\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
% 开关：true=启用彩色高亮（rebuttal/带颜色版本）；false=禁用（camera-ready）

\usepackage{ifthen}
\usepackage{multirow}

\jmlrvolume{-- 120}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026 }

\title[AICL]{A Simple yet Effective Adaptive Inter-organ Contrastive Learning Framework for Unsupervised Domain Adaptation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

    

% More complicate cases, e.g. with dual affiliations and joint authorship
%\midlauthor{\Name{Yiyou Sun\midljointauthortext{Contributed equally} \nametag{$^{1}$}} \orcid{0009-0005-9735-9902} 
\midlauthor{\Name{Yiyou Sun\nametag{$^{1}$}} \Email{yiyousun@link.cuhk.edu.hk}\\
\Name{Zheyao Gao\nametag{$^{2}$}} \Email{zheyaogao@cuhk.edu.hk}\\
\Name{Xiaogen Zhou\nametag{$^{2}$}} \Email{xiaogenzhou@cuhk.edu.hk}\\
\Name{Qi Dou\nametag{$^{2}$}} \Email{qidou@cuhk.edu.hk}\\
\Name{Winnie {Chiu Wing Chu}\nametag{$^{1}$}} \Email{winniechu@cuhk.edu.hk}\\
\addr $^{1}$ Department of Imaging and Interventional Radiology, CU Lab of AI in Radiology, The Chinese University of Hong Kong, Hong Kong, SAR, China \AND
\addr $^{2}$ Department of Computer Science and Engineering, Institute of Medical Intelligence, The Chinese University of Hong Kong, SAR, China \AND
}

\begin{document}

\maketitle

\begin{abstract}

Strong unsupervised domain adaptation (UDA) in multi-organ segmentation seeks to unify complementary information from heterogeneous imaging protocols within a single model without sacrificing source-modality performance, yet the substantial domain gap between modalities makes feature-level alignment non-trivial. Pseudo-label learning (PLL) has emerged as the dominant paradigm, but it suffers from information loss due to hard thresholding and bias introduced by class imbalance and noisy predictions. Contrastive learning (CL) offers a complementary direction by structuring semantic constrast, yet existing voxel-level formulations incur prohibitive computational costs on volumetric data and fail to capture the global anatomical context critical for organ segmentation. In this work, we propose Adaptive Inter-organ Contrastive Learning (AICL), a unified UDA framework for 3D multi-organ cross-modality segmentation that exploits PPL and CL synergistically to facilitate better cross-modality feature alignment. AICL employs dynamic soft pseudo-labels as guidance in the feature latent space to organize for inter-organ samples as positive-negative pairs for CL. Meanwhile, the model is trained with supervised consistency learning (SCL) using mixed ground truths and pseudo-labels, promoting a more discriminative and compact shared latent space. Extensive experiments and ablation studies on an orbital and a cardiac dataset reveal the effectiveness of each component and a significant advancement in segmentation results.

\end{abstract}


\begin{keywords}
Unsupervised Domain Adaptation, Multi-organ Segmentation
\end{keywords}

\section{Introduction}


Unsupervised domain adaptation (UDA) has become a cornerstone of cross-domain medical image segmentation~\cite{qu2024eh,lin2024safeguarding}, transferring knowledge from a labeled source to an unlabeled target domain \cite{Shin2023SDC-UDA:Segmentation,Zhao2023LE-UDA:Segmentation,DBLP:journals/corr/abs-2508-00442}. 
The core challenge lies in the domain gap of substantial differences in intensity distributions, contrast profiles, and noise characteristics across modalities make direct feature-level comparison unreliable~\cite{Lee2021UnsupervisedTransfer,mir_cmclip,mir_deception}.

Early UDA methods emphasized either adversarial alignment in feature/output space or image‑to‑image translation. 
Adversarial alignment encourages global distribution matching but can blur semantic boundaries and underfit minority structures ~\cite{Hoffman2016FCNsAdaptation,Chen2017ROAD:Scenes,VuADVENT:Segmentation}. 
Image translation \cite{Park2020ContrastiveTranslation, Han2021DualTranslation} may partly reduce appearance gaps but risks altering anatomy and depends on cycle or structural constraints that are hard to satisfy in practice. 
Pseudo-label learning (PLL)~\cite{Zhao2023LE-UDA:Segmentation, Shin2023SDC-UDA:Segmentation} has become the cornerstone of modern UDA in medical segmentation. By converting model predictions into supervision, teacher–student frameworks~\cite{mir_stegaffd,chen2025stegavar} leverage uncertainty estimation~\cite{DBLP:journals/patterns/LinTWLJWY24,lin2025tpami,lin2024cvpr} and consistency checks to improve label quality. Despite these measures, inherent noise persists; hard thresholds lose information\cite{Dumoulin2016AStyle, wang2022u2pl}, while class imbalance and low-confidence predictions introduce significant bias.

Contrastive learning (CL) offers a promising paradigm that addresses multi-organ segmentation via pulling consistent samples together while pushing apart dissimilar ones ~\cite{Gu2024UnsupervisedSegmentation,WangHuntingSegmentation,Zhang2023Multi-modalSegmentation,lin2025invariance}. 
Most existing CL approaches, however, construct
contrastive objectives at the \emph{voxel level}, requiring exhaustive pairwise comparisons across full-resolution feature
maps~\cite{Park2020ContrastiveTranslation}. 
In 3D medical imaging, this voxel-wise paradigm relies on dense, voxel-level comparisons, resulting in a substantial computational bottleneck that limits scalability to high-resolution volumetric data. Moreover, pixel-to-pixel representations fail to capture the global contextual information essential for organ segmentation in medical images, where anatomical structures are inherently coherent. %A contrastive formulation that explicitly respects this structure is therefore desirable. 

 
Motivated by the above observations, we proposed adaptive inter-organ contrastive learning (AICL), a unified UDA framework for 3D multi-organ cross-modality segmentation that synergizes the semantic guidance of PLL with the representation power of CL in a structure-aware manner. Instead of discarding uncertain target predictions through rigid hard thresholding, AICL retains the soft semantic information encoded in pseudo-labels and exploits it to guide both organ-patch sampling and pseudo-label consistency regularization. In this way, pseudo-labels are not merely treated as noisy supervision, but are further leveraged as semantic cues to dynamically organize the latent space, encouraging tighter class-wise clustering and more robust cross-domain feature alignment. Furthermore, AICL selectively samples positive and negative pairs for CL and performs alignment at the feature-patch level, which is computationally prohibitive for volumetric data. This enables the model to focus on semantically meaningful organ representations while preserving richer local context than isolated pixel-to-pixel comparisons. Our contributions are summarized as follows: (1) We develop an organ-wise pseudo-label guided patch sampling (PGPS) strategy in the cross-modality feature alignment (CMFA) module for CL to guarantee optimal feature discrepancy and efficient feature representation. (2) We implement a pseudo-label-guided CL (PGCL) regularizer that complements the supervised consistency learning (SCL). This module effectively pushes apart cross-modality features from different classes while pulling together those from the same class in the latent embedding space, robust to the noise inherent in pseudo-labels.




\section{Methods}
%\section{Methodology}

The proposed adaptive inter-organ contrastive learning (AICL) framework processes interleaved inputs $I_{m} \in \mathbb{R}^{H\times W \times D} $ from source and target domains $m \in [m_{1},m_{2}]$ within a unified batch $B$ to generate comprehensive segmentation predictions. As illustrated in Fig.~\ref{fig:framework}, our architecture first employs a shared conditional instance normalization (CIN) ~\cite{Dumoulin2016AStyle,Bastico2023ATransformers} vision transformer (ViT)~\cite{DBLP:journals/tits/GuoTWWWQL25} encoder to dynamically adapt feature distributions across diverse imaging modalities. %, addressing the challenges of heterogeneous data while preserving the cross-modal content consistency. 
Subsequently, CMFA systematically regularizes the shared latent feature space through PGCL, ensuring anatomical consistency between modalities. Finally, the framework integrates SCL to enforce consistency between predictions and corresponding available ground truth.



\begin{figure}[t!]
\includegraphics[width=\textwidth]{figures/framework3.pdf}
\caption{Flowchart of our proposed method. A. The overall pipeline takes interleaved multi-modal images as inputs. B. The framework utilizes a shared encoder to decouple modality-specific statistics from shared semantic representations $z$. C. CMFA module regularizes positive and negative paired samples generated from $F$ via PGPS by aligning cross-modal patch-wise features. D. SCL module jointly optimizes the model using ground truths $Y$ and pseudo labels $Pseudo$. } \label{fig:framework}
\end{figure}


\subsection{Modality-Adaptive Encoder}
The shared-weight encoder $E(\cdot, m)$ integrates a CIN mechanism, parameterized by modality-specific scaling and shifting parameters $\gamma_{m}$ and $\beta_{m}$ for each modality input $I_{1}, I_{2}$. The encoder projects inputs into a unified yet modality-adaptive latent feature space ~$F_{m}\in \mathbb{R}^{C \times \frac{H}{4} \times \frac{W}{4} \times \frac{D}{4}}$ by independently normalizing instance-specific statistics across source and target domains. In each layer $l$ of latent feature space, feature maps $z_{l} (m)$ pass a learnable parameterized CIN which is defined as and shown in Fig.~\ref{fig:framework}B:
\begin{equation}
CIN(z)= \gamma_{m} (\frac{z-\mu(z)}{\sigma(z)}) + \beta_{m},\\
\end{equation}
where $\mu(z)$ and $\sigma(z)$ represent channel-wise mean and standard deviation computed per instance within each batch.

Modality-specific learnable parameters $\gamma_{m}$ and $\beta_{m}$ are trained to decouple sensing-specific statistics (e.g., intensity, noise, and contrast) from modality-consistent semantic content, enabling a single shared encoder to generalize across modalities. CIN provides lightweight adaptation while preserving modality fidelity, thereby facilitating efficient training without compromising cross-modality alignment.  


\subsection{Cross-Modality Feature Alignment}

\subsubsection{Pseudo-label Guided Patch Sampling in Latent Feature Space}
In our framework, PGPS is illustrated in Fig.~\ref{fig:PGCL}. Specifically, inputs $I_{m}$ pass through a shared-weight encoder $E(\cdot,m)$ and a decoder $D$, yielding voxel-wise pseudo-labels $Y^{'}_{m}=D(E(I_{m}, m)) \in\mathbb{R}^{K\times H\times W \times D}$. To perform cross-modality alignment in latent feature space, we downsampled $Y^{'}_{m}$ to feature-layer resolution $\overline{Y^{'}_{m}}\in \mathbb{R}^{K \times \frac{H}{4} \times \frac{W}{4} \times \frac{D}{4}}$ in spatial dimensions, where $k \in [1,2,...,K]$ indexes organ classes and $(h,w,d), h\in [0, \frac{H}{4}), w\in [0, \frac{W}{4}), d \in [0, \frac{D}{4})$ indexes voxel position. Leveraging the compactness of anatomical structures in medical images, we extracted organ-wise feature patch embeddings $\hat{F}^{k}_{m} \in \mathbb{R}^{K \times \hat{H} \times \hat{W} \times \hat{D} }$ from ${F}_{m}$. Specifically, for each class $k$, region of interest (ROI) centers $(h_c^{k},w_c^{k},d_c^{k})$ are obtained as mean coordinates of $\overline{Y^{'}_{m}} (k)$ valid voxels. We then crop a 3D cuboid $C_{m}^{k}$ around the center of a fixed patch size $ps=(H_{ps},W_{ps},D_{ps})$. Organ-wise feature patches $\hat{F}^{k}_{m}$ are sampled by bounding cubes in feature space to ensure they include a reasonable ratio of foreground and background semantic information. Relative equations are formulated as:

\begin{equation}
\mathcal{C}^{k}_{m}=
\left\{(h,w,d)\ \middle|\
|h-h_c^{k}|\le r_h,\ 
|w-w_c^{k}|\le r_w,\ 
|d-d_c^{k}|\le r_d
\right\}.
\end{equation} 

\begin{equation}
\hat{F}^{k}_{m}
=
F_{m}\big[:,\ \mathcal{C}^{k}_{m}\big]
\in\mathbb{R}^{C\times H_{ps}\times W_{ps}\times D_{ps}}.
\end{equation}
where $(r_{h},r_{w},r_{d})$ are half of $ps$ calculated by $2r_{h}+1=H_{ps},2r_{w}+1=W_{ps},2r_{d}+1=D_{ps}$. 


For patch extraction complexity, dense pixel-wise contrastive loss operates on a full-resolution scale compared to our patch sampling strategy, that is $\mathcal{O}(HWD)^2$ compared to $\mathcal{O}(K \cdot ps^3)$ where K is the number of organ classes. Furthermore, the $ps$ is a hyperparameter empirically set according to organ size, balancing local details and global context. %By leveraging class-aware embeddings, PGPS enforces extracted patches are spatially coherent, even when original anatomical structures are misaligned. 
PGPS decouples the dependency on precise anatomical correspondence and preserves rich features exposed to the network, preparing robust multi-organ cross-modality feature alignment. %The position embedding acts as a spatial anchor, guiding the sampling process to focus on semantically consistent regions, thereby enhancing the model's ability to learn modality-invariant representations.

\subsubsection{Pseudo-label Guided Contrastive Learning} 
Inspired by the PatchNCE~\cite{Park2020ContrastiveTranslation} that enforces local feature consistency by contrasting positive and negative image patch pairs, we propose PGCL to extend this concept to CMFA. %Instead of relying solely on spatial proximity, 
PGCL leverages PGPS to create $\hat{F}$ when $l$ is settled as the last layer of $E(\cdot,m)$, and we compute the feature patch contrastive (FPC) loss as:  %across modalities. Formally, for each class-aware patch $fp^{c}_{m1},fp^{c}_{m2}$ extracted from modality $m1,m2$ via C2CPS, 

\begin{equation}
    l_{fpc}=- \mathbb{E}_{m\sim (m_{1}, m_{2})}\sum_{k=1}^{K}\log \left[  \frac{ \exp(\phi(\hat{F}^{k}_{m}\cdot \hat{F}^{k}_{m})/\tau) } { \exp(\phi(\hat{F}^{k}_{m}\cdot \hat{F}^{k}_{m})/\tau)+\sum_{j=1,j\neq k}^{K}\exp(\phi(\hat{F}^{k}_{m}\cdot \hat{F}^{j}_{m})/\tau)}  \right],
\label{eq1}
\end{equation}
where $\phi(\cdot)$ denotes cosine similarity, and $\tau$ is a temperature hyperparameter. Critically, positive pairs are feature patches sharing the same semantic class, while negative pairs come from different classes irrespective of modality.

PGCL enforces that same‑class features are close, whether from different or same modalities, while different classes are separated at the organ-wise semantic feature patch level. 


\begin{figure}[t!]
\centering
\includegraphics[width=0.8\textwidth]{figures/PGPS.pdf}
\caption{Illustration of PGPS. Pseudo labels are downsampled into the latent feature space for organ-wise feature embeddings $\hat{F}^{k}_{m}$ generation. The proposed method minimizes the distance (``PUSH'') between positive pairs (same $k$), effectively facilitating cross-modality feature alignment.} \label{fig:PGCL}
\end{figure}





\subsection{Supervised Consistency Learning}
In the SCL module, $l_{sup}$ calculates the focal dice \cite{Jadon2020ASegmentation,Lin2017FocalDetection} loss $l_{sup}=l_{dice}+l_{focal}$ between the predictions $Y^{'}$ and ground truth $Y$ from annotated classes, while $l_{supUL}$ calculates the focal dice between $Y^{'}$ and initial pseudo labels $Pseudo$ for organ regions without manual labels. 
To promote agreement between the image-scale features, contrastive structural consistency loss $l_{csc}$ enforces consistency between $Y^{'}_{m_{1}}(k)$ and $ Y^{'}_{m_{2}}(k)$ by constructing positive and negative pairs based on shared or distinct anatomical structures \cite{vandenOordDeepMindRepresentationCoding}.  $l_{csc}$ is defined as:
\begin{equation}
l_{csc}= -\mathbb{E}\sum_{k=1}^{K}log\frac{\phi(Y^{k}_{m_{1}}, Y^{k}_{m{2}})}{\sum_{j=1}^{K}\phi(Y^{k}_{m_{1}}, Y^{j}_{m{2}})},\\
\end{equation}
Overall, the total loss of our framework is:
\begin{equation}
l_{total}= \lambda_{sup}l_{sup} + \lambda_{supUL}l_{supUL} + \lambda_{csc}l_{csc} + \lambda_{fpc}l_{fpc}, \\
\end{equation}
where the $\lambda_{sup}, \lambda_{supUL},\lambda_{csc}, \lambda_{fpc}$ are trade-off parameters scaling the importance of each loss component.



\section{Experiments}
\subsection{Datasets}
% \subsubsection{TAO Dataset} 
\noindent\textbf{TAO Dataset.} The in-house TAO dataset comprises 3D orbital MRI scans from 100 subjects, acquired through two complementary protocols: pre-contrast T1-weighted (T1) and post-contrast T1-weighted (T1c) imaging. The dataset contains full annotations for 20 cases, including extraocular muscle (EOM) groups, optic nerves (ON), and lacrimal glands (LG), partitioned into validation (20\%) and test sets (80\%). The remaining 80 training cases are partially annotated with EOM and ON on T1, and LG on T1c. A standardized preprocessing pipeline composed of image registration, cropping inputs to $96\times96\times32$ patches centered in regions with dense anatomical orbital structures, and normalizing intensity distribution to range $[0,1]$. Subsequently, random 3D rotations and axis-aligned flips constitute data augmentation. 


\vspace{1mm}
\noindent\textbf{MS-CMRSeg Dataset.} The publicly available MS-CMRSeg dataset \cite{Zhuang2016MultivariateMRI,Zhuang2019MultivariateImages} encompasses 45 paired cardiac imaging data. The protocol includes balanced-steady state free precession (bSSFP) cine sequences, serving as the source modality, and late gadolinium enhancement (LGE) sequences as the target modality. We only employ expert-validated annotations delineating three cardiac structures from bSSFP: the left ventricular cavity (LV), right ventricular cavity (RV), and left ventricular myocardium (Myo) across all cases. The dataset is randomly partitioned into 35 LGE/bSSFP for model training and 10 pairs for testing. Similar data processing steps as TAO are applied, and inputs are cropped into $480\times 480\times 16$. 


\subsection{Implementation Details}
 
All experiments were conducted using Python 3.10 and PyTorch 1.13.1 on an NVIDIA A100 GPU with CUDA 11.7. We adopted SwinUNETR \cite{Hatamizadeh2022SwinImages} as the backbone architecture for TAO and UNET \cite{Siddique2020U-NetApplications,TarvainenMeanResults} for MS-CMRSeg. SwinUNETR was configured with the feature size of $fs=48$, encoder layer depth of $L=4$, and hidden size of $K=768$. To deal with a small-scale dataset, UNET comprises five resolution levels with feature widths $[16, 64, 128, 256, 512]$. In the encoder, each level begins with a residual unit for downsampling with strides of $[1, 2, 2, 2, 1]$. After iterative tuning of the hyperparameters from empirically initiated sets, we identified the optimal training protocol utilizing an Adam optimizer, learning rate is set as $1e-4$ with weight delay $1e-5$, $l_{sup}=1$, and the overlap ratio is 0.5 for sliding window inference. 

We use a ramp-up schedule for scaling factors $\lambda_{supUL}(t)=1-\min(1,\max(0, \frac{t-10}{T}))$, $\lambda_{csc}(t)=\min(1,\max(0, \frac{t-10}{T}))$, and $\lambda_{fpc}=0.5\min(1,\max(0, \frac{t-10}{T}))$ for more stable training where $T=20$ . 
Patch size is determined by a data-driven manner of statistically estimating organ size. Concretely, for each class, we compute the bounding-box extents of its ground-truth masks after downsampling to the latent feature resolution and summarize the extents across the training set using median and percentiles. We then select $ps=(7,7,7)$ for TAO and $ps=(25, 25, 9)$ for MS-CMRSeg to allow the masks $C_{m}^{k}$ to predominantly cover boundary regions for each organ while limiting background inclusion. 


\begin{figure}[t!]
\centering
\includegraphics[width=0.9\textwidth]{figures/visual_tao.pdf}
\caption{The qualitative comparisons of segmentation on the TAO dataset. Each row represents the same case, where top two rows come from T1 and bottom two rows come from T1c. Colors indicate different anatomical structures. Comparing EOMs and ON in T1c and LG on T1 (marked by yellow arrows) shows varying degrees of imprecision, particularly for small and low-contrast regions. In contrast, ours more consistently matches the ground-truth extent and location across rows, with cleaner boundaries and fewer false positives.
} \label{fig:visual}
\end{figure}

\subsection{Effectiveness of Our Methods}

\begin{table}[t!]
%\begin{tabular}{|l|c c c c c c c >{\columncolor[gray]{0.8}}c|l|}
\centering
\begin{tabular}{c|cccccccc|c} %{m{2.0cm}|ccccccc>
\hline
Methods & LR  & IR  & MR & SR & SOM & ON &IOM &  LG & \textbf{Avg.} \\
\hline\hline
\multicolumn{10}{c}{T1 Dice[$\%$] $\uparrow$}\\
\hline
REG & --& --& --& --& --& --& --&61.31 & 61.31 \\
\hline

CIN-seg& 78.37 & 88.54 & 88.59 & 78.69 & 80.51 & 83.20 & 67.31 & 65.75 & 79.08 \\
DCLGAN& --& --& --& --& --& --& --&55.40 & 55.40
 \\ 
FPL+&55.12 & 71.64 & 70.13 & 66.07 & 56.95 & 49.69 & 16.20 & 60.50 & 55.79 \\
Ours&\textbf{82.60} & \textbf{90.31} & \textbf{89.74} & \textbf{80.76} & \textbf{82.89} & \textbf{85.57} & \textbf{72.08} & \textbf{68.24} & \textbf{81.52}\\
\hline\hline
\multicolumn{10}{c}{T1 HD95[$mm$] $\downarrow$}\\
\hline
REG & --& --& --& --& --& --&--&9.51&9.51\\
\hline
CIN-seg&6.13 & 3.41 & 3.48 & 8.70 & 9.54 & 7.16 & 6.47 & \textbf{7.47} & 6.54 \\
DCLGAN & --& --& --& --& --& --& --& 9.80 & 9.80\\ 
FPL+&8.31 & 10.30 & 10.20 & 9.06 & 8.12 & 8.31 & 12.25 & 10.05 & 9.57\\
Ours & \textbf{5.53} & \textbf{3.83} & \textbf{3.11} & \textbf{4.59} & \textbf{3.58} & \textbf{4.22} & \textbf{5.25} & 7.71 & \textbf{4.58}\\
\hline
\multicolumn{10}{c}{}\\
\hline
\multicolumn{10}{c}{T1c Dice[$\%$] $\uparrow$}\\
\hline
REG &58.26 & 73.68 & 69.54 & 57.94 & 56.98 & 59.36 & 44.15 & - & 59.98 \\
\hline
CIN-seg&70.50 & 78.21 & 79.40 & 68.38 & 70.06 & 68.68 & 51.06 & 77.12 & 70.43 \\
DCLGAN& 47.80 & 63.48 & 70.18 & 59.03 & 65.65 & 49.23 & 35.07 & -- & 55.78 \\ 
FPL+& 50.16 & 69.78 & 77.94 & 67.15 & 73.48 & 58.88 & 38.45 & 71.51 & 63.42\\
%\hline
Ours & \textbf{80.67} & \textbf{81.90} & \textbf{85.63} & \textbf{74.97} & \textbf{81.84} & \textbf{74.48} & \textbf{62.79} & \textbf{78.32} & \textbf{77.39}\\

\hline\hline
\multicolumn{10}{c}{T1c HD95[$mm$] $\downarrow$}\\
\hline
REG  & 6.83 & 5.05 & 7.02 & 5.38 & 7.00 & 5.23 & 9.25 & - & 6.53\\
\hline
CIN-seg&6.49 & 5.64 & 5.29 & 8.23 & 10.47 & 4.44 & 10.29 & 8.42 & 7.41  \\
DCLGAN & 20.13 & 13.32 & 9.93 & 10.50 & 10.03 & 20.94 & 14.12 & -- & 14.14\\ 
FPL+& 5.71 & 4.93 & 5.20 & 5.57 &3.48 & 5.51 & 10.53 & 9.27& 6.28\\
%\hline
Ours & \textbf{5.98} & \textbf{4.68} & \textbf{4.17} & \textbf{5.05} & \textbf{2.88} & \textbf{4.34} & \textbf{8.97} & 7.71 & \textbf{5.47}\\
\hline
\end{tabular}
\caption{Comparison of different methods for the segmentation of TAO-affected organs on the T1 and T1c modality.  }
\label{tab_t1}
\end{table}

We evaluated the effectiveness of our proposed method by measuring multi-organ segmentation performance in the Dice Similarity Score (Dice) and Hausdorff Distance 95 (HD95). 

Table~\ref{tab_t1} presents quantitative comparisons of our proposed method against related unsupervised domain adaptation methods on the TAO dataset, adapting from T1 to T1c. REG compares the source domain labels transformed by a rigid matrix calculated from image registration across modalities and the target domain label. REG depicts the difficulty of resolving the domain gap between inputs. CIN-seg \cite{Bastico2023ATransformers} utilizes registered pseudo labels as supervision for cross-modal fusion, suffering an averaged 9.6\% Dice decline over organs missing annotations compared to GT-supervised organs. Our method outperforms other methods by a large margin, especially in the missing label case, both in dice and HD95.
DCLGAN\cite{Han2021DualTranslation} leverages unsupervised contrastive learning for cross-modal image translation, enabling supervised learning on synthetic target data and source domain labels. However, anatomical distortions introduced during translation degrade performance (e.g., 28.9\% Dice drop for LG in T1), as synthetic images often misalign with ground truth structures. FPL+ \cite{Wu2024FPL+:Segmentation} employs dual-domain pseudo-label generation with noise filtering. While effective in ideal scenarios, its reliance on heuristic thresholds amplifies error propagation under severe annotation sparsity. This framework performs better in T1c than T1 modality because it relies on synthetic T1c images with EOM and ON ground truth, and vice versa.  


To intuitively verify the impact of the PGCL in CMFA module on feature representation learning, we visualized the distribution of feature embeddings $fp_m^k$ using t-SNE. Fig.~\ref{fig:tSNE}A-–B compare the feature spaces before and after applying PGCL. Feature distribution without PGCL exhibits loose distribution with low intra-class compactness. Specifically, IOM from T1 and ON from T1c show vague boundaries and potential overlap in the central region. This lack of distinct separability explains the baseline model's struggle with class discrepancy. In contrast, Fig.~\ref{fig:tSNE}B demonstrates that introducing PGCL significantly regularizes the latent space. Enhanced semantic separability validates that our method effectively mitigates the domain shift problem by aligning feature distributions. 

\begin{figure}[t!]
\centering
\floatconts
  {fig:example}
  {\caption{The qualitative results of segmentation on the MS-CMRSeg dataset. Different rows are from different cases. }}
  {\includegraphics[width=0.92\linewidth]{figures/visual_mscmr5.jpg}}
  \label{fig:visual_mscmr}
\end{figure}

\begin{table}[h]
%\begin{tabular}{|l|c c c c c c c >{\columncolor[gray]{0.8}}c|l|}
\centering
\begin{tabular}{c|cccc|cccc}
\hline
\multirow{2}{*}{Methods } & \multicolumn{4}{c|}{Dice[$\%$] $\uparrow$} & \multicolumn{4}{c}{HD95[$mm$] $\downarrow$}\\
\cline{2-9}
 & Myo &LV & RV  & Avg & Myo &LV & RV  & Avg \\
\hline
REG & 56.99 & 79.74 & 73.27 & 70.00 & 10.22 & 9.45 & 27.73 & 15.80 \\
\hline
CIN-seg & 70.71 & 86.77 & \textbf{78.71} & 78.73 & 10.22 & 9.45 & 27.73 & 15.80 \\
FPL+ & 68.97 & 86.32 & 73.52 & 76.27 & 45.36 & 10.60 & 49.27 & 35.08 \\
Ours & \textbf{74.97} & \textbf{87.55} & 77.09 & \textbf{79.87} & \textbf{6.47} & \textbf{4.89} & \textbf{9.10} & \textbf{6.82} \\
\hline
\end{tabular}
\caption{Comparison of different methods for the segmentation on MS-CMRSeg adapting bSSFP to LGE.}
\label{tab:mscmr}
\end{table}


\subsection{Ablation Study}



\subsubsection{Effectiveness of Key Components }%\subsubsection{Contribution of Each Component} 
We conducted a systematic ablation study on the TAO dataset, focusing on the bidirectional domain adaptation between T1 and T1c modalities, to evaluate the contributions of the proposed core components, CMFA and SCL. As summarized in Tab.~\ref{tab:ablation}, these components are denoted as $l_{fpc}$, and $l_{csc}$, respectively. Our analysis began with a baseline model devoid of components, trained solely on cross-modal inputs with standard supervised dicefocal loss, achieving a modest average Dice of 69.05\%, and a relatively high average HD95 of 9.57 mm. A closer inspection reveals significant performance bottlenecks in segmenting small, irregular structures and low-contrast boundaries of IOM, showing a particularly poor Dice score of 47.38\%. This underscores the challenge of learning robust representations for fine-grained orbital structures under domain shift. The integration of both components achieves superior performance, validating the synergistic effect of our dual-consistency framework. The full model reaches an average Dice of 76.38\% (a 7.33\% improvement over the baseline) and drastically reduces the HD95 to 5.47 mm. Most remarkably, the segmentation of the challenging IOM, SOM improves by over 15\%, 11\% compared to the baseline. These results demonstrate that combining global alignment $l_{csc}$ with local feature refinement $l_{fpc}$ effectively mitigates domain discrepancies, ensuring anatomically plausible segmentation even for complex orbital structures. 

\begin{figure}[t!]
\centering
\floatconts
  {fig:example}
  {\caption{Left and right (A--B) t-SNE visualize feature embeddings from TAO dataset without and with PGCL applied. Different markers as $o$, $\triangle$ indicate features from T1 and T1c modality, respectively, with corresponding colors of organ groups marked in the legend. C--D shows Dice and HD95 comparison between training with 'Warm-up' and 'Cold-start' using REG as $Pseudo$. E--F uses Shifted REG. }}
  {\includegraphics[width=\linewidth]{figures/tsne-pl.pdf}}
  \label{fig:tSNE}
\end{figure}



\begin{table}[h!]
%\setlength{\tabcolsep}{3pt}
\centering
\begin{tabular}{cc|ccccccc|c|c}
\hline
 \multirow{2}{*}{$l_{fpc}$}& \multirow{2}{*}{$l_{csc}$} &\multicolumn{7}{c|}{$\textbf{T1} \rightarrow \textbf{T1c}$}  &$\textbf{T1c} \rightarrow \textbf{T1}$ &\multirow{2}{*}{Avg}\\

\cline{3-10}
& & LR  & IR  & MR & SR & SOM & ON &IOM & LG &  \\
\hline\hline
\multicolumn{10}{c}{Dice[$\%$] $\uparrow$}\\
\hline
& & 72.15 & 76.82 &81.47 & 69.61 & 70.46 & 68.72 & 47.38 & 65.75 & 69.05 \\ 
 &$\checkmark$ &79.55 & 81.89 & 81.75 & 72.41 & 77.27 & 69.28 & 44.22 & 67.15 & 71.78\\
$\checkmark$ & &80.20 & 80.32 & 84.10 & 74.88 & 76.74 & 70.84 & 53.87 & 66.27 & 73.40\\
\checkmark & \checkmark &\textbf{80.67} & 81.90 & \textbf{85.64} & \textbf{74.97} & \textbf{81.84} & \textbf{74.48} & \textbf{62.79} & \textbf{68.24} & \textbf{76.38}\\
\hline\hline
\multicolumn{10}{c}{HD95[$mm$] $\downarrow$}\\
\hline

&   &8.31 & 10.30 & 10.20 & 9.06 & 8.12 & 8.31 & 12.25 & 10.05 & 9.57 \\ 
&  $\checkmark$&7.79& 7.09& 7.20& 5.90& 7.10& 5.29& 9.68& 7.69 & 7.21\\
$\checkmark$ & &7.42 & 8.27 & 5.45 & 5.96 & 5.09 & 5.14 & 9.13 &9.87 & 7.04 \\

 $\checkmark$ &  $\checkmark$ & \textbf{5.98} & \textbf{4.68} & \textbf{4.17} & \textbf{5.05} & \textbf{2.88} & \textbf{4.34} & \textbf{8.97} & \textbf{7.71} & \textbf{5.47}\\
\hline

\end{tabular}
\caption{Ablation results of Dice and HD95 on unlabeled organs (LR, IR, MR, SR, SOM, ON, IOM in T1c, and LG in T1) from TAO dataset.}
\label{tab:ablation}
\end{table}


\begin{figure}[t!]
\centering
\floatconts
  {fig:example}
  {\caption{Organ-wise patch-size statistics and sensitivity analysis of PGPS. (A--C) Boxplots of the organ size extents of $H_{ps}, W_{ps}, D_{ps}$ respectively, measured in the latent feature space for each anatomical structure. (D--E) Robustness analysis of Dice score and HD95(mm) of the \emph{isotropic} patch size \(ps\in\{5,7,9,11\}\) (applied at the latent feature resolution) for PGPS. The brackets indicate pairwise significantly differences where \textit{ns}: not significant; \textit{***} \(p<0.001\); \textit{****} \(p<0.0001\).}}
  {\includegraphics[width=0.92\linewidth]{figures/ps_tao.pdf}}
  \label{fig:tao_ps}
\end{figure}

\subsubsection{Influence of Pseudo-label Quality}
A potential concern arises that the model heavily relies on the quality of initial pseudo-labels to stabilize PGPG and PGCL, especially in the first several epochs of training. We deliberately corrupt the initialization by shifting the registered labels along a random axis denoted as S-Reg, creating extreme spatial misalignment. As shown in In Table \mbox {\ref{tab:shift_pl}}, the shifted label transfer itself yields poor segmentation quality of S-Reg: Avg Dice $30.73\%$, Avg HD95 $8.35\ \mathrm{mm}$, confirming that the initialization is substantially degraded. Despite this, our method converges to strong performance Avg Dice $74.04\%$, Avg HD95 $7.24\ \mathrm{mm}$, substantially outperforming the CIN-seg Avg Dice $57.82\%$, Avg HD95 $12.67\ \mathrm{mm}$. This further proves our model robustly decouples the dependency on precise anatomical correspondence. 

We further study the comparsion of warm-up strategy and cold start under two initializations of registered labels (REG) and shifted registered labels (S-REG). As shown in Fig. \mbox{\ref{fig:tSNE}} C--F, under S-REG initialization warm-up yields better enhancement of Dice and HD95 compared to REG initialization, indicating that delaying and ramping contrastive learning improves robustness when the initial pseudo labels are not reliable.

\begin{table}[h!]
%\setlength{\tabcolsep}{3pt}
\centering
\begin{tabular}{c|ccccccc|c|c}
\hline
\multirow{2}{*}{Methods} &\multicolumn{7}{c|}{$\textbf{T1} \rightarrow \textbf{T1c}$}  &$\textbf{T1c} \rightarrow \textbf{T1}$ &\multirow{2}{*}{Avg}\\
\cline{2-9}
& LR  & IR  & MR & SR & SOM & ON &IOM & LG &  \\
\hline\hline
\multicolumn{10}{c}{Dice[$\%$] $\uparrow$}\\
\hline
S-Reg & 29.21 & 44.52 & 28.26 & 32.14 & 17.34 & 26.74 & 26.26 & 41.39 & 30.73 \\
CIN-seg & 71.47 & 68.10 & 64.63 & 51.64 & 52.38 & 61.84 & 40.81 & 51.72 & 57.82 \\
Ours & \textbf{78.54} & \textbf{79.14} & \textbf{85.50} & \textbf{72.85} & \textbf{79.72} & \textbf{71.41} & \textbf{63.19} & \textbf{61.98} & \textbf{74.04} \\
\hline\hline
\multicolumn{10}{c}{HD95[$mm$] $\downarrow$}\\
\hline
S-Reg & 8.27 & 7.54 & \textbf{5.77} & 7.87 & 7.04 & \textbf{5.58} & 8.85 & 15.87 & 8.35 \\
CIN-seg & 8.81 & 8.18 & 8.26 & 9.34 & 9.69 & 6.65 & 12.47 & 37.98 & 12.67 \\
Ours & \textbf{7.37} & \textbf{6.50} & 6.77 & \textbf{7.22} & \textbf{6.43} & 7.19 & \textbf{8.84} & \textbf{7.59} & \textbf{7.24} \\
\hline

\end{tabular}
\caption{Robustness comparison under shifted registered labels (S-Reg) initialization under extreme spatial misalignment. Despite the substantial degradation in initialization quality, our method achieves markedly higher Dice and lower HD95. }
\label{tab:shift_pl}
\end{table}

\subsubsection{Hyperparameter Analysis}

We further analyze the sensitivity of $ps$ in PGPS. Fig.\mbox{\ref{fig:tao_ps}}A--C show that different anatomical structures occupy significantly different extents in the latent feature space. Fig.\mbox{\ref{fig:tao_ps}}D--E shows Dice score and HD95 robustness over varied \emph{isotropic} patch size $ps\in\{5,7,9,11\}$ at the latent feature level, where they map image patch size $I_{ps}$ ranges between  $I_{ps}\in\{[16, 24),[24, 32),[32, 40),[40, 48)\}$. Contrastive regularization in feature space inherently allows the tolerance of a fixed patch size for all organ regions. In performance comparison, Dice remains statistically stable across the tested values, indicating that AICL is generally robust to moderate patch-size choices. However, HD95 degrades for larger patches $ps\ge 9$, suggesting that overly large cubes include excessive background and weaken the semantic purity of the sampled embeddings, which mainly harms boundary accuracy. A fixed $ps$ for all organs is primarily a practical constraint of our current PGPS strategy, where Organ-adaptive $ps$ with adaptive pooling or multi-scale patch sampling might be a nice improvement. 






\section{Conclusion}

We propose a simple yet effective AICL method that combines SCL with CMFA normalized by CIN, to enhance alignment of cross-modality fine-grained semantic features.  This strategy is advantageous for accommodating multi-modal data, by simply feeding interleaved inputs into the same batch.  Our model performs comparably to or better than prevailing models in multi-organ segmentation from partly labeled multi-modal MRI.
\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was supported in part by the General Research Funding from the Research Grants Council of the Hong Kong Special Administrative Region, China under Grant 14200721 and Grant 14100223 and in part by the Research Grants Council of the Hong Kong Special Administrative Region, China, under Grant T45-401/22-N, and in part by IdeaBooster Fund from Chinese University of Hong Kong University Grants Committee, under Grant IDBF25MED14. }


\bibliography{midl26_120}



\end{document}
