\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{booktabs}
\usepackage{multirow}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2026}

\title[NeuroLangSeg]{NeuroLangSeg: Language-Guided Subcortical Segmentation with Pseudo-Supervision and Anatomical--Linguistic Validation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Ruiying Liu\midljointauthortext{Contributed equally}\nametag{$^{1}$}}\orcid{0000-0001-7851-4253} \Email{rliu60@emory.edu}\\
\addr $^{1}$ Department of Biomedical Informatics, Emory University \AND
\Name{Jialu Liu\midlotherjointauthor\nametag{$^{1}$}} \Email{liujialu2001@gmail.com}\\
\Name{Xuzhe Zhang\nametag{$^{2}$}} \Email{xuzhe.z@columbia.edu}\\
\addr $^{2}$ Department of Biomedical Engineering, Columbia University \AND
\Name{Chuang Huang\nametag{$^{3}$}} \Email{chuan.huang@emory.edu}\\
\addr $^{3}$ Department of Radiology and Imaging Sciences, Emory University \AND
\Name{Yun Wang\nametag{$^{1,4}$}} 
\Email{yun.wang2@emory.edu}\\
\addr $^{4}$ Department of Computer Science, Emory University
}

\begin{document}

\maketitle

\begin{abstract}
Recent advances in vision--language models and LLMs have introduced contextual anatomical reasoning into brain MRI segmentation. However, the field still suffers from a fundamental limitation: the absence of a unified anatomical definition of the structures being segmented. Existing datasets rely on labels produced by heterogeneous manual workflows, often lacking explicit anatomical criteria or consistent annotation standards. As a result, models learn and evaluate within isolated labeling systems, limiting cross-model comparison and valid anatomical measurements. To address these challenges, we introduce \textbf{NeuroLangSeg}, a language-guided framework that enforces a consistent anatomical protocol for subcortical segmentation. A key component of the framework is an anatomical--linguistic evaluator that acts as a training discriminator, encouraging the model to produce outputs by assessing shape characteristics, protocol-defined spatial relationships, and age- and sex-adjusted volumetric norms. Building upon this constraint, NeuroLangSeg integrates a pretrained image encoder with protocol-aligned anatomical prompts and a masked pseudo-labeling strategy, enabling data-efficient and interpretable learning under limited supervision. Together, these components yield anatomically consistent segmentations and support subject-level reporting grounded in a unified anatomical standard. Evaluation across diverse MRI datasets—including comparisons with state-of-the-art models—shows that NeuroLangSeg achieves +4.1 DSC / +8.0 NSD in in-site settings and +3.6 DSC / +14.5 NSD in cross-site generalization over the average baseline, enabled by its LLM--visual integration, while delivering anatomically verifiable predictions suitable for both research and clinical use. GitHub: \url{https://github.com/jlliu2001/SAT_MPL}
\end{abstract}

\begin{keywords}
Anatomical Protocol, Language-Driven Segmentation, Anatomical--Linguistic Evaluation, Brain MRI
\end{keywords}

\section{Introduction}

Accurate segmentation of subcortical brain structures is fundamental to quantitative analysis and clinical assessment. Most regions such as the hippocampus, amygdala, and thalamus, enable detailed investigations of brain development, aging, and neuropathology, supporting downstream analyses of structure--function relationships and population-level biomarkers~\cite{Baribeau2019-aq, Cruz2023-eo, De_Jong2008-ol}. Although manual delineation remains the gold standard for defining anatomical boundaries, it is time-consuming, labor-intensive, and dependent on expert knowledge. 

While traditional neuroimaging pipelines such as FreeSurfer~\cite{Fischl2012-lo}, BrainSuite~\cite{Kim2024-hl}, ANTs~\cite{Avants2011-xv}, and FSL~\cite{Jenkinson2012-wx} have been widely used for automated frameworks for structural analysis, their multi-stage registration and optimization procedures are computationally intensive and difficult to scale for large datasets or clinical workflows. In contrast, recent advances in deep learning have substantially improved medical image segmentation, allowing models to learn rich representations directly from MRI data and achieve high accuracy across diverse anatomical and clinical tasks~\cite{BILLOT2023102789, GUHAROY2019713, HENSCHEL2020117012, HENSCHEL2022118933, Estrada_2023, Zhang_2024_CVPR}. However, most existing approaches remain task-specific—trained for a single structure, cohort, or labeling rules—and demonstrate lower performance when deployed on heterogeneous datasets, limiting their generalization and clinical applicability.


To enhance flexibility and interpretability, large language models (LLMs) and vision-language models (VLMs) have been developed for medical image segmentation by coupling textual descriptions with visual representations~\cite{Ma2024-tr,10887845,su2025meshprompted,zhao2025largevocabularysegmentationmedicalimages}. These multimodal models incorporate semantic context and enable adaptable segmentation across domains.  Prompt-based VLMs extend this capability to open-vocabulary medical segmentation across organs and modalities~\cite{Ma2024-tr,zhao2025largevocabularysegmentationmedicalimages}, while anatomical priors (e.g., shape templates and mesh constraints) further enhance spatial consistency~\cite{su2025meshprompted}. In neuroimaging, emerging protocol-guided approaches encode hierarchical anatomical relationships, such as topology-based text generation to improve brain segmentation~\cite{10887845}. 

Despite these advances, subcortical segmentation still lacks clinically unified anatomical protocols and evaluation frameworks. First, current visual backbones are constrained by their training labels. Most large-scale datasets rely on FreeSurfer-derived masks because they are readily available~\cite{FISCHL2002341, Tae2008-po}. Some models such as FastSurfer~\cite{HENSCHEL2020117012, HENSCHEL2022118933, Estrada_2023}, SynthSeg~\cite{BILLOT2023102789}, and QuickNAT~\cite{GUHAROY2019713} largely reproduce or refine these outputs. However, FreeSurfer boundaries often diverge from expert manual labels~\cite{Morey2009-ns, SCHOEMAKER20161, Lerch2017-mj}, introducing systematic structural bias into both training and evaluation. Second, even when manual segmentations are available, there is still no clinically unified protocol: different experts and software tools apply different delineation rules—for example, outlining the amygdala or hippocampus with different boundaries—resulting in inconsistent ground-truth masks across datasets~\cite{Geuze2005-ph, Yushkevich2015-jj}. Although recent vision--language models incorporate textual cues, they do not resolve this underlying protocol mismatch. Finally, current segmentation frameworks lack a standardized evaluation pipeline to assess anatomical accuracy from a clinical perspective. Conventional metrics based on overlap, such as the Dice coefficient, quantify geometric similarity but fail to capture the morphological integrity, topological consistency, or biological validity of the predicted structures~\cite{BABALOLA20091435}.


To address these challenges, we propose \textbf{NeuroLangSeg}, a language-guided subcortical segmentation framework with pseudo-supervision and anatomical--linguistic validation based on a consistent anatomical protocol from Neuromorphometrics, Inc.~\cite{Landman2012MultiAtlas}. Our main contributions are: 1) Contextual anatomical prompts are encoded and fused with visual features, enabling flexible, prompt-conditioned segmentation across structures and cohorts. 2) A unified visual backbone combines large-scale 3D masked autoencoder pretraining, label-efficient pseudo-label refinement, and global--local stabilization to improve robustness across scanners, ages, and modalities. 3) Clinical anatomical protocols encoded by an LLM guide morphological and topological discriminators. During inference, the evaluator integrates morphological, topological, and BrainChart-normalized volumetric metrics (adjusted for age and sex) to assess anatomical consistency.

Together, these components make NeuroLangSeg a clinically aligned and explainable framework for subcortical segmentation, providing both high accuracy and anatomical validation. To our knowledge, it is the first model to unify language-guided learning, semi-supervised segmentation, and anatomical--linguistic evaluation. Experiments on healthy and clinical cohorts show strong generalization and anatomically consistent performance across diverse populations.


\section{Method}
We address subcortical segmentation across heterogeneous MRI cohorts that differ in annotation policies and lack a unified, protocol-driven evaluation standard (Figure~\ref {fig1}). Each sample contains a 3D MRI volume $ X \in \mathbb{R}^{H \times W \times D} $ and its corresponding segmentation map $Y \in \mathbb{R}^{H \times W \times D}$. A textual prompt describing the target anatomical structure is provided and converted into a semantic embedding $T$, which is combined with visual features extracted from the MAE encoder to guide structure-specific prediction. The fused representation is passed to a segmentation decoder to produce structure-specific masks:
\begin{equation}\label{eq1}
\hat{Y}
= f(X, T)
= \Psi\!\left(
    h_{\text{vis}}\!\left(g_{\text{vis}}(X)\right),\;
    h_{\text{query}}\!\left(g_{\text{vis}}(X),\, g_{\text{text}}(T)\right)
\right),
\end{equation}
where $ g_{\text{vis}} $ is the visual encoder, $ h_{\text{vis}} $ is the visual decoder, $ g_{\text{text}} $ is the text encoder, and $ h_{\text{query}} $ is the query decoder. The segmentation head $ \Psi $ performs dot-product matching and projection to generate the final mask $ \hat{Y} $. Predictions are evaluated using morphological, topological, and volumetric metrics to assess anatomical consistency.

\begin{figure}[htbp]
\centering
  \includegraphics[width=1\linewidth]{fig1.png}
  \caption{Overview of NeuroLangSeg Segmentation and Evaluation}
  \label{fig1}
\end{figure}

\subsection{Language-Guided Prompt Encoding}


\textbf{Text Encoder:} To encode anatomical concepts and their associated positional knowledge, we adopt a BERT-based text encoder $g_{text}$ initialized from a biomedical language model~\cite{zhao2025largevocabularysegmentationmedicalimages} and further adapted through supervised fine-tuning. The encoder maps heterogeneous textual descriptions, including structure names, morphological definitions, and pairwise spatial relations into a unified embedding space. This allows the resulting text embedding $g_{\text{text}}(T)$ to capture structure-specific location cues and facilitates grounding of anatomical terms within the volumetric imaging space. 


\noindent\textbf{Query Decoder:} To adapt the text-derived representation to each MRI volume, we employ a Transformer-based query decoder $h_{\text{query}}$ that fuses textual embeddings with multi-scale visual features. The text embedding acts as the query, and image features serve as keys and values. A stack of cross-attention decoder blocks refines the query by attending to anatomy-relevant visual cues, enabling inference of subject-specific variations. The query $h_{\text{query}}(g_{\text{vis}}(X),\, g_{\text{text}}(T))$ is matched with voxel-level features $h_{\text{vis}}(g_{\text{vis}}(X))$ in the segmentation head, ensuring alignment between textual priors and the spatial context of the MRI scans.

\subsection{Unified Visual Backbone for Label-Efficient Segmentation}

We construct a unified visual backbone by combining 3D MAE pretraining, masked pseudo-label refinement, and global--local stabilization. This backbone provides a strong initialization for subsequent vision--language fine-tuning.

\noindent\textbf{MAE pretraining.}
A 3D Masked Autoencoder (MAE)~\cite{he2021maskedautoencodersscalablevision} is first trained on large-scale MRI volumes to learn modality- and site-invariant features. Local patches and a downsampled global view are randomly masked and reconstructed using an MSE loss, yielding a pretrained visual encoder $ g_{\text{vis}} $.

\noindent\textbf{Masked Pseudo-Labeling (MPL).}
To enable label-efficient domain adaptation, we adopt a 3D MPL teacher--student framework~\cite{grill2020bootstrap,DBLP:journals/corr/TarvainenV17}. We keep pretrained MAE encoder $g_{\text{vis}}$ with a segmentation decoder $h_{\text{vis}}$ to build segmentation $f_{\text{vis}}=h_{\text{vis}}\circ g_{\text{vis}}$. Given an input
image $x_s$ and label $y_s$ from the source domain, the teacher model $f_{\theta}$ provides pseudo-labels for unlabeled target image $x_t$ and student model $f_{\phi}$ learns from masked source image $x_s^{M}$and masked target image $x_t^{M}$ by minimizing the loss with weight $\beta$:
\begin{equation}
\mathcal{L}_{\text{MPL}}
= \mathcal{L}_{\text{Seg}}\!\left(f_{\phi}(x_t^{M}),\, f_{\theta}(x_t)\right)
+ \beta\,\mathcal{L}_{\text{Seg}}\!\left(f_{\phi}(x_s^{M}),\,y_s\right).
\end{equation}
Where $\mathcal{L}_{\text{Seg}}$ is a compound segmentation loss that consists of cross-entropy and Dice loss~\cite{Zhang_2024_CVPR}. 

\noindent\textbf{Global--Local Collaboration (GLC).}
To stabilize pseudo-labels under domain shift, the GLC module~\cite{Zhang_2024_CVPR} fuses high-resolution local patches with global context extracted from the MAE encoder and regularizes their consistency. The full GLC formulation is provided in Appendix~A. The visual backbone is pretrained with: 
\begin{equation}
\mathcal{L}_{\text{vis}}
=
\mathcal{L}_{\text{FSS}}
+ \mathcal{L}_{\text{MPL}}
+ \mathcal{L}_{\text{GLC}},
\end{equation}
where 
$
\mathcal{L}_{\text{FSS}}
= \beta\,\mathcal{L}_{\text{Seg}}(f_{\phi}(x_s),y_s)
$
is the loss of regular fully-supervised segmentation in 
source data and $ \mathcal{L}_{\text{GLC}} $ contains the global--local consistency terms (Appendix~A). After these stages, the visual backbone is fine-tuned jointly with the language-guided module using \textbf{only the supervised segmentation loss} $ \mathcal{L}_{\text{FSS}} $.

\subsection{Anatomical--Linguistic Discriminator}
\subsubsection{Morphological Discriminator}
Different subcortical structures exhibit distinct morphological variations, which serve as crucial reference points during manual annotation. Considering the shape characteristics of brain regions, the shape encoder $\mathcal{F}_{\text{shape}}$ employs an SE(3)-equivariant convolutional neural network~\cite{billot2024se} to extract shape features invariant to rigid transformations, mapping 3D annotations into a compact shape embedding space. 


The shape encoder is pretrained using a denoising autoencoder framework, mapping noisy inputs to embeddings, which are reconstructed by a decoder comprising transposed 3D convolutions with instance normalization. The reconstruction loss combines MSE and soft Dice loss. During the training of NeuroLangSeg, the pre-trained shape encoder is used to constrain the morphological features. In each training step, both the prediction $\hat{Y}$ and the ground truth $Y$ are forwarded through the fixed $\mathcal{F}_{\text{shape}}$ to obtain their respective shape embeddings. The discrepancy between two embeddings is quantified using the MSE loss, which enforces the network to capture anatomically plausible shapes:
\begin{equation}
    \mathcal{L}_{\text{shape}}(\hat{Y},Y)=\left\|\mathcal{F}_{\text{shape}}\left(\hat{Y}\right) - \mathcal{F}_{\text{shape}}\left(Y\right) \right\|_2^{2}
\end{equation}

\subsubsection{Topological Discriminator}

In addition to shape characteristics, the spatial relationships among subcortical nuclei provide crucial cues for manual annotation. To extract these positional features, we used a LLM to parse natural-language descriptions in annotation protocols provided by Neuromorphometrics, Inc. We used the following prompt to extract anatomical rules into a JSON format: \textit{``Please extract the morphological features, relevant reference regions for manual annotation, and positional relationship descriptions... and convert them into a structured JSON description."} The LLM output identified 37 key anatomical pairs (15 left, 15 right, 7 cross-hemisphere) (shown in Appendix Table \ref{tab:K37_pairs}) and defined their relational types in a structured JSON format.
For example, from the sentence \textit{``the hippocampus is posterior and inferior to the amygdala,"} the LLM outputs structured JSON: \textit{hippocampus-amygdala: \{relative\_position: [-1, -1, 0], adjacency\_ratio: 1, adjacency\_vector: [1, 1, 0]\}}.
The discrete direction vector encodes the posterior--inferior offset under a standardized anatomical coordinate system (anterior, superior, right as positive). The adjacency ratio and vector denote whether two structures share a boundary and the dominant direction from one centroid toward the shared interface.


While the LLM identifies which relationships matter, the quantitative features are formalized by computing the statistics from the training set's ground truths.
Each structure pair $(i,j)$ is thus represented by a 7D relational feature $\mathbf{r}_{ij} = [ \Delta\mathbf{c}_{ij}, A_{ij},\mathbf{d}_{ij} ]$,
where $\Delta\mathbf{c}_{ij}$ is the continuous relative position, $A_{ij}$ is the adjacency ratio, and $\mathbf{d}_{ij}$ is the adjacency-direction vector. To account for inter-subject variability in age and development, all relative position vectors are explicitly normalized based on the subject's total brain volume before being processed by the discriminator. For each subject, anatomical pairs form the relational matrix $\mathbf{R} \in \mathbb{R}^{K\times7}$, encoding the full anatomical topology. $K=37$ is the number of anatomical pairs.
The MLP-based location encoder $\mathcal{F}_{\text{loc}}$ is pretrained in the task of reconstructing relative vectors $R$ extracted from annotation images of all subjects in the normal cohort.
In NeuroLangSeg training, this fixed $\mathcal{F}_{\text{loc}}$ enforces topological consistency: for $\hat{Y}$and $Y$, their relational matrices $\mathbf{R}_{\hat{Y}}$ and $\mathbf{R}_Y$ are extracted and encoded as global location embeddings. A Mean Squared Error (MSE) loss minimizes the discrepancy between the two embeddings, constraining the network to preserve accurate anatomical relationships:
\begin{equation}
    \mathcal{L}_{\text{loc}}(\hat{Y},Y)=\left\|\mathcal{F}_{\text{loc}}\left(\mathbf{R}_{\hat{Y}}\right) - \mathcal{F}_{\text{loc}}\left(\mathbf{R}_Y\right) \right\|_2^{2}
\end{equation}

\subsection{Total Loss}
The total training objective of \textbf{NeuroLangSeg} integrates supervised segmentation with protocol-guided anatomical constraints. The supervised term, $\mathcal{L}_{\text{FSS}}$, combines binary cross-entropy and soft Dice losses to encourage both voxel-level accuracy and region-level overlap fidelity. Two auxiliary regularizers are used: a shape loss $\mathcal{L}_{\text{shape}}$ that penalizes deviations from protocol-defined morphological characteristics, and a location loss $\mathcal{L}_{\text{loc}}$ that constrains predictions to anatomically valid spatial neighborhoods derived from protocol-based adjacency rules. The anatomical--linguistic discriminators that define these protocol constraints are not optimized jointly with the segmentation model; they are trained once using manual labels and a fixed anatomical protocol and are frozen during segmentation training and evaluation. The overall loss is defined as:
\begin{equation}\label{eq6}
\mathcal{L}_{\text{total}}(\hat{Y},Y)
= \lambda_1 \mathcal{L}_{\text{FSS}}(\hat{Y},Y)
+ \lambda_2 \mathcal{L}_{\text{shape}}(\hat{Y},Y)
+ \lambda_3 \mathcal{L}_{\text{loc}}(\hat{Y},Y),
\end{equation}
where $\lambda_1$, $\lambda_2$, and $\lambda_3$ control the relative contributions of segmentation fidelity, morphological regularization, and anatomical location consistency.

\section{Evaluation}
\subsection{Classical Metrics}
We evaluate segmentation quality using Dice Similarity Coefficient (\textbf{DSC}) and Normalized Surface Distance (\textbf{NSD}) \cite{Nikolov2021-ro} against manual labels when available. 
\begin{equation}
\mathrm{DSC}(\hat{Y},Y) = \frac{2|\hat{Y} \cap Y|}{|\hat{Y}| + |Y|},
\qquad
\mathrm{NSD}(\hat{Y},Y)
=
\frac{
|\partial \hat{Y} \cap B_{\partial Y}| +
|\partial Y \cap B_{\partial \hat{Y}}|
}{
|\partial \hat{Y}| + |\partial Y|
}
\end{equation}
The \textbf{DSC} measures volumetric overlap between a prediction $\hat{Y}$ and the manual ground truth $Y$, and \textbf{NSD} evaluates boundary agreement within a tolerance $\tau$. $B_{\partial \hat{Y}}$ and $B_{\partial Y}$ denote tolerance bands around the prediction and ground-truth boundaries with $\tau=1$.

\subsection{Anatomical--Linguistic Evaluators}
For large-scale or clinical datasets without manual annotations, we rely on three anatomical evaluators---morphological, topological, and volumetric.

The morphological evaluator assesses whether a predicted structure conforms to its anatomical shape. Using $\mathcal{F}_{\text{shape}}$, we derive per-label shape priors by encoding the annotations of all healthy instances of each structure and averaging them into a prototype vector $\boldsymbol{\mu}_{\text{shape}} \in \mathbb{R}^{128}$. During evaluation, $\hat{Y}$ is encoded by $\mathcal{F}_{\text{shape}}$, and its embedding is compared with the prototype via cosine similarity. 
This similarity is reported as the shape-consistency score, reflecting the morphological correctness of the prediction.
\begin{equation}
\text{Score}_{\text{shape}}(\hat{Y}) = 
\frac{\langle \mathcal{F}_{\text{shape}}(\hat{Y}),\, \boldsymbol{\mu}_{\text{shape}}  \rangle
}{
\| \mathcal{F}_{\text{shape}}(\hat{Y}) \|_2 \, \| \boldsymbol{\mu}_{\text{shape}} \|_2
},
\end{equation}
The topology evaluator measures whether predicted regions preserve correct anatomical spatial relationships. We extracted the mean $\boldsymbol{\mu}_{\text{loc}} \in \mathbb{R}^{K \times 7}$ and standard $\boldsymbol{\sigma}_{\text{loc}} \in \mathbb{R}^{K \times 7}$ deviation of relational features across all annotated subjects. $K=37$ is the number of anatomical pairs. During evaluation, relational features $\mathbf{\hat{R}}\in \mathbb{R}^{K \times 7}$ are extracted from the segmentation $\hat{Y}$, normalized using $\boldsymbol{\mu}_{\text{loc}}$ and $\boldsymbol{\sigma}_{\text{loc}}$, and converted into a topological correctness score:
 \begin{equation}
\text{Score}_{\text{loc}}(\mathbf{\hat{R}}) = \exp\left(-\rho
\frac{\sqrt{\sum{(\mathbf{\hat{R}}-\boldsymbol{\mu}_{\text{loc}})^2/\boldsymbol{\sigma}_{\text{loc}}^2}}
}{\sqrt{K}}\right),
\end{equation}
The volumetric evaluator checks whether predicted structure volumes align with population norms. For each structure, the predicted volume $V$ is converted to an age- and sex-adjusted BrainChart $z$-score \cite{Bethlehem2022-eo,10.7554/eLife.72904}, which we denote as $\text{Score}_{\text{vol}}$. Volumes with $|\text{Score}_{\text{vol}}|\le 2$ are considered plausible:
\begin{equation}
\text{Score}_{\text{vol}} = \frac{V - \mu_{\text{ref}}(age,sex)}{\sigma_{\text{ref}}(age,sex)}.
\end{equation}

\section{Experiments}

We evaluate NeuroLangSeg across three complementary settings: (1) in-site, (2) cross-site segmentation and generalization, and (3) clinical disease-cohort assessment. Segmentation accuracy (DSC, NSD) is reported wherever manual labels are available, while the three anatomical--linguistic evaluators (morphological, topological, volumetric) quantify anatomical robustness in both labeled and unlabeled datasets. Across all experiments, we compare NeuroLangSeg with four visual-only segmentation models (FastSurfer \cite{HENSCHEL2020117012, HENSCHEL2022118933}), QuickNAT \cite{GUHAROY2019713}, MAPSeg \cite{Zhang_2024_CVPR}, and nnU-Net \cite{Isensee2021-pv}, as well as SAT \cite{zhao2025largevocabularysegmentationmedicalimages} as the vision-language baseline. FastSurfer \cite{HENSCHEL2022118933} baseline utilizes the latest VINNA architecture, which incorporates an internal augmentation strategy for resolution independence. Notably, nnU-Net and MAPSeg serve as the underlying backbones for both SAT and NeuroLangSeg to ensure a controlled comparison of linguistic integration. While methods like SynthSeg~\cite{BILLOT2023102789} are popular for domain-agnostic full-brain segmentation, they were excluded here as they rely on intensity simulations for whole-brain labels and are not directly applicable to our focus on protocol-specific subcortical structures and anatomical-linguistic alignment.

\subsection{Dataset}
\textbf{MAE Pretraining:} We compile 11{,}948 unlabeled T1/T2 MRI scans spanning ages 1--100 years from nine publicly available datasets (e.g., \textbf{ABCD} \cite{Casey2018-xm} and \textbf{HCP} \cite{Harms2018-vl}; full list in Appendix B). These scans contain no manual labels and are used solely for self-supervised MAE pretraining.
\noindent\textbf{Pseudo-supervised Fine-tuning:} A total of 118 manually labeled T1-weighted subjects spanning ages 1--100 years are drawn from \textbf{ADNI} \cite{Jack2008-tj}, \textbf{CANDI} \cite{Kennedy2012-sz}, \textbf{OASIS} \cite{Marcus2010-sh}, \textbf{Colin} \cite{Holmes1998-aq}, and \textbf{BCP} \cite{Howell2019-ok} dataset. These 
subjects provide ground-truth annotations for supervised fine-tuning and in-site/cross-site segmentation evaluation.
\noindent\textbf{Clinical Cohorts:} We additionally include two non--manually labeled clinical datasets—20 subjects from \textbf{BrainTS (BraTS) }\cite{Li2023-ep} tumor cohort and 30 subjects from \textbf{ADNI} \cite{Jack2008-tj} Alzheimer's disease cohort—which are used exclusively to evaluate out-of-distribution anatomical generalization without manual ground truth.

\subsection{Experimental Settings}
\textbf{In-Site Segmentation (Exp. 1):} We evaluate performance under matched training and testing conditions using the 118 manually labeled subjects. The dataset is randomly split 50\% for training, 10\% for validation, and 40\% for testing.

\noindent\textbf{Cross-Site Generalization (Exp. 2):} To quantify generalization under realistic domain shift, we use the \textbf{ADNI}  and \textbf{Colin} datasets as an external test cohort. 12 \textbf{ADNI} subjects and one \textbf{Colin} subject are withheld from finetuning, thereby providing an independent evaluation of cross-site performance.

\noindent\textbf{Disease--Cohort Assessment (Exp. 3):} To evaluate clinical robustness and out-of-distribution behavior, we apply NeuroLangSeg to the \textbf{BraTS} tumor dataset and the \textbf{ADNI} Alzheimer's cohort. Since no manual labels are available, evaluation is performed using the morphological, topological, and volumetric anatomical--linguistic evaluators.

\begin{figure}[ht]
\centering
  \includegraphics[width=1\linewidth]{seg_result1.png}
  \caption{Qualitative comparisons. Coronal and sagittal planes, and zoomed-in regions of interest, respectively. Major segmentation errors are highlighted with red arrows. Ground-truth boundaries are indicated by dotted lines, while segmentations from different methods are shown as transparent overlays.}
  \label{fig2}
\end{figure}

\begin{table}[t]
  \caption{Segmentation performance (DSC and NSD) across seven subcortical structures for in-site and cross-site generalization. Bold indicates the best performance.}
  \label{DSC_NSD_result}
  \centering
  \resizebox{\linewidth}{!}{%
    \begin{tabular}{lcccccccc|cccccccc}
      \toprule
      \multirow{2}{*}{Method} &
      \multicolumn{8}{c|}{\textbf{DSC \%}} &
      \multicolumn{8}{c}{\textbf{NSD \%}} \\
      \cmidrule(lr){2-9} \cmidrule(lr){10-17}
       & HIPP & AMG & CD & PT & PD & TM & AB& Avg
       & HIPP & AMG & CD & PT & PD & TM & AB & Avg\\
      \midrule
      \multicolumn{17}{l}{\textbf{In-site}} \\[-1mm]
    \midrule
    FastSurfer & 85.4   & 80.9   & 88.2 & 88.7   & 82.2   & 91.6 & 78.3 & 85.0 & 91.6 & 88.2 & 94.4 & 92.9 & 87.6 & 89.0 & 92.5 & 90.9\\
    QuickNAT   & 77.5   & 59.5   & 80.7 & 83.4   & 72.7   & 87.6 & 59.3 & 74.4 & 78.2 & 43.2 & 79.4 & 82.5 & 61.0 & 74.6 & 61.9 & 68.7\\
    nnU-Net    & 85.4   & 81.1   & 87.9 & 88.4   & 84.0   & 91.1 & 75.9& 84.8 & 93.4   & 91.7   & 94.7 & 91.9   & 91.5   & 90.1 & 91.5& 92.1  \\
    MAPSeg     & 85.7   & 79.8   & 88.3 & 88.7   & 84.5   & 91.5 & 76.8 & 85.0 & 91.9 & 85.5 & 95.0 & 93.2 & 88.6 & 88.8 & 91.3 & 90.6\\
    SAT & 85.7 & 81.2 & 87.4 & 88.3 & 84.4& 90.9& 77.1 & 85.0 &92.6 &91.7 &95.2 &95.1 &91.3 &89.6 &93.8 & 92.8  \\
    \midrule            % ← this is the separator line you want
    NeuroLangSeg       & \textbf{87.6} &\textbf{84.0} &\textbf{88.9} &\textbf{89.3} &\textbf{86.2} &\textbf{92.0} &\textbf{80.4} & \textbf{86.9} &\textbf{95.3} &\textbf{94.9} &\textbf{97.2} &\textbf{96.1} &\textbf{93.5} &\textbf{92.8} &\textbf{95.7} &\textbf{95.0}\\
    \midrule
    \multicolumn{17}{l}{\textbf{Cross-site}} \\[-1mm]
    \midrule
      
    FastSurfer & 77.2   & 69.1   & 85.4 & 86.2   & 75.5   & 87.8 & 70.1 & 78.8 & 67.2 & 62.1 & 76.4 & 72.7 & 65.1 & 63.7 & 69.6 & 68.1\\
    QuickNAT   & 74.2   & 63.1   & 76.8 & 80.5   & 66.5   & 86.6 & 64.0 & 73.1 & 61.1 & 42.6 & 60.2 & 59.7 & 37.1 & 57.8 & 53.6 & 53.2\\
    nnU-Net    & \textbf{84.6}	 & 79.3	 & \textbf{87.7}	 & 86.9	 & 80.5	 & 89.4	 & 76.3	 & 83.5 & 92.8	 & 90.3	 & 95.1	 & 88.9	 & 88.0	 & 86.4	 & 93.2	 & 90.7 \\
    MAPSeg     & 83.9   & 79.3   & 87.3 & 87.9   & \textbf{81.6}   & \textbf{89.8} & 77.0 & 83.8 & 90.3 & 87.8 & 95.9 & 95.1 & 89.7 & 87.1 & 92.1 & 91.1\\
    SAT & 83.2	 & 77.6	 & 86.7	 & 86.7	 & 80.1	 & 89.0	 & 75.3	 & 82.6 & 91.5	 & 90.2	 & 96.4	 & 95.2	 & 89.8	 & 87.5	 & 94.4	 & 92.2  \\
    \midrule            % ← this is the separator line you want
    NeuroLangSeg       & 84.5	 & \textbf{80.0}	 & 86.8	 & \textbf{88.1}	 & 81.0	 & \textbf{89.8}	 & \textbf{78.1}	 & \textbf{84.0} & \textbf{93.5}	 & \textbf{92.8}	 & \textbf{97.1}	 & \textbf{97.0}	 & \textbf{91.5}	 & \textbf{89.7}	 & \textbf{96.0}	 & \textbf{93.6} \\
      \bottomrule
    \end{tabular}%
\  }
  \vspace{-2mm}
{\scriptsize HIPP:Hippocampus, AMG:Amygdala, TM:Thalamus, 
CD:Caudate, PT:Putamen, PD:Pallidum, AB:Accumbens}
\end{table}

\begin{table}[t]
  \caption{Segmentation performance (DSC and NSD) across seven subcortical structures for ablation study. Bold indicates the best performance.}
  \label{ablation_result}
  \centering
  \resizebox{\linewidth}{!}{%
    \begin{tabular}{lcccccccc|cccccccc}
      \toprule
      \multirow{2}{*}{Method} &
      \multicolumn{8}{c|}{\textbf{DSC \%}} &
      \multicolumn{8}{c}{\textbf{NSD \%}}\\
      \cmidrule(lr){2-9} \cmidrule(lr){10-17}
       & HIPP & AMG & CD & PT & PD & TM & AB& Avg
       & HIPP & AMG & CD & PT & PD & TM & AB & Avg\\
      \midrule
      \multicolumn{17}{l}{\textbf{In-site}} \\[-1mm]
    \midrule
    w/o discriminators & 86.2 & 81.9 & 87.5 & 88.0 & 84.5 & 91.0 & 77.5 & 85.2 & 93.8 & 92.8 & 95.9 & 94.8 & 91.2 & 89.8 & 94.0 & 93.2\\
    w/o morphological discriminator   & 86.6 & 82.4 & 87.9 & 88.4 & 84.9 & 91.3 & 78.3 & 85.7& 93.5 & 94.4 & 95.0 & 94.7 & 91.6 & 92.2 & 95.3 & 93.8\\
    w/o topological discriminator    & 87.2 & 83.5 & 88.4 & 88.9 & 85.9 & 91.7 & 79.4 & 86.4& 92.8 & \textbf{95.1} & 95.2 & 94.6 & \textbf{94.6} & \textbf{92.8} & 95.6 & 94.4  \\
    \midrule            % ← this is the separator line you want
    NeuroLangSeg       & \textbf{87.6} &\textbf{84.0} &\textbf{88.9} &\textbf{89.3} &\textbf{86.2} &\textbf{92.0} &\textbf{80.4} & \textbf{86.9} &\textbf{95.3} &94.9&\textbf{97.2} &\textbf{96.1} &93.5 &\textbf{92.8} &\textbf{95.7} &\textbf{95.0}\\
        \midrule
    \multicolumn{17}{l}{\textbf{Cross-site}} \\[-1mm]
    \midrule
    w/o discriminators & 84.1 & 79.3 & 86.5 & 87.5 & 80.1 & 89.3 & 77.5 & 83.5 & 92.6 & 91.9 & 96.7 & 96.6 & 91.3 & 89.0 & 95.7 & 93.4\\
    w/o morphological discriminator   & 84.3 & 79.6 & 86.5 & 87.7 & 80.1 & 89.4 & 77.2 & 83.5& 92.8 & 92.4 & 96.6 & 96.8 & 91.4 & 89.2 & 95.3 & 93.5\\
    w/o topological discriminator    & \textbf{84.9} & 79.7 & 86.7 & 88.0 & 80.3 & \textbf{89.8} & 77.5 & 83.8& \textbf{93.8} & \textbf{93.3} & \textbf{97.2} & \textbf{97.3} & 91.3 & \textbf{90.8} & 95.9 & \textbf{94.2}  \\
    \midrule            % ← this is the separator line you want
    NeuroLangSeg       & 84.5	 & \textbf{80.0}	 & \textbf{86.8}	 & \textbf{88.1}	 & \textbf{81.0}	 & \textbf{89.8}	 & \textbf{78.1}	 & \textbf{84.0} & 93.5	 & 92.8	 & 97.1	 & 97.0	 & \textbf{91.5}	 & 89.7	 & \textbf{96.0}	 & 93.6\\
      \bottomrule
    \end{tabular}%
}
  \vspace{-2mm}
{\scriptsize HIPP:Hippocampus, AMG:Amygdala, TM:Thalamus, 
CD:Caudate, PT:Putamen, PD:Pallidum, AB:Accumbens}
\end{table}


\begin{table}[t]
  \caption{Anatomical--Linguistic Evaluators' average scores for in-site, cross-site (CN), and clinical cohorts (AD and tumor). *** indicate statistically significant differences with $p<0.001$. Stable CN scores indicate protocol-consistent anatomy, while significant deviations in AD and tumor reflect pathological changes.}
  \label{anatomical_scores}
  \centering
  \resizebox{1\linewidth}{!}{%
    \begin{tabular}{lcccccc|c|c}
      \toprule
      \multirow{2}{*}{Score} &
      \multicolumn{6}{c|}{\textbf{In-site and Cross-site (CN)}} &
      \multicolumn{1}{c|}{\textbf{ADNI (AD)}} & \multicolumn{1}{c}{\textbf{BraTS (Tumor)}}\\
      \cmidrule(lr){2-7} \cmidrule(lr){8-8} \cmidrule(lr){9-9} 
       & FastSurfer & QuickNAT & nnU-Net &MAPSeg&SAT& NeuroLangSeg
       & NeuroLangSeg & NeuroLangSeg \\
      \midrule
      
    Shape & $78.1\pm1.7$ & $78.9\pm1.9$   & $79.4\pm1.3$ & $79.0\pm1.3$   & $79.5\pm1.4$
   & $79.2\pm1.7$& $71.2\pm1.4^{***}$   & $71.4\pm1.4^{***}$    \\
    Location   & $87.9\pm2.8$&	$84.0\pm2.5^{***}$&	$86.9\pm4.2$&	$88.3\pm2.6$&	$87.3\pm2.9$&$87.9\pm2.4$  & $77.1\pm2.7^{***}$   & $67.4\pm3.5^{***}$  \\
    Volume     & $1.39\pm0.51^{***}$	 & $2.17\pm0.32^{***}$
	 & $1.05\pm0.34$	 & $1.02\pm0.35$	 & $1.09\pm0.35$
	 & $0.94\pm0.24$ & $1.82\pm0.49^{***}$ & $2.24\pm0.53^{***}$ \\
      \bottomrule
    \end{tabular}%
  }
\end{table}

\begin{figure}[ht]
\centering
  \includegraphics[width=1\linewidth]{scores.png}
  \caption{Shape, location, and volume-score distributions of subcortical regions in Cognitively Normal (CN), Alzheimer's Disease (AD), and tumor participants, stratified by sex.}
  \label{fig3}
\end{figure}


\subsection{Results and Discussion}
\textbf{1. Segmentation Accuracy and Visualization:} 

Table~\ref{DSC_NSD_result} summarizes the in/cross-site DSC/NSD performance. In in-site evaluation, NeuroLangSeg obtains the highest average DSC (86.9\%) and NSD (95.0\%), exceeding the strongest baseline by +1.9\% DSC and +2.2\% NSD, with larger gains over the average baseline (+4.1\% DSC, +8.0\% NSD), particularly for small structures such as the amygdala and accumbens. Under cross-site evaluation, NeuroLangSeg again achieves the highest average DSC (84.0\%) and NSD (93.6\%). This yields +0.2\% DSC and +1.4\% NSD gain over the strongest baseline and substantial improvements over the average baseline (+3.6\% DSC, +14.5\% NSD). 

Figure~\ref{fig2} shows in-site qualitative segmentation results. In the coronal view, several baselines enlarge or shrink the amygdala. In the sagittal view, manual annotations are inherently discontinuous because they were drawn primarily in the coronal plane. Models trained only on these labels tend to compensate incorrectly: methods such as SAT, MAPSeg, and QuickNAT often enlarge the structure, FastSurfer tends to shrink it, and nnU-Net frequently yields missing segments. In contrast, NeuroLangSeg produces anatomically consistent shapes across views without artificial expansion, collapse, or disappearance. Minor 1--2 pixel over-segmentation may still occur, which is a common and well-known behavior in deep learning-based segmentation methods.

Table~\ref{ablation_result} reports ablation results on the in-site dataset. Removing all discriminators leads to a clear performance degradation in both Dice and NSD across all subcortical structures. Excluding either the shape or location discriminator generally reduces overall performance of segmentation accuracy compared with the full model.

\noindent\textbf{2. Anatomical--Linguistic Evaluation and Clinical Generalization:} 

Table~\ref{anatomical_scores} reports the average anatomical--linguistic evaluator scores for in-site and cross-site cognitively normal (CN) subjects and clinical cohorts. For CN participants, NeuroLangSeg achieves high shape and location scores and volume scores close to 0 (within $[-2,2]$), indicating good alignment with the anatomical protocol. Volume score is summarized using $|\text{Score}_{\text{vol}}|$ to properly capture deviation magnitude. Other segmentation methods also produce CN scores that cluster near the reference values, as expected for healthy controls; however, their morphological, topological, and volumetric metrics are consistently lower than those of NeuroLangSeg. As expected, ADNI (AD) and BraTS (tumor) cohorts show reduced scores due to pathology-related changes in morphology and spatial organization. 

A one-way ANOVA was used to assess whether the evaluators distinguish anatomical quality. Within CN subjects, evaluator scores from each method were compared to NeuroLangSeg. Most methods showed no significant difference, as all were tested on the same CN cohort. Only QuickNAT and FastSurfer differed significantly ($p<0.001$), consistent with their lower DSC/NSD performance. Using NeuroLangSeg across CN, AD, and tumor groups, all three evaluators showed significant differences ($p<0.001$), indicating stable scores in healthy controls and clear sensitivity to disease-related anatomical changes.

Figure~\ref{fig3} illustrates these patterns using violin plots of our method's evaluator score distributions for CN, AD, and tumor participants. CN subjects cluster tightly around the reference values, whereas AD and tumor display volume outside $[-2,2]$, and reductions in shape and location scores. Slightly lower CN scores for the amygdala and pallidum arise because our manual labels are smaller than the FreeSurfer-derived volumes used in the BrainChart reference.

\section{Conclusion}
We introduced \textbf{NeuroLangSeg}, a language-guided framework that unifies visual features with protocol-consistent anatomical reasoning for subcortical MRI segmentation. Through MAE pretraining, pseudo-supervised fine-tuning, and anatomical--linguistic evaluation, our method delivers accurate, consistent, and clinically interpretable segmentations across in-site, cross-site, and disease cohorts. Quantitative comparison with state-of-the-art models shows substantial gains, including \textbf{+8.0 NSD} in in-site evaluation and \textbf{+14.5 NSD} in cross-site generalization over the average baseline. ANOVA analyses further confirm that our anatomical--linguistic scores significantly distinguish healthy controls from pathological cases while remaining stable within CN subjects. By grounding segmentation in a standardized anatomical protocol, NeuroLangSeg advances robust, interpretable, and clinically aligned neuroimaging segmentation. The future work is to extend NeuroLangSeg to infant, pediatric, and fetal MRI, as well as to additional disease cohorts, to further assess robustness across developmental stages and pathological conditions.


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was supported by NIH grants R00HD103912, R01MH133313 (Y.W.), and Neuromorphometrics, Inc.}


\bibliography{midl26_123}


\appendix

\section{Model Architecture}
\subsection{Text Encoder}
Using text encoder $g_{\text{text}}$, each text prompt $T$ including structure names, morphological definitions, and pairwise spatial relations is tokenized and processed through a multi-layer self-attention stack, producing a d-dimensional representation.
Given a batch of paired textual descriptions $(T_i, T_i^{'})$, each anatomical structure's concept $T_i$ is paired with $T_i^{'}$, corresponding either to (1) a descriptive phrase capturing its anatomical morphology $T_i^{\text{mor}}$, or (2) relational statements describing its spatial relations relative to other structures $T_i^{\text{rela}}$. 
The encoder produces embedding pairs $(z_i,z_i^{'})$:
\begin{equation}
    z_i = g_{\text{text}}(T_i),\qquad z_i^{'} = g_{\text{text}}(T_i^{'}), \qquad z_i,z_i^{'} \in \mathbb{R}^{d}
    \tag{A1}
\end{equation}
To encourage semantically aligned anatomical concepts to share a nearby representation, we optimized $g_{\text{text}}$ using contrastive learning, with an InfoNCE contrastive loss.
\begin{equation}
    L_{\text{text}}=-\frac{1}{N}\sum_{i=1}^N{\left[\log\frac{\exp(\frac{z_i.z_i{'}}{\tau})}{\sum_{k=1}^N{\mathbf{1}_{i\neq k}\exp(\frac{z_i.z_k{'}}{\tau})}}+log\frac{\exp(\frac{z_i.z_i{'}}{\tau})}{\sum_{k=1}^N{\mathbf{1}_{i\neq k}\exp(\frac{z_k.z_i{'}}{\tau})}}\right]}
    \tag{A2}
\end{equation}
where $\tau$ is the temperature coefficient, and $N$ is the number of subcortical regions.
\subsection{Query Decoder}

The text embedding $z$ serves as the initial \textit{Query}, while the visual feature $g_{vis}(X)$ extracted from the image encoder serves as the \textit{Keys} and \textit{Values}. A stack of decoder blocks, each consisting of cross-attention followed by feed-forward layers, progressively refines the representation:

\begin{equation}
    q = h_{\text{query}}\!\left(g_{\text{vis}}(X),\, g_{\text{text}}(T)\right) , \qquad q \in \mathbb{R}^{d}
    \tag{A3}
\end{equation}

Through this cross-attention mechanism, the query decoder allows the text embedding to attend to anatomically relevant visual cues, enabling the model to infer subject-specific variations in location, orientation, and shape. The output $q$ serves as an image-conditioned anatomical query, which is subsequently matched against voxel-wise visual features $h_{\text{vis}}\!\left(g_{\text{vis}}(X)\right)$ in the segmentation head, using a dot-product operation to generate the final segmentation. 

\subsection{3D Multi-Scale Masked Autoencoder (MAE)}
Our 3D MAE uses 3D ResNet blocks \cite{Zhang_2024_CVPR} instead of Vision Transformers. The encoder is composed of eight 3D ResNet blocks and we adopt an asymmetric architecture with a lightweight decoder, detailed in Supplementary Figure~\ref{fig:mae_model}. During training, the model jointly learns from two input types—randomly sampled local patches $ x $ and a downsampled version of the full volumetric scan $ X $, both resized to $ 96^3 $ voxels. To enable self-supervised learning, both $ x $ and $ X $ are partitioned into non-overlapping 3D patches and subjected to random masking. For $ x $, we use a patch size of $ 8^3 $ and mask 70\% of the patches uniformly at random. For $ X $, which provides a broader field of view (FOV), we use a smaller patch size of $ 4^3 $ while maintaining the same masking ratio. The resulting masked inputs, denoted as $ x_M $ and $ X_M $, are passed to the MAE, which is trained to reconstruct the original unmasked volumes using mean squared error loss computed only on the masked regions.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.8\linewidth]{Appendix1.png}
  \caption{Illustrations of MAE 3D ResNet Block and 3D architectures.}
  \label{fig:mae_model}
\end{figure}

\subsection{3D Masked Pseudo-Labeling (MPL)}
MAPSeg employs a 3D \textbf{Masked Pseudo-Labeling (MPL)} strategy based on a teacher--student architecture (Figure~\ref{fig:mpl_GLC}). The segmentation backbone combines the pretrained MAE encoder $g_{\text{vis}}$ with a lightweight 3D decoder $h_{\text{vis}}$ adapted from DeepLabV3. The decoder uses a 3D Atrous Spatial Pyramid Pooling (ASPP) module with multi-scale dilated convolutions to enlarge the effective receptive field. The student model processes both labeled source volumes and unlabeled target volumes, while the teacher model—an exponential moving average (EMA) of the student—produces stable pseudo-labels for the target domain.

To improve generalization, the student receives masked input volumes, following the same masking scheme used during MAE pretraining. This forces the model to rely on global context rather than local intensity alone. MPL integrates (i) supervised loss on source data and (ii) consistency loss between teacher and student predictions on target data. This yields a label-efficient adaptation mechanism that leverages MAE-learned priors while mitigating noisy pseudo-label propagation.

The teacher model's parameters $\theta$ are updated during training via an
exponential moving average (EMA) of the student model's parameters $\phi$~\cite{DBLP:journals/corr/TarvainenV17}:
\begin{equation}
\theta_{t+1} \leftarrow \alpha \theta_t + (1 - \alpha)\phi_t,
\tag{A4}
\end{equation}
where $t$ and $t+1$ denote training iterations and $\alpha$ is the EMA update weight.
For models initialized from large-scale MAE pretraining, we set $\alpha=0.999$ during
the first 1{,}000 steps and $\alpha=0.9999$ afterwards. For models pretrained on
small-scale source and target datasets (e.g., only dozens of scans), we set
$\alpha=0.99$ during the first 1{,}000 steps, $\alpha=0.999$ during the next 2{,}000 steps,
and $\alpha=0.9999$ for the remaining training. The teacher model $f_{\theta}$ is
initialized with the student model's parameters $\phi$ after a warm-up stage
(e.g., 1{,}000 iterations) on the source-domain data.

\subsection{3D Global-Local Collaboration (GLC)}

To improve pseudo-label stability under large domain shifts, we introduce a 
\textbf{Global--Local Collaboration (GLC)} module~\cite{Zhang_2024_CVPR}. For each scan (Figure~\ref{fig:mpl_GLC}), we extract a 
high-resolution local patch $x$ and a downsampled global volume $X$. The encoder 
$g_{\text{vis}}$ produces local and global features:
\begin{equation}
\chi_{\text{loc}} = g_{\text{vis}}(x), 
\qquad
\chi_{\text{glo}} = \text{upsample}(M \odot g_{\text{vis}}(X)),
\tag{A5}
\end{equation}
where $M$ is a binary mask and $\odot$ denotes cropping followed by interpolation 
to match spatial dimensions. The GLC module fuses the two feature streams by 
channel-wise concatenation:
\begin{equation}
f_{\text{vis}}(x) = h_{\text{vis}}(\chi_{\text{loc}} \oplus \chi_{\text{glo}}),
\tag{A6}
\end{equation}
forming a unified 1024-dimensional latent representation processed by the ASPP head 
for segmentation. To provide global supervision, the model also predicts from the 
global view via
\begin{equation}
f_{\text{vis}}(X) = h_{\text{vis}}(g_{\text{vis}}(X) \oplus g_{\text{vis}}(X)).
\tag{A7}
\end{equation}

To enforce alignment between local and global information, we impose a cosine 
similarity regularizer:
\begin{equation}
\mathcal{L}_{\cos}(x,X)
= 1 -
\frac{\chi_{\text{loc}} \cdot \chi_{\text{glo}}}
     {\max\!\left(\|\chi_{\text{loc}}\|_2,\,
                  \|\chi_{\text{glo}}\|_2,\,
                  \epsilon\right)} .
\tag{A8}
\end{equation}

The GLC losses for source and target data are:
\begin{equation}
\mathcal{L}^{S}_{\text{GLC}}
= \gamma\!\left[
\mathcal{L}_{\text{Seg}}(f_{\phi}(X_s),Y_s)
+ \mathcal{L}_{\text{Seg}}(f_{\phi}(X_s^{M}),Y_s)
\right]
+ \delta\!\left[
\mathcal{L}_{\cos}(x_s,X_s)
+ \mathcal{L}_{\cos}(x_s^{M},X_s^{M})
\right],
\tag{A9}
\end{equation}
\begin{equation}
\mathcal{L}^{T}_{\text{GLC}}
= 2\gamma\,\mathcal{L}_{\text{Seg}}(f_{\phi}(X_t^{M}),f_{\theta}(X_t))
+ 2\delta\,\mathcal{L}_{\cos}(x_t^{M},X_t^{M}).
\tag{A10}
\end{equation}

During training, local $96 \times 96 \times 96$ patches are randomly sampled to 
provide high-resolution detail while the global branch maintains coarse contextual 
awareness. At inference, predictions are generated with a sliding-window scheme 
(stride 80) to cover the full volume.

With the supervised loss
\begin{equation}
\mathcal{L}_{\text{FSS}}
= \beta\,\mathcal{L}_{\text{Seg}}(f_{\phi}(x_s), y_s),
\tag{A11}
\end{equation}
the full training objective becomes:
\begin{equation}
\mathcal{L}_{\text{vis}}
=
\mathcal{L}_{\text{FSS}}
+ \mathcal{L}_{\text{MPL}}
+ \mathcal{L}_{\text{GLC}},
\qquad
\mathcal{L}_{\text{GLC}}
= 
\mathcal{L}^{S}_{\text{GLC}}
+ \mathcal{L}^{T}_{\text{GLC}}.
\tag{A12}
\end{equation}

\begin{figure}[htbp]
  \centering
  \includegraphics[width=1\linewidth]{Appendix2.png}
  \caption{Illustrations of MPL and GLC.}
  \label{fig:mpl_GLC}
\end{figure}

\subsection{Morphological Discriminator}
The morphological discriminator adopts the SE(3) -equivariant convolutional neural network~\cite{billot2024se}  as the shape encoder $\mathcal{F}_{\text{shape}}$ to extract features.
The shape encoder is pretrained using a denoising autoencoder framework, mapping noisy inputs to embeddings, which are reconstructed by a decoder comprising transposed 3D convolutions with instance normalization. For framework details, please refer to the supplementary Figure~\ref{fig:shape_encoder}.

The reconstruction task is trained by minimizing the difference between the reconstructed image and the input image, and the loss uses a combination of multi-class Dice loss and MSE loss. For the input labeled image $Y$, the reconstructed labeled image $Y^{\text{recon}}$ is obtained after passing through the shape encoder and decoder. The loss is calculated as follows:
\begin{equation}
    \mathcal{L}_{\text{shape\_recon}}=1-\frac{2|Y^{\text{recon}} \cap Y|}{|Y^{\text{recon}}| + |Y|}+\left\|Y^{\text{recon}} - Y \right\|_2^{2}
    \tag{A13}
\end{equation}
\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.8\linewidth]{shape_encoder.png}
  \caption{Illustrations of Morphological Discriminator.}
  \label{fig:shape_encoder}
\end{figure}
\subsection{Topological Discriminator}

    Topological Discriminator employs MLP as the location encoder $\mathcal{F}_{\text{loc}}$, which is pretrained to reconstruct relation features from noisy inputs. For framework details, please refer to the supplementary Figure~\ref{fig:loc_encoder}. 
    For each subject, we extract relation vectors of 37 anatomical pairs $\mathbf{R} \in \mathbb{R}^{K\times7}$ from ground truth. $K=37$ is the number of anatomical pairs. The relation feature $\mathbf{r}_{ij} = [ \Delta\mathbf{c}_{ij}, A_{ij},\mathbf{d}_{ij} ]$ is composed of the continuous relative position $\Delta\mathbf{c}_{ij}$, the adjacency ratio $A_{ij}$, and the adjacency-direction $\mathbf{d}_{ij}$. $\Delta\mathbf{c}_{ij}=c_i-c_j$ is obtained by taking the difference between the centroids $c_i$ of structure $i$ and $c_j$ of structure $j$. Adjacency ratio $A_{ij}=\frac{\|N_{i\rightarrow j}\|}{\|B_i\|}$ is the proportion of shared boundary voxels, and the adjacency-direction vector $d_{ij}=c_i-\frac{\sum_{v\in N_{i\rightarrow j}}{v}}{\|N_{i\rightarrow j}\|}$ is defined as the difference between the centroid of the subset of structure $i$'s boundary voxels that are adjacent to $j$ and the centroid of structure $i$ as a whole. Here, $N_{i\rightarrow j}$ is the subset of boundary voxels of structure $i$ that are in direct spatial contact with structure $j$, and $B_i$ denotes the set of all boundary voxels belonging to anatomical structure $i$. 
    Then we generate n = 100 perturbation versions in the relation vectors $R$ by adding Gaussian noise ($\sigma = 0.1$).
    The encoder maps noisy relations to embeddings, which are reconstructed by a feedforward decoder. The reconstruction loss is: 
    \begin{equation}
    \mathcal{L}_{\text{loc\_recon}} = \left\|\mathbf{R} - \mathbf{R}^{\text{recon}}\right\|_2^2
    \tag{A14}
    \end{equation}
    where $\mathbf{R}^{\text{recon}}$ denotes the reconstructed relation matrix. 
    
    The details of the parameter settings for the pre-training tasks of the shape discriminator and the topology discriminator are presented in Table~\ref{Discriminator_configure}.
\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.8\linewidth]{location_encoder.png}
  \caption{Illustrations of Topological Discriminator.}
  \label{fig:loc_encoder}
\end{figure}

\begin{table}[ht]
\centering
\caption{List of the 37 anatomically relevant structure pairs used to construct relation vectors.}
\label{tab:K37_pairs}
\resizebox{\linewidth}{!}{%
\begin{tabular}{ll}
\hline
\textbf{Category} & \textbf{Structure pairs} \\
\hline
Left intra-hemispheric (15) &
(L-HIPP, L-AMG), (L-HIPP, L-TM), (L-HIPP, L-AB), \\
& (L-AMG, L-AB), (L-AMG, L-TM), (L-AMG, L-PT), \\
& (L-CD, L-PT), (L-CD, L-PD), (L-CD, L-AB), \\
& (L-CD, L-TM), (L-PT, L-PD), (L-PT, L-AB), \\
& (L-PT, L-TM), (L-PD, L-TM), (L-AB, L-TM) \\

Right intra-hemispheric (15) &
(R-HIPP, R-AMG), (R-HIPP, R-TM), (R-HIPP, R-AB), \\
& (R-AMG, R-AB), (R-AMG, R-TM), (R-AMG, R-PT), \\
& (R-CD, R-PT), (R-CD, R-PD), (R-CD, R-AB), \\
& (R-CD, R-TM), (R-PT, R-PD), (R-PT, R-AB), \\
& (R-PT, R-TM), (R-PD, R-TM), (R-AB, R-TM) \\

Inter-hemispheric (7) &
(L-HIPP, R-HIPP), (L-AMG, R-AMG), (L-CD, R-CD), \\
& (L-PT, R-PT), (L-PD, R-PD), (L-TM, R-TM), (L-AB, R-AB) \\
\hline
\end{tabular}%
\  }
\end{table}

\section{Dataset Description}

We gather T1-weighted and T2-weighted MRI data across 1-100 years from 15 publicly available datasets. The detailed dataset information is as follows:
\begin{itemize}
    \item \textbf{ABCD: Adolescent Brain Cognitive Development Study} \cite{Casey2018-xm} is a large-scale, longitudinal neuroimaging and behavioral study tracking brain development and child health in over 10,000 U.S. children aged 9--10 years. Participants were enrolled at ages 9--10 and are being followed into their early 20s. We collected 2930 subjects with 3211 longitudinal scans spanning 9-17 years, with 3211 T1-weighted and 3209 T2-weighted images. 
    \item \textbf{ABIDE-I: Autism Brain Imaging Data Exchange} \cite{Di_Martino2014-sc} is a cross-sectional multi-site initiative that shares resting-state fMRI and structural MRI data from individuals with autism and typically developing controls. We collected 1102 subjects/scans of T1-weighted images spanning 6 - 64 years. 
    \item \textbf{ADHD-200: ADHD-200 Global Competition} \cite{Bellec2017-wm} is a cross-sectional multi-site dataset sharing resting-state fMRI and structural MRI data to identify biomarkers of Attention Deficit Hyperactivity Disorder (ADHD). We collected 869 subjects/scans of T1-weighted images spanning 7 - 26 years.
    \item \textbf{ADNI: Alzheimer's Disease Neuroimaging Initiative} \cite{Jack2008-tj} is a longitudinal, multi-site study designed to develop clinical, imaging, genetic, and biochemical biomarkers for early detection and tracking of Alzheimer's disease. We collected 50 subjects/scans of T1-weighted images spanning 60-96 years.
    \item \textbf{BCP: Baby Connectome Project} \cite{Howell2019-ok} is a longitudinal neuroimaging study aiming to map early brain development and connectivity from infancy through early childhood. We collected 2126 subjects with 2444 longitudinal scans spanning 0-7 years, with 2406 T1-weighted and 2347 T2-weighted images.
    \item \textbf{HBN: Healthy Brain Network} \cite{alexander2017open} is a cross-sectional transdiagnostic pediatric study collecting neuroimaging, behavioral, cognitive, and genetic data to better understand mental health and learning disorders. We collected 1729 subjects/scans spanning 5 - 22 years, with 1698 T1-weighted and 562 T2-weighted images.
    \item \textbf{HCP-A: Human Connectome Project -- Aging} \cite{Bookheimer2019-li} is a cross-sectional dataset focused on understanding brain connectivity and aging across the adult lifespan. We collected 725 subjects/scans spanning 36 - 100 years, with 725 T1-weighted and 725 T2-weighted images.
    \item \textbf{HCP-D: Human Connectome Project -- Development} \cite{Somerville2018-vv} is a cross-sectional study examining brain development and connectivity from childhood through young adulthood. We collected 652 subjects/scans spanning 6 - 22 years, with 652 T1-weighted and 652 T2-weighted images.
    \item \textbf{HCP-YA: Human Connectome Project -- Development} \cite{Harms2018-vl} is a cross-sectional study to map the healthy human connectome by collecting and freely distributing neuroimaging and behavioral data on 1,200 normal young adults, aged 22-35.
    \item \textbf{PING: Pediatric Imaging, Neurocognition, and Genetics} \cite{Jernigan2016-sa} is a cross-sectional study designed to assess brain development and its genetic and environmental influences in children and adolescents. We collected 754 subjects/scans spanning 0 - 22 years, with 752 T1-weighted and 106 T2-weighted images.
    \item \textbf{CANDI: Child and Adolescent Neuro Development Initiative} \cite{Kennedy2012-sz} includes structural MRI scans of children and adolescents, supporting research on brain development, psychiatric disorders, and neuroanatomical differences across diagnoses.
    \item \textbf{OASIS: Open Access Series of Imaging Studies} \cite{Marcus2010-sh} provides structural brain MRI data across the adult lifespan, including individuals with and without Alzheimer's disease, to support neurodegenerative and aging research.
    \item \textbf{COLIN: Colin27 Brain Atlas} \cite{Holmes1998-aq} is a high-resolution MRI brain template created by averaging 27 T1-weighted scans of a single individual.
    \item \textbf{BraTS2023: The Brain Tumor Segmentation (BraTS) Challenge 2023} \cite{Li2023-ep} provides an expanded multi-site mpMRI dataset ($\sim$4,500 cases) with expert tumor delineations across diverse populations and tumor types, enabling benchmarking of segmentation, missing-data handling, and cross-task generalizability.
\end{itemize}


We pretrain our models on a dataset of approximately 12,000 subjects, including both T1-weighted and T2-weighted scans. All datasets were preprocessed using N4 bias correction and skull stripping. Full dataset details are provided in Supplementary Table~\ref{pretrain_data}.

\begin{table}[t]
\caption{Dataset summary across age ranges for pretraining: modality, number of subjects, scans, and age span.}
\label{pretrain_data}
\centering
\begin{tabular}{lccccc}
\toprule
Dataset & Modality & Subjects & T1 Scans & T2 Scans & Age (yrs) \\
\midrule
\textbf{ABCD}       & T1w, T2w & 2930 & 3211 & 3209   & 9--16 \\
\textbf{ABIDE-I}    & T1w      & 1102 & 1102 & --   & 6--64 \\
\textbf{ADHD-200}   & T1w      & 869  & 869  & --   & 7--26 \\
\textbf{BCP}        & T1w, T2w & 2126 & 5183 & 4303   & 1--7 \\
\textbf{HBN}        & T1w, T2w & 1729 & 1684 & 560   & 6--64 \\
\textbf{HCP-A}      & T1w, T2w & 725 & 725 & 725   & 36--100 \\
\textbf{HCP-D}      & T1w, T2w & 652 & 652 & 652   & 6--21 \\
\textbf{HCP-YA}     & T1w, T2w & 1061  & 10  & 660   & 22--35 \\
\textbf{PING}       & T1w, T2w & 754 & 752 & 106   & 3--21 \\
\bottomrule
\end{tabular}
\end{table}

We use 118 manually labeled subjects from \textbf{ADNI} \cite{Jack2008-tj}, \textbf{CANDI} \cite{Kennedy2012-sz}, \textbf{OASIS} \cite{Marcus2010-sh}, \textbf{Colin} \cite{Holmes1998-aq}, and \textbf{BCP-50} \cite{Howell2019-ok} (details in Supplementary Table~\ref{segmentation_data}). For robust model development, 50\% of subjects are used for training, 10\% for validation, and 40\% are held out for testing. In our study, \textbf{CANDI}, \textbf{Colin}, and \textbf{OASIS} are single-site datasets. The \textbf{ADNI} manual dataset includes 30 subjects spanning 25 distinct sites. \textbf{ADNI} subjects are split such that 15 subjects from 12 sites are used for training, 3 subjects from 2 sites are used for validation, and 12 subjects from 11 different sites are used for testing.  \textbf{Colin} and 12 \textbf{ADNI} testing subjects are reserved exclusively for cross-site inference. Two non--manually labeled clinical datasets—\textbf{BraTS}\cite{Li2023-ep} tumor cohort and \textbf{ADNI} \cite{Jack2008-tj}(Alzheimer's disease) cohort (Table~\ref{clinical_data})—which are used exclusively to evaluate out-of-distribution anatomical generalization.

\begin{table}[t]
\caption{Dataset summary for subcortical segmentation: modality, number of subjects, scans, and age span.}
\label{segmentation_data}
\centering
\begin{tabular}{lccccc}
\toprule
Dataset & Modality & Subjects & T1 Scans & T2 Scans & Age (yrs) \\
\midrule
\textbf{OASIS}       & T1w & 50 & 70 & --   & 18--93 \\
\textbf{ADNI}    & T1w      & 29 & 30 & --   & 71-88 \\
\textbf{CANDI}   & T1w      & 13  & 13  & --   & 5-15 \\
\textbf{Colin}       & T1w      & 1  & 1  & --   & 27 \\
\textbf{BCP}      & T1w, T2w      & 25 & 25 & 25   & 1-2 \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[t]
\caption{Dataset summary for clinical generalization: modality, number of subjects, scans, and age span.}
\label{clinical_data}
\centering
\begin{tabular}{lccccc}
\toprule
Dataset & Modality & Subjects & T1 Scans & T2 Scans & Age (yrs) \\
\midrule
\textbf{ADNI}    & T1w      & 30 & 30 & --   & 60-96 \\
\textbf{BraTS}   & T1w      & 20  & 20  & --   & 50-85 \\
\bottomrule
\end{tabular}
\end{table}


\section{Baseline}
\subsection{FastSurfer}
FastSurferCNN \cite{HENSCHEL2020117012} is a 2D fully convolutional architecture designed for fast whole-brain segmentation and serves as the segmentation module within the FastSurfer pipeline. The network follows an encoder--decoder design similar to QuickNAT but introduces several architectural improvements: competitive dense blocks (replacing concatenation with maxout operations) to encourage feature competition, unpooling layers for spatially accurate upsampling, and a wider contextual field to better capture neuroanatomical boundaries. FastSurferCNN predicts 2D segmentations for axial, coronal, and sagittal slices, which are combined through a multi-view aggregation strategy to produce the final 3D mask.

In its original formulation, FastSurfer is trained on FreeSurfer-derived labels for 95 anatomical structures, providing a high-speed alternative to traditional surface-based processing. Because the method relies on 2D slice-wise predictions, its performance can vary across views, especially for small or discontinuous subcortical structures.

For our study, we employ FastSurferCNN as a baseline and fine-tune it on our 7-class subcortical label set under the same training conditions as the other methods.
\subsection{QuickNAT}
QuickNAT \cite{GUHAROY2019713} is a 2D fully convolutional framework that performs segmentation on individual slices rather than full 3D volumes. The method trains three independent F-CNNs on single coronal, axial, and sagittal slices, and fuses their outputs through a view-aggregation 
module to obtain the final 3D prediction. Each F-CNN adopts an encoder--decoder architecture with skip connections, unpooling layers, and dense connections to improve gradient flow and feature reuse. The network is optimized using a combination of multi-class Dice loss and weighted logistic loss to address class imbalance and enhance boundary delineation.

In its original formulation, QuickNAT is pre-trained using auxiliary labels generated by FreeSurfer and then fine-tuned on expert manual segmentations. This strategy leverages large-scale automated annotations while adapting to higher-quality ground truth. Because 
QuickNAT operates on single 2D slices without explicit 3D contextual modeling, its predictions may vary across views, particularly for small or irregularly shaped subcortical structures.

For our experiments, we fine-tune QuickNAT on our 7-class subcortical label set to serve as a baseline under consistent training and evaluation conditions.
\subsection{nnU-Net}
nnU-Net\cite{Isensee2021-pv} is a self-configuring segmentation framework that automatically adapts its network architecture, data preprocessing strategies, and training pipelines to the characteristics of a given dataset. Following the original design, we relied entirely on nnU-Net's built-in mechanisms, including its automated determination of patch size, batch size, normalization scheme, deep supervision, and data augmentation policies.

For our experiments, we use the default 3D full-resolution configuration. During training, the model optimized the standard combination of Dice loss and cross-entropy loss under the framework's predefined schedule, including the default learning rate, optimizer settings, and training epochs. After training, inference was performed using nnU-Net's standard test-time augmentation and sliding-window strategy.  

\subsection{MAPSeg}
MAPSeg \cite{Zhang_2024_CVPR} is an unsupervised domain adaptation (UDA) framework designed for volumetric medical image segmentation. It integrates 3D masked autoencoding (MAE) with a masked pseudo-labeling (MPL) strategy and a global--local consistency (GLC) objective to improve robustness across heterogeneous imaging domains. The framework is self-supervised during pretraining through 3D MAE reconstruction, and subsequently refines pseudo-labels using MPL to adapt the model to new domains without requiring manual annotations. GLC further stabilizes training by enforcing consistency between global volumetric context and local structural details.

MAPSeg was originally proposed for centralized, federated, and test-time UDA settings, allowing models trained on one domain to generalize to unseen scanners or cohorts. Because MAPSeg operates directly on 3D volumes with domain-adaptive pseudo-labeling, it can handle cross-domain variations more effectively than purely supervised baselines.

For our experiments, we use MAPSeg's domain-adapted 3D backbone and fine-tune it on our 7-class subcortical label set to serve as a UDA-based baseline under consistent training conditions.

\subsection{SAT}
SAT\cite{zhao2025largevocabularysegmentationmedicalimages} is a large-vocabulary medical image segmentation framework integrating an image encoder with a text-aware feature modulation module. The central design of SAT involves a text--image feature alignment mechanism and a hierarchical decoder with Mixture-of-Experts layers. The encoding process of SAT involves two encoders: a vision encoder responsible for extracting multi-scale 3D visual representations from the input medical volume, and a text encoder mapping natural language descriptions of target structures into embedding vectors. During the decoding stage, the segmentation decoder dynamically fuses these two modalities to produce structure-specific predictions.

In our experiments, we employed the SAT-nano variant built upon the nnU-Net vision backbone as recommended in the original paper.  We fine-tuned the SAT-nano on our dataset using only the image domain and segmentation supervision corresponding to our task. No additional large-scale pretraining or external datasets were used. 

\section{Experiment Settings}
\subsection{Visual-Backbone}
\textbf{MAE Pretraining.} 
For MAE pretraining, we follow the training configurations listed in Table~\ref{MAE_configure}. Each mini-batch contains a randomly sampled local patch $x$ and a downsampled global scan $X$. The masking patch size specified in Table~\ref{MAE_configure} is applied only to $x$; for $X$, the masking patch is always set to half the size due to its larger field of view. During the MAE stage, we apply random 3D affine transformations with isotropic scaling between 75--150\% and rotation sampled from $[-40^\circ, 40^\circ]$.

\noindent\textbf{MPL-GLC.}
For centralized UDA brain MRI segmentation, the detailed training configurations are provided in Table~\ref{Fine-tune_configure}. Each mini-batch contains four patches: a local--global pair $(x, X)$ from the source domain and another pair from the target domain (each of size $96^3$). 
During warm-up epochs, the model is trained exclusively on the source domain. Model selection is based on the validation \textit{Score}, with a patience of 50 epochs.

\textit{Target-domain augmentation.}
We apply a random 3D affine transformation with isotropic scaling of 70--130\% and rotation sampled from $[-30^\circ, 30^\circ]$.

\textit{Source-domain augmentation.}
A stronger augmentation pipeline is used for the source domain, including random affine (70--140\% scaling, $[-30^\circ, 30^\circ]$ rotation), random bias field, and random gamma transformation ($\gamma \in [e^{-0.4}, \, e^{0.4}]$).

\textit{Teacher--student update.}
The teacher model $f_{\theta}$ is updated using an exponential moving average (EMA) of the student parameters $f_{\phi}$:
\begin{equation}
    \theta_{t+1} \leftarrow \alpha \theta_t + (1-\alpha)\phi_t ,
    \tag{A15}
\end{equation}
where $t$ denotes the training iteration and $\alpha$ is the EMA decay rate.

\textit{EMA scheduling.}
For models initialized from large-scale MAE pretraining, 
we set $\alpha = 0.999$ for the first 1{,}000 steps and $0.9999$ thereafter. For models pretrained only on small-scale datasets (tens of scans), we use $\alpha = 0.99$ for the first 1{,}000 steps, $0.999$ for the next 2{,}000 steps, and $0.9999$ for the remaining iterations.

The teacher network is initialized using the student parameters after a warm-up stage (e.g., 1{,}000 iterations) trained solely on the source domain.

\begin{table*}[t]
\centering
\begin{minipage}[t]{0.48\linewidth}
\centering
\caption{MAE Pretraining Configurations}
\label{MAE_configure}
\resizebox{\linewidth}{!}{
\begin{tabular}{ll}
\toprule
config & value \\
\midrule
patch size& $96\times96\times96$ \\
local masking patch & $8\times8\times8$ \\
global masking patch & $4\times4\times4$ \\
masking ratio & 70\% \\
optimizer & AdamW \\
learning rate & $2\times10^{-4}$ \\
weight decay & 0.05 \\
momentum & $\beta_1{=}0.9,\ \beta_2{=}0.95$ \\
lr scheduler & cosine annealing \\
epochs & 300 \\
batch size & 4 \\
iters/epoch & 500 \\
aug.\ prob. & 0.35 \\
augmentation & random affine \\
\bottomrule
\end{tabular}
}
\end{minipage}
\hfill
\begin{minipage}[t]{0.48\linewidth}
\centering
\caption{Fine-tuning Configurations}
\label{Fine-tune_configure}
\resizebox{\linewidth}{!}{
\begin{tabular}{ll}
\toprule
config & value \\
\midrule
patch size& $96\times96\times96$ \\
local masking patch & $8\times8\times8$ \\
global masking patch & $4\times4\times4$ \\
masking ratio & 70\% \\
optimizer & AdamW \\
learning rate & $1\times10^{-4}$ \\
weight decay & 0.01 \\
momentum & $\beta_1{=}0.9,\ \beta_2{=}0.999$ \\
lr scheduler & cosine WR \\
total epochs & 100 \\
warmup epochs & first 10 \\
early stop & 50 \\
batch size & 1 \\
iters/epoch & 100 \\
augmentation & random affine \\
source aug. & random bias field \\
target aug. & random gamma \\
source prediction weight & $\beta{=}0.5$ \\
EMA update weight & $\alpha{=}0.999/0.9999$ \\
auxiliary global loss weight & $\gamma{=}0.05$\\
cosine similarity weight &$\delta{=}0.05$ \\
\bottomrule
\end{tabular}
}
\end{minipage}
\end{table*}
\subsection{Language-Guided Prompt Encoding}
The detailed training configurations of NeuroLangSeg are provided in Table~\ref{NeuroLangSeg_configure}.
\begin{table*}[t]
\centering
\begin{minipage}[t]{0.48\linewidth}
\centering
\caption{NeuroLangSeg Configurations}
\label{NeuroLangSeg_configure}
\resizebox{\linewidth}{!}{
\begin{tabular}{ll}
\toprule
config & value \\
\midrule
patch size& $96\times96\times96$ \\
embedding dimension & 512 \\
text max query & 32 \\
optimizer & AdamW \\
learning rate & $1\times10^{-4}$ \\
weight decay & 0.01 \\
momentum & $\beta_1{=}0.9,\ \beta_2{=}0.999$ \\
lr scheduler & cosine annealing \\
epochs & 500 \\
batch size & 2 \\
iters/epoch & 500 \\
shape loss weight & 0.5 \\
location loss weight & 0.5 \\
\bottomrule
\end{tabular}
}
\end{minipage}
\hfill
\begin{minipage}[t]{0.48\linewidth}
\centering
\caption{Discriminator Pretraining Configurations}
\label{Discriminator_configure}
\resizebox{\linewidth}{!}{
\begin{tabular}{ll}
\toprule
config & value \\
\midrule
\textbf{shape encoder} & \\
\midrule
shape embedding dimension & 128 \\
shape encoder crop size  & $96\times96\times96$ \\
optimizer & Adam \\
loss & Dice+MSE \\
learning rate & $1\times10^{-4}$ \\
weight decay & $1\times10^{-5}$ \\
lr scheduler & cosine annealing \\
epochs & 50 \\
batch size & 8 \\
augmentation & gaussian noise \\
noise perturbations & 100 \\
\midrule
\textbf{location encoder} & \\
\midrule
location encoder batch size& 32 \\
relation pairs & 37 \\
location embedding dimension & 64 \\
optimizer & Adam \\
loss & MSE \\
learning rate & $1\times10^{-3}$ \\
weight decay & $1\times10^{-4}$ \\
lr scheduler & cosine annealing \\
epochs & 50 \\
batch size & 32 \\
augmentation & gaussian noise \\
noise perturbations & 100 \\

\bottomrule
\end{tabular}
}
\end{minipage}
\end{table*}

\section{Results}
Figure~\ref{fig4} shows qualitative segmentation results across methods for cross-site. Table~\ref{supply_score} reports anatomical--linguistic evaluator scores across seven subcortical structures for in-site and cross-site CN subjects and clinical cohorts. Figure~\ref{fig5} shows anatomical--linguistic evaluator score distributions of seven subcortical structures across methods.

\begin{table}[htbp]
\caption{Anatomical-Linguistic Evaluators' scores across seven subcortical structures for in-site, cross-site, and clinical generalization.*, $p < 0.05$;**, $p < 0.01$; ***, $p < 0.001$; n.s., not significant, using ANOVA with Bonferroni correction for multiple comparisons.}
  \label{supply_score}
  \centering
  \resizebox{\linewidth}{!}{
\begin{tabular}{@{}llccccccc@{}}
\toprule
\multirow{2}{*}{Score} & \multirow{2}{*}{Methods}  & \multicolumn{7}{c}{\textbf{Seven subcortical structures}}                         \\ \cmidrule(l){3-9} 
 &  & HIPP  & AMG & CD & PT                        & PD  & TM & AB \\ \midrule
\multirow{10}{*}{Shape}    & \multicolumn{8}{l}{\textbf{In-site and Cross-site (CN)}}                                       \\ \cmidrule(l){2-9} 
& FastSurfer     & 88.2$\pm$2.3  & 74.8$\pm$2.4  & 73.3$\pm$1.8  & 75.9$\pm$2.1 & 74.2$\pm$1.0  & 86.0$\pm$3.7  & 74.1$\pm$6.4  \\
& QuickNAT                  & $\mathbf{94.6\pm1.6^{***}}$  & $\mathbf{79.9\pm3.2^{***}}$  & $\mathbf{71.2\pm1.8^{***}}$  & 77.4$\pm$1.7  & 74.0$\pm$1.1  & 84.9$\pm$2.5  & $\mathbf{70.1\pm8.4^{*}}$  \\
& nnU-Net                   & 89.7$\pm$1.9  & 76.5$\pm$1.2  & 74.4$\pm$1.8  & 77.6$\pm$1.6  & 73.6$\pm$0.9  & 87.3$\pm$3.2  & 76.6$\pm$5.4  \\
& MAPSeg                    & 89.3$\pm$2.1  & 76.7$\pm$1.1  & 73.4$\pm$1.7  & 75.5$\pm$2.3  & 74.0$\pm$0.9  & 86.3$\pm$3.1  & 77.7$\pm$5.3  \\
 & SAT                       & 90.0$\pm$2.0  & 76.0$\pm$2.0  & 73.9$\pm$1.4  & 76.5$\pm$1.6  & 73.7$\pm$0.9  & 87.6$\pm$2.7  & 78.9$\pm$5.7  \\
& NeuroLangSeg              & 90.0$\pm$2.3  & 75.1$\pm$2.7  & 74.1$\pm$1.9  & 76.8$\pm$1.8  & 74.3$\pm$1.0  & 86.9$\pm$2.9  & 77.2$\pm$6.7  \\ \cmidrule(l){2-9} 
 & \multicolumn{8}{l}{\textbf{Clinical generalization}}                                \\ \cmidrule(l){2-9} 
& (ADNI) AD-NeuroLangSeg     & $\mathbf{81.2\pm5.7^{***}}$  & $\mathbf{63.4\pm3.4^{***}}$  & $\mathbf{70.9\pm1.6^{***}}$  & $\mathbf{75.7\pm0.8^{**}}$  & $\mathbf{75.4\pm0.6^{***}}$  & $\mathbf{66.7\pm3.4^{***}}$  & $\mathbf{65.5\pm1.6^{***}}$  \\
& (BraTS) Tumor-NeuroLangSeg & $\mathbf{85.7\pm3.7^{***}}$  & $\mathbf{64.6\pm3.0^{***}}$  & $\mathbf{70.2\pm2.3^{***}}$  & 77.6$\pm$0.6  & 74.2$\pm$0.9  & $\mathbf{64.7\pm5.6^{***}}$ & $\mathbf{62.8\pm1.5^{***}}$  \\ \midrule
\multirow{10}{*}{Location} & \multicolumn{8}{l}{\textbf{In-site and Cross-site (CN)}}                                       \\ \cmidrule(l){2-9} 
 & FastSurfer                & 85.5$\pm$5.4  & 88.6$\pm$3.4  & 86.9$\pm$3.2  & 87.1$\pm$3.9  & $\mathbf{88.6\pm2.6^{***}}$  & 87.1$\pm$4.2  & 87.2$\pm$3.2  \\
& QuickNAT                  & $\mathbf{82.7\pm3.7^{***}}$  & $\mathbf{79.1\pm9.5^{***}}$  & $\mathbf{81.8\pm4.6^{***}}$  & $\mathbf{79.6\pm3.3^{***}}$  & $\mathbf{77.0\pm10.2^{***}}$ & $\mathbf{77.2\pm8.6^{***}}$  & 82.4$\pm$4.2  \\
 & nnU-Net                   & 84.8$\pm$5.0  & 88.0$\pm$4.3  & 85.4$\pm$4.0  & 86.5$\pm$4.7  & 87.8$\pm$4.3  & $\mathbf{85.9\pm4.7^{**}}$  & 86.0$\pm$4.6  \\
  & MAPSeg                    & 86.3$\pm$3.7  & 89.0$\pm$2.9  & 87.3$\pm$3.0  & 88.1$\pm$2.7  & $\mathbf{88.9\pm2.6^{***}}$  & 87.8$\pm$3.3  & 87.8$\pm$3.0  \\
 & SAT                       & 85.8$\pm$3.7  & 88.2$\pm$3.2  & 85.8$\pm$3.4  & 86.3$\pm$3.5  & 87.2$\pm$3.5  & 87.4$\pm$3.2  & 86.4$\pm$3.9  \\
 & NeuroLangSeg              & 86.9$\pm$3.1  & 89.7$\pm$2.5  & 87.9$\pm$2.7  & 86.2$\pm$2.4  & 85.7$\pm$2.3  & 89.8$\pm$2.8  & 85.8$\pm$4.0  \\ \cmidrule(l){2-9} 
& \multicolumn{8}{l}{\textbf{Clinical generalization}}                                  \\ \cmidrule(l){2-9} 
 & (ADNI) AD-NeuroLangSeg     & $\mathbf{69.3\pm4.9^{***}}$  & $\mathbf{80.2\pm5.1^{***}}$  & $\mathbf{70.6\pm3.4^{***}}$  & $\mathbf{74.7\pm3.9^{***}}$  & $\mathbf{74.3\pm3.4^{***}}$  & $\mathbf{74.0\pm4.2^{***}}$  & $\mathbf{73.0\pm5.3^{***}}$  \\
  & (BraTS) Tumor-NeuroLangSeg & $\mathbf{70.0\pm5.7^{***}}$  & $\mathbf{73.9\pm6.2^{***}}$  & $\mathbf{67.6\pm9.0^{***}}$  & $\mathbf{66.1\pm6.9^{***}}$  & $\mathbf{67.0\pm6.7^{***}}$  & $\mathbf{70.5\pm11.7^{***}}$ & $\mathbf{62.7\pm10.9^{***}}$ \\ \midrule
\multirow{10}{*}{Volume}   & \multicolumn{8}{l}{\textbf{In-site and Cross-site (CN)}}                                      \\ \cmidrule(l){2-9} 
& FastSurfer                & $\mathbf{1.46\pm0.96^{**}}$ & $\mathbf{2.25\pm1.05^{***}}$ & $\mathbf{1.40\pm1.01^{**}}$ & 1.02$\pm$0.77 & 1.79$\pm$1.59 & 0.75$\pm$0.66 & 1.06$\pm$0.75 \\
 & QuickNAT                  & $\mathbf{1.55\pm1.10^{**}}$ & $\mathbf{3.56\pm1.34^{***}}$ & $\mathbf{1.69\pm1.13^{***}}$ & 0.84$\pm$0.57 & $\mathbf{2.54\pm0.94^{***}}$ & $\mathbf{1.53\pm0.79^{***}}$ & $\mathbf{3.50\pm1.02^{***}}$ \\
& nnU-Net                   & 1.10$\pm$0.70 & $\mathbf{1.93\pm0.89^{*}}$ & 0.82$\pm$0.60 & 0.75$\pm$0.54 & 1.29$\pm$0.76 & 0.67$\pm$0.52 & 0.79$\pm$0.61 \\
 & MAPSeg                    & 1.01$\pm$0.68 & 1.40$\pm$0.68 & 1.13$\pm$0.80 & 0.94$\pm$0.65 & 1.28$\pm$0.82 & 0.70$\pm$0.55 & 0.71$\pm$0.49 \\
& SAT                       & 0.98$\pm$0.80 & $\mathbf{1.92\pm0.90^{*}}$ & 1.07$\pm$0.69 & 0.68$\pm$0.54 & 1.36$\pm$1.03 & 0.74$\pm$0.60 & 0.86$\pm$0.60 \\
 & NeuroLangSeg              & 0.91$\pm$0.67 & 1.44$\pm$0.77 & 0.84$\pm$0.55 & 0.72$\pm$0.50 & 1.13$\pm$0.74 & 0.78$\pm$0.54 & 0.77$\pm$0.59 \\ \cmidrule(l){2-9} 
  & \multicolumn{8}{l}{\textbf{Clinical generalization}}                                \\ \cmidrule(l){2-9} 
& (ADNI) AD-NeuroLangSeg     & $\mathbf{2.30\pm1.27^{***}}$ & $\mathbf{2.88\pm1.23^{***}}$ & $\mathbf{1.62\pm1.02^{***}}$ & $\mathbf{1.41\pm0.84^{***}}$ & $\mathbf{2.80\pm0.87^{***}}$ & 0.81$\pm$0.54 & 0.93$\pm$0.78 \\
& (BraTS) Tumor-NeuroLangSeg & $\mathbf{2.12\pm1.29^{***}}$ & $\mathbf{3.64\pm1.97^{***}}$ & 1.20$\pm$1.41 & $\mathbf{1.59\pm1.53^{***}}$ & $\mathbf{3.58\pm1.46^{***}}$ & $\mathbf{1.70\pm1.87^{***}}$ & $\mathbf{1.83\pm1.71^{***}}$ \\ \bottomrule
\end{tabular}
}

\vspace{2mm}

\noindent{\scriptsize HIPP:Hippocampus, AMG:Amygdala, TM:Thalamus, CD:Caudate, PT:Putamen, PD:Pallidum, AB:Accumbens}
\end{table}

\begin{figure}[ht]
\centering
  \includegraphics[width=0.9\linewidth]{seg_result2.png}
  \caption{Qualitative segmentation performance for cross-site. Coronal and sagittal views are shown, together with corresponding zoomed-in regions of interest. Major segmentation errors are highlighted with red arrows. Ground-truth boundaries are indicated by dotted lines, while segmentations from different methods are shown as transparent overlays.}
  \label{fig4}
\end{figure}

\begin{figure}[ht]
\centering
  \includegraphics[width=1\linewidth]{method_comparison_boxplots.png}
  \caption{Shape, location, and Z-score distributions of seven subcortical structures compared to baselines. *** indicate statistically significant differences with $p<0.001$ under Bonferroni correction.}
  \label{fig5}
\end{figure}


\end{document}

