\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{wrapfig}
% \usepackage{subcaption}
\usepackage{mathtools}
\usepackage{amsmath,amsfonts,amssymb}
\usepackage{tabularray}
\usepackage{booktabs}
\usepackage{upgreek}
\usepackage{enumitem}
\usepackage{bm}
\usepackage{bbm}
\usepackage{censor}
\usepackage{tikz}
\usepackage{caption}

% \usepackage{textcomp}
\usepackage{siunitx} % For S column type in tables
% \usepackage[english]{babel}
% \usepackage{lastpage}
\usepackage{placeins}
\usepackage{fontawesome5}  
\usepackage{array}    % For table spacing

\usepackage{booktabs}

\usepackage{algorithm}
\usepackage{algpseudocode}

\usepackage{algorithm2e} % vlined optional
% \SetKwInput{KwIn}{Input}
% \SetKwInput{KwOut}{Output}

\usepackage{hyperref}
\newcolumntype{C}[1]{>{\centering\arraybackslash}p{#1}}




\newcolumntype{C}[1]{>{\centering\arraybackslash}p{#1}}

% Define a new column type for spacing
\newcolumntype{P}[1]{>{\centering\arraybackslash}p{#1}}


\usepackage{multirow}
\usepackage{makecell}
% \usepackage{subfigure}

\usepackage{tikz}
\usetikzlibrary{arrows.meta,positioning,fit,calc,backgrounds}
\usepackage{amsmath,amssymb}
\usepackage{xcolor}


\newcommand*{\Scale}[2][4]{\scalebox{#1}{$#2$}}%


\definecolor{LightCyan}{rgb}{0.88,1,1}
\definecolor{lightskyblue}{RGB}{225, 235, 240}


\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%

\definecolor{Gray}{gray}{0.90}
\definecolor{white}{rgb}{1.0, 1.0, 1.0}
\definecolor{Lightgreen}{RGB}{218, 246, 230 }


\usepackage{graphics,color}
\usepackage{xcolor}

\usepackage{graphicx,verbatim}
\newcommand{\red}[1]{{\color{black}#1}}


\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 235}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[UltraSemiNet]{Calibration-Aware Semi-Supervised Fetal Head Segmentation with Boundary-Positive Contrast}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
% \midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \orcid{1111-2222-3333-4444} \Email{abc@sample.edu}\\
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 \AND
% \Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4
% }

\midlauthor{\Name{Ufaq Khan\midljointauthortext{Corresponding Author}\nametag{$^{1}$}} \Email{ufaq.khan@mbzuai.ac.ae}\\
\Name{Umair Nawaz\nametag{$^{1}$}} \Email{umair.nawaz@mbzuai.ac.ae}\\
\Name{Tajamul Ashraf\nametag{$^{1}$}} \Email{tajamul.ashraf@mbzuai.ac.ae}\\
\Name{Tausifa Jan Saleem\nametag{$^{1}$}} \Email{tausifa.saleem@mbzuai.ac.ae}\\
\Name{Massimo Caputo\nametag{$^{2}$}} \Email{M.caputo@bristol.ac.uk}\\
\Name{Srinivas Ananth Narayan\nametag{$^{3}$}} \Email{Srinivas.narayan@uhbw.nhs.uk}\\
\Name{Muhammad Bilal\nametag{$^{4}$}} \Email{Muhammad.bilal@bcu.ac.uk}\\
\Name{Junaid Qadir\nametag{$^{5}$}} \Email{jqadir@qu.edu.qa}\\
\Name{Muhammad Haris\nametag{$^{1}$}} \Email{muhammad.haris@mbzuai.ac.ae}\\
\AND
\addr $^{1}$ Mohamed Bin Zayed University of Artificial Intelligence, UAE\\
\addr $^{2}$ University of Bristol, UK\\
\addr $^{3}$ University Hospital Bristol and Weston, UK\\
\addr $^{4}$ Birmingham City University, UK\\
\addr $^{5}$ Qatar University, Qatar
}



\begin{document}

\maketitle

\begin{abstract}
Accurate fetal head segmentation in ultrasound is hard to scale as labels are scarce and most errors occur at the head--background interface under speckle, shadowing, and low contrast. We present \emph{UltraSemiNet}, a teacher--student framework that makes cross--pseudo supervision (CPS) \emph{selective} via temperature calibration and a dual gate requiring high confidence and test-time augmentation (TTA) stability. We also introduce two boundary-focused modules that complement CPS: \textbf{SAT}, a boundary-positive spatial contrast that learns \emph{through} ambiguous edges using an entropy belt and a soft-IoU agreement test; and \textbf{PCM}, a prototype-guided curriculum that maintains uncertainty-weighted head/background prototypes and targets feature--prototype discrepancies. Across two datasets (FBUI and HC18), UltraSemiNet improves overlap and boundary metrics over a calibrated CPS baseline (e.g., Dice $0.927{\rightarrow}0.971$; HD95 $7.9{\rightarrow}6.8$\,px), with similar cross-dataset trends. Crucially, the calibrated gate reduces miscalibration of the \emph{accepted} pseudo-labels: both expected calibration error (ECE) and Brier score decrease overall, with the largest gains within the 0--2\,px boundary band, alongside improvements in pseudo-label accuracy. Ablations show CPS calibration, SAT, and PCM are complementary and concentrate improvements on boundary-sensitive metrics. \red{In a blinded study, UltraSemiNet achieved better segmentation performance than two senior fetal medicine experts when evaluated against the dataset reference masks, indicating the potential to reduce manual refinements.} Our code can be accessed on \href{https://github.com/Ufaqkhan/UltraSemiNet}{UltraSemiNet}.
% In a blinded study, UltraSemiNet achieved greater agreement with an adjudicated rater consensus than either of two senior fetal medicine experts and met their quality checklist criteria more consistently, indicating the potential to reduce manual refinements.
% \red{Some parts of our code are available at \url{https://anonymous.4open.science/r/UltraSemiNet-4-MIDL}, and the full code will be shared upon acceptance.}
\end{abstract}

\begin{keywords}
Fetal Segmentation, Semi-Supervised Learning, Boundary-Aware Learning
\end{keywords}


\section{Introduction}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\red{The accurate segmentation of the fetal head on ultrasound is a foundational step for fetal biometry, in particular for head circumference (HC) and biparietal diameter (BPD) estimation. Clinically, HC and BPD are among the most routinely acquired fetal biometric measurements and are central to (i) pregnancy dating/gestational age (GA) estimation beyond the first trimester and (ii) longitudinal growth surveillance to detect growth disorders such as fetal growth restriction (FGR) and large-for-gestational-age, which are associated with adverse perinatal outcomes \cite{salomon2019isuog,figueras2018diagnosis}. These measurements are interpreted against widely used international standards (e.g., INTERGROWTH-21st and WHO), which model expected HC/BPD trajectories across gestation for clinical decision-making and cross-population comparability \cite{papageorghiou2014international,kiserud2017world}. Importantly, clinical practice guidelines emphasize standardized acquisition planes, visibility of key anatomical landmarks, and accurate ellipse/caliper placement under routine quality control \cite{salomon2019isuog}. In this context, a reliable head mask is not only a segmentation output, but it is a practical pre-measurement representation that stabilizes ellipse-based HC/BPD tools, reduces manual contour editing, and can support real-time quality control by flagging frames where the skull boundary is unreliable.}

At the same time, the skull--background interface is precisely where ultrasound is most challenging due to speckle, acoustic shadowing, attenuation, and low local contrast \citep{Noble2006UltrasoundSurvey}. These factors create two coupled obstacles: \textit{(i)} label scarcity at scale (pixel-accurate annotation is time-consuming and requires domain expertise), and \textit{(ii)} boundary ambiguity even when labels exist. The latter is compounded by inter-operator variation in annotation conventions (e.g., ellipse-like contours vs.\ fine-grained masks) \citep{Zhang2020HC18Regression}. As a consequence, fully supervised models can overfit limited labels or produce contours that appear plausible but are misaligned at the boundary, inflating boundary-sensitive errors such as \red{Average Surface Distance (ASD) and the 95th percentile Hausdorff Distance (HD95)} despite reasonable Dice scores \citep{shen2023survey}.

Semi-supervised segmentation is therefore attractive because it can leverage large unlabeled collections through self-training and consistency regularization \cite{Tarvainen2017MeanTeacher,Chen2021CPS}. However, in fetal ultrasound, the pixels most likely to be mislabeled are those near the head boundary. If pseudo-label selection relies on raw softmax confidence, which is often miscalibrated, self-training can systematically amplify edge errors. Similarly, enforcing consistency without checking whether predictions are stable under perturbations can also reinforce acquisition artifacts \citep{Guo2017Calibration,Wang2019TTASeg}. A subtler issue is that many representation-learning strategies avoid uncertain regions when forming contrastive pairs (e.g., sampling only confident pixels or memory exemplars), which sharpens features in easy interiors while leaving boundary features under-constrained \citep{Wang2022UGPCL,wang2021exploring,alonso2021semi}. Together, miscalibrated pseudo-labels and boundary avoidance can create a gap between overlap metrics and the boundary fidelity required for reliable biometry.

We address semi-supervised binary fetal head segmentation in ultrasound with minimal pixel-level annotation. The technical challenge is to exploit unlabeled data without reinforcing boundary errors while learning representations that keep the head and background well separated, particularly at the interface. Concretely, we seek a training recipe that \textit{(i)} selects reliable pseudo-labels under explicit checks for confidence and augmentation stability, \textit{(ii)} learns discriminative features through ambiguous boundary neighborhoods only when local evidence agrees, and \textit{(iii)} regularizes the global feature space so that the two classes form compact, well-separated clusters despite class imbalance and annotation-style variability.

We propose \emph{UltraSemiNet}, a teacher--student framework that combines cross--pseudo supervision (CPS) with two uncertainty-aware representation modules tailored to ultrasound boundaries. In practice, UltraSemiNet supports two complementary clinical roles: \textit{(i)} as a pre-measurement step, it produces a clean head mask that stabilizes ellipse-based HC/BPD tools and reduces manual contour editing, and \textit{(ii)} as a real-time quality check, its calibrated confidence and augmentation-stability scores can flag frames with unreliable boundaries for immediate reacquisition before measurements are finalized. Methodologically, we \textit{(i)} calibrate teacher probabilities via temperature scaling on a small labeled subset and gate pseudo-labels using high binary confidence and stability under flip/rotation test-time augmentation (TTA); \textit{(ii)} introduce \textbf{SAT}, a boundary-positive \textbf{S}p\textbf{A}\textbf{T}ial contrast that detects an entropy-based ``boundary belt'' and admits positive pairs across edges only when local probability fields agree (soft-IoU gate); and (iii) introduce \textbf{PCM}, a prototype-guided curriculum miner that maintains uncertainty-weighted teacher prototypes for head/background and prioritizes pixels whose student features are far from their predicted class prototype. \red{Although we focus on 2D fetal head ultrasound in this work, the proposed calibration-aware selective supervision and boundary-focused representation learning are broadly applicable and can be extended to other obstetric targets and to 3D/volumetric acquisitions, as discussed in the limitation section.}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% \red{The accurate segmentation of the fetal head on ultrasound is a foundational step for fetal biometry, especially for head circumference (HC) and biparietal diameter (BPD) estimation.
% Clinically, HC and BPD are routine fetal biometric measurements used to (i) estimate gestational age (GA) beyond the first trimester when early crown--rump length dating is unavailable/unreliable, and (ii) perform longitudinal growth monitoring to identify fetal growth disorders such as fetal growth restriction and large-for-gestational-age, which are associated with adverse perinatal outcomes \cite{vanDenHeuvel2018PloSOne,salomon2019isuog,figueras2018diagnosis}. 

% These head measurements are interpreted against widely used international standards (e.g., INTERGROWTH-21st and WHO), which explicitly model serial ultrasound HC/BPD trajectories across gestation for clinical decision-making and cross-population comparability \cite{papageorghiou2014international,kiserud2017world}. Importantly, practice guidelines emphasize standardized acquisition planes, clear depiction of anatomical landmarks, and precise ellipse/caliper placement under routine quality-control.}


% In addition, the boundary between the skull and the background is precisely where ultrasound is most challenging due to speckle, acoustic shadowing, and low local contrast \citep{Noble2006UltrasoundSurvey}. These factors produce two practical obstacles: (i) label scarcity at scale, and (ii) boundary ambiguity even when labels exist, compounded by variation in annotation style (e.g., ellipse-like contours vs. fine-grained masks) \citep{Zhang2020HC18Regression}. As a result, fully supervised models often overfit limited labels or yield contours that are visually plausible yet misaligned, inflating boundary-sensitive errors like \red{Average Surface Distance (ASD) and 95th percentile Hausdorff Distance (HD95)} despite reasonable Dice \citep{shen2023survey}.


% Semi-supervised segmentation is attractive because it can leverage large unlabeled collections via self-training and consistency \cite{Tarvainen2017MeanTeacher,Chen2021CPS}. However, in ultrasound, the pixels most likely to be mislabeled are those near the head boundary. If we trust raw softmax confidence, which is often miscalibrated, pseudo-labels can amplify edge errors. Similarly, enforcing consistency without checking the stability of the prediction can also reinforce artifacts. \citep{Guo2017Calibration, Wang2019TTASeg}. A subtler issue is that many representation-learning strategies avoid uncertain regions when constructing contrastive pairs (e.g., by sampling confident pixels or class-memory exemplars), which sharpens features in easy interiors while leaving boundary features under-constrained \citep{Wang2022UGPCL, Wang2021CrossPixContrast, Alonso2021PixelMemory}. Together, miscalibrated pseudo-labels and boundary avoidance create a gap between overlap metrics and the boundary fidelity required in practice.


% We address semi-supervised \emph{binary} segmentation of the fetal head in ultrasound with minimal pixel-level annotation. The technical challenge is to exploit unlabeled data without reinforcing boundary errors and to learn feature representations so that the two classes (head and background) remain well separated at the interface.  
% Concretely, we seek a training recipe that (1) selects reliable pseudo-labels under explicit checks for confidence \emph{and} augmentation stability; (2) learns discriminative features through ambiguous boundary neighborhoods only when local evidence agrees; and (3) regularizes the global feature space so that the two classes form compact, well-separated clusters, despite class imbalance and annotation style variability.


% We propose \emph{UltraSemiNet}, a teacher--student framework that combines cross--pseudo supervision (CPS) with two uncertainty-aware representation modules tailored to ultrasound boundaries. In practice, UltraSemiNet fits into the clinical workflow in two complementary ways. \textit{(i)}, as a pre-measurement step, it produces a clean head mask that stabilizes ellipse-based HC/BPD tools and reduces manual contour editing, shortening scan time and standardizing measurements across operators. \textit{(ii)}, as a real-time quality check, its calibrated confidence and augmentation-stability scores flag frames with unreliable boundaries for immediate reacquisition before measurements are finalized. Because the approach is semi-supervised, sites can leverage large unlabeled archives and a small labeled seed set, thereby easing deployment where expert annotation fundings are limited. 


% In terms of methodology, we first \emph{calibrate} teacher probabilities via temperature scaling on a small labeled subset and gate pseudo-labels with a dual criterion, i.e., high binary confidence and stability under flip/rotation test-time augmentation (TTA). This converts CPS from a blind self-training signal into a selective supervision source targeted to trustworthy pixels. Second, we introduce \textbf{SAT}, a boundary-positive \textbf{S}p\textbf{A}\textbf{T}ial contrast that uses calibrated probabilities to detect an entropy-based ``boundary belt'' and admits positive pairs across edges only when the local probability fields agree (soft-IoU gate), with anchor weights that focus learning on clinically relevant ambiguity while avoiding extreme noise. Third, we introduce \textbf{PCM}, a prototype-guided curriculum miner that maintains \emph{uncertainty-weighted} teacher prototypes for head and background and prioritizes pixels whose student features are far from their predicted class prototype, producing a simple global structure prior without brittle hard-patch heuristics. 



\textbf{Contributions.}\textit{(i)} We formulate a \emph{calibrated} CPS scheme for binary ultrasound segmentation that accepts pseudo-labels only when both confident and TTA-stable, reducing confirmation bias near boundaries. \textit{(ii)} We propose \textbf{SAT}, an uncertainty-aware, boundary-positive spatial contrast that admits cross-edge positives via a soft-IoU agreement test on probability crops.
% and concentrates gradients with an uncertainty-informed anchor weight. 
\textit{(iii)} We propose \textbf{PCM}, a lightweight prototype-guided curriculum that replaces heuristic hard-patch mining with uncertainty-weighted teacher prototypes and a discrepancy-based selection of semantically hard pixels, improving global feature separation under class imbalance. \textit{(iv)} We conduct a blinded reader study with two senior fetal medicine experts, demonstrating higher agreement with UltraSemiNet outputs. 

% \paragraph{Contributions.}
% \begin{itemize}

%     \item We formulate a \emph{calibrated} CPS scheme for binary ultrasound segmentation that accepts pseudo-labels only when both confident and TTA-stable, reducing confirmation bias near boundaries.
    
%     \item We propose \textbf{SAT}, an uncertainty-aware, \emph{boundary-positive} spatial contrast that admits cross-edge positives via a soft-IoU agreement test on local probability crops and concentrates gradients with an uncertainty-informed anchor weight.
    
%     \item We propose \textbf{PCM}, a lightweight prototype-guided curriculum that replaces heuristic hard-patch mining with uncertainty-weighted teacher prototypes and a discrepancy-based selection of semantically hard pixels, improving global feature separation under class imbalance.
    
%     % \item We present a thorough evaluation on FBUI and an external HC18 subset with different splits, ablations, calibration analysis, and efficiency measurement.  

%     \item We conduct a blinded reader study with two senior fetal-medicine experts demonstrating higher agreement of UltraSemiNet outputs, indicating potential to reduce manual refinement.
% \end{itemize}


\section{Related work}
%\subsection{Deep Learning in Medical Imaging}
% \noindent\textbf{Deep Learning in Medical Imaging:} 
% Deep learning, and in particular Convolutional Neural Networks (CNNs), has transformed medical image segmentation by learning complex spatial hierarchies directly from data. The U-Net architecture, with its symmetric encoder–decoder and skip connections, exemplifies this progress, achieving state-of-the-art performance in fetal anatomy delineation by effectively capturing both global context and fine details \cite{ronneberger2015u,zhang2018deep}. To mitigate the scarcity of annotated medical images, semi-supervised methods such as Cross-Pseudo Supervision and the Mean Teacher paradigm exploit unlabeled data to improve segmentation accuracy while reducing labeling costs \cite{chen2021semi,tarvainen2017mean}. Generative Adversarial Networks (GANs) further enhance model robustness by synthesizing realistic training examples that reflect patient variability, thereby augmenting limited datasets and promoting generalization to unseen conditions \cite{goodfellow2014generative,frid2018synthetic}. More recently, contrastive learning approaches have been shown to bolster discriminative power by compelling models to distinguish similar from dissimilar image regions without extensive supervision, a principle that underlies the PCM algorithm’s focus on hard, informative patches in ultrasound segmentation \cite{chaitanya2020contrastive}.

\noindent\textbf{Calibration-aware pseudo-labeling.}
The utility of pseudo-labels depends on how well predicted probabilities reflect correctness. For this purpose, simple temperature scaling improves this alignment and yields cleaner supervision when applied before thresholding or re-labeling \cite{choi2024consistency, joy2023sample}. Beyond a single global cutoff, adaptive acceptance schedules either class-balanced self-training or curriculum thresholds, reduce bias toward majority regions, and progressively admit harder pixels. Stability checks under flip/rotation test-time augmentation (TTA) further filter unreliable pseudo-labels \cite{tan2024less} that would otherwise reinforce boundary artifacts. In medical SSL, uncertainty-aware teachers reduce the weight of the ambiguous regions or gate them to curb error propagation near the interfaces \cite{alizadehsani2024handling, vazhentsev2025uncertainty, chen2023deep}. 

UltraSemiNet adopts this calibration-aware view end-to-end, where we (i) temperature-scale the teacher on a small labeled split each epoch, (ii) enforce a dual gate that requires both high \emph{calibrated} confidence and TTA-based stability before a pixel supervises CPS, and (iii) reuse the same calibrated probabilities to drive SAT’s boundary-positive pairing and PCM’s uncertainty-weighted prototype updates, ensuring selection and representation shaping are governed by a shared reliability criterion.


%\subsection{Classical Techniques in Fetal Ultrasound Segmentation}
\noindent\textbf{Classical Techniques in Fetal Ultrasound Segmentation.} The segmentation of fetal structures in ultrasound images initially relied heavily on classical image processing methods. The techniques such as thresholding \cite{Liu2019}, region growth \cite{Yuheng2017}, and Active Contour Models (ACM) \cite{Kass1988} formed the basis of early segmentation efforts. While these methods provided initial frameworks for segmentation, they often required substantial manual intervention and struggled with the intrinsic challenges of ultrasound imaging, such as speckle noise and low contrast\cite{khan2024fetr}, limiting their effectiveness and reliability in clinical applications \cite{smith2010classical,khan2026aurora}.
The advent of deep‐learning architectures has substantially advanced fetal ultrasound segmentation. 

% Reviewer QXgM - 1 - Deep-Learning-based Fetal Ultrasound Segmentation
% \noindent \textbf{Deep-Learning-based Fetal Ultrasound Segmentation:} TransFSM \cite{10310100} combines deformable self attention with a boundary‐aware decoder and an auxiliary Transformer head to accurately delineate multiple anatomies and biometric landmarks in images exhibiting variable shapes and ambiguous boundaries. RTSeg-Net \cite{ou2024rtseg} further demonstrates real‐time performance in intrapartum ultrasound by integrating distribution-shifting convolutional blocks, tokenized MLP modules, and efficient feature fusion, achieving high accuracy on resource-constrained devices such as the Jetson Nano. In a large‐scale study of fetal echocardiography, an AI model trained on 5,363 images from 2,551 pregnancies not only segmented standard planes with high fidelity but also prospectively identified additional diagnostic views in 100 patients, underscoring its potential to enhance both the accuracy and efficiency of prenatal cardiac assessment \cite{taksoee2024ai}.


\section{Methodology}
\label{sec:method}
\noindent \textbf{Overview.}
We present the method in the same order it is executed during training: 
(i) update the exponential moving average (EMA) teacher, calibrate its probabilities, and select reliable pseudo-labels via a confidence+stability gate;
(ii) use the calibrated teacher to define boundary-aware positive/negative pairs and apply spatial contrast;
(iii) maintain uncertainty-weighted class prototypes and perform curriculum mining to regularize the global embedding space.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{figure*}[ht]
    \centering
    \includegraphics[width=0.75\linewidth]{Images/Ultra-MIDL.png}
    \caption{An overview of UltraSemiNet: weak/strong views feed teacher $f_T$ (temperature-calibrated, TTA-averaged) and student $f_S$. The student minimizes the overall loss $\mathcal{L}$ using CPS, boundary-positive SAT, and PCM modules, while the teacher is updated via EMA.}
    \label{fig:main}
\end{figure*}

\subsection{Preliminaries and Notation}
\label{sec:notation}
Let $\mathcal{D}_L=\{(x,y)\}$ be labeled images with binary masks $y\in\{0,1\}^{h\times w}$ where (1=head, 0=background), and $\mathcal{D}_U=\{x\}$ unlabeled images as depicted in Fig.~\ref{fig:main}. We use a student network $f_S(\cdot;\theta_S)$ and an EMA teacher $f_T(\cdot;\theta_T)$ which can be updated as:
\begin{equation}
\label{eq:ema}
\theta_T \leftarrow \alpha\,\theta_T + (1-\alpha)\,\theta_S,\qquad \alpha\in[0.99,0.999].
\end{equation}
At pixel $i$, the networks output head probabilities $p_S(i),p_T(i)\in[0,1]$ and the teacher pseudo-label $\hat y_T(i)=\mathbbm{1}[p_T(i)\ge 0.5]$. The teacher uncertainty is measured by binary entropy as:
\begin{equation}
\label{eq:entropy}
u(i) = -p_T(i)\log p_T(i) -(1-p_T(i))\log(1-p_T(i)),
\end{equation}
and we use normalized entropy $\tilde u(i)=u(i)/\log 2\in[0,1]$ to identify boundary regions. For representation learning, we use $\ell_2$-normalized pixel embeddings $v_i\in\mathbb{R}^{d}$ from a lightweight projection head.




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


% \subsection{Preliminaries and Notation}
% \label{sec:notation}
% Let $\mathcal{D}_L=\{(x,y)\}$ be labeled images with binary masks $y\in\{0,1\}^{h\times w}$ for the fetal head (1=head, 0=background), and $\mathcal{D}_U=\{x\}$ unlabeled images, as depicted in Figure~\ref{fig:main}.
% A shared encoder--decoder backbone maps $x$ to a feature map $H\in\mathbbm{R}^{D\times h\times w}$.
% We maintain a \emph{student} $f_S(\cdot;\theta_S)$ and an EMA \emph{teacher} $f_T(\cdot;\theta_T)$, which can be updated as
% \begin{equation}
% \label{eq:ema}
% \theta_T \leftarrow \alpha\,\theta_T + (1-\alpha)\,\theta_S,\qquad \alpha\in[0.99,0.999].
% \end{equation}
% At pixel $i$, both networks output the \emph{head probability} $p_S(i),p_T(i)\in[0,1]$ (sigmoid of the head logit). 
% The teacher pseudo-label is $\hat y_T(i)=\mathbbm{1}[\,p_T(i)\ge 0.5\,]$.
% We quantify uncertainty by the \emph{binary entropy} which is:
% \begin{equation}
% \label{eq:entropy}
% u(i) = -\,p_T(i)\log p_T(i)\;-\;\big(1-p_T(i)\big)\log\big(1-p_T(i)\big),
% \end{equation}
% which rises at $p_T(i)=0.5$ and is small inside confident interior/background.
% For representation learning we obtain $\mathcal{L}_2$-normalized embeddings $v_i=\mathrm{Norm}(\phi(H_i))\in\mathbbm{R}^{d}$ via a two-layer projection head $\phi$ (default $d{=}128$).
% SAT uses a disk $\mathcal{N}_r(i)=\{j:\|j-i\|\le r\}$ for positives and an annulus $\mathcal{A}_{[d_{\min},d_{\max}]}(i)=\{j:d_{\min}\le\|j-i\|\le d_{\max}\}$ for negatives.
% For local agreement tests, we extract a \emph{single-channel} teacher probability crop $\mathcal{M}_i\in[0,1]^{s\times s}$ centered at $i$ (window size $s$).

\subsection{Calibrated Cross--Pseudo Supervision (CPS)}
\label{sec:cps}
CPS leverages unlabeled data by letting teacher and student supervise each other. However, using raw probabilities directly can therefore risk reinforcing errors. We therefore calibrate teacher probabilities using calibration techniques \cite{kim2025role, forest2024calibrated} so that confidence aligns better with empirical correctness, and we restrict CPS to pixels that are both confident and stable under simple test-time augmentations.

\paragraph{Probability calibration.}
At the start of each epoch, we fit a temperature $\mathcal{T}>0$ on a small labeled subset by minimizing negative log-likelihood. 
If $\ell_T(i)$ denotes the teacher head logit, the \emph{calibrated} probability is
% \begin{equation}
% \label{eq:calib}
$p_T(i)\;\leftarrow\;\sigma\!\big(\ell_T(i)/\mathcal{T}\big),$ where, $\sigma(z)=\tfrac{1}{1+e^{-z}}.$ Calibration is re-estimated periodically (e.g., every 3 epochs) to track the evolving teacher. 
All downstream decisions in the epoch use the calibrated $p_T$. For binary segmentation, a natural confidence is $c_T(i)=\max\{\,p_T(i),\,1-p_T(i)\,\}$. A pixel is accepted for CPS only if it is (i) confident and (ii) stable under simple TTA (flips/rot90).
Let $\overline{p}_T^{\mathrm{TTA}}(i)$ be the mean teacher probability under these augmentations (mapped back).
We measure stability using the Bernoulli KL divergence between $p_T(i)$ and $\overline{p}_T^{\mathrm{TTA}}(i)$:



% A pixel is eligible for CPS if it is confident and its probability is stable under flips/rotations.
% Let $\overline{p}_T^{\mathrm{TTA}}(i)$ be the average teacher probability at $i$ over horizontal/vertical flips and $90^\circ$ rotations (mapped back to the original coordinates). For scalars $p,q\in(0,1)$, $\mathrm{Bern}(p)$ denotes the Bernoulli distribution with success probability $p$. Here, we have an acceptance indicator as:



\begin{equation}
\label{eq:acc-mask-bin}
\begin{aligned}
\mathbbm{1}_{\mathrm{acc}}^{T}(i)
&= \mathbbm{1}\!\left[c_T(i)\ge \tau(t)\right]\, \times\;\mathbbm{1}\!\left[\mathrm{KL}\!\Big(\mathrm{Bern}\big(p_T(i)\big)\,\big\|\,\mathrm{Bern}\big(\overline{p}_T^{\mathrm{TTA}}(i)\big)\Big)\le \delta\right]
\end{aligned}
\end{equation}



where $\tau(t)$ anneals linearly from $0.95$ to $0.80$ over training and $\delta$ is a small KL cutoff. Now, the accepted teacher labels are:
\begin{equation}
\label{eq:masked-pl-bin}
\tilde y_T(i)=
\begin{cases}
\hat y_T(i), & \text{if } \mathbbm{1}_{\mathrm{acc}}^{T}(i)=1,\\[2pt]
\text{ignore}, & \text{otherwise.}
\end{cases}
\end{equation}

Symmetrically, we obtain $\tilde y_S(i)$ with $p_S(i)$ and $c_S(i)=\max\{p_S(i),1-p_S(i)\}$, defining $\mathbbm{1}_{\mathrm{acc}}^{S}(i)$ via \eqref{eq:acc-mask-bin}. Here, $\hat y_T(i)$ denotes the \emph{raw teacher pseudo-label} at pixel $i$, obtained from the calibrated probability $p_T(i)$ whereas $\tilde y_T(i)$ represent the \emph{accepted} pseudo-label. We then supervise the student using teacher-accepted pixels $\Omega_T=\{i:\mathbbm{1}_{\mathrm{acc}}^{T}(i)=1\}$, and supervise the teacher (EMA) using student-accepted pixels $\Omega_S=\{i:\mathbbm{1}_{\mathrm{acc}}^{S}(i)=1\}$:
\begin{equation}
\label{eq:cps-bin}
\begin{aligned}
\mathcal{L}_{\mathrm{CPS}}
&= \frac{1}{|\Omega_T|}\!\sum_{i\in\Omega_T}\!\mathrm{CE}\big(\tilde y_T(i),\, p_S(i)\big) +\; \frac{1}{|\Omega_S|}\!\sum_{i\in\Omega_S}\!\mathrm{CE}\big(\tilde y_S(i),\, p_T(i)\big).
\end{aligned}
\end{equation}

By restricting CPS to confident and augmentation-stable pixels, we reduce confirmation bias where the model is most error-prone (near the head boundary) and provides consistent, calibrated probabilities $p_T(i)$ and uncertainty $u(i)$ as depicted in Figure~\red{4}(b) of supplementary material.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{SAT: Uncertainty-Aware Boundary-Positive Spatial Contrast}
\label{sec:sat}
The SAT module is designed around a simple observation, i.e., in binary fetal head segmentation, most of the ambiguity lies exactly where the head meets the background. 
Rather than discarding these pixels as ``noisy,'' SAT turns them into useful supervision, but only when the nearby evidence supports doing so. 
In practice, SAT uses the \emph{calibrated} head probabilities of the teacher $p_T(i)\in[0,1]$ (Sec.~\ref{sec:cps}) to identify potentially ambiguous locations and then shapes the student’s representations to be consistent across those locations and well-separated from the background. 

We begin by quantifying local uncertainty using the binary entropy: $u(i)$, and normalize it by its maximum, $\tilde u(i)$. Pixels with intermediate $\tilde u$ are likely to lie on (or near) the head boundary. We therefore form a \emph{boundary belt} by marking pixels whose normalized entropy falls in $[\epsilon_1,\epsilon_2]$, so 
$b(i)=\mathbbm{1}\!\big[\epsilon_1 \le \tilde u(i) \le \epsilon_2\big], $where $\epsilon_1=0.40,\ \epsilon_2=0.95.$


Now, ambiguous pixels are not automatically trusted. Before we let them act as ``positives'' in a contrastive pair, we ask whether the local probability fields around two candidate pixels agree. For that we extract a single-channel probability crop $\mathcal{M}_i\in[0,1]^{s\times s}$ centered at $i$ and compute a soft intersection-over-union with a neighbor $j$,
\begin{equation}
\label{eq:siou-bin}
\mathrm{sIoU}(\mathcal{M}_i,\mathcal{M}_j)=
\frac{\sum_{\omega}\min\{\mathcal{M}_i(\omega),\mathcal{M}_j(\omega)\}}
{\sum_{\omega}\max\{\mathcal{M}_i(\omega),\mathcal{M}_j(\omega)\}},
\end{equation}
which measures how similar the surrounding head probabilities are within an $s\times s$ window.

\paragraph{Positive/negative sampling with CPS consistency.}

With these ingredients, SAT constructs contrastive pairs as follows. \red{For each anchor pixel $i$, we sample one positive within a small radius $r$ that shares the same pseudo-label, and $K_-$ negatives from a wider ring region whose pseudo-label is opposite.}
% For an anchor pixel $i$ with teacher pseudo-label $\hat y_T(i)=\mathbbm{1}[p_T(i)\ge 0.5]$, we look for a \emph{positive} $p(i)$ in a small disk $\mathcal{N}_r(i)$ that shares the same pseudo-label. 
If the anchor is away from the boundary ($b(i)=0$), we simply require that both pixels are eligible under the CPS acceptance rule (Eq.~\eqref{eq:acc-mask-bin}), ensuring that they are confident and stable. If the anchor lies in the boundary belt ($b(i)=1$), we only admit a positive with probability $q_b(t)$ and when the local fields agree, i.e., $\mathrm{sIoU}(\mathcal{M}_i,\mathcal{M}_{p(i)})>\tau_{\mathrm{bIoU}}$. Moreover, at least one of the two pixels must be CPS-accepted to guard against spurious matches. Now, the negatives are drawn from an annulus and must carry the opposite pseudo-label. Here, to avoid background overwhelming the loss in this binary setting, we sample negatives so that head and background contribute in a balanced way whenever possible.

To focus learning where it matters, each anchor is given a weight that increases with local ambiguity but does not reward extreme uncertainty. Mathematically, it can be shown as: 
$w(i)=\Big(\gamma_1\big(1-\tilde u(i)\big)+\gamma_2\,\overline{\tilde u}_{\mathcal{N}_r(i)}\Big)\,\big(1+\lambda_b\,b(i)\big),$
where $\overline{\tilde u}_{\mathcal{N}_r(i)}$ averages $\tilde u$ in the positive radius, $\gamma_1{=}0.7$, $\gamma_2{=}0.3$, and a small boost $\lambda_b{=}0.5$ nudges attention toward the belt. 
Using $\mathcal{L}_2$-normalized student embeddings, SAT minimizes a cosine InfoNCE loss as:


\begin{equation}
\label{eq:sat-bin}
\resizebox{0.7\textwidth}{!}{$
\mathcal{L}_{\mathrm{SAT}}
=\sum_{i} w(i)\left[
-\log
\frac{\exp\!\big(\langle v_i^{S}, v_{p(i)}^{S}\rangle/\beta\big)}
{\exp\!\big(\langle v_i^{S}, v_{p(i)}^{S}\rangle/\beta\big)
+\sum_{n\in\mathcal{N}(i)}\exp\!\big(\langle v_i^{S}, v_{n}^{S}\rangle/\beta\big)}
\right],\;\; \beta=0.07
$}
\end{equation}

In effect, SAT asks the representation to be continuous across genuinely consistent boundary neighborhoods and separate them decisively from the background nearby. 
Now, as admission and weighting depend on the same calibrated probabilities and acceptance rule used in CPS, the contrastive signal remains logically consistent with supervision selection, i.e, ambiguous regions shape the features only when the evidence supports it, leading to sharper, more reliable contours (improved HD95/ASD) without sacrificing Dice. 
In practice, we use $r{=}5$, $s{=}15$, $K_-{=}64$ negatives per anchor, ramp $q_b(t)$ from $0$ to $0.4$ during the first half of training, and choose $\tau_{\mathrm{bIoU}}\in\{0.5,0.6,0.7\}$.


% ------------------------------------------------------------

\subsection{PCM: Prototype-Guided Curriculum Miner}
\label{sec:pcm}
Where SAT shapes features \emph{locally} around the boundary, PCM provides a lightweight \emph{global} structure by maintaining running prototypes for ``head'' and ``background'' and asks the student’s features to organize themselves around these references. This is done in a way that is robust to label noise, i.e., prototypes are updated only from pixels that the teacher deems reliable under the CPS acceptance rule, and those contributions are further down-weighted when uncertainty is high.

Specifically, for each class $c\in\{0,1\}$ we maintain an EMA prototype $\mu_c$. At a training step, teacher embeddings $v_i^{T}$ with pseudo-label $\hat y_T(i)=c$ are pooled into a temporary estimate $\tilde\mu_c$ using weights: $\omega(i)=\mathbbm{1}_{\mathrm{acc}}^{T}(i)\,\big(1-\tilde u(i)\big),$ so that only confident, augmentation-stable pixels contribute and highly uncertain cases have little influence. The class prototype is then updated by exponential averaging,
\begin{equation}
\label{eq:proto-bin}
\resizebox{0.8\textwidth}{!}{$
\tilde{\mu}_c = \frac{\sum_{i:\,\hat y_T(i)=c}\,\omega(i)\,v_i^{T}}{\sum_{i:\,\hat y_T(i)=c}\,\omega(i)},\quad
\mu_c \leftarrow \rho\,\mu_c + (1-\rho)\,\tilde{\mu}_c,\quad
\rho = 0.99.
$}
\end{equation}



We warm-start $\mu_c$ from the labeled set. 
If a batch contains no reliable pixels for a class, the EMA simply carries the previous prototype forward. Not all pixels are equally informative for organizing the feature space. To determine where PCM should act most strongly, we define a semantic hardness score at each pixel by measuring how far the student’s feature is from the prototype of its own predicted class. 
Let $p_S(i)$ be the student head probability (so $1-p_S(i)$ is background).
% Let $p_S(i)$ be the student’s head probability and set $\pi_1(i)=p_S(i)$, $\pi_0(i)=1-p_S(i)$. 
We compute
% \begin{equation}
% \label{eq:kappa-bin}
% \resizebox{0.7\textwidth}{!}{$
% \begin{aligned}
$\kappa(i) = \max\Bigl\{p_{S}(i)\bigl(1-\cos\langle v_i^{S},\mu_1\rangle\bigr), (1-p_S(i))\bigl(1-\cos\langle v_i^{S},\mu_0\rangle\bigr)
\Bigr\},$
which is large when a pixel is predicted as head (or background) but its feature is inconsistent with the corresponding prototype. 
A curriculum then selects the top $\gamma(t)$ fraction of pixels by $\kappa(i)$ 
% (with $\gamma$ annealed from $0.3$ to $0.7$ as training stabilizes)
and forms the active set $\Omega_\gamma$. 
Because background typically occupies more area than head, we subsample background within $\Omega_\gamma$ so that both classes contribute comparably.

Finally, PCM pulls features toward the prototype of their predicted class while pushing them away from the other class, using a marginized cosine loss:


\begin{equation}
\label{eq:pcm-bin}
\centering
\resizebox{0.9\textwidth}{!}{$ % adjust 0.85 as needed
\begin{aligned}
\mathcal{L}_{\mathrm{PCM}}
&=\frac{1}{|\Omega_\gamma|}\!
\sum_{i\in\Omega_\gamma}
\Big[
\pi_1(i)\,\big(m - \cos\langle v_i^{S},\mu_1\rangle + \cos\langle v_i^{S},\mu_0\rangle\big)_+ +\;\pi_0(i)\,\big(m - \cos\langle v_i^{S},\mu_0\rangle + \cos\langle v_i^{S}, \mu_1\rangle\big)_+\Big], m=0.2.
\end{aligned}
$}
\end{equation}



with $(\cdot)_+=\max(0,\cdot)$, the effect is to produce compact, well-separated head and background clusters in the embedding space, guided by prototypes that are themselves estimated from reliable pixels. 
As prototype updates use the same acceptance indicator and entropy as CPS, PCM’s global organization is aligned with the supervision that trains the classifier head and with the local boundary shaping performed by SAT. In practice, we compute $\kappa(i)$ at resolutions of $1/2$ and $1/4$ for efficiency, then upsample the resulting mask. Here, if prototypes begin to drift or collapse, increasing the margin to $m{=}0.3$ or slowing the curriculum ramp mitigates the issue.



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\section{Experimental Setup}


\begin{figure*}[hbtp]
\centering
% Additional figure spanning full width
\includegraphics[width=0.8\textwidth]{Images/UltraSemiNet-Qualitative.png}\vspace{-0.6em}
\caption*{(A)}
\vspace{-0.9em}
\label{fig:Q_results}
\begin{minipage}{0.5\textwidth}
\centering
\includegraphics[width=0.8\textwidth]{Images/PCM-MIDL.png}\vspace{-0.6em}
\caption*{(B)}
\vspace{-1.5em}
\label{fig:contrast-enhancement}
\end{minipage}%
\hfill
\begin{minipage}{0.44\textwidth}
\centering
\includegraphics[width=0.9\textwidth]{Images/Best-Worst.png}\vspace{-0.6em}
\caption*{(C)}
\vspace{-1.3em}
\label{fig:best-worst-predictions}
\end{minipage}
\vspace{0.3em}
\caption{(A) Original ultrasound images, ground truth segmentations, and model outputs with Dice scores; (B) Before and after PCM; (C) Best (a-c) and worst (d-f) UltraSemiNet predictions.}
\label{fig:Q_results}
\end{figure*}


\noindent\textbf{Datasets.}
We evaluate on two fetal head ultrasound collections.
(1) \emph{FBUI}~\cite{alzubaidi2023large}: 3{,}832 images spanning 18–40 weeks GA, split \textbf{by subject} into 60\%/20\%/20\% train/val/test.
(2) \emph{HC18 subset}~\cite{van2018automated}: It contains 999 fetal brain images with binary head annotations.
\red{Unless stated, FBUI provides labeled supervision, and contributes additional unlabeled images for semi-supervised training, while HC18 is used solely for cross-dataset evaluation.} Further details are provided in the supplementary material.

\noindent\textbf{Cross-validation and cross-dataset tests.}
All in-distribution results are reported under \textbf{five-fold} patient-level CV on FBUI.
For cross-dataset generalization, models are trained on FBUI and evaluated on the HC18 subset without retuning thresholds. 

\noindent\textbf{Evaluation metrics.}
\red{We report Dice Similarity Coefficient (DSC), Average Surface Distance (ASD), and 95th percentile Hausdorff distance (HD95).
DSC measures volumetric overlap, ASD summarizes average contour deviation, and HD95 reflects worst-case boundary error while being robust to outliers. All metrics are computed per image and averaged over the test set.}

% \paragraph{Per-epoch routine.}
% \begin{enumerate}[leftmargin=1.2em,itemsep=2pt,topsep=2pt]
%   \item \textbf{Calibrate teacher.} Fit temperature $\mathcal{T}$ on a small labeled subset; rescale teacher logits $\ell_T\!\mapsto\!\ell_T/\mathcal{T}$ to obtain calibrated $p_T$. Repeat every 3--5 epochs.
%   \item \textbf{Schedule updates.} Linearly anneal: confidence threshold $\tau(t):0.95\!\rightarrow\!0.80$ (binary CPS), boundary admission $q_b(t):0\!\rightarrow\!0.4$ (SAT), curriculum keep ratio $\gamma(t):0.3\!\rightarrow\!0.7$ (PCM).
% \end{enumerate}

% \paragraph{Per-iteration routine.}
% \begin{enumerate}[leftmargin=1.2em,itemsep=2pt,topsep=2pt]
%   \item \textbf{Mini-batch.} Sample labeled $(x_L,y_L)$ and unlabeled $x_U$. 
%   Apply weak augs to teacher views; weak/strong augs to student views (ultrasound-specific brightness/contrast, speckle injection, mild elastic, flips/90$^\circ$ rotations).
%   \item \textbf{Forward passes.} 
%   Teacher (weak): $p_T, v^{T}\leftarrow f_T(x_L^{w}\cup x_U^{w})$. 
%   Student (weak/strong): $p_S, v^{S}\leftarrow f_S(x_L^{s}\cup x_U^{s})$.
%   \item \textbf{CPS acceptance (binary).} 
%   Compute confidence $c_T(i)=\max\{p_T(i),1-p_T(i)\}$ and TTA-average $\overline{p}_T^{\mathrm{TTA}}(i)$. 
%   Form $\mathbbm{1}_{\mathrm{acc}}^{T}(i)$ via Eq.~\eqref{eq:acc-mask-bin}; define $\tilde y_T(i)$ via Eq.~\eqref{eq:masked-pl-bin}. 
%   Symmetrically compute $\mathbbm{1}_{\mathrm{acc}}^{S}(i)$ and $\tilde y_S(i)$. 
%   Evaluate $\mathcal{L}_{\mathrm{CPS}}$ (Eq.~\eqref{eq:cps-bin}).
%   \item \textbf{SAT pairs and loss.} 
%   From calibrated $p_T$, compute normalized entropy $\tilde u(i)$ and boundary belt $b(i)$ (Eq.~\eqref{eq:belt-bin}). 
%   Build agreement-gated positives using $\mathrm{sIoU}$ on single-channel crops (Eq.~\eqref{eq:siou-bin}) and CPS consistency; sample class-balanced negatives from the annulus. 
%   Compute weights $w(i)$ (Eq.~\eqref{eq:weight-bin}) and evaluate $\mathcal{L}_{\mathrm{SAT}}$ (Eq.~\eqref{eq:sat-bin}).
%   \item \textbf{PCM prototypes, curriculum, loss.} 
%   Update head/background prototypes $\{\mu_1,\mu_0\}$ with teacher embeddings and weights $\omega(i)$ (Eqs.~\eqref{eq:proto-bin}--\eqref{eq:proto-w-bin}). 
%   Rank pixels by semantic hardness $\kappa(i)$ (Eq.~\eqref{eq:kappa-bin}) and form $\Omega_\gamma$ with class balancing. 
%   Evaluate $\mathcal{L}_{\mathrm{PCM}}$ (Eq.~\eqref{eq:pcm-bin}).
%   \item \textbf{Supervised loss.} Compute $\mathcal{L}_{\mathrm{sup}}$ on $(x_L,y_L)$ (Eqs.~\eqref{eq:sup}--\eqref{eq:dice}).
%   \item \textbf{Update.} 
%   Form total loss $\mathcal{L}$ (Eq.~\eqref{eq:total}); update student by backprop; update teacher by EMA (Eq.~\eqref{eq:ema}). 
% \end{enumerate}

\begin{table}[t]
\centering
\small
\setlength{\tabcolsep}{12pt}
\renewcommand{\arraystretch}{1.05}
\caption{\textbf{FBUI (in-distribution) results} under 5-fold cross-validation.}
\label{tab:main-fbui}
\resizebox{\linewidth}{!}{%
\begin{tabular}{@{}lccc@{}}
\toprule
\textbf{Method} & \textbf{DSC $\uparrow$} & \textbf{ASD (px) $\downarrow$} & \textbf{HD95 (px) $\downarrow$} \\
\midrule
U\textendash Net~\cite{ronneberger2015u}                & $0.913 \pm 0.015$ & $2.41 \pm 1.25$ & $9.8 \pm 4.2$ \\
Attention U\textendash Net~\cite{oktay2018attention}    & $0.921 \pm 0.013$ & $2.12 \pm 1.19$ & $8.7 \pm 3.9$ \\
ViT~\cite{dosovitskiy2020image}                         & $0.906 \pm 0.018$ & $2.78 \pm 1.43$ & $11.3 \pm 5.1$ \\
2D nnU\textendash Net~\cite{isensee2021nnu}             & $0.930 \pm 0.011$ & $1.94 \pm 1.07$ & $8.2 \pm 3.6$ \\
Mean Teacher (MT)~\cite{tarvainen2017mean}              & $0.918 \pm 0.014$ & $2.23 \pm 1.18$ & $9.3 \pm 4.0$ \\
CPS-only~\cite{chen2021semi}                            & $0.927 \pm 0.012$ & $1.86 \pm 1.09$ & $7.9 \pm 3.8$ \\
MedSAM~\cite{ma2024segment}                             & $0.821 \pm 0.061$ & $6.85 \pm 3.72$ & $26.4 \pm 11.7$ \\
SAMUS~\cite{lin2024beyond}                              & $0.846 \pm 0.049$ & $5.72 \pm 3.11$ & $22.7 \pm 10.5$ \\
\red{Cross-Teaching}~\cite{luo2022semi}                 & $0.944 \pm 0.046$ & $2.62 \pm 1.21$ & $8.2 \pm 4.9$ \\
\red{MC-Net+}~\cite{wu2022mutual}                       & $0.884 \pm 0.042$ & $4.02 \pm 1.95$ & $10.2 \pm 3.5$ \\
\midrule
\textbf{UltraSemiNet (ours)}                            & \textbf{$0.971 \pm 0.010$} & \textbf{$1.07 \pm 0.92$} & \textbf{$6.8 \pm 3.2$} \\
\bottomrule
\end{tabular}%
}
\vspace{-0.4em}
\end{table}

\begin{table}[t]
\centering
\small
\setlength{\tabcolsep}{5.5pt}
\renewcommand{\arraystretch}{1.12}
\caption{\textbf{Cross-dataset results:} train on FBUI, test on HC18 (no retuning). We report ASD and HD95 in both pixels and millimeters using the provided per-image pixel spacing.}
\label{tab:cross}
\resizebox{\linewidth}{!}{%
\begin{tabular}{@{}lccccc@{}}
\toprule
\multirow{2}{*}{\textbf{Method}} & \multirow{2}{*}{\textbf{DSC $\uparrow$}} & \multicolumn{2}{c}{\textbf{ASD $\downarrow$}} & \multicolumn{2}{c}{\textbf{HD95 $\downarrow$}} \\
\cmidrule(lr){3-4} \cmidrule(lr){5-6}
& & \textbf{px} & \textbf{mm} & \textbf{px} & \textbf{mm} \\
\midrule
U\textendash Net~\cite{ronneberger2015u}                & $0.887 \pm 0.028$ & $3.21 \pm 1.92$ & $0.45 \pm 0.27$ & $14.2 \pm 6.3$ & $1.99 \pm 0.88$ \\
Attention U\textendash Net~\cite{oktay2018attention}    & $0.896 \pm 0.026$ & $2.98 \pm 1.78$ & $0.42 \pm 0.25$ & $13.1 \pm 5.8$ & $1.83 \pm 0.81$ \\
2D nnU\textendash Net~\cite{isensee2021nnu}             & $0.905 \pm 0.022$ & $2.75 \pm 1.63$ & $0.38 \pm 0.23$ & $12.4 \pm 5.5$ & $1.73 \pm 0.77$ \\
Mean Teacher (MT)~\cite{tarvainen2017mean}              & $0.898 \pm 0.025$ & $2.91 \pm 1.71$ & $0.41 \pm 0.24$ & $12.8 \pm 5.7$ & $1.79 \pm 0.80$ \\
CPS-only~\cite{chen2021semi}                            & $0.907 \pm 0.022$ & $2.63 \pm 1.56$ & $0.37 \pm 0.22$ & $11.7 \pm 5.1$ & $1.64 \pm 0.71$ \\
MedSAM~\cite{ma2024segment}                             & $0.803 \pm 0.072$ & $7.41 \pm 4.10$ & $1.04 \pm 0.57$ & $28.9 \pm 12.6$ & $4.04 \pm 1.76$ \\
SAMUS~\cite{lin2024beyond}                              & $0.828 \pm 0.064$ & $6.32 \pm 3.65$ & $0.88 \pm 0.51$ & $24.7 \pm 11.2$ & $3.45 \pm 1.57$ \\
\red{Cross-Teaching}~\cite{luo2022semi}                 & $0.897 \pm 0.037$ & $2.98 \pm 2.76$ & $0.42 \pm 0.39$ & $11.5 \pm 6.8$ & $1.61 \pm 0.95$ \\
\red{MC-Net+}~\cite{wu2022mutual}                       & $0.850 \pm 0.032$ & $4.17 \pm 2.56$ & $0.58 \pm 0.36$ & $13.9 \pm 9.4$ & $1.94 \pm 1.31$ \\
\midrule
\textbf{UltraSemiNet (ours)}                            & \textbf{$0.925 \pm 0.019$} & \textbf{$1.27 \pm 1.44$} & \textbf{$0.18 \pm 0.20$} & \textbf{$10.3 \pm 4.7$} & \textbf{$1.44 \pm 0.66$} \\
\bottomrule
\end{tabular}%
}
\vspace{-0.4em}
\end{table}



% \begin{table}[htbp]
% \centering
% \large
% \begingroup
% \renewcommand{\arraystretch}{1.25}

% \begin{minipage}[t]{0.45\linewidth}
% \centering
% \captionof{table}{FBUI (in-distribution) results under 5-fold CV.}
% \label{tab:main-fbui}
% \resizebox{\linewidth}{!}{%
% \begin{tabular}{@{}lccc@{}}
% \toprule
% Method & DSC $\uparrow$ & ASD (px) $\downarrow$ & HD95 (px) $\downarrow$ \\
% \midrule
% U\textendash Net~\cite{ronneberger2015u}                & $0.913 \pm 0.015$ & $2.41 \pm 1.25$ & $9.8 \pm 4.2$ \\
% Attention U\textendash Net~\cite{oktay2018attention}     & $0.921 \pm 0.013$ & $2.12 \pm 1.19$ & $8.7 \pm 3.9$ \\
% ViT~\cite{dosovitskiy2020image}                          & $0.906 \pm 0.018$ & $2.78 \pm 1.43$ & $11.3 \pm 5.1$ \\
% 2D nnU\textendash Net  \cite{isensee2021nnu}             & $0.930 \pm 0.011$ & $1.94 \pm 1.07$ & $8.2 \pm 3.6$ \\
% Mean Teacher (MT)~\cite{tarvainen2017mean}               & $0.918 \pm 0.014$ & $2.23 \pm 1.18$ & $9.3 \pm 4.0$ \\
% CPS-only~\cite{chen2021semi}                             & $0.927 \pm 0.012$ & $1.86 \pm 1.09$ & $7.9 \pm 3.8$ \\
% MedSAM \cite{ma2024segment}                              & $0.821 \pm 0.061$ & $6.85 \pm 3.72$ & $26.4 \pm 11.7$ \\
% SAMUS \cite{lin2024beyond}                               & $0.846 \pm 0.049$ & $5.72 \pm 3.11$ & $22.7 \pm 10.5$ \\
% \red{Cross-Teaching} \cite{luo2022semi}                               & $0.944 \pm 0.046$ & $2.62 \pm 1.21$ & $8.2 \pm 4.9$ \\
% \red{MC-Net+} \cite{wu2022mutual}                               & $0.884 \pm 0.042$ & $4.02 \pm 1.95$ & $10.2 \pm 3.5$ \\


% \midrule
% \textbf{UltraSemiNet (ours)}                              & \textbf{0.971 $\pm$ 0.010} & \textbf{1.07 $\pm$ 0.92} & \textbf{6.8 $\pm$ 3.2} \\
% \bottomrule
% \end{tabular}%
% }
% \vspace{-0.25em}
% \end{minipage}\hfill%
% \begin{minipage}[t]{0.49\linewidth}
% \centering
% \captionof{table}{Cross-dataset results: train on FBUI, test on HC18 (no retuning).}
% \label{tab:cross}
% \resizebox{\linewidth}{!}{%
% \begin{tabular}{@{}lccc@{}}
% \toprule
% Method & DSC $\uparrow$ & ASD (px) $\downarrow$ & HD95 (px) $\downarrow$ \\
% \midrule
% U\textendash Net~\cite{ronneberger2015u}                & $0.887 \pm 0.028$ & $3.21 \pm 1.92$ & $14.2 \pm 6.3$ \\
% Attention U\textendash Net~\cite{oktay2018attention}     & $0.896 \pm 0.026$ & $2.98 \pm 1.78$ & $13.1 \pm 5.8$ \\
% 2D nnU\textendash Net \cite{isensee2021nnu}              & $0.905 \pm 0.022$ & $2.75 \pm 1.63$ & $12.4 \pm 5.5$ \\
% Mean Teacher (MT)~\cite{tarvainen2017mean}               & $0.898 \pm 0.025$ & $2.91 \pm 1.71$ & $12.8 \pm 5.7$ \\
% CPS-only~\cite{chen2021semi}                             & $0.907 \pm 0.022$ & $2.63 \pm 1.56$ & $11.7 \pm 5.1$ \\
% MedSAM \cite{ma2024segment}                              & $0.803 \pm 0.072$ & $7.41 \pm 4.10$ & $28.9 \pm 12.6$ \\
% SAMUS \cite{lin2024beyond}                               & $0.828 \pm 0.064$ & $6.32 \pm 3.65$ & $24.7 \pm 11.2$ \\
% \red{Cross-Teaching} \cite{luo2022semi}                               & $0.897 \pm 0.037$ & $2.98 \pm 2.76$ & $11.5 \pm 6.8$ \\
% \red{MC-Net+} \cite{wu2022mutual}                               & $0.850 \pm 0.032$ & $4.17 \pm 2.56$ & $13.9 \pm 9.4$ \\


% \midrule
% \textbf{UltraSemiNet (ours)}                              & \textbf{0.925 $\pm$ 0.019} & \textbf{1.27 $\pm$ 1.44} & \textbf{10.3 $\pm$ 4.7} \\
% \bottomrule
% \end{tabular}%
% }
% \end{minipage}

% \endgroup
% \vspace{-0.08in}
% \end{table}



\noindent\textbf{Baselines.}
We compare \emph{UltraSemiNet} against: U-Net~\cite{ronneberger2015u}, Attention U-Net~\cite{oktay2018attention}, Vision Transformer (ViT)~\cite{dosovitskiy2020image}, Mean Teacher (MT)~\cite{tarvainen2017mean}, and CPS-only~\cite{chen2021semi}.
To contextualize results with widely used references, we also include 2D nnU-Net \cite{isensee2021nnu} (default config) and promptable foundation models MedSAM \cite{ma2024segment} and SAMUS \cite{lin2024beyond}.
% All methods share the same backbone family, data splits, and augmentations; method-specific heads/losses follow their original papers.

\noindent\textbf{Semi-supervised protocol.}
In each fold, we sample \textbf{1,500 labeled} images from the FBUI training split; the remaining FBUI training images are treated as \textbf{unlabeled}.
The teacher–student model uses \emph{calibrated} teacher probabilities (temperature refit every 3–5 epochs) and accepts unlabeled pixels for CPS only if they pass a binary confidence threshold annealed $0.95\!\rightarrow\!0.80$ and a flip/rot90 TTA stability test (KL\,$\le 0.15$) (Sec.~\ref{sec:cps}). \red{For temperature scaling, we construct a small calibration set by randomly sampling $N{=}200$ labeled images from the training split of the current fold. The calibration set is strictly disjoint from the validation and test subjects. To prevent leakage, our k-fold splitting is performed at the patient level, i.e., images are grouped by patient ID prior to partitioning, and all samples from the same patient are assigned to the same fold.}

\noindent\textbf{Foundation-model protocol.}
\red{We include MedSAM \cite{ma2024segment} and SAMUS \cite{lin2024beyond} as baselines for the foundation model and evaluate them using a consistent fine-tuning protocol on FBUI. Specifically, we fine-tune each model only on the FBUI-labeled split (no HC18 usage), using the same input resolution ($224\times224$), preprocessing/augmentations, and a comparable training budget. Following their prompt-based design, we use box prompts constructed as the tight bounding box around the head region from the training masks (with a fixed padding). All predictions are mapped back to the evaluation resolution before computing metrics.}


\noindent\textbf{Implementation details.}
Images are resized to $224{\times}224$ and intensity-normalized.
The teacher uses \emph{weak} views (flips, 90$^\circ$ rotations), whereas the student uses weak+strong views (brightness/contrast/gamma jitter, mild elastic, light speckle). Models are trained on 1$\times$A100 with total batch size 32 for 10{,}000 iterations (five-fold CV unless noted). We use AdamW (lr $1\!\times\!10^{-4}$, weight decay $1\!\times\!10^{-4}$), cosine decay with 5\% warmup, gradient clipping at 1.0. The student is optimized with $\mathcal{L}_{\mathrm{sup}}{+}\lambda_{\mathrm{CPS}}\mathcal{L}_{\mathrm{CPS}}{+}\lambda_{\mathrm{SAT}}\mathcal{L}_{\mathrm{SAT}}{+}\lambda_{\mathrm{PCM}}\mathcal{L}_{\mathrm{PCM}}$ (defaults: $\lambda_{\mathrm{CPS}}{=}1.0$, $\lambda_{\mathrm{SAT}}{=}0.3$, $\lambda_{\mathrm{PCM}}{=}0.2$). The teacher is updated by EMA with $\alpha{=}0.996$.
SAT uses one positive and $K_-=64$ negatives per anchor (radius $r{=}5$, annulus $[7,11]$, gate $\tau_{\mathrm{bIoU}}{=}0.6$), whereas PCM maintains head/background prototypes with EMA $\rho{=}0.99$ and a curriculum keep ratio $\gamma(t):0.3\!\rightarrow\!0.7$. Early stopping was also applied on validation DSC with patience of 30.

\noindent\textbf{Computational cost.} \red{UltraSemiNet contains 17.27 million parameters and requires approximately 61.20 GFLOPs for a standard $1\times224\times224$ input. On a single NVIDIA A100 GPU, training takes approximately 55 seconds per epoch. At inference time, UltraSemiNet runs in 4.25 ms per sample (corresponding to $\sim$220 FPS), indicating that the proposed method can operate efficiently in real-time clinical workflows.}




% \begin{table}[t]
% \centering
% \caption{\textbf{Computational cost comparison.} Training time is reported per epoch on the same hardware. Latency is per-sample inference time using a single forward pass. $^\ast$CPS-only reports the effective parameter/FLOP cost when two networks are used during training. Inference uses a single network.}
% \small
% \setlength{\tabcolsep}{6pt}
% \begin{tabular}{lcccc}
% \toprule
% \textbf{Method} & \textbf{Params (M)} & \textbf{FLOPs (G)} & \textbf{Train time/epoch (s)} & \textbf{Latency (ms)} \\
% \midrule
% % U-Net (Sup. only)          & 17.27 & 61.20   & $\sim$15   & 3.23 \\
% Mean Teacher (MT)          & 17.27 & 61.20   & $\sim$25   & 3.23 \\
% CPS-only                   & 34.54$^\ast$ & 122.40$^\ast$ & $\sim$50   & 3.23 \\
% Cross-Teaching             & 44.44 & 61.34   & $\sim$50   & 3.19 \\
% MedSAM (Ma et al.)         & 86.04 & $>$300  & $>$120     & $\sim$45.0 \\
% \textbf{UltraSemiNet (Ours)} & \textbf{17.27} & \textbf{61.20} & \textbf{$\sim$40} & \textbf{3.23} \\
% \bottomrule
% \end{tabular}

% \label{tab:compute_cost}
% \end{table}



\section{Results}
\label{sec:results}








\noindent \textbf{Main Results on FBUI (5-fold CV).}
\label{sec:res_fbui}
Table~\ref{tab:main-fbui} reports results on FBUI under five-fold, patient-level cross-validation with 1{,}500 labeled images and the remaining training images treated as unlabeled. 
UltraSemiNet achieves the best Dice while yielding the largest gains on boundary-sensitive metrics (ASD, HD95). 
Improvements over CPS-only are modest in Dice (as expected for a relatively clear binary target) but consistent and more pronounced for HD95, indicating sharper and more reliable contours.

\noindent \textbf{Cross-Dataset Generalization (Train on FBUI, Test on HC18 subset).}
\label{sec:res_cross}
We next assess robustness under distribution shift by training on FBUI and evaluating on the HC18 subset without retuning thresholds. 
All methods degrade, but UltraSemiNet maintains the strongest performance and the largest boundary gains (Table~\ref{tab:cross}). Reporting ASD/HD95 in millimeters makes the results directly clinically interpretable and confirms that the improvements persist beyond resized pixel-space evaluation.

\begin{figure*}[hbtp]
  \centering
  \begin{minipage}[t]{0.24\textwidth}
    \centering
    \includegraphics[width=\linewidth]{Images/tsne_pre_sat_pcm.pdf}\\[-2pt]
    {\scriptsize \textbf{(a)} t-SNE (early; before SAT/PCM)}
  \end{minipage}\hfill
  \begin{minipage}[t]{0.24\textwidth}
    \centering
    \includegraphics[width=\linewidth]{Images/tsne_post_sat_pcm.pdf}\\[-2pt]
    {\scriptsize \textbf{(b)} t-SNE (late; with SAT+PCM)}
  \end{minipage}\hfill
  \begin{minipage}[t]{0.26\textwidth}
    \centering
    \includegraphics[width=\linewidth]{Images/calibration_ece_over_epochs.pdf}\\[-2pt]
    {\scriptsize \textbf{(c)} ECE over epochs (lower is better)}
  \end{minipage}\hfill
  \begin{minipage}[t]{0.26\textwidth}
    \centering
    \includegraphics[width=\linewidth]{Images/pseudo_label_accuracy_over_epochs.pdf}\\[-2pt]
    {\scriptsize \textbf{(d)} Pseudo-label accuracy (CPS vs.\ w/o CPS)}
  \end{minipage}

  \caption{\textbf{UltraSemiNet dynamics.} (a--b) SAT+PCM progressively sharpen feature separation in t-SNE (more compact and separable head/background clusters). (c) Calibrated CPS with TTA-stability reduces expected calibration error (ECE) compared to naive selection. (d) CPS yields higher pseudo-label accuracy and faster convergence.}
  \label{fig:all-panels}
\end{figure*}


\noindent \textbf{Qualitative Analysis.}
\label{sec:res_qual}
Figure~\ref{fig:Q_results}A shows qualitative results along with a DSC score for each method where UltraSemiNet reduces boundary leakage.
% into low-contrast background and closes small gaps under acoustic shadowing. 
Figure~\ref{fig:Q_results}B shows that PCM further refines contrast in high-entropy areas, whereas
% clarifying anatomical details often obscured in ultrasound images. 
Figure~\ref{fig:Q_results}C shows the best and worst predictions for our model.
Figure~\ref{fig:all-panels} summarizes why UltraSemiNet improves downstream accuracy. Here, SAT+PCM turns entangled early features into compact, separated clusters (a–b). CPS reduces ECE across training (c) and produces higher pseudo-label accuracy throughout (d). We also show some further analysis in Figure~\ref{fig:all-panels-supp} of the appendix by presenting \textbf{dynamics} where CPS acceptance rises while TTA KL falls (a), boundary pixels steadily catch up to interiors as SAT focuses learning at ambiguous edges (b), and near-diagonal reliability at late epochs (c).


\noindent \textbf{Clinical and Practical Implications.}
Table~\ref{tab:overall-abl-calib}a compares UltraSemiNet predictions with two medical experts (ME1, ME2) on total 60 samples from the FBUI and HC18 datasets, revealing that UltraSemiNet consistently outperforms medical experts. In terms of role and experience, ME1 is a board‐certified fetal medicine specialist with over 10 years of experience in obstetric ultrasound, while ME2 is a senior sonographer with over 13 years of dedicated practice in high-risk pregnancy imaging. Each expert independently segmented the fetal head boundary on the set of 30 images from each dataset. These annotations were then compared with the ground-truth masks provided by the source dataset and the predicted masks generated by our UltraSemiNet model. 
% To contextualize performance under inter-rater variability, we additionally derive a consensus reference from the two independent expert segmentations using a blinded adjudication procedure, and we report agreement to this reference alongside agreement to the dataset masks. 
\red{UltraSemiNet shows greater agreement with the consensus reference than either individual expert, suggesting that its contours are at least as consistent with the consolidated clinical standard as those of human raters in this setting.} 
% In addition to geometric agreement, both experts evaluated outputs using a predefined clinical quality checklist (e.g., plausibility of the contour and adequacy of the boundary for measurement); UltraSemiNet met these criteria more consistently, indicating potential to reduce manual refinements during routine biometry.

% To account for inter-rater variability, we additionally formed an \emph{adjudicated rater consensus} from the two expert segmentations following a blinded adjudication procedure, and we report agreement to this consensus as a clinically meaningful reference. In particular, UltraSemiNet achieves higher agreement with the adjudicated consensus than either individual expert, indicating that the model output is at least as consistent with the consensus standard as human raters in this setting. Beyond boundary agreement, both experts also assessed each output using a predefined clinical quality checklist (e.g., correct plane/landmark visibility and contour plausibility). 



% ------------------------------------------------------------

\noindent \textbf{Ablations and Calibration}
\label{sec:res_ablation}
We isolate the contributions of calibration/gating, SAT, and PCM as depicted in Table~\ref{tab:overall-abl-calib}b. All ablations use the same backbone, schedules, and labeled fraction. 
% Calibration and TTA-gating alone improve CPS, primarily reducing HD95. SAT provides the largest boundary improvements, and PCM improves both Dice and boundary metrics by organizing features globally. Combining SAT and PCM yields the best overall performance with the lowest variance across folds.
% ------------------------------------------------------------
% \noindent \textbf{Calibration and Reliability}
% \label{sec:res_calib}
We quantify probability reliability using Expected Calibration Error (ECE; 15 bins) and Brier score on the student’s probability map. 
Table~\ref{tab:overall-abl-calib}c shows that calibrating CPS and using SAT/PCM reduces ECE and Brier versus naive CPS.


% ------------------------------------------------------------




\begin{table*}[hbtp]
\centering
\footnotesize
\caption{Comparative performance, ablations, and probability calibration. HD95/ASD in pixels (px).}
\label{tab:overall-abl-calib}
\setlength{\tabcolsep}{3pt}

% ===== (a) Model vs Experts =====
\begin{minipage}[t]{0.85\textwidth}
\centering
\renewcommand{\arraystretch}{1.15}
\resizebox{\linewidth}{!}{%
\begin{tabular}{@{}l*{9}{S[table-format=2.3]}@{}}
\toprule
\textbf{Dataset} & \multicolumn{3}{c}{\textbf{UltraSemiNet (Ours)}} & \multicolumn{3}{c}{\textbf{Medical Expert 1}} & \multicolumn{3}{c}{\textbf{Medical Expert 2}} \\
\cmidrule(lr{5pt}){2-4} \cmidrule(lr{5pt}){5-7} \cmidrule(lr{5pt}){8-10}
 & {\textbf{DSC} $\uparrow$} & {\textbf{HD95} $\downarrow$} & {\textbf{ASD} $\downarrow$}
 & {\textbf{DSC} $\uparrow$} & {\textbf{HD95} $\downarrow$} & {\textbf{ASD} $\downarrow$}
 & {\textbf{DSC} $\uparrow$} & {\textbf{HD95} $\downarrow$} & {\textbf{ASD} $\downarrow$} \\
\midrule
\textbf{FBUI} & 0.971 & 6.800 & 1.070 & 0.965 & 14.160 & 1.160 & 0.949 & 23.020 & 1.230 \\
\textbf{HC18} & 0.925 & 10.300 & 1.270 & 0.919 & 17.450 & 1.530 & 0.897 & 26.270 & 3.870 \\
\bottomrule
\end{tabular}%
}
\caption*{a) Comparison with two senior medical experts.}
\label{tab:perf_vs_experts}
\end{minipage}

\vspace{4pt}

% ===== (b) + (c) using minipage (side-by-side) =====
\noindent
\begin{minipage}[t]{0.4\textwidth}
\centering
\renewcommand{\arraystretch}{1.15}
\resizebox{\linewidth}{!}{%
\begin{tabular}{@{}lccc@{}}
\toprule
\textbf{Variant} & \textbf{DSC} $\uparrow$ & \textbf{ASD} $\downarrow$ & \textbf{HD95} $\downarrow$ \\
\midrule
CPS (no calib., no TTA)     & $0.940 \pm 0.013$ & $1.90 \pm 1.10$ & $8.6 \pm 4.1$ \\
CPS (calib.\ \& TTA gate)   & $0.952 \pm 0.012$ & $1.55 \pm 1.05$ & $7.9 \pm 3.8$ \\
CPS + SAT (w/o PCM)         & $0.962 \pm 0.011$ & $1.28 \pm 0.98$ & $7.2 \pm 3.5$ \\
CPS + PCM (w/o SAT)         & $0.958 \pm 0.011$ & $1.34 \pm 1.00$ & $7.4 \pm 3.6$ \\
\textbf{UltraSemiNet (Full)}& \textbf{$0.971 \pm 0.010$} & \textbf{$1.07 \pm 0.92$} & \textbf{$6.8 \pm 3.2$} \\
\bottomrule
\end{tabular}%
}
\captionof*{table}{b) Ablations on FBUI.}
\label{tab:ablations}
\end{minipage}\hfill%
\begin{minipage}[t]{0.42\textwidth}
\centering
\renewcommand{\arraystretch}{1.15}
\resizebox{\linewidth}{!}{%
\begin{tabular}{@{}lcc@{}}
\toprule
\textbf{Variant} & \textbf{ECE (\%)} $\downarrow$ & \textbf{Brier ($\!\times\!10^{-2}$)} $\downarrow$ \\
\midrule
CPS (no calibration) & $5.9 \pm 1.1$ & $3.8 \pm 0.4$ \\
CPS (calibrated)     & $3.7 \pm 0.9$ & $3.5 \pm 0.3$ \\
\textbf{UltraSemiNet} & \textbf{2.9 $\pm$ 0.8} & \textbf{3.3 $\pm$ 0.3} \\
\bottomrule
\end{tabular}%
}
\captionof*{table}{c) Calibration on FBUI.}
\label{tab:calibration}
\end{minipage}

\end{table*}




% By delivering higher accuracy and reducing the variability that is common in manual annotations, UltraSemiNet improves prenatal diagnostics through reliable assessments of fetal development. Its robust performance with minimal training data underscores its value as a supportive clinical tool, enabling more precise anomaly detection, and effective fetal growth monitoring.
% and better-informed interventions in diverse conditions where labeled data are often limited.

\section{Limitations and Future Work}
This paper targets 2D \emph{binary} skull segmentation, which can be extended beyond the head to additional obstetric targets (e.g., abdominal circumference, femur length, placenta, and standard cardiac views), to multi-class anatomical segmentation, and to 3D/volumetric acquisitions, which is a natural next step. \red{Concretely, for 3D/4D volumes, we would replace the 2D backbone with a standard 3D encoder--decoder (e.g., 3D U-Net/nnU-Net-style 3D configuration \cite{cciccek20163d,isensee2021nnu}), apply the same temperature calibration and confidence+stability gating voxel-wise using a small set of 3D-consistent flips/rotations, and lift SAT/PCM by defining the entropy belt, probability-crop sIoU check, and prototype mining over 3D patches/neighborhoods. The added cost scales mainly with volume size and the number of TTA views and can be controlled through patch-based training and low-resolution gating.}


Although we observed cross-dataset gains, broader evaluation under stronger domain shifts (scanner vendors, protocols) and prospective studies are needed. Finally, integrating calibrated uncertainty with measurement tools (e.g., auto-ellipse placement), conformal risk control, and active selection for targeted annotation could further improve safety and data efficiency. \red{Moreover, we note that SynthStrip \cite{hoopes2022synthstrip} is a brain-extraction (``skull-stripping'') method designed for 3D neuroimaging (MRI/CT) and is therefore not directly comparable to our task of 2D fetal ultrasound head boundary segmentation. We therefore restrict comparisons to methods applicable to 2D ultrasound segmentation under the same training protocol. As future work, we will investigate whether ultrasound-specific preprocessing or anatomy-prior approaches inspired by such methods can further improve robustness under severe attenuation and incomplete skull visibility.}



\section{Conclusion}
We presented UltraSemiNet, a semi-supervised framework for fetal head ultrasound that combines calibrated cross-pseudo supervision with two boundary-aware modules, leveraging SAT and PCM. SAT heightens spatial sensitivity along ambiguous skull–background interfaces, while PCM organizes features globally via uncertainty-weighted class prototypes, together yielding sharper, more reliable contours. On FBUI (5-fold CV), UltraSemiNet reached 0.971 Dice and reduced HD95 to 6.8 px, with consistent boundary-metric gains on cross-dataset HC18 evaluations. 
% Limitation and future work have been provided in Sup.






% \clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
% \midlacknowledgments{We thank a bunch of people.}


\bibliography{midl26_235}

\clearpage
\appendix

\section{Supplementary Material}

\subsection{Overall Objective and Training}
\label{sec:overall}
UltraSemiNet combines supervised learning on labeled images with three unlabeled objectives that are all derived from the \emph{same, calibrated} teacher probabilities (Sec.~\ref{sec:cps}). 
On labeled data $(x,y)\in\mathcal{D}_L$ (binary head mask $y\in\{0,1\}^{h\times w}$), we use a standard cross-entropy plus soft Dice loss with $\lambda_{\mathrm{dice}}{=}1$ and $\varepsilon=1$, which are presented as: 
$\mathcal{L}_{\mathrm{sup}}
=\frac{1}{|\mathcal{D}_L|}\sum_{(x,y)}
\Big[
\mathrm{CE}\!\big(y,\,p_S\big)
+\lambda_{\mathrm{dice}}\ \mathrm{DSC}\!\big(y,\,p_S\big)
\Big],$ $\mathrm{DSC}(y,p_S)=1-\frac{2\sum_i y(i)\,p_S(i)+\varepsilon}{\sum_i y(i)+\sum_i p_S(i)+\varepsilon}.$

On unlabeled data, CPS (Sec.~\ref{sec:cps}) provides \emph{accepted} pseudo-labels using confidence and TTA-stability, SAT (Sec.~\ref{sec:sat}) shapes local features near the boundary using agreement-gated positives, and PCM (Sec.~\ref{sec:pcm}) enforces a global head/background structure via uncertainty-weighted prototypes and a curriculum. The final training objective is a weighted sum:
% \begin{equation}
% \label{eq:total}
% \resizebox{0.9\linewidth}{!}{$
$\mathcal{L}
=
\mathcal{L}_{\mathrm{sup}}
+\lambda_{\mathrm{CPS}}\ \mathcal{L}_{\mathrm{CPS}}
+\lambda_{\mathrm{SAT}}\ \mathcal{L}_{\mathrm{SAT}}
+\lambda_{\mathrm{PCM}}\ \mathcal{L}_{\mathrm{PCM}}
$
% \end{equation}
with defaults $\lambda_{\mathrm{CPS}}{=}1.0$, $\lambda_{\mathrm{SAT}}{=}0.3$, $\lambda_{\mathrm{PCM}}{=}0.2$. 
Gradients flow through the \emph{student} only and the teacher is updated by EMA (Eq.~\eqref{eq:ema}). TTA averages are used for \emph{gating} but are not part of the backpropagation graph.
% \subsection{Training Algorithm}
% \label{sec:algorithm}
We follow a simple loop that keeps the CPS acceptance, SAT pairing, and PCM prototypes \emph{consistent} by basing all decisions on the calibrated teacher probabilities from Sec.~\ref{sec:cps}. We also prvovide the complete algorithm givenn above which shows the overall flow across different modules.
% \paragraph{Optimization.}
Furthermore, we use AdamW (lr $1\!\times\!10^{-4}$, weight decay $1\!\times\!10^{-4}$), cosine decay with linear warmup (5\% of steps), gradient clipping at 1.0, and set $\alpha{=}0.996$ for EMA.

% \subsection{Experimental Setup Contd.}
% \noindent\textbf{Semi-supervised protocol.}
% In each fold, we sample \textbf{1,500 labeled} images from the FBUI training split; the remaining FBUI training images are treated as \textbf{unlabeled}.
% The teacher–student model uses \emph{calibrated} teacher probabilities (temperature refit every 3–5 epochs) and accepts unlabeled pixels for CPS only if they pass a binary confidence threshold annealed $0.95\!\rightarrow\!0.80$ and a flip/rot90 TTA stability test (KL\,$\le 0.15$) (Sec.~\ref{sec:cps}). \red{For temperature scaling, we construct a small calibration set by randomly sampling $N{=}200$ labeled images from the training split of the current fold. The calibration set is strictly disjoint from the validation and test subjects. To prevent leakage, our k-fold splitting is performed at the patient level, i.e., images are grouped by patient ID prior to partitioning, and all samples from the same patient are assigned to the same fold.}

% \noindent\textbf{Foundation-model protocol.}
% \red{We include MedSAM \cite{ma2024segment} and SAMUS \cite{lin2024beyond} as baselines for the foundation model and evaluate them using a consistent fine-tuning protocol on FBUI. Specifically, we fine-tune each model only on the FBUI-labeled split (no HC18 usage), using the same input resolution ($224\times224$), preprocessing/augmentations, and a comparable training budget. Following their prompt-based design, we use box prompts constructed as the tight bounding box around the head region from the training masks (with a fixed padding). All predictions are mapped back to the evaluation resolution before computing metrics.}



% \noindent\textbf{Cross-validation and cross-dataset tests.}
% All in-distribution results are reported under \textbf{five-fold} patient-level CV on FBUI.
% For cross-dataset generalization, models are trained on FBUI and evaluated on the HC18 subset without retuning thresholds. 

% \subsection{Limitations and future work.}
% This paper targets 2D \emph{binary} skull segmentation, which can be extended beyond the head to additional obstetric targets (e.g., abdominal circumference, femur length, placenta, and standard cardiac views), to multi-class anatomical segmentation, and to 3D/volumetric acquisitions, which is a natural next step. \red{Concretely, for 3D/4D volumes, we would replace the 2D backbone with a standard 3D encoder--decoder (e.g., 3D U-Net/nnU-Net-style 3D configuration \cite{cciccek20163d,isensee2021nnu}), apply the same temperature calibration and confidence+stability gating voxel-wise using a small set of 3D-consistent flips/rotations, and lift SAT/PCM by defining the entropy belt, probability-crop sIoU check, and prototype mining over 3D patches/neighborhoods. The added cost scales mainly with volume size and the number of TTA views and can be controlled through patch-based training and low-resolution gating.}


% Although we observed cross-dataset gains, broader evaluation under stronger domain shifts (scanner vendors, protocols) and prospective studies are needed. Finally, integrating calibrated uncertainty with measurement tools (e.g., auto-ellipse placement), conformal risk control, and active selection for targeted annotation could further improve safety and data efficiency. \red{Moreover, we note that SynthStrip \cite{hoopes2022synthstrip} is a brain-extraction (``skull-stripping'') method designed for 3D neuroimaging (MRI/CT) and is therefore not directly comparable to our task of 2D fetal ultrasound head boundary segmentation. We therefore restrict comparisons to methods applicable to 2D ultrasound segmentation under the same training protocol. As future work, we will investigate whether ultrasound-specific preprocessing or anatomy-prior approaches inspired by such methods can further improve robustness under severe attenuation and incomplete skull visibility.}




\begin{figure}[hbtp]
  \centering

  \begin{minipage}[t]{0.32\textwidth}
    \centering
    \includegraphics[width=\linewidth]{Images/acceptance_rate_and_tta_kl_over_epochs.pdf}
    % \vspace{2pt}
    \captionof*{figure}{\scriptsize (a) CPS acceptance ↑ vs.\ TTA KL ↓}
    % \label{fig:cps-tta} % per-panel labels need subcaption; keep commented if not using it
  \end{minipage}\hfill%
  \begin{minipage}[t]{0.32\textwidth}
    \centering
    \includegraphics[width=\linewidth]{Images/boundary_vs_interior_accuracy_over_epochs.pdf}
    % \vspace{2pt}
    \captionof*{figure}{\scriptsize (b) Boundary vs.\ interior pixel accuracy}
    % \label{fig:boundary-interior}
  \end{minipage}\hfill%
  \begin{minipage}[t]{0.32\textwidth}
    \centering
    \includegraphics[width=\linewidth]{Images/reliability_diagram_selected_pseudo_labels.pdf}
    % \vspace{2pt}
    \captionof*{figure}{\scriptsize (c) Reliability diagram (late epoch)}
    % \label{fig:reliability}
  \end{minipage}

  \caption{Acceptance/uncertainty trends, boundary vs.\ interior accuracy, and calibration.}
  \label{fig:all-panels-supp}
\end{figure}





\begin{table*}[ht]
\centering
\small
\caption{Key hyperparameters used in UltraSemiNet. We report defaults and suggested sweep ranges used in ablations.}
\label{tab:key-hparams}
\begingroup
% \renewcommand{\arraystretch}{1.3} % more vertical space
\resizebox{0.9\textwidth}{!}{%
\begin{tabular}{@{}%
    >{\raggedright\arraybackslash}p{0.20\linewidth}%
    >{\raggedright\arraybackslash}p{0.4\linewidth}%
    >{\centering\arraybackslash}p{0.12\linewidth}%
    >{\centering\arraybackslash}p{0.13\linewidth}@{}}
\toprule
\textbf{Name} & \textbf{Role} & \textbf{Default} & \textbf{Sweep} \\
\midrule
% $\alpha$ & EMA decay for teacher & $0.996$ & $[0.99,0.999]$ \\
% $d$ & Embedding dimension of $v_i$ & $128$ & $\{64,128,256\}$ \\
$r$ & Positive radius for SAT & $5$ px & $\{3,5,7\}$ \\
% $[d_{\min},d_{\max}]$ & Negative annulus (px) & $[7,11]$ & $\{[5,9],[7,11],[9,13]\}$ \\
$s$ & Local window size for $\mathcal{M}_i$ & $15$ & $\{11,15,19\}$ \\
% $\beta$ & InfoNCE temperature & $0.07$ & $\{0.05,0.07,0.10\}$ \\
$[\epsilon_1,\epsilon_2]$ & Entropy belt thresholds & \red{$[0.4,0.95]$} & \red{$\{[0.2,0.9],[0.4,0.95]\}$} \\
$\tau_{\mathrm{bIoU}}$ & Soft-IoU gate for boundary positives & $0.6$ & $\{0.5,0.6,0.7\}$ \\
$q_b(t)$ & Boundary-positive admission (max) & $0.4$ & $\{0.2,0.4\}$ \\
$\gamma_1,\gamma_2$ & Anchor weight coefficients & $0.7,\ 0.3$ & fixed \\
$\tau_c(t)$ & CPS class-wise thresholds & $0.95\!\rightarrow\!0.80$ & slope $\{\times 0.5, \times 1\}$ \\
$\delta$ & TTA disagreement (KL) cutoff & $0.15$ & $\{0.10,0.15,0.20\}$ \\
% $\rho$ & Prototype EMA factor & $0.99$ & $\{0.98,0.99,0.995\}$ \\
$\tau$ & Temperature in $\pi_c(i)$ & $0.5$ & $\{0.3,0.5,0.7\}$ \\
$m$ & PCM margin & $0.2$ & $\{0.1,0.2,0.3\}$ \\
$\gamma(t)$ & Curriculum keep ratio (max) & $0.7$ & $\{0.5,0.7\}$ \\
\bottomrule
\end{tabular}%
}
\endgroup
\vspace{-0.4em}
\end{table*}


\begin{algorithm}[hbtp]
\centering
\resizebox{\linewidth}{!}{%
\begin{minipage}{1.0\linewidth}
\small
\DontPrintSemicolon
\SetAlgoLined
\caption{\red{UltraSemiNet Training Framework}}
\label{alg:ultraseminet}
\KwIn{Labeled set $\mathcal{D}_L$, Unlabeled set $\mathcal{D}_U$; Student $\theta_S$, Teacher $\theta_T$; Calibration set $\mathcal{D}_{cal} \subset \mathcal{D}_L$.}
\KwOut{Optimized Student Model $\theta_S$}
\textbf{Initialize:} Prototype EMA $\rho$, Teacher EMA $\alpha$, SAT radii $r$, Thresholds $\tau, \delta, \epsilon_1, \epsilon_2$. Warm-up prototypes $\mu_0, \mu_1$ using $\mathcal{D}_L$.\;
\For{epoch $=1$ to $E$}{
  \tcp{1. Reliability Calibration (Once per epoch)}
  Extract logits from $\mathcal{D}_{cal}$ using $\theta_T$. Minimize NLL to find scaling factor $\mathcal{T}$.\;
  
  \ForEach{batch $\mathcal{B}_L \subset \mathcal{D}_L, \mathcal{B}_U \subset \mathcal{D}_U$}{
    \tcp{2. Multi-View Forward Pass}
    Generat weak ($x^w$) and strong ($x^s$) views for $x \in \mathcal{B}_U$.\;
    $\text{Teacher}: p_T = \sigma(f(x^w; \theta_T)/\mathcal{T})$; \quad $\text{Student}: p_S, z_S = f(x^s; \theta_S)$ (logits/feats).\;
    
    \tcp{3. CPS: Dual-Gating Mechanism}
    Confidence: $m_{conf} = \mathbbm{1}[\max(p_T) \ge \tau]$. Stability: $m_{stab} = \mathbbm{1}[D_{KL}(p_T || p_{TTA}) \le \delta]$.\;
    Global Gate: $\mathcal{G} = m_{conf} \cdot m_{stab}$. Pseudo-label $\hat{y} = \text{argmax}(p_T)$.\;
    $\mathcal{L}_{CPS} = \frac{1}{|\mathcal{G}|} \sum \mathcal{G} \cdot \ell_{ce}(p_S, \hat{y})$.\;
    \tcp{4. SAT: Spatial Awareness \& Boundary Belt}
    Entropy map $H(p)$; Boundary Belt $\mathcal{B} = \mathbbm{1}[\epsilon_1 \le H(p) \le \epsilon_2]$.\;
    Sample anchors $a$. If $a \in \mathcal{B}$, filter by structural consistency ($\text{sIoU} > \tau_{b}$).\;
    Contrastive loss $\mathcal{L}_{SAT}$ between anchors and context-aware negatives.\;\\
    \tcp{5. PCM: Prototype Consistency}
    Update global prototypes $\mu_c \leftarrow \rho \mu_c + (1-\rho) \bar{z}_{\hat{y}}$ using confident features.\;
    Compute hardness $\kappa = p_S \cdot \text{sim}(z_S, \mu_{1-\hat{y}})$. Select top-ranked hard pixels.\;
    $\mathcal{L}_{PCM} = \ell_{contrast}(z_S, \mu_{\hat{y}}, \mu_{1-\hat{y}})$.\;
    \tcp{6. Optimization}
    $\mathcal{L}_{total} = \mathcal{L}_{sup}(\mathcal{B}_L) + \lambda_1 \mathcal{L}_{CPS} + \lambda_2 \mathcal{L}_{SAT} + \lambda_3 \mathcal{L}_{PCM}$.\;
    Update $\theta_S \leftarrow \theta_S - \eta \nabla \mathcal{L}_{total}$; \quad Update $\theta_T \leftarrow \alpha \theta_T + (1-\alpha) \theta_S$.\;
  }
}
\end{minipage}%
}
\end{algorithm}


% \subsection{Overall Objective and Training}
% \label{sec:overall}
% UltraSemiNet combines supervised learning on labeled images with three unlabeled objectives that are all derived from the \emph{same, calibrated} teacher probabilities (Sec.~\ref{sec:cps}). 
% On labeled data $(x,y)\in\mathcal{D}_L$ (binary head mask $y\in\{0,1\}^{h\times w}$), we use a standard cross-entropy plus soft Dice loss with $\lambda_{\mathrm{dice}}{=}1$ and $\varepsilon=1$, which are presented as: 
% $\mathcal{L}_{\mathrm{sup}}
% =\frac{1}{|\mathcal{D}_L|}\sum_{(x,y)}
% \Big[
% \mathrm{CE}\!\big(y,\,p_S\big)
% +\lambda_{\mathrm{dice}}\ \mathrm{DSC}\!\big(y,\,p_S\big)
% \Big],$ $\mathrm{DSC}(y,p_S)=1-\frac{2\sum_i y(i)\,p_S(i)+\varepsilon}{\sum_i y(i)+\sum_i p_S(i)+\varepsilon}.$

% On unlabeled data, CPS (Sec.~\ref{sec:cps}) provides \emph{accepted} pseudo-labels using confidence and TTA-stability, SAT (Sec.~\ref{sec:sat}) shapes local features near the boundary using agreement-gated positives, and PCM (Sec.~\ref{sec:pcm}) enforces a global head/background structure via uncertainty-weighted prototypes and a curriculum. The final training objective is a weighted sum:
% % \begin{equation}
% % \label{eq:total}
% % \resizebox{0.9\linewidth}{!}{$
% $\mathcal{L}
% =
% \mathcal{L}_{\mathrm{sup}}
% +\lambda_{\mathrm{CPS}}\ \mathcal{L}_{\mathrm{CPS}}
% +\lambda_{\mathrm{SAT}}\ \mathcal{L}_{\mathrm{SAT}}
% +\lambda_{\mathrm{PCM}}\ \mathcal{L}_{\mathrm{PCM}}
% $
% % \end{equation}
% with defaults $\lambda_{\mathrm{CPS}}{=}1.0$, $\lambda_{\mathrm{SAT}}{=}0.3$, $\lambda_{\mathrm{PCM}}{=}0.2$. 
% Gradients flow through the \emph{student} only and the teacher is updated by EMA (Eq.~\eqref{eq:ema}). TTA averages are used for \emph{gating} but are not part of the backpropagation graph.
% % \subsection{Training Algorithm}
% % \label{sec:algorithm}
% We follow a simple loop that keeps the CPS acceptance, SAT pairing, and PCM prototypes \emph{consistent} by basing all decisions on the calibrated teacher probabilities from Sec.~\ref{sec:cps}. We also prvovide the complete algorithm givenn above which shows the overall flow across different modules.
% % \paragraph{Optimization.}
% Furthermore, we use AdamW (lr $1\!\times\!10^{-4}$, weight decay $1\!\times\!10^{-4}$), cosine decay with linear warmup (5\% of steps), gradient clipping at 1.0, and set $\alpha{=}0.996$ for EMA.


% \section{Proof of Theorem 1}

% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}

% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\end{document}

