\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{caption}
% \captionsetup{skip=8pt}
\usepackage{multirow}
\usepackage{mwe} % to get dummy images
\usepackage{booktabs} % for \toprule, \midrule, \bottomrule
\jmlrvolume{-- 328}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}
% \usepackage{tabularx}
\usepackage{graphicx}
\title[Distillation of DINOv3 into a Lightweight Foundation Model]{Domain-Constrained Distillation of DINOv3 into a Lightweight Foundation Model Toward Point-of-Care Ultrasound}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Md Jaber Al Nahian\midljointauthortext{Corresponding Author}\nametag{$^{1}$}} \orcid{0000-0003-3140-4588} \Email{mdjabera@ualberta.ca}\\
\addr $^{1}$ Faculty of Medicine and Dentistry-Radiology and Diagnostic Imaging Department, University of Alberta, AB, CA \\
% \addr $^{2}$ Address 2 \AND
\Name{Shrimanti Ghosh\nametag{$^{1}$}} \Email{shrimant@ualberta.ca}\\
\Name{Jacob Jaremko\nametag{$^{1}$}} \Email{jjaremko@ualberta.ca}\\
\Name{Abhilash Hareendranathan\midljointauthortext{Co-corresponding Author}\nametag{$^{1}$}} \Email{hareendr@ualberta.ca}
% \addr $^{1}$ Faculty of Medicine and Dentistry-Radiology and Diagnostic Imaging Department, University of Alberta, AB, CA
}

\begin{document}

\maketitle

\begin{abstract}
Vision foundation models such as DINOv3 provide powerful representations but are too computationally demanding for point-of-care ultrasound (POCUS), whereas lightweight CNNs remain deployable yet brittle when faced with diverse anatomies and acquisition styles. We bridge this gap with a domain-constrained distillation framework that transfers DINOv3 ViT-B/16 knowledge into a compact ResNet-50, achieving roughly 3.4$\times$ compression while preserving the teacher’s billion-scale visual priors. Using a large, heterogeneous ultrasound corpus and physics-aware augmentations, the distilled model delivers substantial linear-probe improvements over standard CNN baselines and consistently outperforms the ViT teacher on challenging, heterogeneous datasets. It further offers marked gains in limited-label regimes, reflecting the realities of POCUS workflows where annotated data are scarce. Embedding visualizations show that the distilled encoder forms clearer, anatomy-aware clusters than the teacher, indicating successful alignment to ultrasound structure. Together, these results demonstrate that large-scale natural-image priors can be distilled into a lightweight, generalizable encoder suitable for resource-constrained clinical deployment.
\end{abstract}


\begin{keywords}
DINOv3, Distillation, POCUS, Foundation Model, Domain Adaptation.
\end{keywords}

\section{Introduction}
Large vision foundation models (FMs) such as DINOv3 and related self-supervised ViT encoders achieve strong transfer across many visual tasks by pretraining on  billions of natural images \cite{fm_clip,fm_dinov2,fm_dinov3,fm_sam}. These models are increasingly attractive for medical imaging, where labeled data are scarce and distribution shifts are common. However, most existing FMs require massive computational resources including GPUs and large memory.  These resources are often not available in clinical settings. Modern handheld POCUS probes are frequently paired with smartphones or tablets for acquisition and display~\cite{EchoNous2025,knight20232d,jaremko2023ai}. Deploying large ViT-based foundation models for real-time inference on such resource-constrained mobile hardware can be challenging due to latency and memory/compute overhead, motivating model compression and distillation for point-of-care use~\cite{li2022efficientformer}.

ViT-based models are powerful, yet their high memory and processing requirements make them challenging to run efficiently on edge devices in real time \cite{azad2024advances}. Compact convolutional neural networks (CNNs) are much easier to deploy, but they are often trained on small private ultrasound datasets and are prone to generalization failures across anatomies, scanners, and acquisition protocols \cite{CNN_1}. In practice, clinicians must choose between accurate but impractical models and practical but brittle ones.

We tackle this deployment–performance trade-off by treating FM adaptation as a \emph{knowledge preservation} problem rather than a pure compression problem. Starting from a DINOv3 ViT-B/16 teacher pretrained on 1.7B natural images, we distill its representations into a lightweight ResNet-50 student trained on a curated, large-scale ultrasound corpus of 162,000 unlabeled B-mode images from 35 diverse public datasets. The aim is not only to reduce parameter count, but to transfer both generic visual structure learned at billion scale (edges, shapes, hierarchical abstractions) and ultrasound-specific appearance patterns shaped by B-mode physics and clinical scanning practice. We implement this \emph{domain-constrained} adaptation by supervising the student only through DINOv3 token embeddings, while training on ultrasound-only data with ultrasound-aware augmentations (horizontal flips, moderate zoom, mild blur) that reflect plausible B-mode acquisition changes.

We show that this distillation strategy produces a compact ultrasound foundation model that, in certain settings, is competitive with or occasionally outperforms the heavy ViT teacher in ultrasound segmentation and classification tasks. It also maintains strong performance in low-label regimes. Representation analyses suggest that the distilled model retains useful natural-image structure while forming anatomically meaningful clusters across diverse ultrasound domains. Overall, our results suggest that billion-scale natural-image pretraining can be transferred into a lightweight CNN without sacrificing accuracy, offering a promising step toward foundation models that better align with the computational constraints of POCUS systems. A detailed discussion of related work on vision FMs, medical FMs, and medical distillation is provided in Section~\ref{sec:related_work}.


\section{Related Work}
\label{sec:related_work}
\subsection{Ultrasound-specific deep learning}

Before FM-style pretraining, ultrasound applications primarily relied on task-specific CNNs trained from scratch or initialized from supervised ImageNet weights~\cite{us_cnn1,us_cnn2,us_cnn3}. U-Net variants and ResNet-based encoders have been widely deployed for lesion segmentation, organ boundary detection, and view classification~\cite{us_unet_resnet1,us_unet_resnet2}. While compact enough for embedded deployment, these models are typically trained on small, single-center datasets and exhibit poor cross-domain generalization.

\subsection{Vision foundation models and knowledge distillation}

Large-scale vision foundation models including CLIP, DINOv2/DINOv3, and Segment Anything (SAM/SAM2) achieve strong zero-shot and transfer performance across classification, detection, and segmentation by pretraining on hundreds of millions to billions of natural images~\cite{fm_clip,fm_dinov2,fm_dinov3,fm_sam}. Knowledge distillation---where a large teacher supervises a smaller student via feature, logit, or attention matching---is widely used to compress such models for edge deployment~\cite{kd_hinton,kd_fitnets,kd_attention}. Recent work has distilled SAM-like segmenters and DINO-style self-supervised ViTs into compact students, showing that much of the teacher's representational power can be retained in lighter architectures~\cite{kd_sam_distill,kd_dino_distill}. These approaches, however, are almost exclusively evaluated on natural-image benchmarks.

\subsection{Foundation models in medical imaging}

Medical imaging has rapidly adopted foundation-model pretraining, with adaptations of SAM (e.g., MedSAM, Sam2Rad) and other large encoders improving performance and label efficiency across CT, MRI, X-ray, and histopathology~\cite{medsam_paper,Sam2Rad,med_fm_ct_mri,fm_CT, med_fm_hist}. For ultrasound specifically, emerging ultrasound foundation models trained across multiple organs and anatomies demonstrate promising transfer to segmentation and classification under limited labels~\cite{usfm1,usfm2,usfm3}. Most of these models are trained on large but highly heterogeneous and often imbalanced datasets.


\subsection{Knowledge distillation in medical imaging}

Knowledge distillation has been applied in medical imaging to compress large segmentation networks, ensembles, and self-supervised encoders~\cite{med_kd_seg,med_kd_ssl1,med_kd_ssl2}. Existing techniques distill logits, intermediate feature maps, or contrastive representations, sometimes with uncertainty weighting or region-aware losses. Many prior works primarily use supervised task-specific teachers (discarding a wealth of natural-image priors), focus on a single modality or anatomy, or treat domain adaptation as downstream fine-tuning rather than an integral part of the distillation process. In ultrasound, distillation has mainly been used to compress task-specific models rather than to build general-purpose ultrasound foundation models~\cite{us_kd_task_specific}.

\subsection{Research Gap and Proposed Solution}
Existing foundation models and distillation methods do not systematically address how to preserve large-scale natural-image priors while adapting to specialized medical modalities such as ultrasound. In practice, current ultrasound approaches still trade off between heavy ViT-based FMs (high capacity but impractical for point-of-care deployment) and lightweight CNNs (easy to deploy but not generalizable). Motivated by this gap, we propose a domain-constrained, feature-level distillation framework that transfers DINOv3 token representations into a compact ResNet-50 student, using ultrasound-only data and ultrasound-aware augmentations to tailor the encoder for POCUS settings.

\section{Method}
\label{sec:method}

Our goal is to obtain a compact, ultrasound-specific encoder by distilling a large DINOv3 ViT-B/16 teacher into a ResNet-50 student using a large corpus of unlabeled ultrasound images. Fig.~\ref{fig:overview} summarizes the pipeline. In this section, we describe the ultrasound corpus, the teacher--student architecture, the feature-level distillation objective, the ultrasound-aware augmentations, and the optimization details.

\begin{figure}[t]
  \centering
  \includegraphics[width=.8\linewidth]{method_white}
  \caption{Overview of the proposed domain-constrained distillation 
framework. A frozen DINOv3 ViT-B/16 teacher produces rich 
generic visual representations that are distilled into a compact 
ResNet-50 student via a token-wise feature alignment loss 
$\mathcal{L}_{\text{distill}}$, using ultrasound-only data and 
ultrasound-aware augmentations. The resulting encoder is evaluated 
on downstream segmentation, classification, linear probing, and 
representation analysis.}
  \label{fig:overview}
\end{figure}




% \subsection{Ultrasound Corpus}
% \label{subsec:data}

% We curate a heterogeneous corpus of $\sim$160,000 unlabeled B-mode ultrasound images spanning diverse anatomies (breast, thyroid, cardiac, fetal, musculoskeletal) and acquisition settings. Images are extracted from public and institutional datasets, standardized to a fixed resolution, and cleaned by removing overlays and discarding non-ultrasound or corrupted frames. No labels are used during training, encouraging the encoder to learn transferable, anatomy-agnostic ultrasound representations.
\subsection{Ultrasound Corpus}
\label{subsec:data}

We curate a heterogeneous corpus of \(\sim\)162{,}000 unlabeled B-mode ultrasound images spanning diverse anatomies (breast, thyroid, cardiac, fetal, and musculoskeletal) and acquisition settings. The corpus is formed by converting volumetric/video ultrasound data into 2D frames: for 3D studies we export 2D slice frames, and for video clips we extract a subset of informative, non-redundant frames to reduce near-duplicates while preserving clinically meaningful content. Beyond this frame extraction, we apply minimal preprocessing, including basic cleaning (removing overlays/annotations when present) and discarding non-ultrasound or corrupted frames. A detailed list of the constituent datasets and sources is provided in Table~\ref{tab:dataset}. No labels are used during training, encouraging the encoder to learn transferable, anatomy-agnostic ultrasound representations.

% \subsection{Ultrasound Corpus}
% \label{subsec:data}

% We curate a heterogeneous corpus of $\sim$160,000 unlabeled B-mode ultrasound images spanning diverse anatomies (breast, thyroid, cardiac, fetal, musculoskeletal) and acquisition settings. Images are extracted from public and institutional datasets, standardized to a fixed resolution, and cleaned by removing overlays and discarding non-ultrasound or corrupted frames. A detailed inventory of the constituent datasets and their corresponding sources is provided in Table \ref{tab:dataset}. No labels are used during training, encouraging the encoder to learn transferable, anatomy-agnostic ultrasound representations.
% \subsection{Ultrasound corpus}
% \label{subsec:data}

% We construct a heterogeneous corpus of B-mode ultrasound images spanning multiple anatomies, scanners, and acquisition protocols. The corpus aggregates several public and institutional datasets, including breast, thyroid, cardiac, fetal, and musculoskeletal ultrasound, yielding approximately $N \approx 1.6 \times 10^5$ unlabeled 2D images.

% From each source, we extract B-mode frames, remove color or text overlays where possible, and standardize images to a fixed resolution. Obvious non-ultrasound images, calibration screens, and heavily corrupted frames are discarded via a simple quality-control step. No labels are used during distillation; labels are only required later for downstream evaluation. This design encourages the encoder to learn anatomy-agnostic ultrasound structure that can transfer across organs and tasks.
% \begin{table}[ht]
% \centering
% \scriptsize % Slightly larger than tiny for better readability
% \caption{Dataset Names and Citations}
% \label{tab:dataset_citations}
% % X columns automatically calculate width. 
% % l columns wrap tightly around the citation.
% \begin{tabularx}{\textwidth}{@{} X l X l @{}}
% \toprule
% \textbf{Dataset Name} & \textbf{Citation} & \textbf{Dataset Name} & \textbf{Citation} \\ \midrule
% \textbf{105 US Images} & \cite{hann2017algorithm} & \textbf{BUS-UCLM} & \cite{vallez2025bus} \\
% \textbf{AbdomenUS} & \cite{vitale2020improving} & \textbf{BUSBRA} & \cite{gomez2024bus} \\
% \textbf{AULI} & \cite{yiming2022annotated} & \textbf{BUS\_UC} & \cite{iqbal2024memory} \\
% \textbf{brachial\_plexus-master} & \cite{tyagi2024nerve} & \textbf{Cactus Dataset} & \cite{elmekki2025cactus} \\
% \textbf{BrEaST} & \cite{pawlowska2024curated} & \textbf{CAMUS} & \cite{leclerc2019deep} \\
% \textbf{Common Carotid Artery US} & \cite{momot2022common} & \textbf{COVID-BLUES-main} & \cite{wiedemann2025covid} \\
% \textbf{EchoCP} & \cite{xu2020imagechd} & \textbf{EchoNet-Dynamic} & \cite{ouyang2020video} \\
% \textbf{Fast-U-Net} & \cite{ashkani2022fast} & \textbf{FASS} & \cite{da2023fetal} \\
% \textbf{Fetal Echo First Trimester} & \cite{stoean2021assessment} & \textbf{Fetal Plane Zendoo} & \cite{burgos2020evaluation} \\
% \textbf{GIST514-DB} & \cite{he2022query2} & \textbf{HC} & \cite{van2018automated} \\
% \textbf{Injury localization dataset} & \cite{kumar2025novel} & \textbf{JNU-IFM} & \cite{lu2022jnu} \\
% \textbf{kidneyUS} & \cite{singla2023open} & \textbf{LUS Phantom} & \cite{mclaughlan2024lung} \\
% \textbf{MicroSeg} & \cite{shao2024micro} & \textbf{MMOTU-2D} & \cite{DBLP:journals/corr/abs-2207-06799} \\
% \textbf{MMOTU-3D} & \cite{DBLP:journals/corr/abs-2207-06799} & \textbf{PSFHS} & \cite{jieyun2024pubic} \\
% \textbf{mu-RegPro} & \cite{baum2023mr} & \textbf{S1} & \cite{guo2021segmentation} \\
% \textbf{Segthy} & \cite{kronke2022tracked} & \textbf{STMUS} & \cite{marzola2021deep} \\
% \textbf{STU Hospital} & \cite{xbhlk_STU-Hospital_2026} & \textbf{Ultrasound Fetus Dataset} & \cite{anitha2024ultrasound} \\
% \textbf{UBPD} & \cite{ding2022mallesnet} & & \\ \bottomrule
% \end{tabularx}
% \end{table}
% \begin{table}[ht]
% \centering
% % \tiny is strictly not necessary inside resizebox, but helps start small
% \tiny
% \caption{Overview of the public and institutional datasets comprising the ultrasound corpus, along with their primary references.}
% \label{tab:dataset}

% % The magic command: force the content to fit \textwidth
% \resizebox{\textwidth}{!}{%
% \begin{tabular}{@{}ll@{\hspace{1cm}}ll@{}}
% \toprule
% \textbf{Dataset Name} & \textbf{Reference} & \textbf{Dataset Name} & \textbf{Reference} \\ \midrule
% \textbf{105 US Images} & \cite{hann2017algorithm} & \textbf{BUS-UCLM} & \cite{vallez2025bus} \\
% \textbf{AbdomenUS} & \cite{vitale2020improving} & \textbf{BUSBRA} & \cite{gomez2024bus} \\
% \textbf{AULI} & \cite{yiming2022annotated} & \textbf{BUS\_UC} & \cite{iqbal2024memory} \\
% \textbf{Brachial Plexus} & \cite{tyagi2024nerve} & \textbf{Cactus Dataset} & \cite{elmekki2025cactus} \\
% \textbf{BrEaST} & \cite{pawlowska2024curated} & \textbf{CAMUS} & \cite{leclerc2019deep} \\
% \textbf{Common Carotid Artery US} & \cite{momot2022common} & \textbf{COVID-BLUES-main} & \cite{wiedemann2025covid} \\
% \textbf{EchoCP} & \cite{xu2020imagechd} & \textbf{EchoNet-Dynamic} & \cite{ouyang2020video} \\
% \textbf{Fast-U-Net} & \cite{ashkani2022fast} & \textbf{FASS} & \cite{da2023fetal} \\
% \textbf{Fetal Echocardiography} & \cite{stoean2021assessment} & \textbf{Fetal Plane Zendoo} & \cite{burgos2020evaluation} \\
% \textbf{GIST514-DB} & \cite{he2022query2} & \textbf{HC} & \cite{van2018automated} \\
% \textbf{Injury localization dataset} & \cite{kumar2025novel} & \textbf{JNU-IFM} & \cite{lu2022jnu} \\
% \textbf{kidneyUS} & \cite{singla2023open} & \textbf{LUS Phantom} & \cite{mclaughlan2024lung} \\
% \textbf{MicroSeg} & \cite{shao2024micro} & \textbf{MMOTU-2D} & \cite{DBLP:journals/corr/abs-2207-06799} \\
% \textbf{MMOTU-3D} & \cite{DBLP:journals/corr/abs-2207-06799} & \textbf{PSFHS} & \cite{jieyun2024pubic} \\
% \textbf{mu-RegPro} & \cite{baum2023mr} & \textbf{S1} & \cite{guo2021segmentation} \\
% \textbf{Segthy} & \cite{kronke2022tracked} & \textbf{STMUS} & \cite{marzola2021deep} \\
% \textbf{STU Hospital} & \cite{xbhlk_STU-Hospital_2026} & \textbf{Ultrasound Fetus} & \cite{anitha2024ultrasound} \\
% \textbf{UBPD} & \cite{ding2022mallesnet} & & \\ \bottomrule
% \end{tabular}%
% }
% \end{table}
\begin{table}[ht]
\centering
\tiny
\caption{Overview of the public and institutional datasets comprising the ultrasound corpus, organized by anatomy and source.}
\label{tab:dataset}
\resizebox{\textwidth}{!}{%
\begin{tabular}{@{}ll@{\hspace{1cm}}ll@{}}
\toprule
\textbf{Dataset (Ref)} & \textbf{Anatomy} & \textbf{Dataset (Ref)} & \textbf{Anatomy} \\ \midrule
\textbf{105 US Images} \cite{hann2017algorithm} & Liver & \textbf{LUS Phantom} \cite{mclaughlan2024lung} & Lung \\
\textbf{AbdomenUS} \cite{vitale2020improving} & Abdomen & \textbf{MicroSeg} \cite{shao2024micro} & Prostate \\
\textbf{AULI} \cite{yiming2022annotated} & Liver & \textbf{MMOTU-2D} \cite{DBLP:journals/corr/abs-2207-06799} & Ovary \\
\textbf{Brachial Plexus} \cite{tyagi2024nerve} & Nerve & \textbf{MMOTU-3D} \cite{DBLP:journals/corr/abs-2207-06799} & Ovary \\
\textbf{BrEaST} \cite{pawlowska2024curated} & Breast & \textbf{PSFHS} \cite{jieyun2024pubic} & Fetal Head \\
\textbf{BUS-UCLM} \cite{vallez2025bus} & Breast & \textbf{mu-RegPro} \cite{baum2023mr} & Prostate \\
\textbf{BUSBRA} \cite{gomez2024bus} & Breast & \textbf{S1} \cite{guo2021segmentation} & Breast \\
\textbf{BUS\_UC} \cite{iqbal2024memory} & Breast & \textbf{Segthy} \cite{kronke2022tracked} & Thyroid \\
\textbf{Cactus Dataset} \cite{elmekki2025cactus} & Cardiac & \textbf{STMUS} \cite{marzola2021deep} & MSK \\
\textbf{CAMUS} \cite{leclerc2019deep} & Cardiac & \textbf{STU Hospital} \cite{xbhlk_STU-Hospital_2026} & Thyroid \\
\textbf{Carotid Artery} \cite{momot2022common} & Carotid Artery & \textbf{US Fetus} \cite{anitha2024ultrasound} & Fetus \\
\textbf{COVID-BLUES} \cite{wiedemann2025covid} & Lung & \textbf{UBPD} \cite{ding2022mallesnet} & Brachial Plexus \\
\textbf{EchoCP} \cite{xu2020imagechd} & Cardiac & \textbf{HC} \cite{van2018automated} & Fetal Head \\
\textbf{EchoNet-Dynamic} \cite{ouyang2020video} & Cardiac & \textbf{JNU-IFM} \cite{lu2022jnu} & Fetal Head \\
\textbf{Fast-U-Net} \cite{ashkani2022fast} & Fetal Head & \textbf{KidneyUS} \cite{singla2023open} & Kidney \\
\textbf{FASS} \cite{da2023fetal} & Fetal Abdomen & \textbf{GIST514-DB} \cite{he2022query2} & Gastrointestinal \\
\textbf{Fetal Echo} \cite{stoean2021assessment} & Fetal Heart & \textbf{Injury Loc.} \cite{kumar2025novel} & Spinal Cord \\
\textbf{Fetal Plane} \cite{burgos2020evaluation} & Multi-organ & & \\ \bottomrule
\end{tabular}%
}
\end{table}
\subsection{Teacher and student architectures}
\label{subsec:arch}

We adopt a high-capacity DINOv3 vision transformer as the teacher and a compact ResNet as the student.

% \paragraph{Teacher: DINOv3 ViT-B/16.}
% The teacher encoder $T$ is a DINOv3 ViT-B/16 model pre-trained self-supervised on $\approx 1.7$~billion natural images. For an input image $x \in \mathbb{R}^{3 \times 224 \times 224}$ (grayscale replicated to three channels), the ViT processes non-overlapping $16 \times 16$ patches and produces a sequence of token embeddings
% \begin{equation}
%     Z_T(x) \in \mathbb{R}^{N_{\text{tok}} \times d_T},
% \end{equation}
% where $N_{\text{tok}}$ is the number of image tokens and $d_T$ is the teacher embedding dimension. In our setup, token features from $n$ intermediate transformer blocks (we set $n = 2$) are extracted and aggregated to form the teacher representation used for distillation.

\paragraph{Teacher: DINOv3 ViT-B/16.}
The teacher encoder $T$ is a DINOv3 ViT-B/16 model pre-trained 
self-supervised on $\approx 1.7$~billion natural images. For an 
input image $x \in \mathbb{R}^{3 \times 224 \times 224}$ (grayscale replicated to three channels), the ViT processes non-overlapping $16{\times}16$ pixel patches, yielding a $14{\times}14$ grid of $N_{\text{tok}} = 196$ patch token embeddings. Patch token embeddings from the last $n=2$ transformer blocks are extracted, concatenated along the feature dimension, and flattened in raster-scan order to produce:
\begin{equation}
    Z_T(x) \in \mathbb{R}^{196 \times 2d_T},
\end{equation}
where $d_T = 768$ is the ViT-B embedding dimension, giving a final teacher representation of dimension $2d_T = 1536$. The teacher is frozen throughout training.

% \paragraph{Student: ResNet-50.}
% The student encoder $S$ is a standard ResNet-50. Given $x$, the student produces a convolutional feature map
% \begin{equation}
%     F_S(x) \in \mathbb{R}^{C_S \times H_S \times W_S}.
% \end{equation}
% This feature map is reshaped into a sequence of spatial tokens and passed through a small projection head $g_\phi$ (an MLP with two layers and a hidden dimension of 4096) to match the teacher embedding dimension:
% \begin{equation}
%     Z_S(x) = g_\phi\big(\text{flatten}(F_S(x))\big) \in \mathbb{R}^{N_{\text{tok}} \times d_T}.
% \end{equation}
% The ViT-B/16 teacher has $\sim 86$M parameters, whereas the ResNet-50 student has $\sim 25$M parameters, yielding a $\sim 3$--$4\times$ reduction in parameter count and model size.

\paragraph{Student: ResNet-50.}
The student encoder $S$ is a standard ResNet-50. Given $x$, the 
student produces a convolutional feature map
\begin{equation}
    F_S(x) \in \mathbb{R}^{C_S \times H_S \times W_S}.
\end{equation}
This feature map is passed through a projection head $g_\phi$ (a 
single linear layer) that maps each spatial location to the teacher 
embedding dimension $2d_T = 1536$, then bilinearly upsampled to match the teacher's $14{\times}14$ 
resolution and flattened in raster-scan order:
\begin{equation}
    Z_S(x) = g_\phi\bigl(F_S(x)\bigr) \in \mathbb{R}^{196 \times 2d_T}.
\end{equation}
The ViT-B/16 teacher has $\sim$86M parameters, whereas the ResNet-50 
student has $\sim$25M parameters, yielding a $\sim3$--$4\times$ 
reduction in parameter count and model size.
% \subsection{Feature-level distillation}
% \label{subsec:distillation}

% We use a feature-level knowledge distillation scheme that aligns teacher and student token embeddings on unlabeled ultrasound images. For each image $x$, we apply a stochastic augmentation $a(\cdot)$ to obtain $\tilde{x} = a(x)$ and feed the same view to both teacher and student:
% \begin{align}
%     Z_T &= Z_T(\tilde{x}) = T(\tilde{x}) \in \mathbb{R}^{N_{\text{tok}} \times d_T}, \\
%     Z_S &= Z_S(\tilde{x}) = g_\phi\big(\text{flatten}(F_S(\tilde{x}))\big) \in \mathbb{R}^{N_{\text{tok}} \times d_T}.
% \end{align}

\subsection{Feature-level distillation}
\label{subsec:distillation}

We use a feature-level knowledge distillation scheme that aligns 
teacher and student token embeddings on unlabeled ultrasound images. 
For each image $x$, we apply a stochastic augmentation $a(\cdot)$ 
to obtain $\tilde{x} = a(x)$ and feed the same view to both teacher 
and student:
\begin{align}
    Z_T &= Z_T(\tilde{x}) = T(\tilde{x}) 
          \in \mathbb{R}^{196 \times 2d_T}, \\
    Z_S &= Z_S(\tilde{x}) = g_\phi\bigl(F_S(\tilde{x})\bigr) 
          \in \mathbb{R}^{196 \times 2d_T}.
\end{align}

% \paragraph{Token Alignment.}
% To ensure spatial consistency during distillation, we align the ViT-B/16 teacher’s $16{\times}16$ token grid with the ResNet-50 student’s output. We extract $n$ intermediate feature maps from the teacher, resize them if necessary, and concatenate them into a unified token sequence. The student’s $8{\times}8$ feature map is projected via an MLP and upsampled to match the teacher’s resolution. Both outputs are reshaped into sequences and supervised using a mean squared error loss, enabling effective token-wise alignment without modifying either backbone.



\paragraph{Token Alignment.}
To ensure spatial consistency during distillation, patch token 
embeddings from the last $n=2$ transformer blocks of the ViT-B/16 
teacher are extracted and spatially resized if necessary to match 
the last block's $14{\times}14$ resolution. They are then 
concatenated along the feature dimension into a unified 
representation of shape $(B,\;2d_T,\;14,\;14)$, and flattened 
in raster-scan order into $(B,\;196,\;2d_T)$. The ResNet-50 
student's convolutional feature map is projected via a single 
linear layer and bilinearly upsampled to match the teacher's $14{\times}14$ resolution, then similarly 
flattened in the same raster-scan order into $(B,\;196,\;2d_T)$. Both sequences are supervised using a token-wise MSE loss without feature normalization, without 
modifying either backbone.
% \paragraph{Distillation loss.}
% The distillation loss is implemented as a token-wise mean squared error (MSE) between teacher and student embeddings. Let $Z_T(i) \in \mathbb{R}^{d_T}$ and $Z_S(i) \in \mathbb{R}^{d_T}$ denote the embeddings of the $i$-th token in the sequence. The loss for one image is
% \begin{equation}
%     \mathcal{L}_{\text{distill}}(x)
%     =
%     \frac{1}{N_{\text{tok}}}
%     \sum_{i=1}^{N_{\text{tok}}}
%     \big\| Z_T(i) - Z_S(i) \big\|_2^2,
%     \label{eq:distill_loss}
% \end{equation}
% and the batch loss is obtained by averaging \eqref{eq:distill_loss} across the mini-batch. The loss returns a single scalar and is exactly zero when teacher and student features are identical. We do not use any additional logit-based distillation or contrastive loss; all supervision is mediated through the teacher token embeddings.

\paragraph{Distillation loss.}
The distillation loss is implemented as a token-wise mean squared 
error~(MSE) between teacher and student embeddings without feature 
normalization. Let $Z_T(i) \in \mathbb{R}^{2d_T}$ and 
$Z_S(i) \in \mathbb{R}^{2d_T}$ denote the embeddings of the $i$-th 
token in the sequence, where $i = r \cdot 14 + c$ indexes the 
spatial location $(r,c)$ in the $14{\times}14$ grid. Spatial 
correspondence between the $i$-th teacher and student tokens is 
enforced by construction: both representations are reshaped to 
the same $14{\times}14$ grid prior to flattening in raster-scan 
order, ensuring that token $i$ in both sequences corresponds to 
the same image region. The loss for one image is
\begin{equation}
    \mathcal{L}_{\text{distill}}(x)
    =
    \frac{1}{196}
    \sum_{i=1}^{196}
    \big\| Z_T(i) - Z_S(i) \big\|_2^2,
    \label{eq:distill_loss}
\end{equation}
and the batch loss is obtained by averaging \eqref{eq:distill_loss} 
across the mini-batch. We 
do not use any additional logit-based distillation or contrastive 
loss; all supervision is mediated through the teacher token 
embeddings.
% \paragraph{Mixup regularization.}
% We further apply image-level mixup within each batch as a regularizer. Given two images $x_a$ and $x_b$ and a mixing coefficient $\lambda \sim \mathcal{U}(0,1)$, the mixed input is
% \begin{equation}
%     \tilde{x}_{\text{mix}} = \lambda \tilde{x}_a + (1 - \lambda)\tilde{x}_b,
% \end{equation}
% and the corresponding teacher and student features are linearly interpolated. This encourages smoother transitions in feature space and improves stability during training with large batches.
\paragraph{Mixup regularization.}
We further apply image-level mixup within each batch as a 
regularizer. Given two images $x_a$ and $x_b$ and a mixing 
coefficient $\lambda \sim \mathcal{U}(0,1)$, the mixed input is
\begin{equation}
    \tilde{x}_{\text{mix}} = \lambda \tilde{x}_a + 
    (1 - \lambda)\tilde{x}_b.
\end{equation}
The mixed image is then forwarded through both teacher and student, 
so that the resulting token embeddings implicitly reflect the 
convex combination of the two input images. This encourages 
smoother transitions in feature space and improves stability 
during training with large batches.

\subsection{Ultrasound-aware data augmentations}
\label{subsec:augs}

We restrict distillation to transformations that reflect real ultrasound acquisition. Images are resized and cropped to mimic natural variation in zoom and field-of-view; horizontal flips are allowed, but vertical flips are excluded because ultrasound probes have a fixed orientation relative to the skin surface, making upside-down views physically impossible in clinical practice. Mild Gaussian blur models depth-dependent defocus, and color jitter is removed because ultrasound is inherently grayscale. These choices ensure that both teacher and student learn invariances tied to actual probe motion and imaging physics, forming a strictly ``domain-constrained’’ augmentation pipeline.
% \subsection{Ultrasound-aware data augmentations}
% \label{subsec:augs}

% To constrain distillation to physically plausible transformations, we use an ultrasound-aware augmentation pipeline. Each image is resized and randomly cropped to a fixed $256 \times 256$ resolution to mimic zooming and moderate changes in the field-of-view due to probe repositioning. We then apply horizontal flips with moderate probability, while disabling vertical flips, reflecting realistic left--right probe orientation changes without introducing anatomically implausible upside-down views. A mild Gaussian blur is added to capture focus variation and smoothing effects commonly observed in B-mode imaging. Grayscale jitter is disabled, since ultrasound images are inherently grayscale and color jitter does not correspond to a meaningful acquisition change. The augmented image is converted to a PyTorch tensor and used as the single view for both teacher and student. Together with the ultrasound-only corpus, these augmentations define our ``domain-constrained'' setup: the model learns invariance to acquisition-related factors such as zoom, mild blur, and horizontal orientation, while remaining faithful to ultrasound physics.


% \subsection{Optimization and implementation details}
% \label{subsec:optim}

% We train the student encoder using mini-batch stochastic optimization on the unlabeled ultrasound corpus. Unless otherwise stated, all experiments use a ResNet-50 student with default torchvision initialization and a DINOv3 ViT-B/16 teacher with publicly available pre-trained weights. The projection head is a two-layer MLP with hidden dimension 4096 that maps student tokens to the teacher embedding space, and features from $n = 2$ intermediate ViT blocks are used as teacher tokens. We employ a global batch size of 512 images, distributed across all available GPUs on a single node, and train for 1000 epochs of distillation. Optimization is performed with AdamW, using a learning rate of $1 \times 10^{-4}$ and weight decay of $0.05$, in mixed bfloat16 precision on NVIDIA GPUs. During training, we periodically save model checkpoints every 100 epochs and export deployment-ready student weights every 200 epochs. Interrupted runs are resumed to completion to fully utilize compute budget. We implement this training procedure using the DistillationV2 module from the LightlyTrain library, but the approach itself is framework-agnostic.



\subsection{Optimization and implementation details}
\label{subsec:optim}

We distill a ResNet-50 student from a DINOv3 ViT-B/16 teacher using mini-batch training on the unlabeled ultrasound corpus. A lightweight two-layer MLP projects student features into the teacher embedding space, and tokens from two intermediate ViT blocks are used as supervision. Training runs for 1000 epochs with AdamW (learning rate $1\times10^{-4}$, weight decay $0.05$), batch size 512, and mixed bfloat16 precision on NVIDIA L40 GPUs.

\section{Experiments}
\label{sec:experiments}

\subsection{Tasks and datasets}
\label{subsec:datasets}

We evaluate the proposed encoder on two segmentation tasks and one classification task. DDTI is a thyroid nodule segmentation dataset of 637 B-mode ultrasound images with expert pixel-level nodule masks~\cite{pedraza2015ddti}. We use 445 images for training, 127 for validation, and 65 for testing, with splits constructed at the patient level to avoid leakage. BUSI \cite{BUSI} is a breast ultrasound dataset of 780 images with lesion masks and image-level labels. For BUSI segmentation, we use the provided binary lesion masks for benign, malignant, and normal cases and create train, validation, and test splits analogous to DDTI. For BUSI classification, we define a three-class problem with normal (133 images), benign (437 images), and malignant (210 images), using disjoint class-stratified train, validation, and test sets built from class-wise folders. Importantly, neither DDTI nor BUSI is included in the unlabeled ultrasound corpus used for distillation, ensuring that downstream evaluation is performed on held-out datasets not seen during the distillation stage.

% \subsection{Models and baselines}
% \label{subsec:models}

% All downstream experiments are based on ResNet-50 backbones under four initialization schemes: (i) \textbf{R50-Rand}, a ResNet-50 trained from scratch; (ii) \textbf{R50-ImgNet}, a ResNet-50 initialized from supervised ImageNet-1k weights; (iii) \textbf{R50-Distill-Default} (ours), a ResNet-50 obtained by distilling a DINOv3 ViT-B/16 teacher using default natural-image augmentations; and (iv) \textbf{R50-Distill-US} (ours), a ResNet-50 obtained by distilling the same teacher on an ultrasound-only corpus with ultrasound-aware augmentations as described in Section~\ref{sec:method}. For segmentation, each ResNet-50 variant is used as the encoder in the same U-Net-style architecture implemented with Segmentation Models PyTorch, so that only the backbone initialization differs. As a high-capacity reference, we also fine-tune a DINOv3 ViT-B/16 backbone with a lightweight segmentation head. For BUSI classification, we use a standard ResNet-50 classifier (global average pooling followed by a linear head) with the four initializations above.

\subsection{Models and baselines}
\label{subsec:models}

All downstream experiments use ResNet-50 backbones with three initialization schemes: (i) \textbf{R50-Rand}, where weights are randomly initialized at the start of downstream training (no pretraining) and the model is then trained end-to-end using the same downstream protocol as all baselines; (ii) \textbf{R50-Distill-Default} (ours), initialized from a ResNet-50 distilled from a DINOv3 ViT-B/16 teacher on the curated ultrasound-only corpus using default ImageNet augmentations; and (iii) \textbf{R50-Distill-US} (ours), initialized from the same distillation procedure but using ultrasound-aware augmentations on the same ultrasound-only corpus, as described in Section~\ref{sec:method}. For segmentation, each ResNet-50 variant is used as the encoder in the same U-Net-style architecture implemented with Segmentation Models PyTorch, so that only the backbone initialization differs. As a high-capacity reference, we also fine-tune a DINOv3 ViT-B/16 backbone with a lightweight segmentation head. For BUSI classification, we use a standard ResNet-50 classifier (global average pooling followed by a linear head) with the three initializations above.
% \subsection{Training protocols and evaluation metrics}
% \label{subsec:protocols}

% For DDTI and BUSI segmentation, all models use the same U-Net decoder with a ResNet-50 or ViT-B/16 encoder at a fixed input resolution. Training uses standard geometric augmentations, while validation/test images undergo only resizing and normalization. Linear probing freezes the encoder to isolate representation quality, and full fine-tuning updates all parameters under identical optimization settings. For BUSI classification, we fine-tune a ResNet-50 with a three-way output head using the same preprocessing pipeline. All initialization variants (R50-Rand, R50-Distill-Default, R50-Distill-US) share identical training schedules, and model selection is based on validation accuracy. For segmentation on DDTI and BUSI, we report mean Dice coefficient and mean Intersection-over-Union (mIoU) on the held-out test sets. For BUSI classification, we report overall test accuracy and macro-averaged F1 score.


\subsection{Training Protocols, Implementation Details, and Evaluation Metrics}
\label{subsec:protocols}

\textbf{Training Protocols.} We evaluate models under: (1) \emph{Linear Probing}, where the encoder is frozen and only a linear head is trained; and (2) \emph{Full Fine-Tuning}, where the backbone and head are trained end-to-end. All decoders/heads were initialized from scratch during fine-tuning. For fairness, all baselines use the same data splits and task-specific training recipe, with a single learning rate for backbone and head (no layer-wise LR decay). All experiments were conducted on a single NVIDIA L40 GPU on the Compute Canada \textit{Vulcan} cluster.


\textbf{Implementation Details.} All models (DINOv3 teacher, distilled ResNet-50 student, and downstream baselines) are trained using 3-channel inputs to match standard pretrained backbones. Since ultrasound images are grayscale, we convert each image to a 3-channel tensor by channel-replication (i.e., copying the same intensity map to R/G/B). For segmentation, we fine-tune for 50 epochs at $256{\times}256$ (batch 32) using AdamW (lr=$10^{-4}$) with cosine warm restarts and loss $0.8\,\mathcal{L}_{\text{Dice}}+0.2\,\mathcal{L}_{\text{Focal}}$. For classification, we fine-tune for 50 epochs at $224{\times}224$ (batch 16) using AdamW (lr=$10^{-4}$) with cosine warm restarts and cross-entropy.


\textbf{Evaluation Metrics.}
Segmentation performance is evaluated using the Mean Dice Similarity Coefficient (DSC) and Mean Intersection over Union (mIoU), computed \emph{per-image} for the foreground class and averaged over the test set.
% Let $P_i$ and $G_i$ denote the predicted and ground-truth masks:
% \begin{equation}
% \mathrm{DSC} = \frac{1}{N} \sum_{i=1}^{N} \frac{2|P_i \cap G_i|}{|P_i| + |G_i|}, \quad
% \mathrm{mIoU} = \frac{1}{N} \sum_{i=1}^{N} \frac{|P_i \cap G_i|}{|P_i \cup G_i|}.
% \end{equation}
For classification, we report Accuracy and F1-score. To account for class imbalance, the F1-score is computed via \emph{macro-averaging}: precision and recall are calculated for each class $c$ independently and averaged with equal weight ($\text{F1} = \frac{1}{C} \sum_{c} \text{F1}_c$).
% \subsection{Training protocols}
% \label{subsec:protocols}

% For DDTI (thyroid) and BUSI (breast) segmentation, we use a standard U-Net architecture with a ResNet-50 encoder at fixed input resolution. Training images are augmented with common geometric and intensity perturbations and then normalized; validation and test images are only resized and normalized. All ResNet-50 backbones share the same optimization and learning-rate schedule, and in the main comparison the encoder is frozen (linear probing) so that differences across models primarily reflect representation quality. As a high-capacity reference, we also fine-tune a DINOv3 ViT-B/16 backbone with a lightweight segmentation head under an analogous setup.

% For BUSI classification, we fine-tune a ResNet-50 classifier with a three-way output layer for normal, benign, and malignant lesions. Images are resized to a fixed resolution, augmented with light geometric and photometric transformations during training, and normalized consistently with the segmentation pipeline. Models are trained end-to-end with standard cross-entropy optimization, using identical training settings across all initialization schemes (R50-Rand, R50-Distill-Default, R50-Distill-US). Model selection is based on validation performance, and we report test results from the best checkpoint. All models are implemented in PyTorch and trained on NVIDIA L40 GPUs.

% \subsection{Evaluation metrics}
% \label{subsec:metrics}

% \textbf{Segmentation.} We report \emph{Mean Dice} and \emph{Mean IoU} computed \emph{per image} on the test set. For each image, predicted probability maps are converted to a binary mask using the same post-processing for all models, and we compute
% $\mathrm{Dice}=\frac{2|P\cap G|}{|P|+|G|}$ and $\mathrm{IoU}=\frac{|P\cap G|}{|P\cup G|}$,
% where $P$ and $G$ denote the predicted and ground-truth foreground masks. We then average each metric across all test images to obtain the reported mean values. Since the task is binary, metrics are computed for the foreground only (background is not treated as a separate class).

% \textbf{Classification.} We report \emph{Accuracy} and \emph{F1 score} on the test set. Accuracy is computed as the fraction of correctly predicted labels. F1 is computed from precision and recall on the predicted labels and reported as the dataset-level score.




% \section{Results}
% \label{sec:results}

% \subsection{Linear probing on frozen encoders}
% \label{subsec:results_linear}

% Table~\ref{tab:linear_probing} reports linear-probe performance, where only the segmentation/classification heads are trained and all encoders are frozen. On DDTI thyroid nodule segmentation, the DINOv3-distilled ResNet--50 with default natural-image augmentations slightly outperforms ImageNet pretraining in Dice (0.7378 vs.\ 0.7273) and IoU (0.5999 vs.\ 0.5892), and closely matches the DINOv3 ViT teacher (Dice 0.7391). On BUSI segmentation, both distilled variants improve over random initialization, and the ultrasound-aware distilled encoder achieves the best Dice (0.6083) and IoU (0.4666), indicating better transfer of lesion morphology under purely frozen features. For BUSI three-way classification, ImageNet pretraining attains the highest accuracy (0.7452), while the default distilled model provides the best macro F1 (0.7037), suggesting more balanced performance across classes.

% \begin{table}[t]
% \centering
% \caption{Linear probing on frozen encoders. DDTI and BUSI segmentation are evaluated by mean Dice and mean IoU. BUSI classification is a 3-way task (normal/benign/malignant) evaluated by accuracy and macro F1.}
% \label{tab:linear_probing}
% \scriptsize
% \begin{tabular}{lcccccc}
% \toprule
% \multirow{2}{*}{Model} &
% \multicolumn{2}{c}{DDTI Seg.} &
% \multicolumn{2}{c}{BUSI Seg.} &
% \multicolumn{2}{c}{BUSI Cls.} \\
%  & Dice & IoU & Dice & IoU & Acc & F1 \\
% \midrule
% ResNet-50 (Random Weight) & 0.6177 & 0.4639 & 0.4090 & 0.3093 & 0.5605 & 0.4768 \\
% ResNet-50 (ImageNet Weight) & 0.7273 & 0.5892 & 0.5255 & 0.4180 & \textbf{0.7452} & \textbf{0.7037} \\
% ResNet-50 (DINOv3 Distilled, Default Aug.) & \textbf{0.7378} & \textbf{0.5999} & 0.5771 & 0.4329 & 0.7325 & \textbf{0.7037} \\
% ResNet-50 (DINOv3 Distilled, US-Specific Aug.) & 0.7334 & 0.5864 & \textbf{0.6083} & \textbf{0.4666} & \textbf{0.7452} & 0.6768 \\
% DINOv3 ViT-B/16 (Teacher) & 0.7391 & 0.6033 & 0.2384 & 0.1729 & -- & -- \\
% \bottomrule
% \end{tabular}
% \end{table}

% \subsection{Full fine-tuning on all labeled data}
% \label{subsec:results_full_finetune}

% Table~\ref{tab:full_finetune} summarizes full fine-tuning performance when all encoder parameters are updated. On DDTI, ImageNet pretraining achieves the highest Dice (0.7953), but the ultrasound-aware distilled encoder is essentially tied (0.7933) and exceeds the DINOv3 teacher (0.7821), showing that billion-scale ViT priors can be preserved in a 25M-parameter CNN without sacrificing thyroid segmentation quality. On BUSI segmentation, the ultrasound-aware distilled encoder yields the best Dice (0.7571) and IoU (0.6271), clearly outperforming both ImageNet pretraining (Dice 0.7233) and random initialization (Dice 0.5525). For BUSI classification, the same encoder also gives the strongest performance, with accuracy 0.8918 and macro F1 0.8871, improving over the ImageNet baseline (accuracy 0.8662, F1 0.8572). These results indicate that ultrasound-specific distillation is particularly beneficial for breast lesion segmentation and classification.

% \begin{table}[t]
% \centering
% \caption{Full fine-tuning on all labeled data. Metrics as in Table~\ref{tab:linear_probing}.}
% \label{tab:full_finetune}
% \scriptsize
% \begin{tabular}{lcccccc}
% \toprule
% \multirow{2}{*}{Model} &
% \multicolumn{2}{c}{DDTI Seg.} &
% \multicolumn{2}{c}{BUSI Seg.} &
% \multicolumn{2}{c}{BUSI Cls.} \\
%  & Dice & IoU & Dice & IoU & Acc & F1 \\
% \midrule
% ResNet-50 (Random Weight) & 0.6605 & 0.5297 & 0.5525 & 0.4629 & 0.6561 & 0.6526 \\
% ResNet-50 (ImageNet Weight) & \textbf{0.7953} & \textbf{0.6902} & 0.7233 & \textbf{0.6416} & 0.8662 & 0.8572 \\
% ResNet-50 (DINOv3 Distilled, Default Aug.) & 0.7832 & 0.6768 & 0.7315 & 0.6097 & 0.8726 & 0.8632 \\
% ResNet-50 (DINOv3 Distilled, US-Specific Aug.) & 0.7933 & 0.6856 & \textbf{0.7571} & 0.6271 & \textbf{0.8918} & \textbf{0.8871} \\
% DINOv3 ViT-B/16 (Teacher) & 0.7821 & 0.6707 & 0.2838 & 0.1957 & -- & -- \\
% \bottomrule
% \end{tabular}
% \end{table}

% \subsection{Limited-label regimes}
% \label{subsec:results_limited}

% To study label-efficiency, we fine-tune each ResNet-50 encoder on 5\%, 10\%, 20\%, and 50\% of the labeled data for both segmentation and classification. Table~\ref{tab:limited_data} aggregates Dice for DDTI and BUSI segmentation together with BUSI classification accuracy at each label fraction.

% For DDTI segmentation, ImageNet pretraining remains a strong baseline across all label fractions, but the ultrasound-aware distilled encoder achieves the highest Dice at 10\% and 20\% labels (0.6574 and 0.7309), narrowing or surpassing the ImageNet gap in the low--mid label regime. For BUSI segmentation, all pretrained models substantially outperform random initialization, with the ImageNet encoder slightly ahead at 50\% labels (Dice 0.6781), while the distilled variants are competitive at 10--20\% labels.

% For BUSI classification, the benefits of ultrasound-aware distillation are more pronounced. At 5\% labels, the ultrasound-specific encoder already gives the highest accuracy (0.7197 vs.\ 0.6943 for ImageNet), and it maintains a clear lead at 20\% (0.8025 vs.\ 0.7134) and 50\% (0.8726 vs.\ 0.8217). These trends suggest that domain-constrained distillation is particularly effective in POCUS-like low-label classification settings, where both data and compute are constrained.

% % \begin{table}[t]
% % \centering
% % \caption{Limited-data performance for different label fractions. We report segmentation Dice on DDTI (thyroid) and BUSI (breast), and BUSI 3-way classification accuracy, when fine-tuning on 5\%, 10\%, 20\%, and 50\% of labeled data.}
% % \label{tab:limited_data}
% % \scriptsize
% % \begin{tabular}{lcccccccccccc}
% % \toprule
% % & \multicolumn{4}{c}{DDTI Dice} & \multicolumn{4}{c}{BUSI Dice} & \multicolumn{4}{c}{BUSI Acc} \\
% % Model & 5\% & 10\% & 20\% & 50\% & 5\% & 10\% & 20\% & 50\% & 5\% & 10\% & 20\% & 50\% \\
% % \midrule
% % ResNet-50 (Random Weight) &
% % 0.4269 & 0.5237 & 0.5597 & 0.6007 &
% % 0.1911 & 0.3381 & 0.3895 & 0.4038 &
% % 0.2675 & 0.5605 & 0.6242 & 0.6561 \\
% % ResNet-50 (ImageNet Weight) &
% % \textbf{0.5601} & 0.6466 & 0.6889 & \textbf{0.7721} &
% % \textbf{0.3920} & \textbf{0.4835} & \textbf{0.5495} & \textbf{0.6781} &
% % 0.6943 & \textbf{0.6688} & 0.7134 & 0.8217 \\
% % ResNet-50 (DINOv3 Distilled, Default Aug.) &
% % 0.4958 & 0.6536 & 0.7109 & 0.7672 &
% % 0.2494 & \textbf{0.4857} & 0.5011 & 0.6185 &
% % 0.4968 & 0.5860 & 0.6688 & 0.7452 \\
% % ResNet-50 (DINOv3 Distilled, US-Specific Aug.) &
% % 0.5233 & \textbf{0.6574} & \textbf{0.7309} & 0.7661 &
% % 0.2752 & 0.4600 & 0.5036 & 0.5543 &
% % \textbf{0.7197} & 0.6242 & \textbf{0.8025} & \textbf{0.8726} \\
% % \bottomrule
% % \end{tabular}
% % \end{table}
% \begin{table*}[t]
% \centering
% \caption{Limited-data performance for different label fractions. We report segmentation Dice on DDTI (thyroid) and BUSI (breast), and BUSI 3-way classification accuracy, when fine-tuning on 5\%, 10\%, 20\%, and 50\% of labeled data.}
% \label{tab:limited_data}
% \setlength{\tabcolsep}{3pt}  % tighten column spacing
% \scriptsize
% \resizebox{\textwidth}{!}{%
% \begin{tabular}{lcccccccccccc}
% \toprule
% & \multicolumn{4}{c}{DDTI Dice} & \multicolumn{4}{c}{BUSI Dice} & \multicolumn{4}{c}{BUSI Acc} \\
% Model & 5\% & 10\% & 20\% & 50\% & 5\% & 10\% & 20\% & 50\% & 5\% & 10\% & 20\% & 50\% \\
% \midrule
% ResNet-50 (Random Weight) &
% 0.4269 & 0.5237 & 0.5597 & 0.6007 &
% 0.1911 & 0.3381 & 0.3895 & 0.4038 &
% 0.2675 & 0.5605 & 0.6242 & 0.6561 \\
% ResNet-50 (ImageNet Weight) &
% \textbf{0.5601} & 0.6466 & 0.6889 & \textbf{0.7721} &
% \textbf{0.3920} & \textbf{0.4835} & \textbf{0.5495} & \textbf{0.6781} &
% 0.6943 & \textbf{0.6688} & 0.7134 & 0.8217 \\
% ResNet-50 (DINOv3 Distilled, Default Aug.) &
% 0.4958 & 0.6536 & 0.7109 & 0.7672 &
% 0.2494 & \textbf{0.4857} & 0.5011 & 0.6185 &
% 0.4968 & 0.5860 & 0.6688 & 0.7452 \\
% ResNet-50 (DINOv3 Distilled, US-Specific Aug.) &
% 0.5233 & \textbf{0.6574} & \textbf{0.7309} & 0.7661 &
% 0.2752 & 0.4600 & 0.5036 & 0.5543 &
% \textbf{0.7197} & 0.6242 & \textbf{0.8025} & \textbf{0.8726} \\
% \bottomrule
% \end{tabular}%
% }
% \end{table*}

% \subsection{Capacity--performance trade-off}
% \label{subsec:capacity_tradeoff}

% To explicitly quantify the deployment--performance trade-off, Table~\ref{tab:capacity_tradeoff} compares the heavy ViT teacher with the most relevant ResNet-50 baselines in terms of parameter count and full fine-tuning performance. The DINOv3 ViT-B/16 backbone has roughly 86\,M parameters, while ResNet-50 has about 25\,M parameters. Despite this $\sim 3\times$ reduction in capacity, our ultrasound-aware distilled encoder closely matches the teacher on DDTI and clearly improves over the ImageNet baseline on BUSI segmentation and classification, achieving the best BUSI classification accuracy overall. This supports our framing of domain-constrained distillation as a way to preserve billion-scale ViT priors in a form suitable for point-of-care deployment.

% \begin{table}[t]
% \centering
% \caption{Model capacity vs.\ performance after full fine-tuning. Parameter counts are approximate.}
% \label{tab:capacity_tradeoff}
% \scriptsize
% \begin{tabular}{lcccc}
% \toprule
% Model & Params (M) & DDTI Dice & BUSI Dice & BUSI Acc \\
% \midrule
% DINOv3 ViT-B/16 (Teacher) & 86 & 0.7821 & 0.2838 & -- \\
% ResNet-50 (ImageNet Weight) & 25 & \textbf{0.7953} & 0.7233 & 0.8662 \\
% ResNet-50 (DINOv3 Distilled, US-Specific Aug.) & 25 & 0.7933 & \textbf{0.7571} & \textbf{0.8918} \\
% \bottomrule
% \end{tabular}
% \end{table}


% \section{Results}
% \label{sec:results}

% \subsection{Linear probing on frozen encoders}
% \label{subsec:results_linear}

% Table~\ref{tab:linear_probing} reports linear-probe performance, where only the segmentation/classification heads are trained and all encoders are frozen. On DDTI thyroid nodule segmentation, the DINOv3-distilled ResNet--50 with default natural-image augmentations nearly matches the ViT teacher (Dice 0.7378 vs.\ 0.7391; IoU 0.5999 vs.\ 0.6033), while both clearly outperform a randomly initialized backbone. On BUSI segmentation, both distilled variants improve substantially over random initialization, and the ultrasound-aware distilled encoder achieves the best Dice (0.6083) and IoU (0.4666), indicating better transfer of lesion morphology under purely frozen features. For BUSI three-way classification, both distilled models outperform training from scratch; the US-specific encoder yields the highest accuracy (0.7452), whereas the default distilled encoder attains the best macro F1 (0.7037), suggesting more balanced performance across classes.

% \begin{table}[t]
% \centering
% \caption{Linear probing on frozen encoders. DDTI and BUSI segmentation are evaluated by mean Dice and mean IoU. BUSI classification is a 3-way task (normal/benign/malignant) evaluated by accuracy and macro F1.}
% \label{tab:linear_probing}
% \scriptsize
% \begin{tabular}{lcccccc}
% \toprule
% \multirow{2}{*}{Model} &
% \multicolumn{2}{c}{DDTI Seg.} &
% \multicolumn{2}{c}{BUSI Seg.} &
% \multicolumn{2}{c}{BUSI Cls.} \\
%  & Dice & IoU & Dice & IoU & Acc & F1 \\
% \midrule
% ResNet-50 (Random Weight) & 0.6177 & 0.4639 & 0.4090 & 0.3093 & 0.5605 & 0.4768 \\
% ResNet-50 (DINOv3 Distilled, Default Aug.) & 0.7378 & 0.5999 & 0.5771 & 0.4329 & 0.7325 & \textbf{0.7037} \\
% ResNet-50 (DINOv3 Distilled, US-Specific Aug.) & 0.7334 & 0.5864 & \textbf{0.6083} & \textbf{0.4666} & \textbf{0.7452} & 0.6768 \\
% DINOv3 ViT-B/16 (Teacher) & \textbf{0.7391} & \textbf{0.6033} & 0.2384 & 0.1729 & -- & -- \\
% \bottomrule
% \end{tabular}
% \end{table}

% \subsection{Full fine-tuning on all labeled data}
% \label{subsec:results_full_finetune}

% Table~\ref{tab:full_finetune} summarizes performance when all encoder parameters are updated. On DDTI, the ultrasound-aware distilled encoder achieves the best Dice (0.7933) and IoU (0.6856), slightly surpassing both the default distilled variant and the ViT teacher (Dice 0.7821). On BUSI segmentation, the US-specific distilled encoder again provides the strongest results (Dice 0.7571, IoU 0.6271), far above the randomly initialized model and dramatically better than the ViT teacher, which suffers from a pronounced domain gap. For BUSI classification, the same encoder attains the highest accuracy (0.8918) and macro F1 (0.8871). Overall, these results show that ultrasound-specific distillation compresses the billion-scale ViT prior into a 25M-parameter CNN that matches or exceeds the teacher on thyroid segmentation and substantially improves breast lesion segmentation and classification.

% \begin{table}[t]
% \centering
% \caption{Full fine-tuning on all labeled data. Metrics as in Table~\ref{tab:linear_probing}.}
% \label{tab:full_finetune}
% \scriptsize
% \begin{tabular}{lcccccc}
% \toprule
% \multirow{2}{*}{Model} &
% \multicolumn{2}{c}{DDTI Seg.} &
% \multicolumn{2}{c}{BUSI Seg.} &
% \multicolumn{2}{c}{BUSI Cls.} \\
%  & Dice & IoU & Dice & IoU & Acc & F1 \\
% \midrule
% ResNet-50 (Random Weight) & 0.6605 & 0.5297 & 0.5525 & 0.4629 & 0.6561 & 0.6526 \\
% ResNet-50 (DINOv3 Distilled, Default Aug.) & 0.7832 & 0.6768 & 0.7315 & 0.6097 & 0.8726 & 0.8632 \\
% ResNet-50 (DINOv3 Distilled, US-Specific Aug.) & \textbf{0.7933} & \textbf{0.6856} & \textbf{0.7571} & \textbf{0.6271} & \textbf{0.8918} & \textbf{0.8871} \\
% DINOv3 ViT-B/16 (Teacher) & 0.7821 & 0.6707 & 0.2838 & 0.1957 & -- & -- \\
% \bottomrule
% \end{tabular}
% \end{table}

% \subsection{Limited-label regimes}
% \label{subsec:results_limited}

% To study label-efficiency, we fine-tune each ResNet-50 encoder on 5\%, 10\%, 20\%, and 50\% of the labeled data for both segmentation and classification. Table~\ref{tab:limited_data} aggregates Dice for DDTI and BUSI segmentation together with BUSI classification accuracy at each label fraction.

% For DDTI segmentation, both distilled models consistently outperform training from scratch, with the US-specific encoder achieving the best Dice at 5\%, 10\%, and 20\% labels and the default distilled variant slightly ahead at 50\%. For BUSI segmentation, the distilled backbones again dominate the random baseline: the US-specific encoder is strongest at very low labels (5\% and 20\%), while the default distilled model is best at 10\% and 50\%. 

% For BUSI classification, the gains from ultrasound-aware distillation are most pronounced. The US-specific encoder achieves the best accuracy at every label fraction, from 5\% (0.7197 vs.\ 0.4968 for the default distilled model and 0.2675 for random) up to 50\% (0.8726 vs.\ 0.7452 and 0.6561). These trends suggest that domain-constrained distillation is particularly effective in POCUS-like low-label classification settings, where both data and compute are constrained.

% \begin{table*}[t]
% \centering
% \caption{Limited-data performance for different label fractions. We report segmentation Dice on DDTI (thyroid) and BUSI (breast), and BUSI 3-way classification accuracy, when fine-tuning on 5\%, 10\%, 20\%, and 50\% of labeled data.}
% \label{tab:limited_data}
% \setlength{\tabcolsep}{3pt}
% \scriptsize
% \resizebox{\textwidth}{!}{%
% \begin{tabular}{lcccccccccccc}
% \toprule
% & \multicolumn{4}{c}{DDTI Dice} & \multicolumn{4}{c}{BUSI Dice} & \multicolumn{4}{c}{BUSI Acc} \\
% Model & 5\% & 10\% & 20\% & 50\% & 5\% & 10\% & 20\% & 50\% & 5\% & 10\% & 20\% & 50\% \\
% \midrule
% ResNet-50 (Random Weight) &
% 0.4269 & 0.5237 & 0.5597 & 0.6007 &
% 0.1911 & 0.3381 & 0.3895 & 0.4038 &
% 0.2675 & 0.5605 & 0.6242 & 0.6561 \\
% ResNet-50 (DINOv3 Distilled, Default Aug.) &
% 0.4958 & 0.6536 & 0.7109 & \textbf{0.7672} &
% 0.2494 & \textbf{0.4857} & 0.5011 & \textbf{0.6185} &
% 0.4968 & 0.5860 & 0.6688 & 0.7452 \\
% ResNet-50 (DINOv3 Distilled, US-Specific Aug.) &
% \textbf{0.5233} & \textbf{0.6574} & \textbf{0.7309} & 0.7661 &
% \textbf{0.2752} & 0.4600 & \textbf{0.5036} & 0.5543 &
% \textbf{0.7197} & \textbf{0.6242} & \textbf{0.8025} & \textbf{0.8726} \\
% \bottomrule
% \end{tabular}%
% }
% \end{table*}

% \subsection{Capacity--performance trade-off}
% \label{subsec:capacity_tradeoff}

% To explicitly quantify the deployment--performance trade-off, Table~\ref{tab:capacity_tradeoff} compares the heavy ViT teacher with our ultrasound-aware distilled ResNet--50 in terms of parameter count and full fine-tuning performance. The DINOv3 ViT-B/16 backbone has roughly 86\,M parameters, while the distilled ResNet-50 has about 25\,M parameters. Despite this $\sim 3\times$ reduction in capacity, the distilled encoder closely matches the teacher on DDTI thyroid segmentation and clearly improves BUSI breast lesion segmentation and classification accuracy. This supports our framing of domain-constrained distillation as a way to preserve billion-scale ViT priors in a form suitable for point-of-care deployment.

% \begin{table}[t]
% \centering
% \caption{Model capacity vs.\ performance after full fine-tuning. Parameter counts are approximate.}
% \label{tab:capacity_tradeoff}
% \scriptsize
% \begin{tabular}{lcccc}
% \toprule
% Model & Params (M) & DDTI Dice & BUSI Dice & BUSI Acc \\
% \midrule
% DINOv3 ViT-B/16 (Teacher) & 86 & 0.7821 & 0.2838 & -- \\
% ResNet-50 (DINOv3 Distilled, US-Specific Aug.) & 25 & \textbf{0.7933} & \textbf{0.7571} & \textbf{0.8918} \\
% \bottomrule
% \end{tabular}
% \end{table}

% % \section{Discussion}
% % \label{sec:discussion}

% % Our results support three main claims. First, we show that it is possible to \emph{preserve billion-scale ViT priors in a compact 25M-parameter ResNet}. The ultrasound-aware distilled encoder closely tracks the 86M-parameter DINOv3 ViT-B/16 teacher on DDTI segmentation, and, after full fine-tuning, even slightly exceeds the teacher while remaining competitive with ImageNet pretraining. This suggests that the core representational benefits of large ViTs can be transferred into a deployment-friendly CNN backbone.

% % Second, \emph{ultrasound-aware distillation and augmentations matter most on heterogeneous breast data}. Across both linear probing and full fine-tuning, the ultrasound-specific distilled encoder consistently delivers the best BUSI segmentation Dice and the strongest BUSI classification performance, outperforming both random and ImageNet initializations. This indicates that constraining the distillation process to ultrasound-only data and physically plausible transformations helps the student encode lesion appearance and context that are not well captured by generic natural-image priors alone.

% % Third, \emph{the largest gains appear in label-efficient, POCUS-like classification regimes}. In BUSI three-way classification with limited labels, the ultrasound-aware distilled encoder offers the strongest improvements over ImageNet, particularly at 5--20\% labels, where additional annotations are costly but deployment demands remain high. Together, these findings suggest that domain-constrained distillation is a promising recipe for building lightweight, label-efficient ultrasound foundation models that bridge the gap between billion-scale pretraining and real-world point-of-care deployment.



% % % \section{Results}
% % % \label{sec:results}

% % % \subsection{Linear probing on frozen encoders}
% % % \label{subsec:results_linear}

% % % Table~\ref{tab:linear_probing} reports linear-probe performance when only the final segmentation/classification heads are trained. On DDTI thyroid nodule segmentation, the DINOv3-distilled ResNet--50 with default natural-image augmentations slightly outperforms ImageNet pretraining in Dice (0.7378 vs.\ 0.7273) and IoU (0.5999 vs.\ 0.5892). For BUSI breast lesion segmentation, both distilled variants improve over random initialization, and the ultrasound-aware distilled encoder achieves the highest Dice (0.6083), indicating better transfer of lesion morphology. For BUSI three-way classification, ImageNet pretraining still attains the best accuracy (0.7452), while the default distilled model yields the highest macro F1 (0.7037), suggesting more balanced performance across the three classes.

% % % \begin{table}[t]
% % % \centering
% % % \caption{Linear probing results on frozen encoders. DDTI and BUSI segmentation are evaluated by mean Dice and mean IoU. BUSI classification is a 3-way task (normal/benign/malignant) evaluated by accuracy and macro F1.}
% % % \label{tab:linear_probing}
% % % \scriptsize
% % % \begin{tabular}{lcccccc}
% % % \toprule
% % % \multirow{2}{*}{Model} &
% % % \multicolumn{2}{c}{DDTI Segmentation} &
% % % \multicolumn{2}{c}{BUSI Segmentation} &
% % % \multicolumn{2}{c}{BUSI Classification} \\
% % %  & Dice & IoU & Dice & IoU & Acc & F1 \\
% % % \midrule
% % % ResNet-50 (Random Weight) & 0.6177 & 0.4639 & 0.4090 & 0.3093 & 0.5605 & 0.4768 \\
% % % ResNet-50 (ImageNet Weight) & 0.7273 & 0.5892 & 0.5255 & 0.4180 & 0.7452 & 0.7037 \\
% % % ResNet-50 (DINOv3 Distilled, Default Aug.) & \textbf{0.7378} & \textbf{0.5999} & 0.5771 & 0.4329 & 0.7325 & \textbf{0.7037} \\
% % % ResNet-50 (DINOv3 Distilled, US-Specific Aug.) & 0.7334 & 0.5864 & \textbf{0.6083} & \textbf{0.4666} & \textbf{0.7452} & 0.6768 \\
% % % DINOv3 ViT-B/16 (Teacher) & 0.7391 & 0.6033 & -- & -- & -- & -- \\
% % % \bottomrule
% % % \end{tabular}
% % % \end{table}


% % % \subsection{Full fine-tuning on the full dataset}
% % % \label{subsec:results_full_finetune}

% % % Table~\ref{tab:full_finetune} summarizes full fine-tuning performance when all encoder parameters are updated. On DDTI segmentation, the ImageNet-initialized ResNet--50 attains the highest Dice (0.7953), with the ultrasound-specific distilled model close behind (0.7933) and matching or exceeding the DINOv3 teacher (0.7821). On BUSI segmentation, the ultrasound-aware distilled encoder achieves the best Dice (0.7571) and IoU (0.6271), clearly outperforming both ImageNet pretraining (Dice 0.7233) and random initialization (Dice 0.5525). For BUSI classification, the same distilled encoder yields the strongest performance, with accuracy 0.8918 and macro F1 0.8871, improving over the ImageNet baseline (accuracy 0.8662, F1 0.8572).

% % % \begin{table}[t]
% % % \centering
% % % \caption{Full fine-tuning results on all labeled data. Metrics as in Table~\ref{tab:linear_probing}.}
% % % \label{tab:full_finetune}
% % % \scriptsize
% % % \begin{tabular}{lcccccc}
% % % \toprule
% % % \multirow{2}{*}{Model} &
% % % \multicolumn{2}{c}{DDTI Segmentation} &
% % % \multicolumn{2}{c}{BUSI Segmentation} &
% % % \multicolumn{2}{c}{BUSI Classification} \\
% % %  & Dice & IoU & Dice & IoU & Acc & F1 \\
% % % \midrule
% % % ResNet-50 (Random Weight) & 0.6605 & 0.5297 & 0.5525 & 0.4629 & 0.6561 & 0.6526 \\
% % % ResNet-50 (ImageNet Weight) & \textbf{0.7953} & \textbf{0.6902} & 0.7233 & 0.6416 & 0.8662 & 0.8572 \\
% % % ResNet-50 (DINOv3 Distilled, Default Aug.) & 0.7832 & 0.6768 & 0.7315 & 0.6097 & 0.8726 & 0.8632 \\
% % % ResNet-50 (DINOv3 Distilled, US-Specific Aug.) & 0.7933 & 0.6856 & \textbf{0.7571} & \textbf{0.6271} & \textbf{0.8918} & \textbf{0.8871} \\
% % % DINOv3 ViT-B/16 (Teacher) & 0.7821 & 0.6707 & -- & -- & -- & -- \\
% % % \bottomrule
% % % \end{tabular}
% % % \end{table}


% % % \subsection{Limited-label regimes}
% % % \label{subsec:results_limited}

% % % To study label-efficiency, we fine-tune each encoder on 5\%, 10\%, 20\%, and 50\% of the labeled data for DDTI and BUSI. Table~\ref{tab:limited_seg} reports segmentation Dice. For DDTI, ImageNet pretraining consistently gives the strongest Dice across all label fractions, though the ultrasound-aware distilled encoder narrows the gap at 20\% labels (0.7309 vs.\ 0.6889). For BUSI segmentation, all pretrained models substantially outperform random initialization, with the ImageNet encoder achieving the best Dice at 50\% labels (0.6781), while the distilled variants remain competitive at intermediate fractions.

% % % Table~\ref{tab:limited_cls} shows BUSI three-way classification accuracy under the same label fractions. Here the benefits of ultrasound-aware distillation are more pronounced: at 5\% labels, the ultrasound-specific distilled encoder achieves the highest accuracy (0.7197 vs.\ 0.6943 for ImageNet), and it maintains a clear lead at 20\% (0.8025 vs.\ 0.7134) and 50\% (0.8726 vs.\ 0.8217). These trends indicate that, while ImageNet pretraining remains a strong baseline for segmentation, the ultrasound-distilled encoder provides the largest gains for low-label breast lesion classification.

% % % \begin{table}[t]
% % % \centering
% % % \caption{Limited-data segmentation performance (Dice) on DDTI (thyroid) and BUSI (breast) for different label fractions.}
% % % \label{tab:limited_seg}
% % % \scriptsize
% % % \begin{tabular}{lcccccccc}
% % % \toprule
% % % \multirow{2}{*}{Model} &
% % % \multicolumn{4}{c}{DDTI Dice} &
% % % \multicolumn{4}{c}{BUSI Dice} \\
% % %  & 5\% & 10\% & 20\% & 50\% & 5\% & 10\% & 20\% & 50\% \\
% % % \midrule
% % % ResNet-50 (Random Weight) & 0.4269 & 0.5237 & 0.5597 & 0.6007 & 0.1911 & 0.3381 & 0.3895 & 0.4038 \\
% % % ResNet-50 (ImageNet Weight) & \textbf{0.5601} & 0.6466 & 0.6889 & \textbf{0.7721} & \textbf{0.3920} & \textbf{0.4835} & \textbf{0.5495} & \textbf{0.6781} \\
% % % ResNet-50 (DINOv3 Distilled, Default Aug.) & 0.4958 & 0.6536 & 0.7109 & 0.7672 & 0.2494 & 0.4857 & 0.5011 & 0.6185 \\
% % % ResNet-50 (DINOv3 Distilled, US-Specific Aug.) & 0.5233 & \textbf{0.6574} & \textbf{0.7309} & 0.7661 & 0.2752 & 0.4600 & 0.5036 & 0.5543 \\
% % % \bottomrule
% % % \end{tabular}
% % % \end{table}

% % % \begin{table}[t]
% % % \centering
% % % \caption{Limited-data BUSI 3-way classification accuracy for different label fractions.}
% % % \label{tab:limited_cls}
% % % \scriptsize
% % % \begin{tabular}{lcccc}
% % % \toprule
% % % Model & 5\% & 10\% & 20\% & 50\% \\
% % % \midrule
% % % ResNet-50 (Random Weight) & 0.2675 & 0.5605 & 0.6242 & 0.6561 \\
% % % ResNet-50 (ImageNet Weight) & 0.6943 & \textbf{0.6688} & 0.7134 & 0.8217 \\
% % % ResNet-50 (DINOv3 Distilled, Default Aug.) & 0.4968 & 0.5860 & 0.6688 & 0.7452 \\
% % % ResNet-50 (DINOv3 Distilled, US-Specific Aug.) & \textbf{0.7197} & 0.6242 & \textbf{0.8025} & \textbf{0.8726} \\
% % % \bottomrule
% % % \end{tabular}
% % % \end{table}


\section{Results and Discussion}
\label{sec:results}

\subsection{Linear probing on frozen encoders}
\label{subsec:results_linear}

Table~\ref{tab:linear_probing} summarizes linear-probe performance across segmentation and classification tasks. On DDTI, both distilled models substantially outperform the randomly initialized baseline, with \textbf{R50-Distill-Default} obtaining the best Dice (0.7378) and IoU (0.6252), indicating successful transfer of ViT teacher knowledge to a compact CNN. In contrast, the \textbf{DINOv3 ViT-B/16} teacher underperforms (Dice 0.6503), reflecting limited robustness to thyroid-domain grayscale and speckle statistics.  On BUSI, the domain gap becomes more pronounced: the teacher collapses to a Dice of 0.2384, while the ultrasound-aware \textbf{R50-Distill-US} achieves the strongest segmentation performance (Dice 0.6083, IoU 0.5259). Similarly, \textbf{R50-Distill-US} yields the best BUSI classification accuracy (0.7452), whereas \textbf{R50-Distill-Default} attains the highest macro F1 (0.7037). These results confirm that distillation on ultrasound-only data, paired with physics-consistent augmentations, produces representations substantially better aligned with downstream ultrasound tasks than the generic natural-image ViT teacher.


\begin{table}[t]
\centering
\caption{Linear probing on frozen encoders. DDTI and BUSI segmentation are evaluated by mean Dice and mean IoU. BUSI classification is a 3-way task (normal/benign/malignant) evaluated by accuracy and macro F1.}
\label{tab:linear_probing}
\tiny
\begin{tabular}{lcccccc}
\toprule
\multirow{2}{*}{Model} &
\multicolumn{2}{c}{DDTI Seg.} &
\multicolumn{2}{c}{BUSI Seg.} &
\multicolumn{2}{c}{BUSI Cls.} \\
 & Dice & IoU & Dice & IoU & Acc & F1 \\
\midrule
R50-Rand & 0.6028 & 0.4647 & 0.4365 & 0.3369 & 0.6115 & 0.4204 \\
R50-Distill-Default & \textbf{0.7378} & \textbf{0.6252} & 0.5771 & 0.4857 & 0.7325 & \textbf{0.7037} \\
R50-Distill-US & 0.7334 & 0.6192 & \textbf{0.6083} & \textbf{0.5259} & \textbf{0.7452} & 0.6768 \\
DINOv3 ViT-B/16 & 0.6503 & 0.4743 & 0.2384 & 0.1729 & -- & -- \\
\bottomrule
\end{tabular}
\end{table}

% \subsection{Full fine-tuning on all labeled data}
% \label{subsec:results_full_finetune}
% As shown in Table~\ref{tab:full_finetune}, full fine-tuning substantially amplifies the benefits of distillation. Both distilled ResNet-50 models outperform the randomly initialized baseline across all tasks, and the ultrasound-aware variant closely matches the ViT teacher on DDTI despite having only a fraction of the parameters. The ViT’s inconsistent Dice–IoU behavior further suggests difficulty aligning its natural-image features with ultrasound boundary structure. On BUSI, the contrast is even stronger: the teacher fails to adapt, while both distilled students fine-tune reliably and achieve markedly better segmentation and classification performance. This indicates that distillation not only compresses the teacher but also removes natural-image biases that hinder direct transfer to ultrasound. Although DINOv3 performed well on thyroid segmentation, it demonstrated significant overfitting on the breast ultrasound (BUSI) task. Training logs revealed that validation performance degraded rapidly despite continued improvements in training loss, likely due to the domain gap between natural images and the complex speckle noise of breast ultrasound. We identified the stabilization of foundation models on such high-variance medical datasets as an open challenge for future research.
% --- BUSI qualitative failures: 6 cases, 3 columns (Input | Input+GT | GT vs Pred) ---

% \subsection{Full fine-tuning on all labeled data} \label{subsec:results_full_finetune}

% As shown in Table~\ref{tab:full_finetune}, full fine-tuning substantially amplifies the benefits of distillation. Both distilled ResNet-50 models consistently outperform the randomly initialized baseline across all tasks. On the DDTI dataset, the ultrasound-aware student achieves performance parity with the massive ViT teacher despite having only a fraction of the parameters. A more striking contrast is observed on the BUSI dataset. While the DINOv3 teacher performed well on thyroid data, it exhibited significant overfitting on the breast ultrasound task; training logs revealed that validation performance degraded rapidly despite continued improvements in training loss. This instability likely stems from the pronounced domain gap between natural images and the complex speckle noise and class heterogeneity of breast ultrasound. In contrast, both distilled students fine-tuned reliably, achieving markedly better segmentation and classification performance than the teacher. This indicates that distillation not only compresses the model but also effectively filters out natural-image biases that hinder direct transfer, whereas stabilizing massive foundation models on such high-variance medical datasets remains an open challenge.
\subsection{Full fine-tuning on all labeled data}
\label{subsec:results_full_finetune}

As shown in Table~\ref{tab:full_finetune}, full fine-tuning amplifies the benefits of distillation. Both distilled ResNet-50 models consistently outperform the randomly initialized baseline across all tasks. On DDTI, the ultrasound-aware student achieves near-parity with the ViT teacher despite using a fraction of the parameters, indicating that distillation preserves performance while substantially reducing model size.

A sharper contrast emerges on BUSI. While DINOv3 adapts well to DDTI, it transfers poorly to BUSI and exhibits unstable fine-tuning behavior: training loss decreases while validation loss degrades and becomes unstable (Supplementary Fig.~\ref{fig:busi_dinov3_overfit_curve}), consistent with overfitting under limited and heterogeneous BUSI supervision (best-validation checkpoint reported). We further analyze this DDTI--BUSI gap in the Supplementary material (Supplementary Sec.~\ref{sec:supp_ddti_busi_gap}; Supplementary Figs.~\ref{fig:ddti_dinov3_qualitative}--\ref{fig:busi_dinov3_qualitative_6cases}), documenting frequent boundary mismatch, missed small/thin lesions, and false positives on negative (normal-class) images. In contrast, both distilled students fine-tune reliably on BUSI and achieve substantially higher segmentation Dice/IoU and classification accuracy/F1 (Table~\ref{tab:full_finetune}).

Collectively, these results suggest that domain-constrained distillation provides dual benefits: (i) model compression that improves deployability on resource-constrained medical devices, and (ii) an ultrasound-adapted initialization that mitigates natural-image transfer bias and improves robustness on challenging breast ultrasound data.

\begin{table}[t]
\centering
\caption{Full fine-tuning on all labeled data. Metrics as in Table~\ref{tab:linear_probing}.}
\label{tab:full_finetune}
\tiny
\begin{tabular}{lcccccc}
\toprule
\multirow{2}{*}{Model} &
\multicolumn{2}{c}{DDTI Seg.} &
\multicolumn{2}{c}{BUSI Seg.} &
\multicolumn{2}{c}{BUSI Cls.} \\
 & Dice & IoU & Dice & IoU & Acc & F1 \\
\midrule
R50-Rand & 0.6605 & 0.5297 & 0.5525 & 0.4629 & 0.6561 & 0.6526 \\
R50-Distill-Default & 0.7652 & 0.6608 & \textbf{0.6967} & \textbf{0.6209} & 0.8662 & 0.8572 \\
R50-Distill-US & 0.7872 & \textbf{0.6745} & 0.6930 & 0.6126 & \textbf{0.8790} & \textbf{0.8673} \\
DINOv3 ViT-B/16 & \textbf{0.7933} & 0.4790 & 0.2838 & 0.1957 & -- & -- \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Limited-label regimes}
\label{subsec:results_limited}

As shown in Table~\ref{tab:limited_data}, both distilled models maintain strong performance even when fine-tuned with only a small fraction of labeled data, whereas the randomly initialized baseline degrades quickly. The ultrasound-aware student is particularly stable in the lowest-label settings, indicating that domain-constrained distillation yields features that transfer more reliably under scarce supervision. These trends are most evident in BUSI classification, where the distilled encoder consistently outperforms alternatives across all label fractions. This highlights a key advantage of our approach: by embedding ultrasound-specific priors during distillation, the model becomes far less dependent on large annotated datasets. Such label efficiency is essential for point-of-care and resource-limited environments, where expert annotation is costly or unavailable.

\begin{table*}[t]
\centering
\caption{Limited-data performance for different label fractions. We report segmentation Dice on DDTI (thyroid) and BUSI (breast), and BUSI 3-way classification accuracy, when fine-tuning on 5\%, 10\%, 20\%, and 50\% of labeled data.}
\label{tab:limited_data}
\setlength{\tabcolsep}{3pt}
\tiny
\resizebox{\textwidth}{!}{%
\begin{tabular}{lcccccccccccc}
\toprule
& \multicolumn{4}{c}{DDTI Dice} & \multicolumn{4}{c}{BUSI Dice} & \multicolumn{4}{c}{BUSI Acc} \\
Model & 5\% & 10\% & 20\% & 50\% & 5\% & 10\% & 20\% & 50\% & 5\% & 10\% & 20\% & 50\% \\
\midrule
R50-Rand &
0.4269 & 0.5237 & 0.5597 & 0.6007 &
0.1911 & 0.3381 & 0.3895 & 0.4038 &
0.2675 & 0.5605 & 0.6242 & 0.6561 \\
R50-Distill-Default &
0.4958 & 0.6536 & 0.7109 & \textbf{0.7672} &
0.2494 & \textbf{0.4857} & 0.5011 & \textbf{0.6185} &
0.4968 & 0.5860 & 0.6688 & 0.7452 \\
R50-Distill-US &
\textbf{0.5233} & \textbf{0.6574} & \textbf{0.7309} & 0.7661 &
\textbf{0.2752} & 0.4600 & \textbf{0.5036} & 0.5543 &
\textbf{0.7197} & \textbf{0.6242} & \textbf{0.8025} & \textbf{0.8726} \\
\bottomrule
\end{tabular}%
}
\end{table*}

\subsection{Deployment Feasibility for Point-of-Care Ultrasound}
\label{subsec:deployment}

Point-of-care ultrasound (POCUS) systems are often deployed on resource-constrained tablet platforms (e.g., EchoNous Kosmos on iPad Air and Philips Lumify on Android tablets), where available system memory is typically on the order of 4--12\,GB.~\cite{EchoNous2025,knight20232d,jaremko2023ai} To quantify deployment feasibility, Table~\ref{tab:deploy_compression_ddti} reports latency, throughput, peak memory, FLOPs, and parameter counts under a standardized protocol (batch size 1, $224{\times}224$ input, FP32; CPU timing averaged over 50 runs on an Intel CPU with 8\,GB RAM; peak memory measured as maximum GPU allocation on an NVIDIA L40). Compared to the teacher, the distilled student reduces peak GPU memory from 686\,MB to 372\,MB (45.8\%) and achieves substantially faster CPU inference, with latency improving from 430\,ms to 33.76\,ms per image (12.75$\times$) and throughput from 2.30 to 27.64 images/s (12.02$\times$). The student further reduces FLOPs by 53.18\% and parameters by 62.81\%, while retaining 99.2\% of the teacher's Dice (0.7933 $\rightarrow$ 0.7872). Together, these results demonstrate a favorable accuracy--efficiency trade-off and support practical deployability under POCUS hardware constraints; on-device benchmarking on commercial scanners is left for future work.

% \subsection{Deployment Feasibility for Point-of-Care Ultrasound}
% \label{subsec:deployment}

% Point-of-care ultrasound (POCUS) systems are commonly deployed on resource-constrained tablet platforms, such as the EchoNous Kosmos on iPad Air and Philips Lumify on Android tablets, where available system memory is typically on the order of 4--12\,GB.~\cite{EchoNous2025,knight20232d,jaremko2023ai} 
% To quantify deployment feasibility, Table~\ref{tab:deploy_compression_ddti} reports latency, throughput, peak memory, FLOPs, and parameter counts under a standardized evaluation protocol.

% All efficiency metrics are measured with batch size 1 at $224{\times}224$ resolution in FP32, averaged over 50 runs on an Intel CPU system with 8\,GB RAM; peak memory reports maximum GPU memory allocation during inference on an NVIDIA L40 GPU.

% The teacher model requires 686\,MB of peak GPU memory, whereas our student reduces this to 372\,MB (45.8\% reduction), substantially alleviating memory pressure. For latency/throughput measured on CPU, the student achieves a 12.75$\times$ reduction in latency (430\,ms $\rightarrow$ 33.76\,ms per image) and a 12.02$\times$ improvement in throughput (2.30 $\rightarrow$ 27.64 images/s). Table~\ref{tab:deploy_compression_ddti} further shows reductions of 53.18\% in FLOPs and 62.81\% in parameters, which are important for power-limited, prolonged scanning sessions. Despite a small Dice decrease of 0.77\% (0.7933 $\rightarrow$ 0.7872), the student retains 99.2\% of the teacher model's performance. These results indicate a favorable accuracy--efficiency trade-off and support the feasibility of deploying the distilled model under constrained POCUS hardware budgets. While benchmarking on commercial POCUS devices is left for future work, the reported latency, memory, and compression metrics provide quantitative evidence that the proposed distillation meaningfully improves practical deployability.




% \subsection{Deployment Feasibility for Point-of-Care Ultrasound}
% Point-of-care ultrasound systems operate on resource-constrained tablets (e.g., EchoNous Kosmos on iPad Air with 6 GB shared system memory), where model inference competes with concurrent B-mode rendering (~50--200 MB) and segmentation decoders (~100--200 MB). To validate that our compression framework addresses real deployment barriers, Table~\ref{tab:deploy_compression_ddti} compares DINOv3 ViT-B16 (teacher) and R50-Distill-US (student) on critical inference metrics.

% The teacher model requires 686 MB peak GPU memory, consuming approximately 23\% of available iPad RAM and leaving insufficient headroom for concurrent processes. This forces either serialized processing (eliminating real-time feedback) or network offloading (violating POCUS low-resource philosophy). Our student reduces peak memory to 372 MB (45.8\% reduction, Table~\ref{tab:deploy_compression_ddti}), consuming only 15\% of available RAM and enabling concurrent real-time segmentation during live scanning.

% For CPU-based inference---the primary deployment path on handheld devices---our student achieves 12.75$\times$ speedup in latency (430 ms $\to$ 33.76 ms, Table~\ref{tab:deploy_compression_ddti}). This reduces acquisition-to-result time from 1.0--1.4 seconds (clinically unacceptable) to 0.6--0.8 seconds (meets clinical workflow expectations). Table~\ref{tab:deploy_compression_ddti} also demonstrates 53.18\% FLOPs reduction and 62.81\% parameter reduction, both critical for extended scanning sessions where power consumption directly impacts battery life.

% The 0.77\% accuracy drop (Dice 0.7933 $\to$ 0.7872, Table~\ref{tab:deploy_compression_ddti}) preserves 99.2\% of teacher performance---acceptable for 3$\times$ compression. These results validate that domain-constrained distillation bridges the deployment-performance trade-off in POCUS imaging, enabling foundation models to operate in resource-constrained clinical settings.

% Table~\ref{tab:deploy_compression_ddti} reports a deployment-oriented comparison between 
% the teacher and student models, including parameter count, FLOPs, measured latency, 
% throughput, and peak memory usage. The student is 2.7$\times$ smaller and requires 
% 2.1$\times$ fewer FLOPs, resulting in 2.1$\times$ faster GPU inference and 12.7$\times$ 
% faster CPU inference. Peak GPU memory is reduced by 45.8\%, while segmentation performance 
% is preserved with only a 0.77\% relative drop in Dice. These results demonstrate that the proposed distillation framework yields a compact and 
% computationally efficient model that is substantially more suitable for resource-constrained 
% POCUS settings, while maintaining near-teacher-level accuracy.

% \subsection{Capacity--performance trade-off}
% \label{subsec:capacity_tradeoff}
% Table~\ref{tab:capacity_tradeoff} highlights the deployment benefits of distillation. The ViT teacher is more than three times larger than the ResNet-50 student, yet the distilled model retains similar performance on DDTI and delivers substantially stronger results on BUSI. This indicates that distillation not only compresses the teacher but also yields representations better aligned with ultrasound, making the compact student a more practical choice for point-of-care deployment.
\begin{table}[t]
\centering
\tiny
\caption{Deployment-oriented comparison of teacher vs. student}
\label{tab:deploy_compression_ddti}
\begin{tabular}{lccc}
\hline
Metric & Teacher (DINOv3 ViT-B) & Student (ResNet50) & Student vs Teacher \\
\hline
Parameters (backbone only)  (M) $\downarrow$ & 86 & 25 & 3.4$\times$ smaller\\
Parameters (backbone + head) (M) $\downarrow$ & 87.44 & 32.52 & 2.69$\times$ smaller ($-62.81\%$) \\
FLOPs (GFLOPs) $\downarrow$ & 22.80 & 10.67 & 2.14$\times$ less ($-53.18\%$) \\
GPU latency (ms, mean$\pm$std) $\downarrow$ & 8.63 $\pm$ 0.91 & 4.07 $\pm$ 1.26 & 2.12$\times$ faster \\
GPU throughput (img/s) $\uparrow$ & 86.21 & 174.88 & 2.03$\times$ higher \\
Peak GPU memory (MB) $\downarrow$ & 686.70 & 372.17 & 1.85$\times$ less ($-45.80\%$) \\
CPU latency (ms, mean$\pm$std) $\downarrow$ & 430.51 $\pm$ 3.02 & 33.76 $\pm$ 4.50 & 12.75$\times$ faster \\
CPU throughput (img/s) $\uparrow$ & 2.30 & 27.64 & 12.02$\times$ higher \\
\textbf{Dice Score} & \textbf{0.7933} & \textbf{0.7872} & \textbf{-0.0061} (\textbf{-0.77\%}) \\
\hline
\end{tabular}
\end{table}



% \begin{table}[t]
% \centering
% \caption{Model capacity vs.\ performance after full fine-tuning. Parameter counts are approximate.}
% \label{tab:capacity_tradeoff}
% \scriptsize
% \begin{tabular}{lcccc}
% \toprule
% Model & Params (M) & DDTI Dice & BUSI Dice & BUSI Acc \\
% \midrule
% DINOv3 ViT-B/16 & 86 & \textbf{0.7933} & 0.2838 & -- \\
% R50-Rand & 25 & 0.6605 & 0.5525 & 0.6561 \\
% R50-Distill-US & 25 & .7872 & 0.6930 & \textbf{0.8790} \\
% \bottomrule
% \end{tabular}
% \end{table}

\subsection{Representation analysis}
\label{subsec:repr_analysis}
t-SNE projections of the ultrasound corpus (Fig.~\ref{fig:tsne_representations}) show clear differences in how each model organizes the data. The randomly initialized encoder produces highly entangled embeddings with little anatomical separation, consistent with its weaker downstream performance. Distillation markedly improves structure: the default student forms more coherent clusters, while the ultrasound-aware student produces the most distinct and stable separation across datasets. In contrast, the ViT teacher retains broad natural-image structure but shows substantial overlap between ultrasound domains, mirroring its poor BUSI performance. These patterns support our central claim that domain-constrained distillation realigns feature space toward ultrasound-specific cues, enabling stronger generalization on heterogeneous clinical data.

\begin{figure}[t]
  \centering
  \includegraphics[width=\textwidth]{tsne_4models_plus_legend_row.png}
  \caption{t-SNE visualization of Ultrasound corpus embeddings for four models:
  R50-Rand, R50-Distill-Default, R50-Distill-US, and ViT-B/16 (DINOv3 Teacher).
  Points are colored by dataset, with a shared color map across models; the
  right-most panel shows the legend for the top-$K$ most frequent datasets.
  R50-Distill-US forms the most compact and well-separated clusters across
  ultrasound domains.}
  \label{fig:tsne_representations}
\end{figure}


% \section{Discussion}
% \label{sec:discussion}

% Our results support three main claims. First, it is feasible to \emph{compress billion-scale ViT priors into a compact 25M-parameter CNN}. On DDTI thyroid segmentation, \textbf{R50-Distill-US} closely tracks the 86M-parameter \textbf{DINOv3 ViT-B/16} teacher under linear probing and slightly surpasses it after full fine-tuning (Dice 0.7933 vs.\ 0.7821). This shows that the core representational structure learned by a large ViT can be retained in a lightweight backbone that is better suited to real-time, resource-constrained deployment.

% Second, \emph{ultrasound-aware distillation and augmentations are most beneficial on heterogeneous breast data}. Across both linear probing and full fine-tuning, \textbf{R50-Distill-US} consistently delivers the best BUSI segmentation Dice and the strongest BUSI classification performance, while the ViT teacher underperforms due to a strong domain gap. The fact that \textbf{R50-Distill-US} outperforms both \textbf{R50-Rand} and \textbf{R50-Distill-Default} indicates that constraining distillation to ultrasound-only data and physically plausible transformations helps encode lesion appearance, speckle patterns, and probe-dependent artefacts that generic natural-image augmentations do not capture well.

% Third, \emph{the largest gains appear in label-efficient, POCUS-like classification regimes}. In BUSI three-way classification with limited labels, \textbf{R50-Distill-US} dominates the alternatives across all label fractions, with particularly large margins at 5--20\% of the labeled data. This suggests that domain-constrained distillation not only improves peak performance but also stabilizes optimization when supervision is scarce, which is precisely the regime faced by many point-of-care deployments.

% Overall, these findings indicate that domain-aware distillation from a large ViT into a compact ResNet is a practical recipe for building ultrasound foundation models that are both deployment-friendly and label-efficient. While our study is limited to two organs and 2D images, the same strategy could plausibly extend to larger multi-center corpora, video-based POCUS, and other safety-critical imaging domains.


% \section{Discussion}
% \label{sec:discussion}

% Our results support three main claims. First, it is feasible to \emph{compress billion-scale ViT priors into a compact 25M-parameter CNN}. On DDTI thyroid segmentation, the distilled ResNet--50 with ultrasound-specific augmentations closely tracks the 86M-parameter DINOv3 ViT-B/16 teacher under linear probing and slightly surpasses it after full fine-tuning (Dice 0.7933 vs.\ 0.7821). This shows that the core representational structure learned by a large ViT can be retained in a lightweight backbone that is better suited to real-time, resource-constrained deployment.

% Second, \emph{ultrasound-aware distillation is most beneficial on heterogeneous breast data}. On BUSI, the US-specific distilled encoder consistently achieves the best segmentation Dice and the strongest three-way classification performance, while the ViT teacher underperforms due to a strong domain gap. The fact that the US-specific student outperforms both a randomly initialized ResNet--50 and the default natural-image distilled student indicates that constraining distillation to ultrasound-only data and physically plausible augmentations helps encode lesion appearance, speckle patterns, and probe-dependent artefacts that generic natural-image priors do not capture well.

% Third, \emph{the largest gains appear in label-efficient, POCUS-like classification regimes}. In BUSI three-way classification with limited labels, the ultrasound-aware distilled encoder dominates the alternatives across all label fractions, with particularly large margins at 5--20\% of the labeled data. This suggests that domain-constrained distillation not only improves peak performance but also stabilizes optimization when supervision is scarce, which is precisely the regime faced by many point-of-care deployments.

% Overall, these findings suggest that domain-aware distillation from a large ViT into a compact CNN is a practical recipe for building ultrasound foundation models that are both deployment-friendly and label-efficient. While our study is limited to two organs and 2D images, the same strategy could plausibly extend to larger multi-center corpora, video-based POCUS, and other safety-critical imaging domains.
\begin{table}[htbp]
\centering
\tiny
\caption{Ablation: Augmentation Policy \& Mixup on BUSI Classification \& DDTI Segmentation.}
\label{tab:ablation-augmix}
% \resizebox{0.95\columnwidth}{!}{%
\begin{tabular}{lccccc}
\toprule
Augmentation & Mixup & BUSI Acc. & F1 & DDTI Dice & DDTI IoU \\
\midrule
Default & $\times$ & 0.8662 & 0.8572 & 0.7652 & 0.6608 \\
Default & $\checkmark$ & 0.8712 & 0.8602 & 0.7732 & 0.6658 \\
\midrule
US-aware & $\times$ & 0.8790 & 0.8738 & 0.7759 & 0.6705 \\
US-aware & $\checkmark$ & \textbf{0.8790} & \textbf{0.8673} & \textbf{0.7872} & \textbf{0.6745} \\
\bottomrule
\end{tabular}%
% }
\end{table}

% \subsection{Ablation Study on Augmentation and Mixup}
% Table~\ref{tab:ablation-augmix} analyzes the contributions of the augmentation policy and 
% mixup ($\alpha=0.2$, image-level within-batch applied during distillation only). Default augmentation includes standard ImageNet-style transformations (random crop, 
% horizontal flip, color jitter, Gaussian blur). The US-aware policy consistently outperforms the Default augmentation, including when 
% compared against Default with mixup, indicating that domain-constrained transformations 
% are the primary driver of performance improvements. Mixup acts as a regularizer but is not strictly required. Under the US-aware setting, 
% removing mixup does not degrade accuracy and slightly improves F1, suggesting that 
% physically grounded augmentations alone are sufficient to achieve strong generalization. 
% We therefore treat mixup as an optional stabilization mechanism in the distillation stage 
% rather than a core component of the proposed framework.
\subsection{Ablation Study on Augmentation Policy \& Mixup}
Table~\ref{tab:ablation-augmix} examines the effects of the augmentation policy and mixup on BUSI classification and DDTI segmentation; mixup is applied only during distillation and is not used during downstream fine-tuning or evaluation. The Default augmentation pipeline includes standard ImageNet-style transformations such as random crop, horizontal flip, color jitter, and Gaussian blur.
The US-aware policy consistently outperforms Default + Mixup. While mixup improves BUSI accuracy and DDTI Dice slightly when applied to the Default augmentation, it does not significantly boost performance compared to the US-aware augmentation. This suggests that US-aware augmentations are the primary driver of performance improvements, and mixup acts as a regularizer rather than a core component of the framework. Removing mixup under the US-aware setting results in slightly better F1 scores, confirming that ultrasound-aware augmentations alone are sufficient for strong generalization.

\subsection{Ablation Study on Different Distilled Architectures}

We applied the DINOv3 ViT-B16 teacher, US corpus, and US-aware augmentations to distill ResNet-18 and ConvNeXt-Tiny, alongside ResNet-50. Table~\ref{tab:additional_backbones} compares their performance with randomly initialized baselines. All distilled models show significant improvements in BUSI classification and DDTI segmentation. ResNet-18 achieves 9.6\% higher BUSI accuracy and 8.9\% higher DDTI Dice, while ConvNeXt-Tiny shows a 60.0\% increase in BUSI accuracy and 20.6\% improvement in DDTI Dice, validating the effectiveness of domain-constrained distillation. While ResNet-50 remains the top performer, these results confirm the framework's generalizability across architectures.

\begin{table}[t]
\centering
\tiny
\caption{Performance comparison of additional distilled models (ResNet-18 and ConvNeXt-Tiny) versus random initialization.}
\label{tab:additional_backbones}
\begin{tabular}{@{}lcccc@{}}
\toprule
Model & BUSI Acc & BUSI F1 & DDTI Dice & DDTI IoU \\
\midrule
R18-Rand & 0.5987 & 0.4144 & 0.6936 & 0.5657 \\
\textbf{R18-Distill-US} & 0.6561 & 0.4835 & 0.7552 & 0.6424 \\
ConvNeXt-Tiny-Rand & 0.5096 & 0.3386 & 0.6251 & 0.4828 \\
\textbf{ConvNeXt-Tiny-Distill-US} & 0.8153 & 0.7855 & 0.7543 & 0.6386 \\
\textbf{R50-Distill-US} & 0.8790 & 0.8673 & 0.7872 & 0.6745 \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Comparison against existing foundation models (FMs)}
Table~\ref{tab:fm_comparison_ddti} compares our model with representative FMs after full fine-tuning on DDTI under the same protocol. Temporal (SAM2/Hiera-B+) \cite{wahd2025time} performs best (Dice $=0.8041$), while our R50-Distill-US is close (Dice $=0.7872$) with far fewer parameters (25M vs.\ 80.8M--86M). Our distilled ResNet-50 also matches the natural-image ViT FM baseline (DINOv3, Dice $=0.7933$), supporting our goal of achieving FM-level accuracy in a compact backbone. ViT-based ultrasound specific FM, USFM \cite{jiao2024usfm} performs substantially lower (Dice $=0.5760$) on this benchmark.
\begin{table}[t]
\centering
\caption{Comparison against existing foundation models (FMs) on \textbf{DDTI} segmentation after full fine-tuning}
\label{tab:fm_comparison_ddti}
\tiny
\setlength{\tabcolsep}{4.5pt}
\begin{tabular}{l l l r c}
\toprule
Model & FM type & Backbone & Params (M) & Mean Dice \\
\midrule
DINOv3 ViT-B/16 & Natural-image FM (SSL) & ViT-B/16 & 86.0 & 0.7933 \\
Temporal (SAM2) & SAM2-based & Hiera-B+ & 80.8 & \textbf{0.8041} \\
USFM & Ultrasound-specific FM & ViT-B/16 & 86 & 0.5760 \\
R50-Distill-US (Ours) & Distilled compact FM (US corpus) & ResNet-50 & \textbf{25.0} & 0.7872 \\
\bottomrule
\end{tabular}

% \vspace{2pt}
% \footnotesize
% \textit{Protocol:} All models are fine-tuned on DDTI using the same data split, input resolution, and training schedule.
% \textit{Checkpoints:} Temporal uses the official SAM2 \texttt{Hiera-B+} checkpoint (80.8M params).
\end{table}





\section{Conclusion}

We introduced a domain-constrained distillation framework that transfers billion-scale ViT representations into a compact ResNet-50 suitable for ultrasound. Using a curated ultrasound corpus and ultrasound-aware augmentations, the distilled models offer stronger generalization and substantially better label efficiency than both a randomly initialized CNN and the original DINOv3 ViT teacher. These findings demonstrate that large vision priors can be preserved in a deployment-friendly backbone while mitigating the natural-image biases that limit direct ViT adaptation. Our approach provides a simple and practical recipe for building ultrasound foundation models that are compatible with point-of-care constraints. Limitations include the focus on 2D B-mode data and the absence of on-device latency evaluation. Future work will extend this framework to video-based POCUS, additional anatomies, and hardware-aware model design.



\bibliography{midl26_328}

\appendix
\renewcommand{\thefigure}{A\arabic{figure}}
\setcounter{figure}{0}

\section{Supplementary: Why DINOv3 Behaves Differently on DDTI vs.\ BUSI}
\label{sec:supp_ddti_busi_gap}

Figures~S\ref{fig:ddti_dinov3_qualitative} and S\ref{fig:busi_dinov3_qualitative_6cases} show that DINOv3 transfers more reliably to DDTI than to BUSI under the same full fine-tuning protocol. On DDTI, predictions are spatially localized and largely follow the annotated nodule boundaries, with occasional errors mainly on very small or low-contrast nodules (Fig.~S\ref{fig:ddti_dinov3_qualitative}, Case~5). In contrast, BUSI exhibits diverse failure modes, including boundary mismatch/over-segmentation (Cases~1--2), missed small/thin lesions (Cases~3--4), and false positives on \emph{normal-class} images where no lesion is annotated (Cases~5--6 in Fig.~S\ref{fig:busi_dinov3_qualitative_6cases}).

These qualitative patterns are consistent with three dataset-specific factors that make BUSI harder for direct transfer: (i) larger lesion-scale variability and more ambiguous boundaries; (ii) explicit inclusion of negative (normal) cases, increasing false-positive risk; and (iii) higher acquisition and speckle/background variability, which likely amplifies domain shift for a high-capacity natural-image ViT. We further observe unstable optimization on BUSI (Fig.~S\ref{fig:busi_dinov3_overfit_curve}), where training loss continues to decrease while validation loss degrades, suggesting overfitting under standard full fine-tuning. All reported DINOv3 results use the checkpoint selected by best validation performance (same selection rule for all models). Together, these observations motivate our domain-constrained distillation strategy to obtain a compact ultrasound-adapted initialization that fine-tunes more robustly on BUSI while remaining competitive on DDTI.




\begin{figure}[htbp]
    \centering
    \includegraphics[width=0.4\linewidth]{qualitative_grid_compact_1.png}
    \caption{\textbf{Qualitative BUSI segmentation failure cases for fine-tuned DINOv3.}
    Columns show \textit{Input}, \textit{Input + GT} (ground-truth lesion contour in \textcolor{green}{green} when available), and \textit{GT vs Pred} (DINOv3 prediction in \textcolor{red}{red} overlaid with GT in \textcolor{green}{green}). 
    \textbf{Case 1--2}: over-segmentation and boundary mismatch (prediction extends beyond the annotated lesion). 
    \textbf{Case 3--4}: false negatives, including missed and thin/small targets (GT present but prediction absent). 
    \textbf{Case 5--6}: false positives on \emph{normal-class} images (no lesion annotation in GT), where the model predicts spurious regions.}
    \label{fig:busi_dinov3_qualitative_6cases}
\end{figure}

\begin{figure}[htbp]
    \centering
    \includegraphics[width=0.5\linewidth]{qualitative_grid_compact_2_no_original_keep_case_header.png}
    \caption{\textbf{Qualitative DDTI segmentation results for fine-tuned DINOv3.}
    Columns show \textit{Input}, \textit{Input + GT} (ground-truth nodule contour in \textcolor{green}{green}), and \textit{GT vs Pred} (DINOv3 prediction in \textcolor{red}{red} overlaid with GT in \textcolor{green}{green}). Across representative cases, predictions are spatially focused and closely match the annotated nodule boundaries, with only minor contour deviations. Case 5 illustrates a small-target false negative / under-segmentation: the nodule is small and low-contrast, and DINOv3 produces a weak or nearly absent prediction, missing most of the annotated region. This contrasts with BUSI, where DINOv3 produces frequent false negatives on small lesions and false positives on normal-class images (Figure.~S\ref{fig:busi_dinov3_qualitative_6cases}).}
    \label{fig:ddti_dinov3_qualitative}
\end{figure}

\begin{figure}[t]
    \centering
    \includegraphics[width=0.75\linewidth]{busi_dinov3_train_curve_loss.png}
    \caption{\textbf{Optimization behavior of fine-tuned DINOv3 on BUSI.}
    Training loss decreases steadily over 50 epochs, while validation loss increases and becomes unstable after early training, indicating overfitting during full fine-tuning on BUSI under our standard protocol.}
    \label{fig:busi_dinov3_overfit_curve}
\end{figure}



\end{document}
