\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{booktabs}
\usepackage{multirow}
\usepackage{enumitem}
\usepackage{xcolor}
\usepackage{makecell}
\usepackage{placeins}
\usepackage{float}

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 385}
\editors{Accepted for publication at MIDL 2026}

\title[HiPro-CT: Probabilistic 3D Medical VLM]{HiPro-CT: A Hierarchical Probabilistic Framework for 3D Medical Vision-Language Alignment}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Lin Lu\nametag{$^{1}$}} \Email{l-lu22@mails.tsinghua.edu.cn}\\
\Name{Zihan Liu\nametag{$^{1}$}} \Email{zh-liu22@mails.tsinghua.edu.cn}\\
\Name{Chaoxiang Tang\nametag{$^{1}$}} \Email{tang-cx24@mails.tsinghua.edu.cn}\\
\Name{Hui Zhang\midljointauthortext{Corresponding author}\nametag{$^{1}$}} \Email{hzhang@tsinghua.edu.cn}\\
\addr $^{1}$ School of Biomedical Engineering, Tsinghua University, Beijing, China
}

\begin{document}

\maketitle

\begin{abstract}
The adaptation of vision-language models (VLMs) to 3D medical imaging is currently impeded by two fundamental bottlenecks: 
the dilution of local features caused by the granularity mismatch between volumetric data and textual reports, and the inability of deterministic embeddings 
to capture the inherent semantic uncertainty of clinical descriptions. To address these challenges, we propose HiPro-CT, a novel hierarchical probabilistic framework for 3D medical vision-language alignment. 
Unlike traditional point-based approaches, HiPro-CT maps images and texts into Gaussian probability distributions, 
utilizing variance to explicitly quantify uncertainty and enhance robustness against incompleteness and polysemy. 
We introduce a soft masked pooling strategy that performs weighted feature aggregation guided by anatomical masks, 
enabling precise organ-level alignment while preserving boundary context. Furthermore, we devise a hierarchical inclusion loss to enforce geometric constraints within the embedding space, 
ensuring that the deterministic global representations are geometrically grounded within the strictly more uncertain local distributions. 
Extensive experiments demonstrate that HiPro-CT significantly outperforms state-of-the-art deterministic baselines 
in zero-shot multi-abnormality detection and cross-modal retrieval, validating the efficacy of integrating fine-grained anatomical supervision with probabilistic representation learning.
\end{abstract}

\begin{keywords}
Vision-Language Models, Computed Tomography, Probabilistic Embedding, Uncertainty Modeling, Fine-grained Alignment
\end{keywords}

\section{Introduction}
In recent years, the application of deep learning in medical image analysis has evolved from single-modality, single-task approaches toward multi-modal, general-purpose Foundation Models. 
Vision-Language Models (VLMs), represented by CLIP (Contrastive Language-Image Pre-training) \cite{radford2021learning}, have demonstrated remarkable open-vocabulary recognition capabilities through pre-training on large-scale image-text pairs.
However, transferring this paradigm to three-dimensional medical imaging, such as CT and MRI, encounters two fundamental challenges.

First, the mismatch in information granularity between vision and text leads to feature dilution. There is a substantial disparity in information density between medical images and radiology reports. 
A single 3D CT volume contains millions of voxels replete with rich anatomical details, whereas the corresponding report typically consists of only hundreds of words, highly focused on pathological findings. 
Existing mainstream models (e.g., MedCLIP \cite{wang2022medclip}, CT-CLIP \cite{hamamci2024developing}) usually adopt a coarse-grained global alignment strategy, compressing high-dimensional volumetric data into a single vector via global pooling. 
This operation inevitably leads to feature dilution, causing critical local features to be overshadowed by background anatomical structures, thereby hindering precise semantic alignment.

Second, medical text is characterized by semantic incompleteness and uncertainty. Medical image-text pairs possess inherent ``one-to-many'' polysemy 
(for instance, the same imaging manifestation may be described as a ``nodule'' or a ``mass'', while reports from different patients may contain identical sentences). 
Traditional deterministic embeddings attempt to map complex clinical semantics to a single point in Euclidean space. Mathematically, this often leads to embedding space collapse, 
failing to effectively capture the inherent range of uncertainty in clinical diagnosis. 
Furthermore, such deterministic mapping struggles to handle the ``false negative'' supervision signals arising from the omission of normal organ descriptions in reports, 
thus limiting the model's ability to learn representations of normal anatomical structures.

To alleviate these challenges, this study proposes \textbf{HiPro-CT}, constructing the first 3D medical VLM that combines fine-grained strong supervision with probabilistic modeling. 
The core philosophy is to introduce probabilistic representations and hierarchical geometric constraints atop fine-grained visual alignment, thereby uniformly characterizing the semantics 
and uncertainty of both images and text while explicitly injecting anatomical hierarchical information. The specific innovations are as follows:

\begin{itemize}
    \item \textbf{Probabilistic embedding:} Images and texts are elevated from point vectors to Gaussian probability distributions. 
    The distribution mean carries the core semantics, while the variance quantifies uncertainty. This enhances the model's robustness to polysemy and semantic ambiguity and provides a more rational metric for cross-modal similarity.
    \item \textbf{Soft masked pooling:} Instead of hard cropping, organ or regional masks are utilized to perform soft weighting on features. 
    This approach preserves boundary and contextual information while calculating organ-level features, enabling finer-grained semantic alignment.
   \item \textbf{Hierarchical inclusion loss:} Geometric constraints are introduced within the probabilistic space to ensure that local organ distributions, 
   which contain less information, statistically ``encompass'' the global distribution. This mechanism explicitly encodes the ``global-organ'' anatomical semantic hierarchy and calibrates representations across different scales.
\end{itemize}

Subsequently, we will systematically review the relevant literature, focusing on the evolution of medical VLM architectures, cross-modal applications of probabilistic representation learning, 
fine-grained visual alignment strategies, and hierarchical semantic constraint mechanisms, to substantiate the theoretical basis and innovative value of HiPro-CT.

\section{Related Work}

\subsection{3D Medical Vision-Language Models: From Global to Fine-grained}

Early medical Vision-Language Models (VLMs) primarily focused on 2D modalities, such as X-rays \cite{shentu2024cxr, mamdouh2025advancements}. Pioneering works like MedCLIP and BiomedCLIP \cite{zhang2023biomedclip} adapted CLIP-style contrastive learning to the medical domain, 
employing decoupled dual-tower encoders to achieve efficient zero-shot classification and open-vocabulary recognition. 
To accommodate 3D modalities like CT and MRI, models such as CT-CLIP and Med3DVLM \cite{xin2025med3dvlm} introduced 3D Transformers or 3D CNNs to encode entire volumetric data into detailed global features for alignment with radiology reports. However, 
``Global-Global'' alignment reveals significant information asymmetry in three-dimensional scenarios: massive voxel details are compressed via coarse-grained pooling, 
causing subtle pathological features to be diluted by background noise, which hinders the model's ability to perceive minute anomalies.

To alleviate this bottleneck, researchers have begun exploring fine-grained alignment mechanisms. CT-GLIP \cite{lin2024ct}, drawing inspiration from Grounded Language-Image Pre-training, 
utilizes pre-computed segmentation masks to perform hard cropping. This forces the alignment of specific organ image patches with corresponding descriptive sentences, thereby enhancing organ-level recognition capabilities. 
Nevertheless, hard cropping relies heavily on high-quality boundaries and tends to discard peripheral context, limiting performance in scenarios involving invasive tumors or blurred boundaries. 
Anatomy-VLM \cite{gu2025anatomy} addresses this by extracting regions of interest (ROI) at the feature map level and concatenating local and global tokens to enhance context awareness. 
However, most such approaches remain based on deterministic bounding boxes or binary masks. They struggle to characterize the ``unmentioned $\neq$ non-existent'' nature of reports and terminological polysemy, 
failing to explicitly model uncertainty and anatomical hierarchy at the representation level.

\subsection{Probabilistic Representation and Uncertainty Modeling}

To address semantic ambiguity and the ``one-to-many'' nature of cross-modal matching, probabilistic embedding has progressively entered vision-language research. 
PCME \cite{chun2021probabilistic, chun2023improved} pioneered the embedding of images and texts as probability distributions (typically Gaussian), where the distribution mean represents central semantics and the variance characterizes range and uncertainty. 
By replacing point-to-point similarity with probabilistic similarity, this approach improves robustness in many-to-many matching. ProLIP \cite{chun2024probabilistic} further introduced an ``uncertainty token'' for efficient variance estimation 
and utilized an ``inclusion loss'' to learn geometric relationships between occluded and complete information, thereby improving cross-modal retrieval and descriptive consistency.

In the medical domain, however, probabilistic modeling remains in its nascent stages. ProbMED \cite{gao2025probmed} utilized Hellinger Distance instead of cosine similarity to measure distributional divergence, 
validating the efficacy of probabilistic methods in multi-modal alignment for chest X-rays and electrocardiograms. 
Anatomy-VLM \cite{gu2025anatomy}, functioning on the premise that ``text is more abstract,'' adopted von Mises-Fisher distributions for post-hoc modeling of text embeddings to mitigate textual uncertainty. 
Nevertheless, existing medical VLMs often limit probabilistic modeling to a single modality (predominantly text), overlooking visual uncertainty in 3D medical imaging caused by imaging quality, 
motion artifacts, or blurred lesion boundaries. Furthermore, they lack a unified probabilistic framework synergistic with fine-grained regional alignment and fail to explicitly inject anatomical hierarchical structures within the probabilistic space.

In summary, existing 3D medical VLMs are constrained by two primary bottlenecks. First, fine-grained alignment relies excessively on hard cropping, making it difficult to balance contextual information around lesions with boundary robustness. 
Second, probabilistic modeling has not yet successfully covered both image and text modalities, nor has it explicitly constrained the ``Global-Local'' anatomical hierarchy at the distributional level. 
Addressing these gaps, we propose a unified framework combining Soft Masked Pooling with Hierarchical Probabilistic Embeddings. 
This approach aims to transcend current performance ceilings in open-vocabulary recognition and small lesion detection through more precise semantic alignment and effective uncertainty characterization.

\section{Materials and Methods}
\subsection{Datasets}
In this study, we utilize the RadGenome-Chest CT dataset \cite{zhang2024radgenome}. Built upon 25,692 non-contrast 3D chest CT volumes and their corresponding reports from the CT-RATE repository, 
this dataset was originally constructed via a dual-stream pipeline designed to achieve region-level alignment and augmentation between visual and textual modalities.

On the imaging side, the dataset employed a text-prompted universal segmentation model (SAT) \cite{zhao2023one} to perform comprehensive 3D segmentation across 197 chest-related anatomical categories. 
The volumetric data were standardized to a unified voxel spacing of $1\,\text{mm}\times1\,\text{mm}\times3\,\text{mm}$, resulting in a multi-level anatomical structure tree with corresponding organ-level masks.

On the textual side, a report sentence splitting and classification model was applied to decompose the findings and impression sections into region-specific descriptions organized by anatomical hierarchy. 
This process established explicit anchoring relationships between individual sentences and their corresponding segmentation masks. 
Consequently, the dataset provides approximately 665{,}000 multi-granularity, sentence-level ``region-report'' pairs for training and evaluation.

\begin{figure}[!htbp]
 \floatconts
   {fig:1}
   {\caption{Framework of HiPro-CT}}
   {\includegraphics[width=0.95\linewidth]{figs/framework.pdf}}
 \end{figure}

\subsection{Framework}

As illustrated in \figureref{fig:1}, the proposed HiPro-CT is a unified framework designed for hierarchical probabilistic vision-language alignment. 
The architecture consists of parallel 3D visual and textual encoders that map input CT volumes and radiology reports into a shared probabilistic embedding space, modeling them as Gaussian distributions to capture inherent uncertainty. 
To bridge the granularity gap, the framework incorporates two distinct alignment levels: a global branch for aligning the entire volume with the full report, 
and a fine-grained branch utilizing Soft Masked Pooling to align specific organ features with their corresponding sentence descriptions. 
The entire model is optimized end-to-end using a hybrid objective that combines Probabilistic Contrastive Loss for semantic alignment and Hierarchical Inclusion Loss to enforce geometric constraints between global and local distributions.

\subsection{Probabilistic Encoders and Uncertainty Estimation}

Our framework consists of a 3D visual encoder $f_v$ and a text encoder $f_t$. Instead of deterministic vectors, we model the output embeddings as diagonal Gaussian 
distributions $p(z|x) = \mathcal{N}(z; \mu(x), \Sigma(x))$, where $\Sigma(x)$ is a diagonal covariance matrix representing the uncertainty.

\textbf{3D Visual Encoder}
Given a 3D CT volume $V \in \mathbb{R}^{D \times H \times W}$, we employ a 3D Vision Transformer (ViT) \cite{dosovitskiy2020image} as the backbone. The input volume is divided into non-overlapping patches 
and linearly projected into patch embeddings $E_v = \{e_1, e_2, \ldots, e_N\}$, where $N$ is the number of patches.

To estimate the distribution parameters $(\mu_v, \Sigma_v)$, we utilize a specific \textbf{Probabilistic Projection Head}:
\begin{itemize}
    \item \textbf{Mean ($\mu_v$):} We perform global average pooling on $E_v$ followed by a linear projection and $L_2$ normalization to map the representation onto a hypersphere:
    \begin{equation}
        \mu_{v} = \text{Normalize}(\text{Linear}(\text{MeanPool}(E_v)))
    \end{equation}
    \item \textbf{Variance ($\Sigma_v$):} Unlike previous works that use a static token, we introduce a learnable \textbf{Variance Query} $Q_{var} \in \mathbb{R}^{1 \times C}$ to 
    dynamically aggregate uncertainty information from the patch features via a Multi-Head Attention (MHA) mechanism:
    \begin{equation}
        h_{var} = \text{MHA}(query=Q_{var}, key=E_v, value=E_v)
    \end{equation}
    \begin{equation}
        \log \sigma^2_{v} = \text{Linear}_{var}(h_{var})
    \end{equation}
    where $\Sigma_v = \text{diag}(\exp(\log \sigma^2_{v}))$. This query-based mechanism allows the model to attend to specific ambiguous regions (e.g., blurry boundaries) when estimating the global uncertainty.
\end{itemize}

\textbf{Text Encoder}
Similarly, for the input text $T$, we use a Transformer-based encoder. The global text distribution $\mathcal{N}(\mu_{t}, \Sigma_{t})$ is derived from the last hidden states using a parallel mechanism: 
the mean is derived from the \texttt{[CLS]} token, and the variance is computed using a learnable text variance query attending to the sequence outputs.

\subsection{Fine-grained Alignment via Soft Masked Pooling}

A critical limitation of standard CLIP on 3D data is the dominance of background information. To enforce fine-grained alignment, we utilize anatomical segmentation masks. 
However, since masks are pixel-level and ViT features are patch-level, hard cropping leads to information loss. We propose soft masked pooling.

Let $M \in \{0, 1\}^{D \times H \times W}$ be the binary mask for a specific organ. We first downsample $M$ to match the patch resolution of the ViT output, 
resulting in a soft weight map $W \in [0, 1]^{N}$, where $W_i$ represents the proportion of the organ within the $i$-th patch. The organ-specific visual mean $\mu_{v\_local}$ is computed as:
\begin{equation}
\tilde{w}_i = \frac{W_i}{\sum_{j=1}^N W_j + \epsilon}, \quad \mu_{v\_local} = \text{Normalize}\left(\text{Linear}\left(\sum_{i=1}^N \tilde{w}_i \cdot e_i\right)\right)
\end{equation}
For the organ-specific variance $\Sigma_{v\_local}$, we apply the Variance Query $Q_{var}$ to the masked patch features, ensuring the uncertainty estimate focuses only on the relevant anatomical region. 
The corresponding local text embedding $\mathcal{N}(\mu_{t\_local}, \Sigma_{t\_local})$ is encoded from the specific sentence describing that organ.

\subsection{Optimization Objectives}

We optimize the network using a hybrid loss function that governs both alignment and distributional relationships.

\subsubsection{Probabilistic Pairwise Contrastive Loss (PPCL)}
We employ the Closed-form Sampled Distance (CSD) \cite{chun2023improved} to measure the similarity between two distributions. For a vision distribution $z_v$ and text distribution $z_t$:
\begin{equation}
    \text{CSD}(z_v, z_t) = \|\mu_v - \mu_t\|^2_2 + \text{Tr}(\Sigma_v + \Sigma_t)
\end{equation}
Following SigLIP \cite{zhai2023sigmoid}, we implement the contrastive loss using a sigmoid-based formulation. The logit for a pair is defined as:
\begin{equation}
    s(z_v, z_t) = a \cdot (\mu_v^\top \mu_t - \frac{1}{2}\text{Tr}(\Sigma_v + \Sigma_t)) + b
\end{equation}
where $a$ and $b$ are learnable scale and bias parameters. We minimize the binary cross-entropy loss $\mathcal{L}_{PPCL}$ on these logits for both global-global pairs and local-local (organ-sentence) pairs.

\subsubsection{Hierarchical Inclusion Loss}
To model the probabilistic relationship between global information and local information, we introduce a \textbf{Hierarchical Inclusion Loss}. 
We model the hierarchical relationship based on information certainty, a principle applicable to both visual and textual modalities.
Global inputs (e.g., the full CT volume or the complete radiology report) provide comprehensive context that resolves semantic ambiguity, yielding more deterministic representations with lower variance.
Conversely, local fragments (e.g., organ patches or isolated sentences) often lack boundary context or specific references, leading to inherent uncertainty and broader distributions.
Therefore, we enforce a constraint where the concentrated global distribution is probabilistically contained within the diffuse local distribution ($z_{global} \subset z_{local}$).
We define an inclusion score $H(z_{global} \subset z_{local})$ based on the log-integral of the probability density functions:
\begin{equation}
    H(z_1 \subset z_2) = \log \int p_1^2(x)p_2(x)dx - \log \int p_1(x)p_2^2(x)dx
\end{equation}
A positive $H$ indicates that $z_1$ is likely included in $z_2$. We apply this loss to enforce:
\begin{enumerate}[label=(\arabic*)]
    \item Visual Inclusion: Global CT distribution $\subset$ Organ CT distribution.
    \item Textual Inclusion: Full report distribution $\subset$ Sentence distribution.
\end{enumerate}

The loss is formulated as $\mathcal{L}_{hier} = -\log \sigma(H(z_{global} \subset z_{local}))$.

\subsubsection{Cross-modal Inclusion Loss}
Additionally, following the intuition that text descriptions are often more abstract (and thus more uncertain) than specific images, 
we apply a cross-modal inclusion loss $\mathcal{L}_{cross}$ to encourage the text distribution to encompass the image distribution (Image $\subset$ Text), 
further regularizing the uncertainty estimation.

\subsubsection{Total Objective}
Finally, to prevent variance collapse, we add a Variational Information Bottleneck (VIB) regularization term $\mathcal{L}_{vib}$:
\begin{equation}
    \mathcal{L}_{vib} = KL(\mathcal{N}(\mu, \Sigma) \| \mathcal{N}(0, I)) = \frac{1}{2} \sum_{i=1}^{D} (\mu_i^2 + \sigma_i^2 - 1 - \log \sigma_i^2)
\end{equation}
where KL is the Kullback-Leibler divergence.
The total objective is:
\begin{equation}
    \mathcal{L} = \mathcal{L}_{PPCL} + \lambda_1 \mathcal{L}_{hier} + \lambda_2 \mathcal{L}_{cross} + \lambda_3 \mathcal{L}_{vib}
\end{equation}
where $\lambda_1$, $\lambda_2$, $\lambda_3$ are hyperparameters balancing the contributions. This multi-objective optimization ensures that our model learns discriminative features for zero-shot classification while maintaining a structured, 
interpretable probabilistic embedding space.

\section{Experiments and Results}
\subsection{Implementation Details}

The 3D visual encoder consists of a 6-layer ViT trained from scratch, and the text encoder is initialized with the pre-trained BiomedVLP-CXR-BERT-specialized weights \cite{boecking2022making}. 
Input CT volumes are standardized to a resolution of $336 \times 336 \times 96$ via center cropping or padding, with a patch size of $16 \times 16 \times 8$. During training, each sample consists of a global volume-report pair. 
Additionally, we employ a stochastic sampling strategy that randomly selects one organ mask and its corresponding sentence per step to compute the fine-grained inclusion loss. 
The model is trained for $50{,}000$ steps on 4 NVIDIA H800 GPUs with a per-GPU batch size of 4. 
We optimize the network using the AdamW optimizer \cite{loshchilov2017decoupled} with a learning rate of $1 \times 10^{-5}$ and a cosine decay scheduler, utilizing bfloat16 mixed-precision to enhance efficiency.
Finally, the loss balancing coefficients $\lambda_1$, $\lambda_2$, and $\lambda_3$ are set to $0.1$, $0.0001$, and $0.1$.
All experimental results reported in the following tables are averaged over 1{,}000 bootstrap runs on the test set, with 95\% confidence intervals shown in parentheses.

\subsection{Zero-shot multi-abnormality detection}\label{sec:zeroshot}
To evaluate the generalization capability and clinical validity of the learned representations, we conducted zero-shot multi-abnormality detection on the RadGenome-Chest test set, 
with ground truth labels derived from the original CT-RATE annotations. We compared our framework against state-of-the-art baselines including CT-Net \cite{draelos2021machine} and CT-CLIP. 
Critically, to ensure a fair comparison that isolates the contribution of architectural design from external pre-training benefits, 
we re-initialized the visual encoder of the CT-CLIP baseline and retrained it from scratch using identical hyperparameters and sampling strategies as HiPro-CT. 
During inference, we employed the text prompts ``\textit{\{Abnormality\} is present}'' and ``\textit{\{Abnormality\} is not present}'' to compute similarity scores, 
and as shown in \tableref{tab:1}, our proposed probabilistic framework significantly outperforms the standard CLIP paradigm trained under the same conditions, 
demonstrating superior zero-shot transferability.

\subsection{Cross-modal Retrieval}
To further evaluate the granularity of semantic alignment beyond simple classification, 
we conducted cross-modal retrieval experiments on a randomly sampled subset of 100 volume-report pairs from the test set. We performed both Text-to-Image (T2I) and Image-to-Text (I2T) retrieval, 
reporting Recall@K (R@1, R@5, R@10) metrics. For the deterministic CT-CLIP baseline, rankings were generated using standard cosine similarity. 
In contrast, HiPro-CT utilizes the negative Closed-form Sampled Distance to rank candidates, leveraging the learned variance to measure probabilistic overlap. As detailed in \tableref{tab:2}, 
HiPro-CT consistently outperforms the baseline. This superiority suggests that our probabilistic framework effectively captures distinct pathological details via soft masked pooling 
and handles the semantic ambiguity of reports better than point-based embeddings, resulting in more precise matching in the open retrieval space.

\begin{table}[H]
 \floatconts
   {tab:1}%
   {\caption{Zero-shot multi-abnormality detection on four metrics: Accuracy, Precision, F1 Score (Weighted), AUROC}}%
 {\resizebox{0.85\textwidth}{!}{\begin{tabular}{cccccc}
 \toprule
 \bfseries Method & \bfseries Type & \bfseries Accuracy & \bfseries Precision & \bfseries F1 Score & \bfseries AUROC \\
 \midrule
CT-Net & Supervised & \makecell{0.617\\[-4pt]{\scriptsize (0.609, 0.624)}} & \makecell{0.264\\[-4pt]{\scriptsize (0.257, 0.272)}} & \makecell{0.657\\[-4pt]{\scriptsize (0.646, 0.667)}} & \makecell{0.629\\[-4pt]{\scriptsize (0.620, 0.638)}}\\
ViT-3D & Supervised & \makecell{\textbf{0.815}\\[-4pt]{\scriptsize (0.809, 0.822)}} & \makecell{0.235\\[-4pt]{\scriptsize (0.223, 0.248)}} & \makecell{\textbf{0.777}\\[-4pt]{\scriptsize (0.770, 0.785)}} & \makecell{0.709\\[-4pt]{\scriptsize (0.697, 0.719)}}\\
\midrule
CT-CLIP & Zero-shot & \makecell{0.643\\[-4pt]{\scriptsize (0.632, 0.653)}} & \makecell{0.290\\[-4pt]{\scriptsize (0.279, 0.301)}} & \makecell{0.680\\[-4pt]{\scriptsize (0.670, 0.690)}} & \makecell{0.679\\[-4pt]{\scriptsize (0.668, 0.689)}}\\
HiPro-CT & Zero-shot & \makecell{0.684\\[-4pt]{\scriptsize (0.670, 0.695)}} & \makecell{\textbf{0.326}\\[-4pt]{\scriptsize (0.315, 0.337)}} & \makecell{0.716\\[-4pt]{\scriptsize (0.704, 0.727)}} & \makecell{\textbf{0.729}\\[-4pt]{\scriptsize (0.720, 0.738)}}\\
  \bottomrule
  \end{tabular}}}
 \end{table}

\begin{table}[H]
 \floatconts
   {tab:2}%
   {\caption{Comparison of retrieval accuracy between CT-CLIP and HiPro-CT. Results are reported as Recall@K (\%) on the test subset ($N=100$).}}%
 {\resizebox{\textwidth}{!}{\begin{tabular}{ccccccc}
  \toprule
  \multirow{2}{*}{\bfseries Method} & \multicolumn{3}{c}{\bfseries Image-to-Text} & \multicolumn{3}{c}{\bfseries Text-to-Image}\\
  \cmidrule(lr){2-4} \cmidrule(lr){5-7}
   & \bfseries R@1 & \bfseries R@5 & \bfseries R@10 & \bfseries R@1 & \bfseries R@5 & \bfseries R@10\\
  \midrule
  CT-CLIP & \makecell{9.77\\[-4pt]{\scriptsize (8.82, 10.74)}} & \makecell{33.06\\[-4pt]{\scriptsize (31.97, 34.14)}} & \makecell{49.77\\[-4pt]{\scriptsize (48.72, 50.77)}} & \makecell{9.95\\[-4pt]{\scriptsize (9.08, 10.87)}} & \makecell{33.20\\[-4pt]{\scriptsize (32.16, 34.27)}} & \makecell{49.85\\[-4pt]{\scriptsize (48.85, 50.83)}}\\
  HiPro-CT & \makecell{\textbf{10.05}\\[-4pt]{\scriptsize (9.08, 11.06)}} & \makecell{\textbf{36.13}\\[-4pt]{\scriptsize (34.97, 37.28)}} & \makecell{\textbf{54.55}\\[-4pt]{\scriptsize (53.52, 55.56)}} & \makecell{\textbf{10.26}\\[-4pt]{\scriptsize (9.27, 11.25)}} & \makecell{\textbf{35.66}\\[-4pt]{\scriptsize (34.40, 36.83)}} & \makecell{\textbf{54.69}\\[-4pt]{\scriptsize (53.64, 55.75)}}\\
   \bottomrule
   \end{tabular}}}
 \end{table}

\subsection{Ablation Study on Hierarchical Probabilistic Constraints}
To investigate the individual contributions of our proposed optimization objectives, we conducted a progressive ablation study with three configurations: 
(1) using solely the Global Probabilistic Pairwise Contrastive Loss (PPCL) as a baseline; 
(2) combining Global and Local PPCL to introduce fine-grained organ-level supervision; and 
(3) the full HiPro-CT framework which further incorporates Hierarchical and Cross-modal Inclusion Losses. 
Importantly, this experimental design also provides a direct ablation on replacing the standard deterministic CLIP objective with our probabilistic formulation. Specifically, the CT-CLIP baseline in \tableref{tab:1} serves as the standard point-embedding CLIP loss benchmark (retrained from scratch with identical hyperparameters; see Section~\ref{sec:zeroshot}), while ``PPCL Loss (Global)'' in \tableref{tab:3} applies PPCL only at the global level (distributional embeddings) without any local (mask/organ-level) alignment or hierarchical inclusion constraints. 
Their comparison therefore isolates the effect of mapping inputs to distributions, which improves zero-shot detection, whereas for cross-modal retrieval the global-only probabilistic objective is not uniformly better (see \tableref{tab:2} vs. \tableref{tab:4}), motivating the added fine-grained and hierarchical constraints.
\begin{table}[ht!]
  % The first argument is the label.
  % The caption goes in the second argument, and the table contents
  % go in the third argument.
 \floatconts
   {tab:3}%
   {\caption{Ablation study on zero-shot multi-abnormality detection performance across different loss configurations.}\vspace{-10pt}}%
 {\resizebox{0.8\textwidth}{!}{\begin{tabular}{ccccc}
  \toprule
  \bfseries Loss Function & \bfseries Accuracy & \bfseries Precision & \bfseries F1 Score & \bfseries AUROC\\
  \midrule
   {\small PPCL (Global)} & \makecell{0.655\\[-4pt]{\scriptsize (0.643, 0.668)}} & \makecell{0.303\\[-4pt]{\scriptsize (0.291, 0.314)}} & \makecell{0.691\\[-4pt]{\scriptsize (0.680, 0.702)}} & \makecell{0.702\\[-4pt]{\scriptsize (0.692, 0.713)}}\\[8pt]
   {\small PPCL (Global \& Local)} & \makecell{0.673\\[-4pt]{\scriptsize (0.660, 0.685)}} & \makecell{0.325\\[-4pt]{\scriptsize (0.315, 0.337)}} & \makecell{0.707\\[-4pt]{\scriptsize (0.695, 0.718)}} & \makecell{0.720\\[-4pt]{\scriptsize (0.711, 0.730)}}\\[8pt]
  {\small\makecell{PPCL (Global \& Local)\\+ \(L_{\mathrm{hier}}\)}} & \makecell{0.679\\[-4pt]{\scriptsize (0.665, 0.692)}} & \makecell{0.321\\[-4pt]{\scriptsize (0.310, 0.333)}} & \makecell{0.711\\[-4pt]{\scriptsize (0.699, 0.722)}} & \makecell{0.724\\[-4pt]{\scriptsize (0.715, 0.734)}}\\[8pt]
  {\small\makecell{PPCL (Global \& Local)\\+ \(L_{\mathrm{hier}}\) + \(L_{\mathrm{cross}}\)}} & \makecell{0.681\\[-4pt]{\scriptsize (0.668, 0.693)}} & \makecell{\textbf{0.327}\\[-4pt]{\scriptsize (0.316, 0.338)}} & \makecell{0.713\\[-4pt]{\scriptsize (0.701, 0.723)}} & \makecell{0.726\\[-4pt]{\scriptsize (0.717, 0.736)}}\\[8pt]
  {\small\makecell{PPCL (Global \& Local)\\+ \(L_{\mathrm{hier}}\) + \(L_{\mathrm{cross}}\) + \(L_{\mathrm{vib}}\)}} & \makecell{\textbf{0.684}\\[-4pt]{\scriptsize (0.670, 0.695)}} & \makecell{0.326\\[-4pt]{\scriptsize (0.315, 0.337)}} & \makecell{\textbf{0.716}\\[-4pt]{\scriptsize (0.704, 0.727)}} & \makecell{\textbf{0.729}\\[-4pt]{\scriptsize (0.720, 0.738)}}\\
   \bottomrule
   \end{tabular}}}
 \end{table}
\vspace{-20pt}
\begin{table}[H]
 \floatconts
   {tab:4}%
   {\caption{Ablation study on cross-modal retrieval performance across different loss configurations.}\vspace{-10pt}}%
{\resizebox{1\textwidth}{!}{%
\begin{tabular}{ccccccc}
  \toprule
  \multirow{2}{*}{\bfseries Loss Function} & \multicolumn{3}{c}{\bfseries Image-to-Text Retrieval} & \multicolumn{3}{c}{\bfseries Text-to-Image Retrieval}\\
  \cmidrule(lr){2-4} \cmidrule(lr){5-7}
   & \bfseries R@1 & \bfseries R@5 & \bfseries R@10 & \bfseries R@1 & \bfseries R@5 & \bfseries R@10\\
  \midrule
   {\small PPCL (Global)} & \makecell{8.62\\[-4pt]{\scriptsize (7.74, 9.59)}} & \makecell{30.54\\[-4pt]{\scriptsize (29.35, 31.71)}} & \makecell{47.80\\[-4pt]{\scriptsize (46.87, 48.85)}} & \makecell{9.01\\[-4pt]{\scriptsize (8.06, 10.04)}} & \makecell{32.23\\[-4pt]{\scriptsize (31.14, 33.31)}} & \makecell{48.56\\[-4pt]{\scriptsize (47.51, 49.62)}}\\[8pt]
   {\small PPCL (Global \& Local)} & \makecell{10.04\\[-4pt]{\scriptsize (9.08, 11.06)}} & \makecell{33.80\\[-4pt]{\scriptsize (32.67, 34.85)}} & \makecell{51.50\\[-4pt]{\scriptsize (50.45, 52.56)}} & \makecell{9.90\\[-4pt]{\scriptsize (8.95, 10.87)}} & \makecell{33.89\\[-4pt]{\scriptsize (32.74, 35.04)}} & \makecell{51.89\\[-4pt]{\scriptsize (50.90, 52.88)}}\\[8pt]
  {\small\makecell{PPCL (Global \& Local)\\+ \(L_{\mathrm{hier}}\)}} & \makecell{10.10\\[-4pt]{\scriptsize (9.08, 11.10)}} & \makecell{34.96\\[-4pt]{\scriptsize (33.82, 36.13)}} & \makecell{53.06\\[-4pt]{\scriptsize (52.05, 54.16)}} & \makecell{\textbf{10.64}\\[-4pt]{\scriptsize (9.65, 11.64)}} & \makecell{35.10\\[-4pt]{\scriptsize (33.89, 36.25)}} & \makecell{53.28\\[-4pt]{\scriptsize (52.30, 54.28)}}\\[8pt]
  {\small\makecell{PPCL (Global \& Local)\\+ \(L_{\mathrm{hier}}\) + \(L_{\mathrm{cross}}\)}} & \makecell{\textbf{10.12}\\[-4pt]{\scriptsize (9.08, 11.13)}} & \makecell{35.56\\[-4pt]{\scriptsize (34.34, 36.77)}} & \makecell{54.17\\[-4pt]{\scriptsize (53.13, 55.18)}} & \makecell{10.32\\[-4pt]{\scriptsize (9.27, 11.32)}} & \makecell{\textbf{36.16}\\[-4pt]{\scriptsize (34.97, 37.34)}} & \makecell{54.40\\[-4pt]{\scriptsize (53.32, 55.56)}}\\[8pt]
  {\small\makecell{PPCL (Global \& Local)\\+ \(L_{\mathrm{hier}}\) + \(L_{\mathrm{cross}}\) + \(L_{\mathrm{vib}}\)}} & \makecell{10.05\\[-4pt]{\scriptsize (9.08, 11.06)}} & \makecell{\textbf{36.13}\\[-4pt]{\scriptsize (34.97, 37.28)}} & \makecell{\textbf{54.55}\\[-4pt]{\scriptsize (53.52, 55.56)}} & \makecell{10.26\\[-4pt]{\scriptsize (9.27, 11.25)}} & \makecell{35.66\\[-4pt]{\scriptsize (34.40, 36.83)}} & \makecell{\textbf{54.69}\\[-4pt]{\scriptsize (53.64, 55.75)}}\\
   \bottomrule
   \end{tabular}}}
\end{table}
 As presented in \tableref{tab:3} and \tableref{tab:4}, 
 the quantitative results exhibit a clear monotonic improvement across all metrics as the loss functions become more comprehensive. Specifically, 
 the introduction of Local PPCL significantly outperforms the global-only baseline by mitigating feature dilution, 
 while the final integration of inclusion losses yields the best performance by explicitly enforcing geometric logical constraints within the probabilistic space. Detailed results are presented in Appendix \ref{app:results}.
 This consistent upward trend validates our hypothesis that modeling both multi-granularity semantics and their hierarchical interrelations is essential for robust 3D medical vision-language alignment. 

\subsection{Qualitative Analysis}
To complement the quantitative evaluation, we provide two qualitative analyses in the appendix.
Appendix \ref{app:viz} visualizes the learned Gaussian embeddings, confirming that the hierarchical inclusion relationships are preserved across both visual and textual modalities.
Appendix \ref{app:loc} presents Grad-CAM-based localization comparisons, showing that HiPro-CT attends to more clinically relevant regions than CT-CLIP.

 \section{Conclusion}
 In this paper, we introduced HiPro-CT, a hierarchical probabilistic framework designed to overcome the critical limitations of feature dilution and semantic ambiguity in 3D medical vision-language alignment. 
 By synergizing Gaussian distributional embeddings with a novel Soft Masked Pooling mechanism, 
 our approach enables precise organ-level alignment while explicitly modeling the inherent uncertainty of clinical data through variance and hierarchical geometric constraints. 
 Extensive experiments on the RadGenome-Chest CT dataset demonstrate that HiPro-CT achieves superior performance over state-of-the-art deterministic baselines in both zero-shot abnormality detection and cross-modal retrieval. 
 These results collectively validate the efficacy of integrating fine-grained anatomical supervision with probabilistic representation learning, 
 offering a robust and interpretable paradigm for the advancement of 3D medical foundation models.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was supported by the Institute for Intelligent Healthcare, Tsinghua University (No. 2022ZLB001) and the Tsinghua-Foshan Innovation Special Fund (No. 2021THFS0104).}


\bibliography{midl26_385}

\clearpage
\appendix

\setlength{\textfloatsep}{8pt plus 2pt minus 2pt}
\setlength{\floatsep}{6pt plus 2pt minus 2pt}
\setlength{\intextsep}{8pt plus 2pt minus 2pt}
\setlength{\abovecaptionskip}{4pt}
\setlength{\belowcaptionskip}{0pt}

\section{Results of zero-shot multi-abnormality detection}\label{app:results}
Compared to CT-CLIP, HiPro-CT demonstrates comprehensive superiority across all four evaluation metrics (Accuracy, Precision, F1 Score, and AUROC) in the detection of the vast majority of anomalies, 
including medical material, arterial and coronary artery wall calcification, cardiomegaly, pericardial and pleural effusion, lymphadenopathy, atelectasis, lung opacity, pulmonary fibrotic sequela, 
mosaic attenuation pattern, peribronchial and interlobular septal thickening, and consolidation. 
In contrast, CT-CLIP consistently outperforms HiPro-CT across all metrics in the detection of lung nodules and bronchiectasis.
\begin{figure}[!htbp]
\floatconts
  {fig:heatmap}
  {\caption{Heatmap of zero-shot multi-abnormality detection results}}
  {\includegraphics[width=0.8\linewidth]{figs/heatmap_combined.png}\vspace{-6pt}}
\end{figure}

\FloatBarrier
\section{Visualizing Distributions}\label{app:viz}
\begingroup

\begin{figure}[!htbp]
\floatconts
  {fig:gaussian_projection_image}
  {\caption{Gaussian projection visualization for image embeddings.}}
  {\includegraphics[width=0.70\linewidth]{figs/gaussian_projection_image.pdf}}
\end{figure}

\begin{figure}[!htbp]
\floatconts
  {fig:gaussian_projection_text}
  {\caption{Gaussian projection visualization for text embeddings.}}
  {\includegraphics[width=0.70\linewidth]{figs/gaussian_projection_text.pdf}}
\end{figure}

These visualization results further corroborate the core intuition underlying our hierarchical probabilistic constraints. 
Specifically, to facilitate visualization, we project the original multi-dimensional Gaussian embeddings onto a one-dimensional Gaussian distribution. 
By selecting a projection axis that aligns the projected means as closely as possible, we can directly inspect the resulting variances and their inclusion relationships.

As illustrated in \figureref{fig:gaussian_projection_image}, the Gaussian embedding derived from the masked input (e.g., utilizing a stomach mask to perform soft masked pooling) 
yields a local distribution that statistically encompasses the distribution encoded by the corresponding global volume. 
This phenomenon indicates that local anatomical semantics entail higher information uncertainty; conversely, as global information is acquired, 
the probability distribution function becomes notably more concentrated.

Similarly, \figureref{fig:gaussian_projection_text} demonstrates the textual hierarchy: the distribution of a shorter, more abstract description (e.g., ``lung'') encompasses that of a longer, more specific one (e.g., ``right lung lower lobe'').
Collectively, these results validate that our learned probabilistic space preserves hierarchical inclusion relationships across both visual and textual embeddings.

\endgroup

\section{Localization results}\label{app:loc}
\begingroup
To qualitatively assess localization behavior, we visualize model attention using Grad-CAM \cite{2020Grad}. 
As shown in \figureref{fig:localization_result}, HiPro-CT exhibits more precise and clinically relevant focus on fine-grained regions corresponding to the target findings, whereas CT-CLIP tends to produce more diffuse or partially misplaced activations. 
This comparison suggests that the proposed fine-grained supervision together with probabilistic hierarchical constraints improves the model's ability to attend to the correct anatomical details.
\begin{figure}[!htbp]
\floatconts
  {fig:localization_result}
  {\caption{Localization result for the text: \textit{``In the left lung, there is linear density consistent with band atelectasis-sequelae changes in the inferior lingular segment.''}}}
  {\resizebox{0.65\linewidth}{!}{\includegraphics{figs/localization_compare.pdf}}\vspace{-10pt}}
\end{figure}
\endgroup
\end{document}