\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{booktabs,array}
\usepackage{comment} 
\usepackage[strings]{underscore} % makes _ usable in text and \texttt
\usepackage{seqsplit}   
\usepackage{chngcntr} 
\usepackage{xcolor}
\usepackage{graphicx}
\newcommand{\lbl}[1]{\texttt{\small\seqsplit{#1}}}
\newcommand{\fengbei}[1]{\color{black}#1\color{black}}
%\newcommand{\Edited}[1]{\color{blue}[ED: #1]\color{black}}
\newcommand{\Edited}[1]{\color{black}#1 \color{black}}
% \newcommand{\lbl}[1]{\ttfamily\small\seqsplit{#1}}
\jmlrvolume{-- 133}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}
\title[Cross-Modal Alignment for Opportunistic Cardiac Screening]{X-Cardia: Phenotype-Guided Cross-Modal Alignment for Opportunistic Cardiac Screening on Routine Chest CT}  %XCardia - cross cardiac, aligning cardiac to non-cardiac 
%\title[Short Title]{CANON: phenotype-guided framework for Aligning Cardiac and Non-cardiac Modalities} 
%phenotype-guided Multimodal Alignment: Bridging Cardiac and Non-Cardiac Modalities
% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Nusrat Binta Nizam\nametag{$^{1}$}} \Email{nn284@cornell.edu}\\
\Name{Fengbei Liu\nametag{$^{1}$}} \Email{fl453@cornell.edu}\\
\Name{Sunwoo Kwak\nametag{$^{1}$}} \Email{sk3355@cornell.edu}\\
\Name{Ilan Richter\nametag{$^{2}$}} 
\Email{ir2498@cumc.columbia.edu}\\
\Name{Jayant K Raikhelkar\nametag{$^{2}$}} 
\Email{jkr2146@cumc.columbia.edu}\\
\Name{Ashley Beecy\nametag{$^{3}$}} 
\Email{ashleybeecy@gmail.com}\\
\Name{Nir Uriel\nametag{$^{2}$}} 
\Email{nu2126@cumc.columbia.edu}\\
\Name{Deborah Estrin\nametag{$^{1}$}} \Email{destrin@cornell.edu}\\
\Name{Mert R Sabuncu\nametag{$^{1,4}$}} \Email{msabuncu@cornell.edu}\\
\addr $^{1}$ Cornell Tech, New York, USA\\
\addr $^{2}$ Columbia University Irving Medical Center, New York, USA\\
\addr $^{3}$ Sutter Health, California, USA\\
\addr $^{4}$ Weill Cornell Medicine, New York, USA
}

\begin{document}
\raggedbottom
\maketitle

\begin{abstract}
\begin{comment}
Deep learning models for cardiac prognostics often operate within single-modality frameworks, limiting their ability to capture physiologically meaningful cross-modal relationships. In particular, we focus on non-gated, non-contrast chest computed tomography (CT) scans that are typically acquired for entirely non-cardiac indications, rather than for dedicated cardiac assessment. We introduce X-Cardia, a phenotype-guided multimodal alignment framework that transfers structural cardiac phenotypes from echocardiography (ECHO) and electrocardiography (ECG) into CT representations by enforcing explicit phenotype-level consistency. This setting is intrinsically challenging because the lack of cardiac gating obfuscates the cardiac phase and the absence of contrast limits the visibility of cardiovascular structures, but these scans represent a rich resource for opportunistic cardiac screening. The approach combines CLIP-style contrastive pre-training to align image and tabular embeddings with a non-parametric Nadaraya--Watson phenotype head, which uses a support-bank to guide the latent space toward clinically meaningful axes. This enables the image encoder to learn physiological features that are generalized beyond the modality boundaries. We pre-train using data from 20,574 patients and fine-tune the resulting image encoder on ten cardiac abnormality prediction tasks. The proposed method consistently outperforms both the standard contrastive learning and the baseline without pre-training, achieving a gain of up to 8\% of AUROC on the test set. In the 5-shot setting, phenotype-guided alignment improves AUROC by an average of 9.8\% over baselines, demonstrating strong data efficiency and generalization from few labeled samples. Our results show that explicit phenotype-guided alignment yields interpretable, data-efficient representations that transfer cardiac knowledge to non-cardiac CTs, defining a promising paradigm for multimodal medical imaging.
\end{comment}

% Deep learning models for cardiac diagnostics and prognostics are often trained within a single modality, limiting their ability to capture physiologically meaningful cross-modal structure. We instead focus on leveraging non-gated, non-contrast chest computed tomography (CT) scans, typically acquired for non-cardiac indications for opportunistic cardiac risk assessment. 
Multimodal medical data offer an opportunity to learn general-purpose representations for cardiovascular diagnosis. We introduce X-Cardia, a cardiac phenotype-guided multimodal framework that uses structured data as intermediate supervision during pre-training. X-Cardia learns to extract cardiac information from non-contrast, non-gated chest CT scans by aligning CT features with tabular measurements derived from echocardiography (ECHO) and electrocardiography (ECG).
Our method combines CLIP-style contrastive pre-training with a non-parametric Nadaraya--Watson (NW) prediction head that enforces phenotype-level similarity via exemplar-based alignment. Pre-training on 20,574 patients, followed by fine-tuning on ten cardiac abnormality prediction tasks, yields substantial performance gains. X-Cardia improves AUROC by up to 8\% on the held-out test set and delivers an average 11.8\% AUROC improvement in a 5-shot regime.
These results demonstrate that explicit phenotype alignment produces interpretable, data-efficient representations and enables routine chest CT to support opportunistic cardiac screening. Code is available at: \href{https://github.com/sumona00/X-Cardia}{\texttt{https://github.com/sumona00/X-Cardia}}.

\end{abstract}

\begin{keywords}
Multimodal Alignment, Phenotype-Guided, Nadaraya--Watson head, Chest CT, Echocardiography, Electrocardiography.
\end{keywords}

\section{Introduction}
Modern cardiac diagnostics routinely use volumetric scans such as CT, together with structured clinical measurements (e.g., chamber dimensions, pressure estimates) derived from different modalities, such as echocardiography and electrocardiography~\cite{ota2001real, jenkins2009left}. These modalities provide complementary information: imaging captures morphological patterns, while tabular measurements encode expert-derived phenotypes~\cite{henry1980echocardiographic,lau2023deep,ghorbani2020deep,castrejon2016learning}. Despite this, current deep learning pipelines typically operate on a single modality or use simple late fusion, leaving the rich semantic correspondence between modalities largely untapped. This limits robustness and underutilizes valuable clinical structure.
Routine non-gated, non-contrast chest CT scans represent an especially challenging but impactful setting for opportunistic cardiac screening. These studies are acquired for non-cardiac indications, lack cardiac gating, and attenuate cardiovascular detail---making cardiac interpretation difficult even for experts. Yet many clinically meaningful phenotypes, such as chamber dilation or valvular dysfunction, manifest jointly as geometric patterns on CT and quantitative deviations in structured measurements. Leveraging these natural correspondences could enable models to extract cardiac information from routine thoracic imaging.

Cross-modal alignment~\cite{wang2022multi,jiang2023cross} provides a mechanism to learn such shared semantic structure. Aligning imaging and tabular representations allows gradients from cleaner, more structured phenotypes to regularize the image encoder, guiding it toward physiologically grounded features that generalize across tasks and modalities. Prior multimodal contrastive learning (MMCL)~\cite{yuan2021multimodal,radford2021learning} approaches have demonstrated the promise of contrastive alignment, but often produce global, mixed embeddings that offer limited interpretability and weak few-shot generalization. Moreover, they rarely enforce per-phenotype similarity, which is crucial in cardiology where findings are sparse and modalities may disagree.

These challenges are amplified when the goal is to infer cardiac status from routine chest CT. Without explicit phenotype guidance, models may rely on shortcut features or collapse toward modality-specific biases, especially in data-scarce settings. Effective cardiac--non-cardiac transfer therefore requires an approach that (i) tightly couples CT and structured measurements, (ii) grounds representations in clinically meaningful phenotypes, and (iii) supports exemplar-based reasoning.
In this work, we introduce X-Cardia, a cardiac phenotype--guided multimodal framework for aligning chest CT with ECHO and ECG-derived measurements. X-Cardia integrates a CLIP-style contrastive objective with a non-parametric Nadaraya--Watson (NW) head~\cite{cai2001weighted,wang2023learning,wang2022flexible} that enforces phenotype-level similarity via a support bank of exemplar embeddings. This hybrid objective produces interpretable, data-efficient representations that transfer effectively to downstream cardiac prediction tasks on non-gated chest CT.

We pre-train on a large cohort of 20,574 patients and fine-tune the CT encoder on ten clinically relevant abnormalities derived from ECHO. X-Cardia substantially improves performance over strong baselines---including standard multimodal contrastive learning, achieving up to 8\% AUROC gains in full-data settings and nearly 11.8\% improvements in 5-shot regimes. These results indicate that explicit phenotype alignment is key to unlocking cardiac information from routine CT and enabling scalable opportunistic screening.
\begin{comment}
Pre-training X-Cardia on multimodal image--tabular data from 20{,}574 patients and fine-tuning the image encoder on ten ECHO-derived abnormalities markedly improves performance over strong baselines, showing that phenotype-aligned pre-training can unlock cardiac information from routine CT for opportunistic screening.
\end{comment}

% Despite recent progress in multimodal contrastive learning (MMCL)~\cite{yuan2021multimodal,radford2021learning}, existing approaches often produce global, mixed representations that offer limited interpretability and weak support for few-shot generalization. Furthermore, they do not explicitly encourage per- clinical phenotype similarity that can transfer across tasks or modalities. These limitations are particularly challenging in cardiology, where clinically important findings are rare and modalities occasionally disagree, leading models to find out shortcut features and exhibit failure modes. These issues are further compounded when attempting to infer cardiac status from routine non-gated, non-contrast chest CTs. We target this challenging setting of non-gated, non-contrast chest CTs acquired for non-cardiac indications, transferring cardiac phenotypes into this domain to enable opportunistic cardiovascular screening from routine thoracic imaging.

% To address these challenges, we propose a cardiac to non-cardiac aligned representation learning framework that integrates: (i) cross-modal contrastive objectives (CLIP-style) to align volumetric and tabular embeddings, and (ii) a Nadaraya--Watson head that enforces per-clinical phenotype similarity using a support bank of exemplar embeddings, enabling exemplar-based reasoning. Together, these components yield phenotype-guided, interpretable representations that are data-efficient in low-shot regimes, and aligned with clinically meaningful semantic labels.
\section{Related Works}

\subsection{Learning with Tabular Data}
Conventional tabular models such as XGBoost~\cite{chen2016xgboost} and LightGBM~\cite{ke2017lightgbm} remain strong baselines, often outperforming early neural networks~\cite{shwartz2022tabular}. Recent attention-based architectures better capture feature dependencies: TabTransformer~\cite{huang2020tabtransformer} introduces contextual embeddings, and TabNet~\cite{arik2021tabnet} applies sequential attention for feature selection. Foundation models like TabPFN~\cite{hollmann2022tabpfn} and TabLLM~\cite{hegselmann2023tabllm} enable few-shot reasoning through meta-learning or language-model serialization, producing transferable structured embeddings.

In medical applications, tabular representations offer complementary physiological information. Prior work has shown that contrastively coupling image and tabular encoders can enhance unimodal performance~\cite{hager2023best}. Jiang et al.~\cite{jiang2024tabular,jiang2024transferring} extend this idea by aligning visual feature channels with clinical phenotypes using optimal transport and mutual information. These methods highlight the value of tabular data as a source of structured, expert-derived signals for guiding representation learning.

\subsection{Cross-Modal Transfer}
Cross-modal alignment learns a shared embedding space. For example, vision--language models such as CLIP~\cite{radford2021learning}, ALBEF~\cite{li2021align}, IRRA~\cite{jiang2023cross}, CUSA~\cite{huang2024cross}, and UNITER~\cite{chen2019uniter} use contrastive or masked objectives to link image and text representations. Liang \emph{et al.}~\cite{liang2022mind} revealed a persistent modality gap formed by each encoder due to initialization and temperature dynamics. 
%Other metadata-enhanced approaches (e.g., EXIF as a language-like modality) show that non-image channels can act as useful alignment signals~\cite{zheng2023exif}.

In medical imaging, multimodal contrastive methods such as MMCL~\cite{yuan2021multimodal, hager2023best}, SimCLR-based approaches~\cite{chen2020simple, tang2020exploring}, and clinically grounded frameworks like CHARMS~\cite{jiang2024tabular,jiang2024transferring}  demonstrate that structured signals can regularize visual encoders and improve data efficiency. The modality-focusing hypothesis~\cite{xue2022modality} further suggests that cross-modal transfer succeeds when modalities share causal, modality-general features. Recent work on large vision--language models~\cite{li2025survey} underscores the importance of aligning both latent spaces and model behavior.

% For medical imaging, CHARMS~\cite{jiang2024tabular,jiang2024transferring} and contrastive learning based on SimCLR and SCARF~\cite{hager2023best} exemplify embedding-based alignment that maps structured and visual data into a joint space through contrastive or transport-based losses. The modality-focusing hypothesis~\cite{xue2022modality} argues that cross-modal transfer succeeds when modalities share causal and modality-general features. Recent works on large vision--language models further highlights the importance of aligning both latent representations and model behavior~\cite{li2025survey}.  

%Motivated by these insights, our work performs cardiac-to-non-cardiac transfer by aligning chest CT with ECHO- and ECG-derived phenotypes using a contrastive objective coupled with phenotype-level supervision. This explicit guidance helps reduce modality gaps and supports more interpretable and data-efficient multimodal representations.
Motivated by these, we perform cardiac-to-non-cardiac transfer by aligning chest CT with ECHO and ECG-derived phenotypes via a contrastive, phenotype-supervised objective, reducing modality gaps and yielding more data-efficient multimodal representations.

% Motivated by these insights, we perform cardiac to non-cardiac modality transfer by pre-training on paired cardiac images and tabular features, aligning modalities through phenotype-guided contrastive learning objectives. The alignment of the grounding with tabular features helps mitigate the effects of the modality gap and supports stronger and interpretable multimodal learning.

\section{Methodology}\label{sec:methods}
% An overview of the proposed cross-modal pre-training and fine-tuning framework is shown in Figure~\ref{fig:overview}.
We start with the basic setup in Sec.~\ref{sec:problem_setup}, followed by the encoders and representation fusion in Sec.~\ref{sec:encoder_fusion}.  We then describe the Nadaraya--Watson head in Sec.~\ref{sec:nadaraya_watson_head}, the support bank construction in Sec.~\ref{sec:support_bank_construction}, and the cross-modal alignment in Sec.~\ref{sec:cross_modal_alignment}. The training and evaluation details are described in Sec.~\ref{sec:train_eval}. Finally, we present the supervised fine-tuning on cardiac binary targets in Sec.~\ref{sec:supervised_fine_tuning}. An overview of the proposed cross-modal pre-training and fine-tuning framework is shown in Figure~\ref{fig:overview}.
\begin{figure*}[h!]
  \centering
  \includegraphics[width=\textwidth]{Multimodal_Overview_F.pdf}
  \caption{
Overview of proposed X-Cardia. 
During pre-training (left), multimodal alignment is learned across chest CT and tabular features (ECHO and ECG) using a CLIP-style contrastive loss $\mathcal{L}_{\mathrm{clip}}$ and cardiac supervision via a Nadaraya--Watson (NW) head with loss $\mathcal{L}_{\mathrm{nw}}$. Flame icon means learnable, whereas snowflake icon means frozen.
During fine-tuning (right), only the last layer of pre-trained CT encoder is optimized to predict cardiac abnormalities from non-cardiac chest CTs, leveraging the aligned and phenotype-guided latent space obtained during pre-training. %\fengbei{Change "Transformer encoder" to "Tabular encoder" to maintain consistency.}
}
   \label{fig:overview}
\end{figure*}
\subsection{Problem Setup}
\label{sec:problem_setup}
We denote our multimodal training dataset as $\mathcal{D} = \{(\mathbf{x}^{\mathrm{img}}_i, \mathbf{x}^{\mathrm{tab}}_i, \mathbf{y}_i)\}^{|\mathcal{D}|}_{i=1}$, where $\mathbf{x}^{\mathrm{img}}_i \in \mathcal{X} \subset \mathbb{R}^{D \times H \times W} $ is the 3D chest CT volume with $D$ as number of slices, $H$ and $W$ as height and width. $\mathbf{x}^{\mathrm{tab}}_i \in \mathcal{X} \subset \mathbb{R}^{F}$ is the structured tabular feature vector derived from ECHO and ECG with $F$ as the number of features. $\mathbf{y}_i \in \{0, 1\}^{C}$ is the multi-hot vector and $C$ is the number of classes.  


% Our multimodal input comprises 3D chest CT volumes $x^{\mathrm{img}}\in\mathbb{R}^{1\times D\times H\times W}$ combined with structured echocardiographic morphometric features and electrocardiographic electrical measurements, represented as $x^{\mathrm{tab}}\in\mathbb{R}^{F}$. We derive $C$ binary phenotype labels $y\in[0,1]^C$ corresponding to structural echocardiographic findings defined by clinically meaningful normal ranges. Categorical phenotypes are label-encoded, and continuous features are imputed using $k$-nearest neighbors ($k{=}5$). For each dimension of the phenotype $f$ with reference interval $[l_f, h_f]$, we define phenotype label $y_f$ with indictor function $\mathbbm{1}$:
% \begin{equation}
% y_f = \mathbbm{1}\big[x^{\mathrm{tab}}_f \notin [l_f, h_f]\big].
% \end{equation}

\subsection{Encoders and Representation Fusion}
\label{sec:encoder_fusion}
\paragraph{Image encoder.}

\begin{comment}
A 3D ResNet--50~\cite{he2016deep} backbone is used for volumetric feature extraction, with the classification head removed. For a batch of size $B$, the final convolutional feature tensor is,
\[
\Phi(x^{\mathrm{img}})\!\in\!\mathbb{R}^{B\times2048\times D'\times H'\times W'}.
\]
Global average pooling (GAP) over $(D',H',W')$ produces compact image features 
$\mathbf{f}^{\mathrm{img}}\!\in\!\mathbb{R}^{B\times2048}$, which are linearly reduced to 
$\mathbf{h}^{\mathrm{img}}\!\in\!\mathbb{R}^{B\times128}$ and passed through a two-layer projection 
head to yield the final embedding 
$\mathbf{e}^{\mathrm{img}}\!\in\!\mathbb{R}^{B\times256}$.
\end{comment}
We denote image encoder (3D ResNet--50~\cite{he2016deep, hara2017learning, wang2018non}) as $f_\theta: \to \mathbb{R}^{d}$ mapping $\mathbf{x}^{\mathrm{img}}_i$ to a $d$-dimensional embedding:
\begin{equation}
  \mathbf{e}^{\mathrm{img}}_i = f_\theta(\mathbf{x}^{\mathrm{img}}_i)
\end{equation}
where $\theta$ represents the learnable parameters of the image encoder and $\mathbf{e}^{\mathrm{img}}_i \in \mathbb{R}^{d}$ is the image embedding after Global Average Pooling (GAP) and MLP head.



% We use a 3D ResNet--50~\cite{he2016deep} backbone as the image encoder. Let 
% \(\Phi_{\mathrm{img}}\) denote this encoder, mapping a batch 
% of input volumes, \(x^{\mathrm{img}}\), to convolutional feature maps,
% \[
% \Phi_{\mathrm{img}}(x^{\mathrm{img}}) \in \mathbb{R}^{B \times C \times D' \times H' \times W'}.
% \]
% Global average pooling over the spatial dimensions \((D', H', W')\) yields image-level feature
% vectors \(\mathbf{f}^{\mathrm{img}} \in \mathbb{R}^{B \times d_f}\), where \(d_f\) is the feature
% dimension. These features are passed through a linear layer to obtain 
% \(\mathbf{h}^{\mathrm{img}} \in \mathbb{R}^{B \times d_h}\), and then a two-layer projection head
% produces the final image embeddings 
% \(\mathbf{e}^{\mathrm{img}} \in \mathbb{R}^{B \times d_e}\), where \(d_h\) and \(d_e\) denote the
% hidden and embedding dimensions, respectively. 

\paragraph{Tabular encoder.}
We adopt a FT-Transformer~\cite{gorishniy2021revisiting} based architecture. We define a tokenizer $h_\psi$ and encoder $g_\phi$ that map the input $\mathbf{x}^{\mathrm{tab}}_i \in \mathbb{R}^{F}$ to an embedding $\mathbf{e}^{\mathrm{tab}}_i \in \mathbb{R}^{d}$. The tokenizer $h_\psi$ projects each scalar feature to form a sequence of embeddings. The encoder $g_\phi$ processes this sequence via Transformer layers, aggregates the output via GAP, and applies a final projection head:

\begin{equation}
  \mathbf{e}^{\mathrm{tab}}_i = g_\phi\big(h_\psi(\mathbf{x}^{\mathrm{tab}}_i)\big),
\end{equation}
where $\psi$ and $\phi$ are learnable parameters.
% The structured feature vectors, \(x^{\mathrm{tab}} \in \mathbb{R}^{B \times F}\), where \(F\) denotes the number of tabular features, are first mapped
% through a linear layer into a sequence of token embeddings, which are processed by a Transformer
% encoder. Mean pooling across the token dimension yields tabular feature vectors
% \(\mathbf{f}^{\mathrm{tab}} \in \mathbb{R}^{B \times d_f^{\mathrm{tab}}}\). These features are
% normalized and passed through a lightweight linear head to obtain the final tabular embeddings
% \(\mathbf{e}^{\mathrm{tab}} \in \mathbb{R}^{B \times d_e^{\mathrm{tab}}}\).
%In our implementation, we use a 3-layer, 8-head Transformer with feed-forward width 1024, and set \(d_f^{\mathrm{tab}} = d_e^{\mathrm{tab}} = 256\).

\paragraph{Embedding fusion.}
We use sum fusion for balanced optimization of both encoders and because it performs comparably to more complex strategies in the analysis presented in Appendix \ref{sec:embedding}.
We further fuse $\mathbf{e}^{\mathrm{img}}_i$ and $\mathbf{e}^{\mathrm{tab}}_i$ to obtain a shared representation for phenotype prediction. We define the fusion operation as follows:
\begin{equation}
  \mathbf{z}_i = \frac{\mathbf{e}^{\mathrm{img}}_i \oplus \mathbf{e}^{\mathrm{tab}}_i}{\left\lVert \mathbf{e}^{\mathrm{img}}_i \oplus \mathbf{e}^{\mathrm{tab}}_i \right\rVert_2}
  \label{eq:fused_representation}
\end{equation}
where $\oplus$ denotes element-wise summation and $\left\lVert \cdot \right\rVert_2$ is $\ell_2$ norm.

% Both modality embeddings occupy the same latent space and are fused by summation followed by $\ell_2$ normalization \cite{hsieh2023mdf}:
% \begin{equation}
% \mathbf{z} = 
% \frac{\mathbf{e}^{\mathrm{img}}+\mathbf{e}^{\mathrm{tab}}}
% {\left\lVert \mathbf{e}^{\mathrm{img}}+\mathbf{e}^{\mathrm{tab}}\right\rVert_2},
% \qquad 
% \mathbf{z}\!\in\!\mathbb{R}^{B\times256}.
% \end{equation}
% The fused representation $\mathbf{z}$ serves as a shared embedding for phenotype prediction.

\subsection{Non-Parametric Phenotype Prediction (Nadaraya--Watson Head)}
\label{sec:nadaraya_watson_head}
\begin{comment}
We adopt a Nadaraya--Watson (NW) estimator~\cite{cai2001weighted,wang2023learning,wang2022flexible} built on the fused embedding, $\mathbf{z}$ for phenotype prediction. 
The NW head stores a support bank of normalized embeddings paired with multi-hot phenotype labels and computes temperature-scaled similarities to derive attention weights over supports. 
Each query embedding is predicted as a weighted average of support labels, yielding $\hat{\mathbf{p}}(\mathbf{z})\!\in\![0,1]^C$, where the temperature $\tau_{\mathrm{nw}}{=}1.0$ controls similarity sharpness. 
Predictions are trained using a numerically stabilized binary cross-entropy loss, encouraging smooth, non-parametric phenotype inference without additional classifier parameters.
\end{comment}
We adopt a Nadaraya--Watson (NW) estimator~\cite{cai2001weighted,wang2023learning,wang2022flexible} on fused embedding $\mathbf{z}_i$ for phenotype prediction. 
The NW head is non-parametric and has no learnable parameters; it uses support bank $\mathcal{D}_{sup} = \{(\hat{\mathbf{z}}_k, \hat{\mathbf{y}}_k)\}^{|\mathcal{D}_{sup}|}_{k=1}$
% of $N_s$ support examples
% $\{(\mathbf{s}_k, \mathbf{y}_k)\}_{k=1}^{N_s}$
, where $\hat{\mathbf{z}}_k$ is fused embedding and $\hat{\mathbf{y}}_k$ is the multi-hot phenotype label of sample $k$.  
% $\mathbf{s}_k\in\mathbb{R}^d$ is an $\ell_2$-normalized fused embedding and $\mathbf{y}_k\in\{0,1\}^C$ is a multi-hot phenotype vector indicating which cardiac phenotypes are present. 
Given a query embedding $\mathbf{z}_i$, we first compute temperature-scaled similarities,

  \begin{equation}
    \alpha_k({\mathbf{z}_i}) = \frac{\exp\big( (\mathbf{z}_i \cdot \hat{\mathbf{z}}_k) / \tau_{\mathrm{nw}} \big)}{ \sum_{j=1}^{|\mathcal{D}_{sup}|} \exp\big( (\mathbf{z}_i \cdot \hat{\mathbf{z}}_j) / \tau_{\mathrm{nw}}  \big) },
  \end{equation}

% \begin{equation}
% \alpha_k(\mathbf{z}) 
% = 
% \frac{\exp\big(\langle \tilde{\mathbf{z}}, \tilde{\mathbf{s}}_k\rangle / \tau_{\mathrm{nw}}\big)}
%      {\sum_{j=1}^{N_s} \exp\big(\langle \tilde{\mathbf{z}}, \tilde{\mathbf{s}}_j\rangle / \tau_{\mathrm{nw}}\big)},
% \quad
% \tilde{\mathbf{z}} = \frac{\mathbf{z}}{\lVert \mathbf{z}\rVert_2},\;
% \tilde{\mathbf{s}}_k = \frac{\mathbf{s}_k}{\lVert \mathbf{s}_k\rVert_2},
% \end{equation}
where $\tau_{\mathrm{nw}}$ is a temperature hyperparameter controlling the sharpness of the attention distribution over support, $\alpha_k({\mathbf{z}_i})$ denotes the weight assigned to query embedding $\mathbf{z}_i$ over support bank. 
The predicted phenotype probabilities are then given by:
\begin{equation}
\hat{\mathbf{p}}_i = \sum_{k=1}^{|\mathcal{D}_{sup}|} \alpha_k(\mathbf{z}_i)\, \hat{\mathbf{y}}_k,
\end{equation}
where $\hat{\mathbf{p}}_i$ is the $C$-dimensional vector of phenotype probabilities for query $\mathbf{z}_i$, which can be viewed as weighted average of the support-label vectors in phenotype space.
For training the NW head, we use binary cross-entropy (BCE) loss $\mathcal{L}_{\mathrm{nw}}$ applied to all cardiac phenotypes and averaged over the training set.
% \begin{equation}
% \mathcal{L}_{\mathrm{nw}} = - \frac{1}{|\mathcal{D}|} \sum_{i=1}^{|\mathcal{D}|} \sum_{c=1}^{C} \Big[ \mathbbm{1}(\mathbf{y}_{i,c}=1) \log \big( \hat{\mathbf{p}}_{i,c} \big) + \mathbbm{1}(\mathbf{y}_{i,c}=0) \log \big( 1 - \hat{\mathbf{p}}_{i,c} \big) \Big],
% \end{equation}
% where $\hat{\mathbf{p}}_{i,c}$ represents the predicted probability of the $c$-th phenotype for the $i$-th sample, and $\mathbbm{1}(\cdot)$ is the indicator function.


% At training time, the NW head is queried with the current fused embeddings and produces phenotype probabilities $\hat{\mathbf{p}}(\mathbf{z}_i)$ for each sample $i$. We train the model using binary cross-entropy loss, $\mathcal{L}_{\mathrm{nw}}$ applied to all cardiac phenotypes.

\begin{comment}
\begin{equation}
\mathcal{L}_{\mathrm{nw}}
=
-\frac{1}{BC}\sum_{i=1}^{B}\sum_{c=1}^{C}
\big[
y_{ic}\, \log(\hat{p}_{ic})
+
(1-y_{ic})\, \log(1-\hat{p}_{ic})
\big],
\end{equation}
where $B$ is the batch size, $C$ is the number of phenotypes, $\mathbf{y}_i\in\{0,1\}^C$ are ground-truth phenotypes, $\hat{\mathbf{p}}_i$ are NW predictions.
\end{comment}
\subsection{Support Bank Construction}
\label{sec:support_bank_construction}
%\fengbei{TBD}
We maintain a non-parametric support bank for Nadaraya--Watson inference on embeddings. At initialization, all training samples are encoded and up to $K$ \emph{positive} examples per phenotype are selected: if a phenotype has fewer than $K$ positives, we keep all; otherwise, we run $k$-means with $K$ clusters and retain the samples closest to each centroid. The resulting normalized support embeddings are stored in the NW head and used to compute cosine similarities between query and support embeddings, and the bank is rebuilt every $M$ epochs to track the evolving representation.

\subsection{Cross-Modal Alignment}
\label{sec:cross_modal_alignment}
\begin{comment}
To align modalities in a shared representation space, we adopt a symmetric contrastive objective following the CLIP formulation~\cite{radford2021learning}. 
For a batch of $B$ paired samples, let 
$E_{\mathrm{img}}, E_{\mathrm{tab}}\!\in\!\mathbb{R}^{B\times d}$ 
be the $\ell_2$-normalized image and tabular embedding matrices. 
We compute the similarity matrix 
$S = E_{\mathrm{img}} E_{\mathrm{tab}}^{\top} / \tau$, 
where $\tau{=}0.07$ is a fixed temperature. 
The cross-modal loss is defined as
\begin{equation}
\mathcal{L}_{\mathrm{clip}}
=
-\tfrac{1}{2B}\!\sum_{i=1}^{B}
\!\left[
\log\frac{\exp(S_{ii})}{\sum_{j=1}^{B}\exp(S_{ij})}
+
\log\frac{\exp(S_{ii})}{\sum_{j=1}^{B}\exp(S_{ji})}
\right].
\end{equation}
This objective maximizes similarity between each image--tabular pair while minimizing similarity to non-matching samples within the batch.
\end{comment}
To align image and tabular modalities in a shared embedding space, we employ a CLIP-style symmetric contrastive loss~\cite{oord2018representation,  radford2021learning, hager2023best}. 
% For a batch of \(B\)
% paired samples, let \(\mathbf{e}^{\mathrm{img}}, \mathbf{e}^{\mathrm{tab}} \in \mathbb{R}^{B \times d}\)
% denote the \(\ell_2\)-normalized image and tabular embeddings. 
We define scaled similarities as $s_{ij} = (\mathbf{e}^{\mathrm{img}}_{i} \cdot \mathbf{e}^{\mathrm{tab}}_{j} ) / \tau_{\mathrm{clip}}$ where, $\tau_{\mathrm{clip}}$ is a temperature hyperparameter controlling sharpness of the similarity distribution.
% \[
% s_{ij} = \frac{\langle \mathbf{e}^{\mathrm{img}}_{i}, \mathbf{e}^{\mathrm{tab}}_{j} \rangle}{\tau},
% \]
Cross-modal loss is,
% \begin{equation}
% \mathcal{L}_{\mathrm{clip}}
% =
% -\frac{1}{2B}\sum_{i=1}^{B}
% \left[
% \log\frac{\exp(s_{ii})}{\sum_{j=1}^{B}\exp(s_{ij})}
% +
% \log\frac{\exp(s_{ii})}{\sum_{j=1}^{B}\exp(s_{ji})}
% \right],
% \end{equation}
\begin{equation}
    \begin{split}
      \mathcal{L}_{\text{clip}}= -\frac{1}{2|\mathcal{D}|}\left[ \sum_{i=1}^{|\mathcal{D}|}\log \frac{\exp(s_{ii})}{\sum_{j=1}^{|\mathcal{D}|}\exp(s_{ij})}+ \sum_{j=1}^{|\mathcal{D}|}\log \frac{\exp(s_{jj})}{\sum_{i=1}^{|\mathcal{D}|}\exp(s_{ji})}\right]
    \end{split}
    \label{eq:loss_clip}
  \end{equation}
which encourages each matched image--tabular pair (the diagonal terms \(s_{ii}\)) to have
higher similarity than all non-matching pairs within the batch.

\subsection{Training and Evaluation}
\label{sec:train_eval}
The overall objective combines cross-modal alignment and phenotype supervision as:
\begin{equation}
\mathcal{L}_{\mathrm{total}}
=
\mathcal{L}_{\mathrm{clip}}
+
\lambda_{\mathrm{nw}}\,\mathcal{L}_{\mathrm{nw}},
\qquad
\label{eq:total_loss}
\end{equation}

The weighting term $\lambda_{\mathrm{nw}}$ is tuned to balance alignment and phenotype learning. Training uses AdamW with cosine-annealed learning rate. Optimization runs for up to 100 epochs with early stopping criteria. In all the comparison studies, MMCL is actually our model (X-Cardia) without NW head and trained using only $\mathcal{L}_{\mathrm{clip}}$ loss.

\subsection{Supervised Fine-Tuning on Cardiac Binary Targets}
\label{sec:supervised_fine_tuning}
%We fine-tuned the pre-trained CT encoder (3D ResNet--50~\cite{he2016deep, wang2018non, hara2017learning}) on the labeled cohort to predict cardiac abnormalities directly from non-cardiac chest CT scans. The model outputs ten binary targets capturing key structural and functional phenotypes (e.g., reduced ejection fraction, increased wall thickness, valvular disease). To exploit the pre-trained alignment between cardiac and non-cardiac representations, we froze the lower convolutional blocks and jointly optimized the remaining layers and task-specific classification heads under a multi-task objective. Training used a masked binary cross-entropy loss to accommodate missing labels, and performance was reported using AUROC across all cardiac targets.

We fine-tuned the pre-trained image encoder (3D ResNet-50~\cite{he2016deep, wang2018non, hara2017learning}) on the labeled cohort to predict ten binary cardiac phenotypes directly from non-cardiac chest CT. To leverage the learned alignment, we froze the lower convolutional blocks and jointly optimized the remaining layers and task-specific heads with a multi-task, masked binary cross-entropy loss, reporting AUROC for all targets.
\begin{comment}
\begin{table}[hb!]
\centering
\small
\caption{Performance (AUROC) by label on the test set under different training strategies (\textbf{bolded} values indicate best result; \underline{underlined} values indicate second-best).}
\label{tab:main_results}
\setlength{\tabcolsep}{3pt}
\begin{tabular}{>{\raggedright\arraybackslash}p{0.3\textwidth} *{7}{c}}
\toprule
Label & No pre-training & MMCL & SimCLR & NW+MMCL\\
\midrule
LVEF $\leq$ 45\% & \underline{0.76} & 0.75 & 0.60  & \textbf{0.77} \\
LVWT $\geq$ 13 flag & 0.67 & \underline{0.69} & 0.62   & \textbf{0.72}\\
Aortic Stenosis & 0.73 & \underline{0.76} & 0.54 &  \textbf{0.79} \\
Aortic Regurgitation & \underline{0.66} & \underline{0.66} & 0.61 & \textbf{0.74} \\
Mitral Regurgitation & \textbf{0.76} & 0.73 & 0.61  & \underline{0.75} \\
Tricuspid Regurgitation & \underline{0.74} & 0.72 & 0.65  & \textbf{0.75}\\
Pulmonary Regurgitation & 0.73 & \textbf{0.80} & 0.57  & \underline{0.79} \\
PASP $\geq$ 45 flag & \textbf{0.70} & \underline{0.67} & 0.63  & \textbf{0.70}\\
TR$_{\max}$ $\geq$ 32 flag & \textbf{0.69} & \underline{0.66} & \underline{0.66} & \textbf{0.69} \\
SHD flag & \textbf{0.74} & \textbf{0.74} & \underline{0.63}  & \textbf{0.74} \\
\bottomrule
\end{tabular}
\end{table}
\end{comment}
\begin{table}[hb!]
\centering
\small
\caption{Performance (AUROC; mean $\pm$ standard deviation) by label on the test set under different training strategies (\textbf{bolded} values indicate best result; \underline{underlined} values indicate second-best).}
\label{tab:main_results}
\setlength{\tabcolsep}{3pt}
\begin{tabular}{>{\raggedright\arraybackslash}p{0.3\textwidth} *{7}{c}}
\toprule
Label & No pre-training & MMCL & SimCLR & NW+MMCL\\
\midrule
LVEF $\leq$ 45\% & $\underline{0.73} \pm 0.036$ & $\underline{0.73} \pm 0.027$ & $0.58 \pm 0.023$  & $\textbf{0.76} \pm 0.005$\\
LVWT $\geq$ 13 flag & $0.65 \pm 0.015$ & $\underline{0.67} \pm 0.014$ & $0.60 \pm 0.012$   & $\textbf{0.71} \pm 0.009$\\
Aortic Stenosis & $0.70 \pm 0.032$ & $\underline{0.74} \pm 0.022$ & $0.55 \pm 0.020$ &  $\textbf{0.85} \pm 0.042$ \\
Aortic Regurgitation & $0.63 \pm 0.020$ & $\underline{0.64} \pm 0.018$ & $0.58 \pm 0.023$ & $\textbf{0.72} \pm 0.015$ \\
Mitral Regurgitation & $\textbf{0.73} \pm 0.032$& $\underline{0.72} \pm 0.014$ & $0.57 \pm 0.025$ & $\underline{0.72} \pm 0.017$ \\
Tricuspid Regurgitation & $\textbf{0.72} \pm 0.015$ & $\underline{0.70} \pm 0.017$ & $0.62 \pm 0.020$ & $\textbf{0.72} \pm 0.021$\\
Pulmonary Regurgitation & $0.69 \pm 0.032$ & $\underline{0.76} \pm 0.028$ & $0.57 \pm 0.008$  & $\textbf{0.77} \pm 0.027$\\
PASP $\geq$ 45 flag & $\textbf{0.68} \pm 0.015$ & $\underline{0.66} \pm 0.005$ & $0.61 \pm 0.019$ & $\textbf{0.68} \pm 0.017$\\
TR$_{\max}$ $\geq$ 32 flag & $\textbf{0.67} \pm 0.020$ & $0.63 \pm 0.022$ & $0.62 \pm 0.027$ & $\underline{0.66} \pm 0.018$ \\
SHD flag & $0.70 \pm 0.032$ & $\underline{0.73} \pm 0.009$ & $0.62 \pm 0.022$  & $\textbf{0.76} \pm 0.014$ \\
\textbf{Average} & $0.69 \pm 0.022$ & $\underline{0.70} \pm 0.012$ & $0.59 \pm 0.012$  & $\textbf{0.73} \pm 0.007$ \\
\bottomrule
\end{tabular}
\end{table}
\section{Experiments and Results}
\subsection{Dataset Overview}
\begin{comment}
We assembled a large-scale multimodal cardiac imaging dataset comprising non-gated chest CT volumes, ECHO, ECG, and patient demographic data collected from Columbia University Irving Medical Center (CUIMC) and Weill Cornell Medicine (WCM). 
Each CT study was temporally matched to its corresponding ECHO and ECG examinations using unique patient identifiers, ensuring multimodal consistency within each clinical encounter.
The pre-training cohort included 20,574 patients, each with non-contrast chest CT, ECHO study, and 12-lead ECG acquired within six months of the CT scan. 
The dataset was randomly divided at the patient level into 16,459 for training and 4,115 for validation (80/20 split). 
This paired multimodal dataset enabled pre-training of the cross-modal alignment and phenotype prediction framework described in Section~\ref{sec:methods}.
For supervised downstream training, we curated a separate labeled cohort of 7,553 patients with 16,357 chest CT studies annotated for cardiac structural and functional abnormalities derived from ECHO. 
Ten binary cardiac targets were defined, including reduced left ventricular ejection fraction (LVEF~$\leq$~45\%), increased wall thickness (LVWT~$\geq$~13~mm), valvular stenosis and regurgitation (aortic, mitral, tricuspid, pulmonary), elevated pulmonary artery systolic pressure (PASP~$\geq$~45~mmHg), increased tricuspid regurgitant velocity (TR$_\mathrm{max}$~$\geq$~32~m/s), and structural heart disease (SHD) presence \cite{poterucha2025detecting}. 
The fine-tuning set was split (80/20) into training and validation subsets, and an independent test set of 2,266 patients with 4,861 CT studies was reserved for final evaluation. 
\end{comment}
We assembled a large-scale multimodal cardiac imaging dataset from Columbia University Irving Medical Center and Weill Cornell Medicine comprising non-gated chest CT, ECHO, ECG, and demographic data. CT studies were temporally matched to corresponding ECHO and ECG exams using unique patient identifiers, yielding a pre-training cohort of $20{,}574$ patients with non-contrast chest CT, ECHO, and ECG acquired within six months of the CT scan. This cohort was split at the patient level into $16{,}459/4{,}115$ patients (80/20) for training and validation, and used to pre-train the cross-modal alignment and phenotype prediction framework in Section~\ref{sec:methods}.
For supervised downstream training, we curated a separate labeled cohort of $7{,}553$ patients with $16{,}357$ chest CT studies annotated for cardiac structural and functional abnormalities derived from ECHO. We defined ten binary cardiac targets, including reduced LVEF~($\leq 45\%$), increased LV wall thickness~($\geq 13$~mm), valve stenosis/regurgitation (aortic, mitral, tricuspid, pulmonary), elevated PASP~($\geq 45$~mmHg), increased TR$_\mathrm{max}$~($\geq 32$~m/s), and structural heart disease (SHD) presence~\cite{poterucha2025detecting}. The fine-tuning cohort was split 80/20 into training and validation subsets, with an independent test set of $2{,}266$ patients ($4{,}861$ CT studies) held out for final evaluation.

\subsection{Implementation Details}
Training and validation were conducted using PyTorch on NVIDIA A100 GPUs with mixed-precision optimization. All CT volumes were resampled to $2$ mm isotropic resolution and center-cropped to $164^3$ voxels. Both the image encoder and tabular encoder were randomly initialized (Xavier for all linear layers) and trained end-to-end during pre-training. Tabular features were standardized per column. Missing entries in $\mathbf{x}^{\mathrm{tab}}_i$, were then imputed with a k-nearest neighbors (k = 5) imputer fitted on the training split and subsequently applied to the validation and test splits. For contrastive pre-training, batch size was set to 12 with temperature $\tau_{clip}{=} 0.07$. The Nadaraya--Watson head was initialized with $K{=}20$ exemplars per phenotype and refreshed every $M{=}5$ epochs. All reported metrics were computed on held-out patient-level splits to ensure independence across train, validation, and test cohorts.
\subsection{Results}
Table~\ref{tab:main_results} summarizes performance across ten cardiac prediction tasks on the held-out chest CT test set. Our proposed framework (NW+MMCL) consistently outperformed both no-pre-training and standard multimodal contrastive learning approaches~\cite{hager2023best, chen2020simple} across most of the prediction tasks (Table~\ref{tab:main_results}, statistical significance analysis in Appendix \ref{app:statsig}). Here, one of the baselines, MMCL denotes an ablation of X-Cardia without the NW Head.
Our method improved AUROC by $0.03{-}0.08$ over the no-pre-training baseline, except for mitral regurgitation and TR$_\mathrm{max}$~($\geq 32$~m/s). Improvements were also observed relative to standard MMCL, particularly in valvular disease tasks where physiological phenotypes yield clearer cross-modal correspondence.
In the few-shot evaluation (Table~\ref{tab:few_results}), NW+MMCL exhibited the largest relative gains, achieving an average improvement of $11.8\%$ AUROC over MMCL and $23.5\%$ AUROC over SimCLR. Few-shot gains were most pronounced for Aortic Stenosis and SHD Flag, demonstrating the effectiveness of phenotype-guided pre-training for detecting structural abnormalities. Additional experiments varying the fraction of labeled CT studies used for fine-tuning (Appendix \ref{sec:traning_size}, Table~\ref{tab:training_fraction_ablation}) show that NW+MMCL consistently outperforms the baseline across all data regimes, highlighting its strong sample efficiency. We also evaluate X-Cardia under zero, one, and two-shot supervision to assess representation quality and data efficiency (Appendix \ref{app:few_results}). Lower absolute performance in the zero-shot setting is expected, as many cardiac phenotypes manifest subtly and are not directly observable on non-gated chest CT. With limited supervision, performance improves substantially, and X-Cardia consistently outperforms other methods across tasks. These results indicate that phenotype-guided multimodal alignment yields CT representations that generalize and adapt effectively with minimal downstream supervision.

\begin{table}[hb!]
\centering
\small
\caption{Test performance (AUROC; mean $\pm$ standard deviation) by label across different training strategies in 5-shot learning. (\textbf{bolded} values indicate best result; \underline{underlined} values are second-best).}
\label{tab:few_results}
\setlength{\tabcolsep}{3pt}
\begin{tabular}{>{\raggedright\arraybackslash}p{0.3\textwidth} *{7}{c}}
\toprule
Label & No pre-training & MMCL & SimCLR & NW+MMCL\\
\midrule
LVEF $\leq$ 45\% & $0.54 \pm 0.033$ & $\underline{0.64} \pm 0.066$ & $0.51 \pm 0.027$  & $\textbf{0.68} \pm 0.049$\\
LVWT $\geq$ 13 flag & $0.53 \pm 0.019$ & $\underline{0.59} \pm 0.033$ & $0.51 \pm 0.045$   & $\textbf{0.63} \pm 0.039$\\
Aortic Stenosis & $0.54 \pm 0.015$ & $\underline{0.64} \pm 0.026$ & $0.54\pm 0.045$ &  $\textbf{0.86} \pm 0.018$ \\
Aortic Regurgitation & $0.53 \pm 0.025$ & $\underline{0.58} \pm 0.037$ & $0.52 \pm 0.037$ & $\textbf{0.64} \pm 0.049$ \\
Mitral Regurgitation & $0.57 \pm 0.029$& $\underline{0.60} \pm 0.051$ & $0.49 \pm 0.026$ & $\textbf{0.67} \pm 0.005$ \\
Tricuspid Regurgitation & $0.53 \pm 0.025$ & $\underline{0.58} \pm 0.017$ & $0.48 \pm 0.033$ & $\textbf{0.65} \pm 0.022$\\
Pulmonary Regurgitation & $0.61 \pm 0.053$ & $\underline{0.67} \pm 0.029$ & $0.52 \pm 0.037$  & $\textbf{0.69} \pm 0.008$\\
PASP $\geq$ 45 flag & $\underline{0.55} \pm 0.026$ & $\underline{0.55} \pm 0.025$ & $0.50 \pm 0.016$ & $\textbf{0.58} \pm 0.026$\\
TR$_{\max}$ $\geq$ 32 flag & $\underline{0.61} \pm 0.017$ & $0.52 \pm 0.041$ & $0.53 \pm 0.015$ & $\textbf{0.62} \pm 0.017$ \\
SHD flag & $0.59 \pm 0.035$ & $\underline{0.63} \pm 0.046$ & $0.52 \pm 0.041$  & $\textbf{0.75} \pm 0.019$ \\
\textbf{Average} & $0.56 \pm 0.013$ & $\underline{0.60} \pm 0.013$ & $0.52 \pm 0.013$  & $\textbf{0.68} \pm 0.002$ \\
\bottomrule
\end{tabular}
\end{table}

\begin{comment}
\begin{table*}[h!]
\centering
\small
\caption{Test performance (AUROC) by label across different training strategies in 5-shot learning. (\textbf{bolded} values indicate best result; \underline{underlined} values are second-best).}
\label{tab:fewshot_results}
\setlength{\tabcolsep}{2pt}
\begin{tabular}{>{\raggedright\arraybackslash}p{0.3\textwidth} *{7}{c}}
\toprule
Label &  No pre-training & MMCL & SimCLR & NW+MMCL\\
\midrule
LVEF $\leq$ 45\%  & 0.52 & \underline{0.63} & 0.50  & \textbf{0.65}  \\
LVWT $\geq$ 13 flag & 0.53 & \underline{0.61} & 0.51 & \textbf{0.65}  \\
Aortic Stenosis  & 0.54 & \underline{0.63} & 0.51 & \textbf{0.85}\\
Aortic Regurgitation  & 0.54 & \textbf{0.61} & 0.51  & \underline{0.60}  \\
Mitral Regurgitation  & 0.58 & \underline{0.59} & 0.50 & \textbf{0.68} \\
Tricuspid Regurgitation  & 0.56 & \underline{0.59} & 0.44 & \textbf{0.67} \\
Pulmonary Regurgitation  & \underline{0.68} & 0.67 & 0.48 & \textbf{0.70}  \\
PASP $\geq$ 45 flag  & \underline{0.59} & 0.58 & 0.50 & \textbf{0.60} \\
TR$_{\max}$ $\geq$ 32 flag  & \underline{0.63} & 0.50 & 0.53 & \textbf{0.64}  \\
SHD flag  & \underline{0.64} & 0.62 & 0.49 & \textbf{0.74}  \\
\bottomrule
\end{tabular}
\end{table*}
\end{comment}
\begin{table*}[h!]
\centering
\caption{Phenotype prediction performance with NW head versus standard linear head.}
\label{tab:nw_vs_linear}
\small
\setlength{\tabcolsep}{5pt}
\renewcommand{\arraystretch}{0.6}
\begin{tabular}{lcc}
\toprule
Evaluation Metric & NW Head & Linear Head \\
\midrule
AUROC             & 0.66 & 0.63 \\
F1 Score          & 0.60 & 0.55 \\
Cosine Similarity & 0.72 & 0.76 \\
\bottomrule
\end{tabular}
\end{table*}
\subsection{Ablation Study}
We conducted ablations to evaluate the contributions of,  
(i) training with NW head vs. Linear head,   
(ii) the quality of cross-modal embedding alignment, and  
(iii) the interpretability of the support-bank representations.
(iv) comparison of pre-training on cardiology vs radiology.
%(iii) the interpretability of the phenotype labels.

\paragraph{NW Head.}
Empirically, Table~\ref{tab:nw_vs_linear} shows that a learnable linear head on the fused embeddings underperforms the non-parametric, parameter free NW head on phenotype prediction, even though it achieves similar or slightly higher global cosine similarity. This suggests that the NW objective promotes more discriminative, phenotype-aligned decision boundaries: because the NW head has no learnable parameters, the encoders themselves must adapt to align CT and tabular embeddings, whereas a parametric head can partially bypass one modality by relying more heavily on the cleaner signal. Qualitative Grad-CAM comparisons between the NW head and a linear head further support
this effect, with the NW head producing more focused cardiac attention maps
(Appendix \ref{sec:nw_gradcam}, Figure~\ref{fig:nw_vs_linear_gradcam_appendix}).
\begin{figure*}[t!]
  \centering
  \includegraphics[width=0.9\textwidth]{PCA_Visual.pdf}
  \caption{
PCA visualization of image (blue) and tabular (orange) modality embeddings under different pre-training strategies. (a), (b), and (c) are SimCLR, MMCL, and NW + MMCL, respectively. SimCLR and MMCL show weak alignment whereas, NW + MMCL enhances alignment by integrating phenotype-level supervision, promoting structured semantic consistency across modalities and reducing modality gaps.
}
  \label{fig:pca}
\end{figure*}
\begin{figure*}[hb!]
  \centering
    \includegraphics[width=0.7\textwidth]{10tasks.pdf}
  \caption{
{Task-specific score distributions for multimodal CT representations using proposed NW+MMCL based pre-training.
For each task, we plot the distribution of signed classifier scores for positive (teal, filled) and negative (gray, outlined) cases, where the score denotes the signed distance of CT embeddings to the task-specific linear classifier head. Across tasks, positive cases are consistently right-shifted relative to negatives, indicating task-aligned separation, while overlap reflects clinical heterogeneity and continuous disease severity.
}}
  \label{fig:10tasks}
\end{figure*}
\begin{figure*}[h!]
  \centering
\includegraphics[width=0.6\textwidth]{Gradcam_support.pdf}
  \caption{
\Edited{Qualitative Grad-CAM maps on cardiac CT slices from the final support set after cross-modal pre-training. Each column corresponds to one phenotype: IVS (interventricular septal thickness), LVPW (left ventricular posterior wall thickness), LVD (left ventricular internal diameter in diastole), and LVS (left ventricular internal diameter in systole), and each image shows a different case from the final support set. The pre-trained encoder attends to anatomically relevant cardiac regions, indicating transferable structural priors before downstream fine-tuning.}
}
  \label{fig:gradcam}
\end{figure*}
\paragraph{Quality of embedding alignment.}
PCA visualization (Figure~\ref{fig:pca}) demonstrates that only the NW + MMCL pre-training strategy yields substantial embedding alignment between chest CT and tabular features (ECHO and ECG), forming a coherent cross-modal latent space. In contrast, SimCLR and MMCL exhibit weak or partial alignment, highlighting the necessity of phenotype-level supervision to bridge modality-specific representation gaps. These results support our hypothesis that integrating physiological phenotypes enables more effective cross-modal embedding fusion. To further assess phenotype-level discriminability in the fused multimodal space, we analyze LDA (Linear Discriminant Analysis) score distributions for key cardiac phenotypes (Appendix~\ref{app:kde}). While partial overlap is observed, the consistent separation trends suggest that the fused embedding encodes clinically relevant information despite inherent phenotype variability.
To evaluate whether the shared CT representation encodes task-relevant information, we examine task-specific classifier score distributions (Figure \ref{fig:10tasks}, Appendix \ref{fig:MMCL_10}, and Appendix \ref{fig:SimCLR_10}). Across tasks, positive cases consistently exhibit higher scores than negatives, indicating that the NW+MMCL based pre-trained CT model captures better class-aligned evidence along task-specific directions. Although the distributions overlap, this rightward shift of positives demonstrates meaningful task-specific separation in the learned CT representation.
\paragraph{Interpretability of the support-bank representations.}
%\paragraph{Interpretability of the phenotype representations.}
%Support examples formed coherent and interpretable manifolds in the latent space, including smooth gradients in wall thickness, chamber dilation, and valvular severity.
%Qualitative Grad-CAM maps (Figure~\ref{fig:gradcam}) of the abnormal range of phenotypes reveal that the pre-trained CT encoder attends to cardiac regions, despite the CT scans being non-cardiac and non-gated. This indicates that multimodal alignment transfers cardiac information into the image encoder. Notably, accurate prediction of the chamber diameter measurements, particularly LVD and IVD, remains more challenging than the wall-thickness tasks.
Support examples formed coherent manifolds with smooth phenotype gradients, and Grad-CAM maps (Figure~\ref{fig:gradcam}) show that the pre-trained CT encoder focuses on cardiac regions despite non-cardiac, non-gated scans, indicating successful multimodal transfer of cardiac information, though chamber diameters remain harder to predict than wall thickness. Although samples within the same phenotype class share a common label, perfect intra-class spatial consistency is not expected in this setting. Phenotypes correspond to continuous anatomical measurements whose spatial manifestations vary across patients due to differences in cardiac morphology, disease severity, and imaging plane, particularly in non-gated, non-contrast chest CT. Additional variability arises because the Nadaraya--Watson support bank aggregates multiple representative exemplars per phenotype rather than enforcing a single canonical template, encouraging the model to attend to a range of anatomically plausible regions. Consequently, Grad-CAM visualizations may exhibit heterogeneous yet phenotype-consistent attention patterns within a class, reflecting clinically meaningful variability rather than instability in the learned representation.
\paragraph{Pre-training on cardiology vs radiology}
We compared our cardiac modality--based pre-training approach against a CT model pretrained on radiology reports using a CLIP-style framework~\cite{hamamci2024developing}. During fine-tuning, the CT encoder was kept frozen while only task-specific heads were optimized. Both models were evaluated on the same ECHO-derived labels using CT volumes as input. As shown in Figure~\ref{fig:bar}, our proposed method (NW+MMCL) consistently outperforms CT-CLIP across all ten downstream tasks, achieving higher AUROC for cardiac labels. The largest performance gains are observed for valvular disease classification, where cardiac-specific multimodal pre-training leveraging ECHO and ECG signals provides substantial improvements over radiology-based pre-training. These results suggest that cardiac-focused multimodal alignment yields more informative and task-relevant representations for ECHO-derived clinical variables than generic CT--report pre-training.
\begin{figure}[tbh!]
  \centering
  \includegraphics[width=0.8\textwidth]{Barplot_auroc.pdf}
  \caption{
AUROC performance of CT-CLIP and NW+MMCL across ten cardiovascular prediction tasks on the test set. NW+MMCL consistently outperforms CT-CLIP across all tasks, with particularly large gains observed for valvular disease classification tasks. Error bars denote variability across repeated runs (standard deviation).
}
  \label{fig:bar}
\end{figure}
\subsection{Discussion and Limitations}
Our proposed phenotype-guided multimodal alignment framework demonstrates several significant advantages. By enforcing consistency between cardiac measurements and CT-derived embeddings during pre-training, the model learns physiologically relevant representations that transfer effectively to non-gated chest CT. This leads to strong gains across all cardiac prediction tasks and is especially beneficial in data-scarce scenarios, where our approach consistently outperforms standard contrastive pre-training and no pre-training baselines. 

Additionally, the non-parametric Nadaraya--Watson head serves as a structural safeguard against modality collapse. In contrast to parametric classifiers that can implicitly down-weight the noisier signal in favor of cleaner features, the NW head has no learnable parameters and therefore cannot internalize the supervision itself. This places the optimization burden entirely on the encoders, encouraging the CT backbone to shape its embedding geometry to align with the phenotype-support prototypes. The exemplar-based formulation also offers a more transparent link between predictions and representative cases, helping to anchor the latent space in clinically meaningful phenotypes. Notably, despite the CT scans not being acquired for cardiac indications, the pre-trained model learns to focus on cardiac regions, suggesting effective transfer of structural information from ECHO and ECG into CT-based representations.

Despite these strengths, several limitations should be considered. The multimodal alignment depends on exams occurring within six months of the CT scan, during which a patient’s cardiac condition may change, potentially introducing misalignment. Phenotype labels were derived using threshold-based binarization, which may simplify complex physiological conditions. Furthermore, the current support bank does not dynamically adapt to rare or evolving phenotypes. Finally, our evaluation focused on binary classification tasks; future extensions could explore continuous severity prediction, temporal modeling for deeper clinical insight. 
Additionally, paired CT--ECG--ECHO data are required only during pre-training as a one-time supervision cost and are not needed at inference, where the model operates solely on CT, enabling scalable deployment for opportunistic cardiac screening. Because retrospectively collected multimodal cohorts may represent a clinically enriched population, their distribution may differ from that of patients undergoing routine CT alone; future work will investigate bias-aware pre-training and domain adaptation strategies to mitigate this potential distribution shift. Future extensions could also explore more expressive image encoder architectures to further improve representation capacity and downstream performance.
\section{Conclusion}
\Edited{This work presents a phenotypically supervised multimodal alignment framework that unifies cardiac and non-cardiac imaging by leveraging tabular cardiac features as alignment signals. By integrating CLIP-style cross-modal contrastive learning and a Nadaraya--Watson non-parametric head, the method produces semantically grounded, interpretable embeddings that transfer effectively to chest CT for cardiac abnormality prediction. Extensive experiments demonstrate consistent improvements across ten cardiac tasks, strong data efficiency, and substantial few-shot gains. Together, these results highlight phenotype-level alignment as a promising direction for multimodal representation learning in medical imaging, particularly when labeled data are scarce or modalities differ in diagnostic intent.}

\midlacknowledgments{This work was fully supported by funding from NewYork-Presbyterian for the NYP-Cornell Cardiovascular AI Collaboration. We gratefully acknowledge the contributions of the data team and the clinicians involved in this work.}

\bibliography{midl26_133}
\newpage
\appendix
\counterwithin{table}{section}
\counterwithin{figure}{section}
\setcounter{table}{0}
\setcounter{figure}{0}
\renewcommand{\thetable}{\thesection.\arabic{table}}
\renewcommand{\thefigure}{\thesection.\arabic{figure}}
\section{Test set overview}
\begin{table}[h!]
\centering
\small
\caption{Label distribution and missingness rate in the test cohort across ten cardiac abnormalities. Counts represent positive (pos), negative (neg), and rate of missing labels per class.}
\label{tab:test_label_distribution}
\setlength{\tabcolsep}{5pt}
\begin{tabular}{lcccc}
\toprule
\textbf{Label} & \textbf{Pos} & \textbf{Neg}  & \textbf{Missing (\%)} \\
\midrule
LVEF~$\leq$~45\% & 553 & 4304  & 0.1 \\
LVWT~$\geq$~13~flag & 624 & 3691  & 11.2 \\
Aortic stenosis & 126 & 3470 &  26.0 \\
Aortic regurgitation & 75 & 4575 &  4.3 \\
Mitral regurgitation & 196 & 4428 &  4.9 \\
Tricuspid regurgitation & 363 & 4267 &  4.8 \\
Pulmonary regurgitation & 15 & 3334 & 31.1 \\
PASP~$\geq$~45~flag & 886 & 1920 & 42.3 \\
TR$_{\max}$~$\geq$~32~flag & 422 & 1733  & 55.7 \\
SHD~flag & 1942 & 170  & 56.6 \\
\bottomrule
\end{tabular}
\end{table}
\newpage
\section{Effect of NW Head Loss Weighting ($\lambda_{\mathrm{nw}}$)}
Varying the NW loss weight (Table~\ref{tab:bylamda}) revealed that excessively small values underutilize phenotype structure, while very large values can overconstrain the representation. The optimal range lies between $0.4$ and $0.6$, balancing alignment and phenotype smoothing.
\begin{table*}[h!]
\centering
\small
\caption{Performance (AUROC) on the test set by label using different $\lambda_{nw}$ values in the loss.}
\label{tab:bylamda}
\setlength{\tabcolsep}{2pt}
\begin{tabular}{>{\raggedright\arraybackslash}p{0.4\textwidth} *{7}{c}}
\toprule
Label & $\lambda_{nw}$ = 0.2  & $\lambda_{nw}$ = 0.4   & $\lambda_{nw}$ = 0.6   & $\lambda_{nw}$= 0.8 & $\lambda_{nw}$ = 1.0  \\
\midrule
LVEF $\leq$ 45\%  & 0.74 & 0.74 & 0.73  & 0.74  & 0.74 \\
LVWT $\geq$ 13 flag & 0.72 & 0.69 & 0.70 & 0.69 & 0.71\\
Aortic Stenosis  & 0.89 & 0.90 & 0.87 & 0.88 & 0.88\\
Aortic Regurgitation  & 0.72 & 0.67 & 0.65  & 0.68 & 0.73\\
Mitral Regurgitation  & 0.73 & 0.73 & 0.72 & 0.72 & 0.75\\
Tricuspid Regurgitation  & 0.66 & 0.69 & 0.66 & 0.65 & 0.70 \\
Pulmonary Regurgitation  & 0.81 & 0.73 & 0.77 & 0.72 & 0.66\\
PASP $\geq$ 45 flag  & 0.67 & 0.68 & 0.67 & 0.68 & 0.69\\
TR$_{\max}$ $\geq$ 32 flag  & 0.64 & 0.65 & 0.65 & 0.67 & 0.65\\
SHD flag  & 0.77 & 0.76 & 0.76 & 0.75 & 0.75\\
\bottomrule
\end{tabular}
\end{table*}
\newpage
\section{Effect of Training Size}\label{sec:traning_size}
Table~\ref{tab:training_fraction_ablation} reports performance as the labeled training set is reduced to $2\%, 4\%, 6\%$, and $10\%$ of available CTs. NW+MMCL outperformed the baseline at every fraction and for every label. In extremely low-supervision settings (e.g., $2\%$), the method improved AUROC by up to $57\%$ for Aortic Stenosis and $48\%$ for Mitral Regurgitation than the no-pre-training baseline. These results indicate that phenotype-guided multimodal alignment yields strong sample efficiency.
\begin{table*}[h!]
\centering
\small
\caption{Performance (AUROC) by disease label across models at different fractions of the training set. Best per percent per row is bolded.}
\label{tab:training_fraction_ablation}
\setlength{\tabcolsep}{3pt}

\resizebox{\textwidth}{!}{
\begin{tabular}{lcccccccc}
\toprule
& \multicolumn{2}{c}{\textbf{2\%}} & \multicolumn{2}{c}{\textbf{4\%}} & \multicolumn{2}{c}{\textbf{6\%}} & \multicolumn{2}{c}{\textbf{10\%}} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
\textbf{Label} & Base & NW+MMCL & Base & NW+MMCL & Base & NW+MMCL & Base & NW+MMCL \\
\midrule
Aortic Regurgitation                & 0.56 & \textbf{0.73} & 0.51 & \textbf{0.71} & 0.56 & \textbf{0.72}  & 0.62 & \textbf{0.66} \\
Aortic Stenosis                     & 0.49 & \textbf{0.77} & 0.53 & \textbf{0.79} & 0.43 & \textbf{0.78}  & 0.62 & \textbf{0.78}  \\
LVEF $\leq$ 45\%                    & 0.58 & \textbf{0.69} & 0.61 & \textbf{0.73} & 0.54 & \textbf{0.73}  & 0.50 & \textbf{0.74} \\
LVWT $\geq$ 13 flag                 & 0.56 & \textbf{0.64} & 0.63 & \textbf{0.69} & 0.52 & \textbf{0.69} & 0.54 & \textbf{0.68} \\
Mitral Regurgitation                & 0.49 & \textbf{0.73}  & 0.56 & \textbf{0.74} & 0.56 & \textbf{0.73} & 0.56 & \textbf{0.71}  \\
PASP $\geq$ 45 flag                 & 0.47 & \textbf{0.65}  & 0.50 & \textbf{0.67}  & 0.54 & \textbf{0.66} & 0.63 & \textbf{0.67} \\
Pulmonary Regurgitation             & 0.72 & \textbf{0.73} & 0.58 & \textbf{0.69}  & 0.55 & \textbf{0.61}  & 0.65 & \textbf{0.69} \\
SHD flag                            & 0.46 & \textbf{0.72}  & 0.61 & \textbf{0.71}  & 0.52 & \textbf{0.73}  & 0.63 & \textbf{0.72}\\
TR$_{\max}$ $\geq$ 32 flag              & 0.46 & \textbf{0.64}  & 0.51 & \textbf{0.63} & 0.52 & \textbf{0.63}  & 0.62 & \textbf{0.63} \\
Tricuspid Regurgitation             & 0.57 & \textbf{0.71} & 0.50 & \textbf{0.70} & 0.57 & \textbf{0.69}  & 0.63 & \textbf{0.69}\\
\bottomrule
\end{tabular}
}
\end{table*}
\newpage
\section{Non-Parametric vs. Learnable Heads for Phenotype Prediction} \label{sec:nw_gradcam}
In this appendix (Figure \ref{fig:nw_vs_linear_gradcam_appendix}), we qualitatively examine how the choice of prediction head influences the spatial focus of the pre-trained CT encoder. Grad-CAM visualizations for four key phenotypes show that the non-parametric NW head, when combined with multimodal contrastive pre-training, consistently drives the encoder to attend to anatomically relevant myocardial and chamber regions, in line with the underlying ECHO-derived measurements. In contrast, replacing the NW head with a linear classifier yields more scattered and occasionally extra-cardiac attention patterns. These examples support our hypothesis that the non-parametric NW head better enforces phenotype-level alignment between CT and tabular representations, leading to more interpretable and physiologically grounded image features.
\begin{figure}[hb!]
  \centering
  \includegraphics[width=1.0\textwidth]{Gradcam__compare.pdf}
  \caption{
\Edited{Comparison of Grad-CAM attention maps on cardiac CT volumes for the non-parametric Nadaraya--Watson (NW) head versus a learnable linear head after cross-modal pre-training. Each column corresponds to one of four phenotypes---IVS (interventricular septal thickness), LVPW (left ventricular posterior wall thickness), LVD (left ventricular internal diameter in diastole), and LVS (left ventricular internal diameter in systole). The NW + MMCL model (top row) produces sharper, more localized attention over ventricular walls and chambers, whereas the Linear Head + MMCL model (bottom row) exhibits more diffuse and off-target responses, indicating weaker phenotype alignment in the learned image features.}
}
\label{fig:nw_vs_linear_gradcam_appendix}
\end{figure}
\newpage
\section{Embedding Combination Strategies}\label{sec:embedding}

We ablate three strategies for combining image and tabular embeddings for phenotype prediction, while keeping the encoders, training schedule, and Nadaraya--Watson (NW) head fixed across settings.
\paragraph{(i) Sum fusion.
Given the image embedding $e^{\text{img}} \in \mathbb{R}^d$ and tabular embedding $e^{\text{tab}} \in \mathbb{R}^d$, we compute the fused representation via element-wise summation followed by $\ell_2$ normalization:
\begin{equation}
z = \frac{e^{\text{img}} + e^{\text{tab}}}{\left\lVert e^{\text{img}} + e^{\text{tab}} \right\rVert_2}
\end{equation}
This corresponds to the fusion operation defined in Eq.~(3) of the main text.}
\paragraph{(ii) Concatenation.}
We concatenate modality-specific embeddings and project them back to the shared embedding dimension:
\begin{equation}
z = \frac{[e^{\text{img}}; e^{\text{tab}}]}{\left\lVert [e^{\text{img}}; e^{\text{tab}}] \right\rVert_2}
\end{equation}
where $[\cdot;\cdot]$ denotes concatenation. This ensures dimensional compatibility with other fusion strategies.
\paragraph{(iii) Gated fusion.}
We employ a learned scalar gating mechanism to adaptively weight the two modalities. Specifically, we compute a gating coefficient
\begin{equation}
a = \sigma\!\left(g\!\left([e^{\text{img}}; e^{\text{tab}}]\right)\right),
\end{equation}
where $g(\cdot)$ is a small multilayer perceptron and $\sigma(\cdot)$ is the sigmoid function. The fused embedding is then given by
\begin{equation}
z = \frac{a\, e^{\text{img}} + (1-a)\, e^{\text{tab}}}{\left\lVert a\, e^{\text{img}} + (1-a)\, e^{\text{tab}} \right\rVert_2}.
\end{equation}
This strategy can be interpreted as adaptive scalar-gated late fusion, related to gated multimodal fusion approaches~\cite{arevalo2017gated}.
Table~E.1 reports performance for each fusion strategy using AUROC, F1 score, and cosine similarity between predicted and ground-truth phenotypes. All results are computed on the same evaluation split using an identical support-bank construction for the NW head.

We adopt sum fusion to ensure balanced and stable training of both the image and tabular encoders, as it enforces equal contribution from each modality and avoids optimization bias toward a single encoder. Empirically, sum fusion achieves performance comparable to more complex fusion strategies across AUROC, F1, and cosine similarity (Table~E.1), motivating its use as a simple and robust default.
\begin{table*}[h!]
\centering
\caption{Comparison of embedding combination strategies. Best performance per row is bolded.}
\label{tab:fusion}
\small
\setlength{\tabcolsep}{5pt}
\renewcommand{\arraystretch}{0.6}
\begin{tabular}{lccc}
\toprule
Evaluation Metric & Sum Fusion & Concatenation & Gated Fusion \\
\midrule
AUROC             & \textbf{0.66} & 0.65 & 0.62\\
F1 Score          & \textbf{0.60} & 0.58  & 0.58 \\
Cosine Similarity & 0.72 & 0.72 & \textbf{0.74} \\
\bottomrule
\end{tabular}
\end{table*}
\newpage
\section{Statistical Significance Analysis}
\label{app:statsig}
We assessed statistical significance using paired, two-sided t-tests across repeated runs with identical experimental settings. For each tasks, AUROC values from the same run were treated as paired observations. A significance level of $\alpha = 0.05$ was used. Table~\ref{tab:statsig_pvalues} reports the p-values and significance outcomes for comparisons with approaches.

\begin{table*}[t]
\centering
\small
\resizebox{\textwidth}{!}{
\begin{tabular}{lcccccc}
\toprule
Task &
\multicolumn{2}{c}{NW+MMCL vs. MMCL} &
\multicolumn{2}{c}{NW+MMCL vs. SimCLR} &
\multicolumn{2}{c}{NW+MMCL vs. No pre-training} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7}
 & p-value & Sig. & p-value & Sig. & p-value & Sig. \\
\midrule
LVEF & 0.080 & No & $3.9{\times}10^{-4}$ & Yes & 0.314 & No \\
LVWT & 0.004 & Yes & $2.2{\times}10^{-4}$ & Yes & 0.044 & Yes \\
Aortic stenosis & 0.026 & Yes & 0.0077 & Yes & 0.050 & Yes \\
Aortic regurgitation & 0.024 & Yes & 0.0011 & Yes & 0.009 & Yes \\
Mitral regurgitation & 0.215 & No & $4.9{\times}10^{-4}$ & Yes & 0.286 & No \\
Tricuspid regurgitation & 0.102 & No & $5.8{\times}10^{-4}$ & Yes & 0.667 & No \\
Pulmonary regurgitation & 0.239 & No & 0.0010 & Yes & 0.081 & No \\
PASP & 0.141 & No & 0.0022 & Yes & 0.226 & No \\
TR$_{\max}$ & 0.030 & Yes & 0.030 & Yes & NA & NA \\
SHD & 0.062 & No & 0.0021 & Yes & 0.184 & No \\
\midrule
\textbf{Macro-average} & \textbf{0.020} & \textbf{Yes} &
\textbf{$3.1{\times}10^{-5}$} & \textbf{Yes} &
\textbf{0.037} & \textbf{Yes} \\
\bottomrule
\end{tabular}
}
\caption{p-values from paired statistical tests. We report p-values from paired two-sided t-tests comparing NW+MMCL against each strategies. ``Sig.'' denotes statistical significance at $p<0.05$. The macro-average row corresponds to a paired test on run-wise macro-averaged AUROC (averaged across all tasks within each run). ``NA'' indicates a degenerate case with zero variance in paired differences.}
\label{tab:statsig_pvalues}
\end{table*}
NW+MMCL shows statistically significant improvements in macro-averaged AUROC over all other strategies, with particularly strong gains relative to SimCLR and training from scratch. Several individual tasks do not reach significance when compared to MMCL, reflecting the limited number of paired runs rather.
\newpage
\section{Discriminative Analysis of Features}
\label{app:kde}
Relative to MMCL (Figure \ref{fig:MMCL_10}) and SimCLR (Figure \ref{fig:SimCLR_10}), the NW+MMCL model (Figure \ref{fig:10tasks}) produces more consistent right-shifted score distributions for positive cases across tasks, indicating improved task-aligned separation. MMCL shows moderate but less stable separation, while SimCLR exhibits substantial overlap, highlighting the benefit of phenotype-guided multimodal pre-training. On the other hand, to quantify phenotype-level separability in the fused multimodal representation using NW+MMCL pre-training, we projected the latent embeddings onto a one-dimensional LDA axis for each concept and visualized the resulting score distributions using kernel density estimates (Figure \ref{fig:kde}). Unlike unsupervised dimensionality reduction methods, this approach directly measures how well the fused embedding linearly separates negative (in-range) and positive (out-of-range) samples. Across all four concepts (IVS, LVPW, LVD, and LVS), the fused representation exhibits systematic shifts in the LDA score distributions, with varying degrees of overlap between classes. Phenotypes with more clearly separated distributions demonstrate stronger alignment, whereas increased overlap suggests reduced discriminability. Overall, these results indicate that the fused embedding captures relevant structure and supports interpretable assessment of multimodal feature alignment at the phenotype level.
\begin{figure}[t!]
  \centering
  \includegraphics[width=0.7\textwidth]{MMCL_10.pdf}
  \caption{
{Task-specific score distributions for multimodal CT representations using MMCL pre-training.
For each task, we plot the distribution of signed classifier scores for positive (teal, filled) and negative (gray, outlined) cases, where the score denotes the signed distance of CT embeddings to the task-specific linear classifier head.}
}
\label{fig:MMCL_10}
\end{figure}
\begin{figure}[t!]
  \centering
  \includegraphics[width=0.7\textwidth]{SimCLR_10.pdf}
  \caption{
{Task-specific score distributions for multimodal CT representations using SimCLR pre-training.
For each task, we plot the distribution of signed classifier scores for positive (teal, filled) and negative (gray, outlined) cases, where the score denotes the signed distance of CT embeddings to the task-specific linear classifier head. }
}
\label{fig:SimCLR_10}
\end{figure}
\begin{figure}[t!]
  \centering
  \includegraphics[width=0.7\textwidth]{KDE.pdf}
  \caption{
{Phenotype separability in the fused latent space.
Kernel density estimates of LDA projection scores for four cardiac concepts (IVS, LVPW, LVD, LVS) derived from the fused embedding. Each panel shows the one-dimensional LDA axis that maximizes class separation between negative and positive samples. Clear shifts between distributions indicate stronger phenotype discriminability in the fused representation.}
}
\label{fig:kde}
\end{figure}
\newpage
\section{Tabular Feature Specification and Embedding Construction}
\label{app:tabular_features}
This appendix provides details on the structured ECG and ECHO features used to construct the tabular embeddings in X-Cardia, including feature composition, embedding dimensionality, and feature selection considerations.

\subsection{ECG and ECHO Feature Set}

The tabular input consists of routinely reported clinical measurements derived from ECHO, ECG, and basic demographics. These features reflect standard cardiac structure, function, and conduction parameters commonly available in clinical workflows. Specifically, the structured feature set includes:

\begin{itemize}
    \item \textbf{Demographics and clinical context:} patient age, sex, race, ventricular assist device (VAD) flag.
    \item \textbf{ECG-derived measurements:} ventricular rate, atrial rate, PR interval, QRS duration, QT interval, corrected QT (QTc), ventricular pacing flag.
    \item \textbf{Cardiac function and pressure estimates:} left ventricular ejection fraction (LVEF), pulmonary artery systolic pressure (PASP), PASP excluding right atrial pressure, right atrial pressure (RAP), pericardial effusion indicator, tricuspid regurgitation maximum velocity.
    \item \textbf{Diastolic function parameters:} E-wave velocity, A-wave velocity, E/A ratio, mitral valve tissue Doppler indices (E$^\prime$ and A$^\prime$ velocities: lateral, medial, and averaged).
    \item \textbf{Left ventricular outflow tract (LVOT) measurements:} LVOT area, diameter, peak and mean velocities, velocity--time integral (VTI), peak and mean pressure gradients.
    \item \textbf{Valvular disease and prosthesis indicators:} global and valve-specific prosthetic flags (aortic, mitral, tricuspid, pulmonary), mitral regurgitation VTI and peak velocity.
    \item \textbf{Aortic valve hemodynamics:} aortic valve peak velocity, peak gradient, mean gradient, and VTI.
\end{itemize}

\subsection{Tabular Embedding Dimensionality}

All structured features are standardized and encoded using an FT-Transformer based tabular encoder. The resulting tabular embedding is projected into a shared latent space of dimension, $d = 256$, where both CT and tabular embeddings are aligned and fused via element-wise summation.

\subsection{Feature Selection Considerations}
We do not perform explicit feature selection or subset optimization. This design choice is intentional, as the tabular encoder is used to provide physiologically meaningful intermediate supervision rather than to optimize standalone tabular prediction. Using a broad set of routinely available cardiac measurements encourages the CT encoder to align with diverse structural, functional, and hemodynamic signals during multimodal pre-training. Exploring learned or task-adaptive feature selection remains an interesting direction for future work.

\section{Additional Experimental Details: Hyperparameters, Selection, and Compute}
\label{app:exp_details_short}

\subsection{Hyperparameters}
We added the details of hyperparameters used in Table \ref{tab:pretrain_hparams} and Table \ref{tab:finetune_hparams}.
\label{app:hparams}
\begin{table*}[t]
\centering
\small
\setlength{\tabcolsep}{6pt}
\begin{tabular}{l l}
\toprule
\textbf{Parameter} & \textbf{Value} \\
\midrule
Epochs & 30 \\
Batch size & 8 \\
Optimizer & AdamW \\
Learning rate & $1\times10^{-4}$ \\
Weight decay & $1\times10^{-5}$ \\
LR scheduler & CosineAnnealingWarmRestarts \\
Scheduler params & $T_0{=}10$, $T_{\text{mult}}{=}2$, $\eta_{\min}{=}1\times10^{-6}$ \\
CLIP temperature ($\tau$) & 0.07 \\
NW softmax temperature ($\tau_{\text{NW}}$) & 1.0 \\
NW loss weight ($\lambda_{\text{NW}}$) & 0.5 \\
Support size cap per concept ($K$) & 5 \\
Support refresh interval & every 5 epochs \\
K-means ($k$) for support selection & up to 5 \\
\midrule
\bottomrule
\end{tabular}
\caption{Pre-training hyperparameters used in all experiments.}
\label{tab:pretrain_hparams}
\end{table*}


\begin{table*}[t]
\centering
\small
\setlength{\tabcolsep}{6pt}
\begin{tabular}{l l}
\toprule
\textbf{Parameter} & \textbf{Value} \\
\midrule
Epochs (max) & 100 \\
Batch size & 12 \\
Optimizer & AdamW \\
Learning rate & $5\times10^{-4}$ \\
Weight decay & $1\times10^{-3}$ \\
LR scheduler & OneCycleLR (cosine anneal) \\
Early stopping patience & 6 \\
\bottomrule
\end{tabular}
\caption{Fine-tuning hyperparameters and evaluation protocol.}
\label{tab:finetune_hparams}
\end{table*}

\subsection{Selection Criteria for Key Parameters}
\label{app:selection}

\begin{itemize}
    \item \textbf{K-means support selection ($k$).} We cap the number of support exemplars per phentype with $K{=}5$ to balance diversity and retrieval cost. For phenotype $p$ and number of positive cases $N_p^+$, we set $k=\min(K, N_p^+)$; when $N_p^+\le K$, we use all positives.
    \item \textbf{Contrastive temperature ($\tau$).} We use $\tau{=}0.07$ (CLIP-style default), which was stable across runs; we did not observe meaningful gains from additional tuning in our setting.
    \item \textbf{Support refresh interval.} Updating the support bank every 5 epochs provided a good trade-off between adaptivity and overhead.
\end{itemize}

\subsection{Compute and Complexity}
\label{app:compute_short}
We report wall-clock training time for pre-training  and fine-tuning, along with GPU type and number of GPUs used, in the final version. Overall computational cost is dominated by the 3D CNN backbone, while the additional overhead from multimodal fusion and support-set retrieval is minimal due to the small, fixed-size support bank.

\begin{table}[t]
\centering
\small
\setlength{\tabcolsep}{6pt}
\begin{tabular}{l l}
\toprule
\textbf{Setting} & \textbf{Value} \\
\midrule
GPU & NVIDIA A100 40GB \\
GPUs used & 2 \\
Pre-training time / epoch & $\sim$ 55 min \\
Fine-tuning time / epoch & $\sim$ 40 min \\
Precision & FP32 \\
\bottomrule
\end{tabular}
\caption{Hardware and wall-clock training time.}
\label{tab:compute}
\end{table}
\section{Additional Results and Robustness Analysis}
\label{app:few_results}
\paragraph{Few-shot evaluation.}
We report performance under $K$-shot settings ($K\in\{0,1,2\}$). In the $0$ shot case, the pre-trained CT encoder is evaluated with a frozen backbone. For $1$ and $2$ shot settings, the model is fine-tuned using $K$ labeled samples per class with fixed data splits across methods. NW+MMCL consistently outperforms other methods across tasks and shot settings. Performance improves with increasing supervision for all methods; however, NW+MMCL shows steeper gains, indicating better sample efficiency and more transferable CT representations.
\begin{figure*}[t!]
  \centering
  \begin{subfigure}
    \centering
    \includegraphics[width=0.8\textwidth]{0shot.pdf}
    %\caption{0-shot}
  \end{subfigure}
  \begin{subfigure}
    \centering
    \includegraphics[width=0.8\textwidth]{1shot.pdf}
    %\caption{1-shot}
  \end{subfigure}
  \begin{subfigure}
    \centering
    \includegraphics[width=0.8\textwidth]{2shot.pdf}
    %\caption{2-shot}
  \end{subfigure}
  \caption{
  Few-shot downstream performance across cardiac tasks. AUROC (mean $\pm$ std over repeated runs) for 10 cardiac phenotype prediction tasks under (a) 0 shot, (b) 1 shot, and (c) 2 shot supervision.
  }
  \label{fig:fewshot_auc}
\end{figure*}
\end{document}
