\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution


% \usepackage[color]{changebar}

\usepackage{multirow}
\usepackage{threeparttable}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{mwe} % to get dummy images
%  THIS IS FOR TRACK CHANGES...


\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 240}
\editors{Accepted for publication at MIDL 2025}

\title[Contrastive Patient-level Pretraining for Lung Cancer Risk Prediction]{Contrastive Patient-level Pretraining Enables Longitudinal and Multimodal Fusion for Lung Cancer Risk Prediction}

% Use \Name{Author Name} to specify the name.
% If the surname contains spaces, enclose the surname
% in braces, e.g. \Name{John {Smith Jones}} similarly
% if the name has a "von" part, e.g \Name{Jane {de Winter}}.
% If the first letter in the forenames is a diacritic
% enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

% Two authors with the same address
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
%  \Name{Author Name2} \Email{xyz@sample.edu}\\
%  \addr Address}

% Three or more authors with the same address:
% \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
%  \Name{Author Name2} \Email{an2@sample.edu}\\
%  \Name{Author Name3} \Email{an3@sample.edu}\\
%  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Thomas Z. Li\nametag{$^{1, 2}$}} \orcid{0000-0001-9950-4679} \Email{thomas.z.li@vanderbilt.edu}\\
\addr $^{1}$ Department of Biomedical Engineering, Vanderbilt University, Nashville, TN \\
\addr $^{2}$ Medical Scientist Training Program, Vanderbilt University, Nashville, TN \\
\Name{Lianrui Zuo\nametag{$^{3}$}} \orcid{0000-0002-5923-9097} \Email{lianrui.zuo@vanderbilt.edu}\\
\addr $^{3}$ Department of Electrical and Computer Engineering, Vanderbilt University, Nashville, TN \\
\Name{Yihao Liu\nametag{$^{3}$}} \Email{yihao.liu@vanderbilt.edu}\\
\Name{Aravind R. Krishnan\nametag{$^{3}$}} \Email{aravind.r.krishnan@vanderbilt.edu}\\
\Name{Kim L. Sandler\nametag{$^{4}$}} \Email{kim.sandler@vumc.org}\\
\addr $^{4}$ Department of Radiology, Vanderbilt University Medical Center, Nashville, TN \\
\Name{Thomas A. Lasko\nametag{$^{3, 5}$}} \orcid{0000-0003-2300-9529} \Email{tom.lasko@vanderbilt.edu}\\
\addr $^{5}$ Department of Biomedical Informatics, Vanderbilt University, Nashville, TN \\
\Name{Fabien Maldonado\nametag{$^{6}$}} \Email{fabien.maldonado@vumc.org}\\
\addr $^{6}$ Department of Medicine, Vanderbilt University Medical Center, Nashville, TN \\
\Name{Bennett A. Landman\nametag{$^{1,3,4,5}$}} \Email{bennett.landman@vanderbilt.edu}\\
\\
}

\begin{document}

\maketitle

\begin{abstract}
Leveraging longitudinal and multimodal data is important for clinical predictive tasks. Contrastive language-image pretraining (CLIP) has been successful in learning multimodal representations by aligning paired images and captions, i.e. medical images and corresponding radiology report. However, in real clinical settings, the alignment of unpaired modalities, such as medical images and clinical notes collected at different times, is an open challenge, even though such data are ubiquitous in practice. This study conducts contrastive pretraining between longitudinal chest CTs and clinical variables on the patient level using a large public lung cancer screening dataset. Leveraging a time-distanced transformer to encode longitudinal imaging and an open-source text embedding to encode clinical variables, we optimize contrastive loss between the embedded modalities from same patient (positive pair) against those from different patients (negative pair). We find that finetuning the CLIP representation significantly improves prediction of lung cancer risk in two types of clinical populations (0.895 and 0.893 AUC) compared to conventional multimodal fusion (0.873 and 0.875 AUC) and single modality baselines. These results demonstrate how contrastive patient-level pretraining can enable longitudinal and multimodal fusion without additional training data. We released our code and pre-trained weights at \href{https://github.com/MASILab/lung-cplp}{https://github.com/MASILab/lung-cplp}.

\end{abstract}


\begin{keywords}
contrastive language-image pretraining (CILP), multimodal, chest CT, lung cancer
\end{keywords}

\section{Introduction}
\label{sec:intro}
Leveraging information across time and modalities is crucial for many clinical problems that machine learning can have a significant impact. However, multimodal models often struggle with overfitting, and are susceptible to both intra- and inter-modality variation compared to simpler imaging models ~\cite{wang2020makestrainingmultimodalclassification, 10484420}. Furthermore, modalities collected from routine clinical practice are irregularly sampled and asynchronous with one another. Because of these challenges, researchers often find that longitudinal multimodal models are prohibitively data-hungry~\cite{udandarao2024zeroshotexponentialdatapretraining} and frequently fail to generalize to different clinical settings~\cite{li2024winnersperformancelungcancer, Lasko2024}. Unsupervised and self-supervised learning offer promising solutions by leveraging large-scale data to enhance multimodal representation and improve model robustness~\cite{li2023multimodalfoundationmodelsspecialists}. 

Contrastive pretraining is a prominent self-supervised paradigm that leverages a contrastive loss to encourage one embedding to be similar to a related embedding while being less similar than an arbitrarily selected pair ~\cite{oord2019representationlearningcontrastivepredictive, jia2021scalingvisualvisionlanguagerepresentation}. In contrastive language-image pretraining (CLIP)~\cite{radford2021learningtransferablevisualmodels, zhang2022contrastivelearningmedicalvisual}, the related pairs consist of an image and its natural language description, such as a CT study and radiology report, but the pairs can be defined using any useful relationship. ~\citet{lee2022uniclipunifiedframeworkcontrastive} included intra-modality pairs by aligning pairs of augmentations on the same image and~\citet{10.1007/978-3-031-43993-3_51} included figure-caption pairs from web-scale biomedical documents. 

Applications of CLIP on medical data are concentrated in radiology and pathology where image-text pairs are abundant, resulting in promising downstream performance~\cite{9710099, chen2024mammoclipleveragingcontrastivelanguageimage, 10376801, Jang_2024, Lu2024}. Many challenging clinical problems however, such as prediction of future lung cancer risk, rely on multiple modalities not because they describe each other well but because \textit{they do not}~\cite{Acosta2022}. Solving such tasks, to some degree, may rely on integrating probabilistically independent sources that are exclusively found in different modalities~\cite{lasko2024unsuperviseddiscoveryclinicaldisease}. For the lung cancer domain, family history of lung cancer is one of many well-known risk factors that does not have a direct CT correlate. Beyond radiologist- and pathologist-annotated reports, most of the clinical record does not directly describe medical images. This begs the largely unexplored question as to whether unpaired modalities can be aligned simply based on being derived from the same patient and same relative time. 

In this study we leverage a time-distance chest CT encoder and an open-sourced text representation model to compute embeddings for consecutive screening chest CTs and clinical variables respectively for each patient. We conduct contrastive pretraining across the image and text embeddings by defining positive and negative pairs on the patient ID. This encourages embeddings of the unpaired modalities to be similar, which admittedly is counterintuitive. Nevertheless we hypothesized that the representation learned from CLIP would achieve better separation in downstream classification than supervised multimodal representations. We show that contrastive patient-level pretraining leads to improved lung cancer risk prediction after finetuning compared to supervised training on the same training set using the same model architecture.

% APPROACH ================================================
\begin{figure}[tb!]
  \includegraphics[width=\linewidth]{MIDLLatexTemplate/patient_level_clip.pdf}
  \caption{Our approach ingests up to three annual chest CTs and cross-sectional clinical variables. Top: The chest CTs collected at times $(p_0, p_1, p_2)$ are transformed into feature vectors $(s_0, s_1, s_2)$ using a pretrained ResNet (Sybil). Each feature vector is added to its corresponding time encoding $\hat{p}_i$ to form a sequence for input into a time-distance transformer (TDViT). The embedding corresponding to the latest time point was used as the image embedding, $I^i$, for CLIP and finetuning. Bottom: Tabular variables are transformed into a natural language form using sentence templates. We used a pretrained text representation model (TRM) and a multi-layer perceptron~(MLP) to compute the text embedding, $T^i$, for CLIP and finetuning. During both stages, weights of Sybil and TRM were frozen.}
  \label{fig:1}
\end{figure}

\section{Approach}
\label{sec:approach}
% I feel like this paragraph is redundant in a conference paper, so I removed it. Feel free to add it back if you have a different thought. --LZ
% Our approach to contrastive patient-level pretraining began with leveraging pretrained encoders for each modality to compute image and text embeddings (Sec.~\ref{subsec:patient_level_embed}) and applying CLIP to patient-level embeddings (Sec.~\ref{subsec:clip_pretrain}). 
% We examined the learned representation through its zero-shot and fine-tuned prediction of two-year lung cancer risk (Sec.~\ref{subsec:zero_shot}), comparing against single-modal and multi-modal ablative baselines (Sec.~\ref{subsec:finetune}).

\subsection{Patient-level Embeddings}
\label{subsec:patient_level_embed}
This study focused on learning patient-level embeddings of public chest CTs and clinical variables from the \href{https://www.cancerimagingarchive.net/collection/nlst/}{National Lung Screening Trial (NLST)} ~\cite{doi:10.1056/NEJMoa1102873}. We reproduced the test set from ~\cite{Ardila2019} and used all other subjects for training. The clinical variables are collected through surveys and chart reviews at the time of trial randomization. Following randomization, patients annually receive up to three low-dose chest CTs (Table~\ref{tab:1}). Unlike the paired modalities conventionally seen in CLIP studies, the clinical variables are chronologically out of sync with the collected imaging in the NLST for up to four years in some cases. Moreover, they are semantically unpaired, meaning that the clinical variables do not directly describe the CT image. Following CLIP, we leverage modality-specific encoders (Figure~\ref{fig:1}).

\begin{table}[tb]
    \small
    \centering
        \caption{Training and Testing Sets from NLST. \texttt{COPD}: Presence of chronic obstructive pulmonary disease; \texttt{phist}: Personal history of cancer, \texttt{fhist}: Family history of lung cancer.}
        \label{tab:1}%
            \begin{threeparttable}[b]
                \begin{tabular}{*{1}{l}*{1}{c}*{1}{c}p{0.3\linewidth}}
                    \toprule
                    & \textbf{\# Patients (\# Cases)} & \textbf{\# Chest CTs} & \textbf{Clinical Variables} \\
                    \midrule
                    Training\textsuperscript{\textdagger} & $22,571~(598) $ & $61,631$ & \multirow{5}{\linewidth}{age, sex, race, BMI*, smoking quit time, smoking duration, smoking pack years, COPD*, emphysema, phist., fhist., smoking status} \\
                    Screening Test & $2,315~(94)$ & $6,615$ & \\
                    Nodule Test & $2,057~(93) $ & $5,879$ & \\
                    & & & \\
                    & & & \\
                    \bottomrule
                \end{tabular}
                \begin{tablenotes}
                \small
                    \item \textsuperscript{\textdagger}Used in both pretraining and finetuning stages. *Variable missing in $<$ 1\% of subjects 
                \end{tablenotes}
            \end{threeparttable}
\end{table}
\subsubsection{Image Encoder}
The image encoder is composed to two modules. First, CT slices were normalized to a Hounsfield window of $[-600, 1500]$ and resized to $512 \times 512$. These were given to Sybil, a ResNet developed to detect lung cancer on lung screening chest CTs ~\cite{mikhael2023sybil}, which resulted in  $\mathbb{R}^{256}$ feature vectors for each chest CT. Second, we follow the strategy of time-distance transformers ~\cite{10.1117/12.2653911, 10.1007/978-3-031-43895-0_61} which added time embeddings representing the relative days from the earliest scan to the corresponding feature vectors. This allows the encoder to learn longitudinally, such as observing the change in lung nodules features over time. We set the sequence length to 3 and we used attention masks to avoid attending to missing tokens in the sequence (i.e. when the patient received less than three chest CTs). During training, the ResNet weights were frozen while time-distanced transformer weights were optimized (\figureref{fig:1}, top). 

\subsubsection{Text Encoder}
To encode the clinical variables, we leveraged a pretrained BERT-style~\cite{zhang2024mgtegeneralizedlongcontexttext} text representation model (TRM) from Hugging Face, \href{https://huggingface.co/dunzhang/stella_en_400M_v5}{\texttt{\detokenize{dunzhang/stella_en_400M_v5}}}, selected for its small memory footprint and strong performance on the Massive Text Embedding Benchmark~\cite{muennighoff2023mtebmassivetextembedding}. 
The text encoder's input consisted of the patient's tabular variables transformed into a natural language paragraph using sentence templates (Figure~\ref{fig:1},~bottom). Units of measurement were included with numerical variables and missing variables were simply left out of the paragraph. In our experiments, the natural language transform outperformed transformation into JSON format, but it was unclear if this was specific to the TRM we selected. The set of clinical variables included the length of lung cancer-free follow up and if the patient ultimately developed lung cancer. The TRM tokenized the natural language paragraph~\cite{conneau2020unsupervisedcrosslingualrepresentationlearning, kudo2018sentencepiecesimplelanguageindependent} and computed text embeddings. Lastly, a multi-layer perceptron (MLP) processed the resulting text embeddings. Although this study only accessed cross-sectional clinical variables, the MLP can be replaced with a transformer in future studies where the non-imaging data is longitudinal. During training, the weights of the TRM were frozen while the MLP weights were optimized (\figureref{fig:1},~{bottom}).  

\subsection{Contrastive Patient-level Pretraining}
\label{subsec:clip_pretrain}
We hypothesized that representations learned through contrastive patient-level pretraining would be more useful in lung cancer classification compared to representations learned through supervised learning. In order to align imaging and clinical variables using contrastive pretraining, we arranged image and text embedding pairs for each patient. The image embedding was the token corresponding to the chronologically latest chest CT while the text embedding was simply the output of the text encoder. Within a batch, an image and text embedding was considered a positive pair if they corresponded to the same patient or negative if the image and text embeddings were derived from different patients. All pairs were multi-modal (i.e. we did not allow pairs to be of image-to-image or text-to-text embeddings). We performed CLIP to maximize to cosine similarity between positive pairs via symmetric cross entropy loss. We employed Adam optimization~\cite{kingma2017adammethodstochasticoptimization} and a learning rate with cosine annealing and warm restart cycles~\cite{loshchilov2017sgdrstochasticgradientdescent}. Training was stopped when no improvement in the loss was observed after 1000 epochs. This stage of training was highly sensitive to batch size. On one hand, a large batch size increased the number of contrasted pairs and reduced variation within the batch. However, batch sizes beyond a certain limit underperformed, perhaps due to the model taking very few steps per epoch~\cite{zhang2024doescriticalbatchsize}. We found that the critical batch size for our training dataset was 2401 which corresponded to 10 steps per epoch. We froze aforementioned parts of the image and text encoder and trained with mixed precision~\cite{micikevicius2018mixedprecisiontraining} to accommodate our desired batch size. Code and pre-trained models are released at \href{https://github.com/MASILab/lung-cplp}{https://github.com/MASILab/lung-cplp}.

% EXPERIMENTS ================================================

\section{Experiments and Results}
We examined the CLIP embeddings by testing its zero-shot and fine-tuned prediction of 2-year lung cancer risk on a withheld test set. We compared against baselines to examine the benefit of contrastive patient-level pretraining. Each experiment was performed in two clinical cohorts: the entire test set (screening) and the subset of patients with a lung nodule detected (detected-nodules). In brief, the screening cohort tests the model’s ability to estimate future lung cancer risk, regardless of whether a nodule is present, while the detected-nodule cohort evaluates the model’s ability to discriminate between benign and malignant nodules. We report the mean AUC and 95\% confidence interval from 1000 bootstrapped samples of the predictions, sampling with replacement from the test set. A two-sided Wilcoxon signed-rank test was used to test if the mean AUCs of each approach was significantly different than the rest for $p<0.05$.

All experiments were conducted in Python 3.12 using Pytorch 2.5 and CUDA 12.4 on a Ubuntu 22.04 server with a single NVIDIA RTX 6000. 

\subsection{Zero-shot Classification}
\label{subsec:zero_shot}
We were interested in evaluating how useful the pretrained patient-level embeddings were for predicting 2-year lung cancer risk out-of-the-box. In contrast to other zero-shot tasks, the concept of lung cancer is not entirely unseen during pretraining since the lung cancer label and follow up period was included during CLIP training. However, the task of predicting two-year lung cancer risk still presents a challenging domain shift both in terms of longitudinal semantics and class prevalence in the test set, as demonstrated by the zero-shot results to be presented. To evaluate the performance of zero-shot classification on the test set, we converted each diagnostic label into a templated sentence, ``The patient [developed/did not develop] lung cancer after 2 years''. This was appended to the patient's natural language paragraph, preserving the other clinical variables but in place of the diagnostic label. We then computed the cosine similarity between each image embedding and both possible text embeddings. The text embedding with the higher similarity score was chosen as the model's prediction in a zero-shot manner. In this setting, the CLIP approach struggled to predict 2-year lung cancer risk with performances of 0.540 and 0.523 in screening and detected-nodule settings respectively. (\tableref{tab:2}, CLIP-zeroshot)

\subsection{Finetune Classification}
\label{subsec:finetune}
We asked how the pretrained patient-level embeddings would perform in a supervised classification setting. Because lung cancer labels were available for each patient, the fine-tune phase used the same training set as the pretraining phase. We removed sentences pertaining to lung cancer and follow-up period, but other preprocessing steps remained unchanged. The embeddings from the image and text encoder were concatenated and fed through a MLP and sigmoid classifier. We set the learning rate of the image and text encoders to 1/10th of the MLP classifier's learning rate. This finetuned approach achieved a mean AUC of 0.895 in the lung screening setting and 0.893 in the detected-nodule setting (\tableref{tab:2}, CLIP-finetune).

In addition, we finetuned and evaluated four checkpoints along the course of pretraining CLIP to probe how classification performance changed with length of training (\figureref{fig:2}) in the screening cohort. When evaluating at different checkpoints over 10,000 pretraining epochs, we observed that finetuning as early as epoch 2500 led to improved performance over supervised baselines (0.887, 95\% CI:[0.886, 0.888] AUC). The best performance was achieved at 9000 epochs (\tableref{tab:2}, CLIP-finetune), albeit the finetune performance did not increase monotonically over training epochs.


\begin{table}[tb]
    \small
    \centering
        \caption{Prediction of 2-year lung cancer risk on test cohort (mean AUC [95\% CI]). \texttt{cross-sec.}: cross-sectional, \texttt{long.}: longitudinal. }
        % \begin{adjustbox}{width=\te}
        \label{tab:2}%
            \begin{threeparttable}[b]
                \resizebox{\columnwidth}{!}{
                \begin{tabular}{*{1}{l} p{0.24\linewidth} p{0.24\linewidth} *{1}{l}*{1}{l}}
                    \toprule
                    & \textbf{Screening} ($n=2,315$) & \textbf{Detected-Nodule} ($n=2,057$) & \textbf{Imaging type} & \textbf{Non-imaging type} \\
                    \midrule
                    Sybil-cs & 0.877 [0.872, 0.881] & 0.869 [0.864, 0.874] & cross-sec. & N/A \\
                    Sybil-TDViT & 0.861 [0.860, 0.863]& 0.858 [0.856, 0.859] & long. & N/A \\
                    MM-tabular & 0.871 [0.869, 0.872]& 0.876 [0.871, 0.880] & long. & tabular \\
                    MM-lang & 0.873 [0.871, 0.874]& 0.875 [0.873, 0.876] & long. & language \\
                    CLIP-tabular & 0.849 [0.843, 0.854] & 0.837 [0.831, 0.842] & long. & tabular \\
                    CLIP-zeroshot & 0.540 [0.538, 0.541]& 0.523 [0.521, 0.524] & long. & language \\
                    \textbf{CLIP-finetune} & \textbf{0.895 [0.894, 0.896]\textsuperscript{*}} & \textbf{0.893 [0.892, 0.894]\textsuperscript{*}} & long. & language \\
                    \bottomrule
                \end{tabular}}
                \begin{tablenotes}
                    \item \textbf{\textsuperscript{*}}$p<0.05$ against all other methods.
                \end{tablenotes}
            \end{threeparttable}
        % \end{adjustbox}
\end{table}

\subsection{Baselines and Ablations}
Lung cancer labels are available for all patients in the NLST, which allowed us train all baselines with the same set used in the contrastive pretraining and finetuning phases. As a direct ablation of CLIP, we reproduced the architecture from the finetune stage and trained the model from random weights (\tableref{tab:2}, MM-lang), preserving weights that were originally frozen during CLIP training. Specifically we optimized the TDViT and MLP proceeding the TRM while the ResNet and TRM remained frozen. All supervised training was conducted with a batch size of 2401 and training was stopped when the validation loss failed to improve for 40 epochs. Our CLIP approach outperformed its supervised counterpart by about 0.02 AUC, with MM-lang underperforming at 0.873 AUC on the screening cohort and 0.875 AUC on the detected nodule cohort, indicating that the CLIP was responsible for the observed performance gains.

We also ablated the TRM from the text encoder, using the tabular clinical variables as input to a CLIP approach (\tableref{tab:2}, CLIP-tabular) and purely supervised approach (\tableref{tab:2}, MM-tabular). We employed multiple linear imputation~\cite{murray2018multipleimputationreviewpractical} for missing variables in tabular form. We trained a longitudinal imaging baseline from random weights using the image encoder itself (\tableref{tab:2}, Sybil-TDViT) appended to an MLP classifier. Lastly we trained a cross-sectional imaging baseline using Sybil features from the most recent chest CT and a MLP classifier (\tableref{tab:2}, Sybil-cs). 

MM-tabular (0.871 and 0.876 AUC) and cross-sectional imaging model Sybil-cs (0.876 and 0.869 AUC) both matched the performance of MM-lang on screening and detected-nodule cohorts respectively. The longitudinal imaging model Sybil-TDViT slightly underperformed all other methods at (0.861 and 0.858 AUC) in both clinical cohorts.


\begin{figure}[tb]
  \includegraphics[width=\linewidth]{MIDLLatexTemplate/performance_over_epoch.png}
  \caption{Four checkpoints were finetuned and evaluated over contrastive pretraining epochs. Above-baseline performance was observed at 2500 epochs and the best performance was achieved at 9000 epochs. Error bars represent 95\% confidence intervals}
  \label{fig:2}
\end{figure}

% Discussion ================================================
\section{Discussion and Limitations}

This work investigates contrastive pretraining with repeat chest CTs and clinical variables that are matched on patient and time but not paired semantically. The learned patient-level embeddings were evaluated based on zero-shot and finetune prediction of 2-year lung cancer risk in screening and detected-nodule cohorts. Without additional training examples, our approach, when finetuned, improved significantly over a supervised counterpart and single modality baselines.

To interpret the learned representation before and after CLIP, we computed image and text embeddings with Sybil and the TRM respectively using their original weights and compared them to the feature embeddings from CLIP encoders. A 2D t-SNE ~\cite{JMLR:v9:vandermaaten08a} map of the embeddings shows how the original embeddings cluster by modality while the image and text embeddings from CLIP are pulled together (\figureref{fig:3}). 

Interestingly, the supervised multimodal models, MM-lang and MM-tabular, were not able to leverage the predictive signal that we suspect is present in clinical variables to outperform the imaging classifier, Sybil-cs. Overfitting on the multimodal feature space and additional model parameters is a probable explanation for this~\cite{wang2020makestrainingmultimodalclassification, 10484420}. Contrastive patient-level pretraining overcame these challenges to some degree, suggesting that it learned a more effective joint representation of longitudinal imaging and non-imaging modalities. Even when operating with purely labeled data, this may be a useful pretraining step for warming up downstream supervised training.

\begin{figure}[tb]
    \includegraphics[width=\linewidth]{MIDLLatexTemplate/tsne_embeddings.png}
    \caption{Image and text embeddings before (left) and after (right) CLIP were visualized with t-SNE. Without CLIP, partly-disjoint clusters can be observed within the same modality. After CLIP, image and text embeddings are pulled together. In both examples, lung cancer cases are not easily separable from controls, but the CLIP representation outperformed supervised baselines after finetuning.}
    \label{fig:3}
\end{figure}
The poor zero-shot performance after CLIP warrants discussion. Given that we only included 13 variables, we did not expect our zero-shot performance to be similar to large scale foundation models. Still, these results suggest that the patient-level embeddings are not easily separable into the diagnostic classes by cosine similarity alone. Instead, a non-linear classifier, e.g. a finetuning stage, is required to fully leverage the utility of the learned representation. \figureref{fig:3} visually corroborates this as the lung cancer cases and controls do not separate into noticeable clusters. These findings could be a consequence of aligning unpaired modalities and the unseen task-learning capability of the model would be improved by covering a larger and more diverse set of clinical concepts during pretraining. 

In conclusion, this paper demonstrates that contrastive pretraining enhances the fusion of longitudinal and multimodal medical data without requiring semantically paired modalities or additional training examples. To ensure applicability to real clinical data, we integrate time encodings for longitudinal information and a TRM for embedding non-imaging modalities. In data-limited settings, contrastive patient-level pretraining facilitates multimodal fusion, mitigating the overfitting challenges typically encountered with supervised approaches.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This research was funded by the NIH through F30CA275020, 2U01CA152662, and R01CA253923-02, as well as NSF CAREER 1452485 and NSF 2040462. This study was also funded by the Vanderbilt Institute for Surgery and Engineering through T32EB021937-07, the Vanderbilt Institute for Clinical and Translational Research through UL1TR002243-06, and the Pierre Massion Directorship in Pulmonary Medicine. We utilize generative AI to generate code segments based on task descriptions, as well as to assist with debugging, editing, and autocompleting code. Additionally, generative AI has been employed to refine sentence structure and ensure grammatical accuracy. However, all conceptualization, ideation, and prompts provided to the AI stem entirely from the authors' creative and intellectual efforts. We take full responsibility for reviewing and verifying all AI-generated content in this work.}


\bibliography{midl25_240}

\end{document}
