\documentclass{midl} 
\usepackage{multirow}
\usepackage{enumitem}
\usepackage{mwe} 
\usepackage{ulem}  % Supports underline and strikethrough
\usepackage{xcolor}  % Allows color customization
\jmlryear{2025}\jmlrworkshop{Full Paper -- MIDL 2025}\jmlrvolume{-- 78}\editors{Accepted for publication at MIDL 2025}

\title[Beyond the Prompt]{Beyond the Prompt: Deploying Medical Foundation Models on Diverse Chest X-ray Populations}

\midlauthor{\Name{Louisa Fay\nametag{$^{1,2, 3}$}}  \orcid{0009-0005-5071-5519}\Email{lfay@stanford.edu}\\
\Name{Jean-Benoit Delbrouck\nametag{$^{1}$}} \Email{jbdel@stanford.edu}\\
\Name{Thomas K{\"u}stner\nametag{$^{2}$}} \Email{thomas.kuestner@med.uni-tuebingen.de}\\
\Name{Bin Yang\nametag{$^{3}$}} \Email{bin.yang@iss.uni-stuttgart.de}\\
\Name{Noel C. F. Codella\nametag{$^{4}$}} \Email{ncodella@microsoft.com}\\
\Name{Matthew P. Lungren\nametag{$^{4}$}} \Email{mlungren@microsoft.com}\\
\Name{Curtis P. Langlotz\nametag{$^{1}$}} \Email{langlotz@stanford.edu}\\
\Name{Sergios Gatidis\nametag{$^{1}$}} \Email{sgatidis@stanford.edu}\\
\addr $^{1}$ Department of Radiology, School of Medicine, Stanford University, CA, USA\\
\addr $^{2}$ Medical Image and Data Analysis, University Hospital of T\"ubingen, Germany\\
\addr $^{3}$ Institute of Signal Processing and System Theory, University of Stuttgart, Germany\\ 
\addr $^{4}$ Microsoft Health and Life Sciences, Redmond, WA, USA
}

\begin{document}


\maketitle

\begin{abstract}
Foundation models (FMs) have shown impressive performance in medical image analysis tasks, but their deployment in real-world clinical settings, especially across diverse patient populations such as adult and pediatric cases, remains challenging. Key open questions include optimal prompting techniques and strategies for model adaptation or fine-tuning for clinical use. In this study, we evaluated different approaches for deploying FMs in clinical scenarios for diverse patient populations. We use the lightweight, embedding-based vision-language FM \textit{MedImageInsight} to predict pneumonia from chest X-rays, a condition common in both adult and pediatric patients.
We observed a large variation in model predictive performance depending on the chosen prompt design, highlighting the importance of text prompt design for successful zero-shot (ZS) application. On in-domain datasets, we found performance differences of up to 46\% in Matthews correlation coefficient (MCC) and 56\% in true positive rates across different text prompts.
By introducing text and vision embedding ensembles, we achieved substantial ZS improvements, outperforming training-based methods (fine-tuning, linear probe) in low-data scenarios by up to 43\% for adults and 35\% for pediatric populations (MCC).  
This ensembling strategy also promotes resource-efficient, equitable clinical use by supporting diverse demographic subgroups, achieving MCC improvements of 6\% by sex, 17\% by age, and 10\% by race compared to linear probe. \end{abstract}

\begin{keywords}
multimodal foundation model, bias, zero-shot, pneumonia, ensembles
\end{keywords}

\section{Introduction}


Foundation Models (FMs) that have been trained on extensive web-based datasets have demonstrated great promise and remarkable generalizability across a variety of tasks in different domains, including natural language processing, computer vision, and text and image generation \cite{brown2020language, radford2021learning}. Similarly, their medical counterparts, trained on domain-specific datasets such as PubMed, electronic health records, and medical imaging, have shown significant potential to advance healthcare applications \cite{zhang2022contrastive,singhal2023large}. 
However, their reliable implementation in clinical settings without further adjustments remains challenging due to the severe consequences of incorrect diagnoses or treatment plans \cite{huang2023visual}.
While an increasing number of FMs are developed using medical data, their clinical application often experiences performance drops on out-of-distribution data, such as in new patient populations (e.g., transitioning from adult to pediatric cases) \cite{chen2024chexagent, zhang2023biomedclip, huang2024multimodal}.

Moreover, since many FMs are trained to derive predictions from vision-language similarities, their effective training-free, zero-shot (ZS) application depends not only on the input image but also on the given text prompt. Determining an optimal text prompt to achieve the best ZS performance in various environments, particularly in new distributions, still poses a major challenge.

A common strategy for applying vision-language FMs in clinical settings relies on adapting and fine-tuning image encoders \cite{chambon2022adapting, hu2021lora}. However, this approach requires additional diverse and labeled data, which is expensive and difficult to acquire. Furthermore, most FMs are based on large transformer models, which require significant computational resources to fine-tune. As many healthcare facilities lack the necessary infrastructure, these approaches are unsuitable for integration into clinical workflows.

Our study aims to address these challenges by identifying effective strategies for the successful clinical application of FMs, focusing on the state-of-the-art, open-source, lightweight, embedding-based vision-language FM, \textit{MedImageInsight} \cite{codella2024medimageinsight} which showed superior performance and suitability across multiple tasks and domains. 
Since \textit{MedImageInsight} was predominantly trained on adult data, this study examines the prediction of pneumonia from chest X-rays in adult (in-domain) and pediatric (out-of-domain) cases using training-free ZS and training-based approaches. Key contributions of our work include:



\begin{itemize}[noitemsep]
    \item \textbf{ZS ensemble for training-free FM deployment:} Enhancing ZS prediction by introducing text and vision ensembles for medical tasks.
    \item \textbf{Analysis of FM adaptation trade-offs:} Comparative analysis of ZS, LoRA fine-tuning, and lightweight adapters (linear probe, k-NN), showing that fine-tuning requires sufficient data to be effective, while linear probing can introduce biases.
    \item \textbf{Multi-site evaluation} using MIMIC-CXR (in-domain, adults, part of training data), CheXpert (external data, adults) and VinDr-PCXR (out-of-domain, pediatric) datasets.
    \item \textbf{Bias assessment} of ZS and training-based methods across sex, age, and race groups.
\end{itemize}
\noindent Related works and limitations are provided in Appendix \ref{RelatedWork}, \ref{Limitations}. Code is publicly available\footnote{\hyperlink
{https://github.com/loufay/Beyond-the-prompt}{https://github.com/loufay/Beyond-the-prompt}}.   
\section{Methods}

\begin{figure}[t]
    \centering
    \includegraphics[width=0.7\linewidth]{figures/Fig1_Framework.png}
    \caption{Overview of our Beyond-the-prompt pipeline using a zero-shot ensemble method. (top) Text embeddings are averaged over prompt templates and radiology reports and (bottom) compared to image ensembles generated by augmented views. A confidence-score filter is applied to select the most reliable image embeddings. Cosine similarity between text and image ensembles defines predicted classes.} 
    \label{fig:framework}
\end{figure}

We evaluated the open-source FM \textit{MedImageInsight} for pneumonia detection in three different domains by assessing its ZS capability using nine different prompt types and enhanced prompt and vision ensembles (Figure \ref{fig:framework}) as well as by exploring training-based methods.

\textit{MedImageInsight} \cite{codella2024medimageinsight} comprises three parts: an image encoder (360M parameters), a text encoder (250M parameters), and an optional text decoder (70M parameters). We excluded the text decoder, resulting in a lightweight FM with 610M parameters. The model was trained on image-text pairs from 14 modalities, including adult chest X-rays, along with radiology reports from MIMIC-CXR database \cite{johnson2019mimic}, and image-label pairs from NIH-CXR-LT \cite{holste2022long} and Mass General Brigham database.

\subsection{Zero-shot Evaluation} \label{sec2.2:ZS}
Embedding-based contrastive vision-language FMs compute the cosine similarity between an image embedding and multiple text embeddings to perform training-free ZS classification.

\subsubsection{Prompt input}

\paragraph{Templates as prompt.} 
A common approach of constructing a prompt in ZS classification involves a combination of a \textit{text-prompt template} plus a placeholder, 
$\{class\}$, that represents the class names, (here: \textit{No Finding} or \textit{Pneumonia}).
The image is assigned to the class of the closest text prompt in the embedding space.
We evaluated the FM's performance in predicting pneumonia using the following text prompts: 1) $\{class\}$, 2) $\textit{chest}+$\{class\}, \linebreak 3) $\textit{X-ray}+$\{class\}, 4) $\textit{chest X-ray}+$\{class\}, 5) \textit{X-ray chest anteroposterior}+\{class\}.

\paragraph{Radiology reports as prompt.}
Since \textit{MedImageInsight} is trained on radiology reports, we additionally evaluate our model by generating text embeddings using textual information extracted from the following parts of the radiology reports: 6) findings section, 7) impression section, 8) both, findings and impression sections, or 9) full radiology reports. We randomly sample ten reports from each class in the training set and compute the distance between the text embeddings generated from these reports and an image embedding from the test set. The predicted class is determined based on the majority of the five closest text embeddings.

\paragraph{Template and radiology report ensembles as prompt.}
As introduced by \cite{radford2021learning}, generating averaged text embeddings can enhance ZS results and reduce computational complexity, as a general text ensemble is created once and reused during inference. An averaged text embedding $\overline{t_c} = \frac{1}{P}\sum^P_{p=1}t_{cp}$ is computed for each class $c$ using all $P$ prompt embeddings $t$.
Most medical FMs are trained on template prompts and radiology reports. Hence, merely averaging the template prompts may not be sufficient. Therefore, we propose an extension by generating: 10) averaged template-based embeddings using the templates (1-5), 11) report-based embeddings using ten reports per class of each report type (6-9), and 12) embeddings that incorporate both template- and report-embeddings (1-9).


\subsubsection{Vision input}

\paragraph{Original X-ray as vision input.}
The most common approach to generate an input embedding is to use the original image, which, in our case, represents a chest X-ray.

\paragraph{Vision ensemble as vision input.}
By augmenting an input image $B$ times and creating $B$ image embeddings, a single, representative embedding can be generated by averaging these $B$ embeddings.
This embedding is compared to a given text embedding \cite{shu2022test, dobler2024lost}. This method aims to enhance the robustness and diversity of the image embeddings, potentially improving the alignment with text representations.
We chose $B=64$ \cite{shu2022test} and applied a random selection of the following augmentation techniques: random rotation within a range of $\pm10^\circ$, random affine transformations with translation up to 10\% of the image dimensions, color jittering (brightness=0.2, contrast=0.2), Gaussian blurring (kernel size = 5), and automatic contrast enhancement.

\paragraph{Confidence score filtered (CF) vision ensemble as vision input.}
In addition to averaging all augmented views, we implement a CF method as in \cite{shu2022test}. This approach identifies the $N$ most confident augmented images using entropy-based confidence filtering by determining the 10\% of samples with the lowest entropy.


\begin{figure}
    \centering
    \includegraphics[width=0.49\linewidth]{figures/Fig2_left_image_examples.png}
    \hfill
    \includegraphics[width=0.49\linewidth]{figures/Fig2_right_embedding_distribution.png}
    \caption{(left) Chest X-ray examples of 'No Finding' and 'Pneumonia' cases across adult (MIMIC, CheXpert) and pediatric (VinDr-PCXR) domains (right) Comparison of X-ray embeddings of \textit{MedImageInsight} across the three domains using t-SNE. Embeddings of the test set and separated by classes are attached in Appendix \ref{Extended Dataset}.}
    \label{fig:dataset}
\end{figure}

\subsection{Training-based adaption} \label{sec2.3:Train}
To compare the training-free ZS methods, we investigate training-based adaptation strategies that leverage the image encoder by adding lightweight adapters as well as performing full fine-tuning of the image encoder.
Unless stated otherwise, we run each experiment five times with randomly sampled training data and used cross entropy loss and AdamW \cite{loshchilov2017decoupled} optimizer with a learning rate of $3\times10^{-4}$. If validation performance did not improve for five epochs, the learning rate was reduced by factor of 0.1.
\paragraph{Lightweight adapter training.}
In adapter training, the image encoder remains frozen while the image embeddings are further processed using subsequent lightweight adapter heads. We applied Linear Probing ($R^{1024\times2}$) and $k$-nearest neighbor (kNN) ($k=5$).

\paragraph{Fine-tuning using Low-Rank Adaptation (LoRA).}
The parameters of the image encoder are adapted using LoRA \cite{hu2021lora}, which modifies the parameters with a low intrinsic rank updates (rank $r=8$).

Hyperparameters $k$ and $r$ were empirically selected (Appendix \ref{Extended Results}, Table \ref{tab:knn_hyperparameter} and \ref{tab:lora_hyperparameter}).
\paragraph{Baseline Models.} \label{sec2.4}
We used four baseline models, each from a different model type.\\
(1) DenseNet-121 \cite{huang2017densely}, a simple CNN with 7M parameters, 98\% smaller than the image encoder of \textit{MedImageInsight}, but showed competitive results in various medical applications \cite{singh2024efficient}. (2) CheXagent \cite{chen2024chexagent}, an instruction-tuned FM (8B parameters) trained exclusively on chest X-rays to generate free text. The training of CheXagent included all applied three datasets. (3) \textit{RAD-DINO} \cite{perez2024rad} is an image encoder (86.6M) that generates embeddings of size $R^{768\times2}$. To enable classification, we trained a linear head (Linear Probing). This model was trained only on medical data including MIMIC and CheXpert datasets. (4) BiomedCLIP \cite{zhang2023biomedclip} (0.09B) is a contrastive vision-language model trained on PubMed data. Its architecture supports the direct application of our zero-shot (ZS) methods.




\subsection{Datasets} \label{sec2.5}
\begin{figure}
    \centering 
    \includegraphics[width=0.49\linewidth]{figures/Fig3_left_Best_MCC_ZS_FT.png}
    \hfill
    \includegraphics[width=0.49\linewidth]{figures/Fig3_right_Bias.png}
    \caption{(left) Mean MCC for ZS experiments across different prompt types: (1-5) Template, (6-9) Report, and most effective Ensemble using (12) Template and Report (MIMIC, CheXpert); (10) Templates (VinDr-PCXR) compared to fine-tuning. (right) Bias Assessment: Absolute mean MCC difference across subgroups: sex (Male, Female), age ($\leq$62,$>$62 years), race (Asian, Black, White).}
    \label{fig:best_mcc}
\end{figure}


We used three publicly available chest X-ray datasets to evaluate pneumonia prediction in three different environments. Figure \ref{fig:dataset} (left) shows representative examples for each dataset. To ensure consistency and fairness during training under varying amounts of training data, we balanced all training datasets using 744 samples. We also balanced all test datasets.

\begin{itemize}[noitemsep]
   \item \textbf{MIMIC-CXR (in-domain, adults)} \cite{johnson2019mimic} was part of the FM training. Our test set included 8,186 X-rays with labels and radiology reports.
    \item \textbf{CheXpert (external validation, adults)} \cite{irvin2019chexpert} was not part of the training set for \textit{MedImageInsight}, but comprises adult subjects, similar to MIMIC. Our balanced test set contains 2,508 samples. We also generated balanced test sets for the demographic subgroups: sex (male/female - 1151 samples per group), age (young:$<62$ years/old: $>62$ years - 1128 samples per group), and race (White/Asian/Black - 171 samples per group) to assess biased prediction differences. 
    \item \textbf{VinDr-PCXR (new domain, pediatrics)} \cite{pham2022vindr} represents a new domain, as it exclusively contains pediatric cases, which were not part of the training of \textit{MedImageInsight}. The balanced test set contains 178 samples.
    \end{itemize}


\section{Results and Discussion}


\subsection{Zero-shot Evaluation}
As highlighted in Figure \ref{fig:best_mcc} (left), ZS performance of \textit{MedImageInsight} highly depends on the specific text prompt to which a given X-ray is compared. A detailed evaluation of all prompt types and metrics is depicted in Table \ref{tab:results}. Similar ZS behavior was detected for the baseline FM \textit{BiomedCLIP} (Appendix \ref{Extended Results}, Table \ref{tab:biomedclip}).

\paragraph{Template as prompt.}
Among our five types of prompt templates, we obtained the highest accuracy (Acc) on all adult and pediatric datasets by using the prompt template (2) \textit{chest}+\{class\}. Most other templates resulted in either TPR or TNR below $50\%$. 

\paragraph{Radiology reports as prompt.}
Comparing the image embeddings to parts of the radiology reports revealed notable performance boosts for the adult datasets. MIMIC performed best using (5) \textit{Findings} as prompt ($\mathrm{Acc}=78.0\%$, $\mathrm{MCC}=0.57$). CheXpert performed slightly better using (6) \textit{Impression} ($\mathrm{Acc}=80.5\%$, $\mathrm{MCC}=0.62$).
While MIMIC and CheXpert achieved comparable results using either (5) \textit{Findings} or (6) \textit{Impression}, operating on full reports led to performance drops of up to $7\%$ in Acc.
In contrast, when comparing pediatric cases, VinDr-PCXR, to any part of MIMIC reports, TPR were consistently $<20\%$. 

\begin{table}[t] 
\centering
\caption{ZS performance for different text and vision prompts across adult (MIMIC, CheXpert), and pediatric (VinDr-PCXR) datasets. (Acc, TNR, TPR in [\%].)}
\resizebox{1\textwidth}{!}{%
\begin{tabular}{c|c|l|c|cccc|cccc|cccc}
\hline
\multicolumn{3}{c|}{\textbf{Prompt}}&\textbf{Vision}& \multicolumn{4}{c|}{\textbf{MIMIC}} & \multicolumn{4}{c|}{\textbf{CheXpert}} & \multicolumn{4}{c}{\textbf{VinDr-PCXR}} \\
\multicolumn{3}{c|}{\textbf{}}&\textbf{Ens.}$^+$ & \textbf{Acc} & \textbf{TNR} & \textbf{TPR} & \textbf{MCC} & \textbf{Acc} & \textbf{TNR} & \textbf{TPR} & \textbf{MCC} & \textbf{Acc} & \textbf{TNR} & \textbf{TPR} & \textbf{MCC}\\
\hline
 \multirow{5}{*}{\rotatebox{90}{\textbf{Template}}}& 1&&\multirow{4}{*}{-}& 59.9 & 98.5 & 21.4 & 0.31& 62.9 & 99.3 & 26.6 & 0.38& 60.1 & 80.9 & 39.3 & 0.22\\
&2&\textit{chest}&& 74.8 &  88.9 &  60.7 &  0.52&  76.4&  97.2&  55.6&  0.58& 69.1 &  85.4 &  52.8 &  0.40\\
&3&\textit{X-ray}&& 67.8 & 96.5 & 39.1 & 0.43& 65.5& 99.1 & 31.8 & 0.42&68.0 & 94.4 & 41.6 & 0.42\\
&4&\textit{x. c.*}&& 61.3 & 98.4 & 24.1 & 0.34& 66.2 & 99.0& 33.5 & 0.43&61.8 & 100 & 23.6 & 0.37\\
&5&\textit{x. c. a.*}&&51.2 & 3.0 & 99.7 & 0.1& 70.3 & 84.7& 55.8 & 0.42&50.0 & 0 & 100 & 0\\

\hline
\multirow{4}{*}{\rotatebox{90}{\textbf{Report}}}& 6& Findings (F)& \multirow{4}{*}{-}& 78.0&  83.8&  72.2&   0.57 &80.1&87.9&72.4&0.61&58.0& 98.7& 17.2&0.27\\
&7&Impression (I)&& 77.7 & 83.7 & 71.8 &  0.56& 80.5& 88.0& 72.9& 0.62&  58.1& 98.9 & 17.3&0.28\\
&8&F+I&& 74.6 & 84.7 & 64.5 & 0.50 &73.6&90.3&56.9&0.50&53.3& 100 & 7.0 & 0.19\\
&9&Full Report && 74.7 & 84.8 & 64.5 & 0.50&   73.5&90.3&56.6&0.50& 53.9& 100& 8.0 & 0.2\\
\hline \hline
\multirow{9}{*}{\rotatebox{90}{\textbf{Prompt Ensemble}}}&10&Templates&\multirow{3}{*}{-}& 64.2 & 97.5 & 30.8 & 0.38 & 73.7 & 98.3 & 48.5 & 0.54 &  63.5 &  66.9 &  60.1 &  0.27\\
&11&Reports&& 74.4 & 71.2 & 77.6 & 0.49 &  82.4 &  85.4 &  79.5 &  0.65 & 55.9 & 25.8& 86.0 & 0.15 \\
&12&Both&& 76.3 &  80.4 &  72.2 & 0.53 & 82.8 & 89.0 & 76.6 & 0.66 & 56.2 & 28.1 & 84.3 & 0.15 \\ \cline{2-16}
&10&Templates &All& 64.3 & 97.6 &31.0 & 0.38 & 70.1 & 99.2 & 40.1 & 0.49 &  66.9 &  77.5 &  56.2 &  0.35 \\
&11&Reports &All&  75.0 &  73.2 &  76.7 &  0.5 &  82.9 &  83.8 &  82.0 &  0.66 & 53.9 & 18.0 & 90.0 & 0.11 \\
&12&Both &All& 76.5 & 81.8 & 71.3 & 0.53 & 83.7 & 89.0 & 78.3 & 0.68 & 54.8 & 21.9 & 87.6 & 0.12 \\  \cline{2-16}
&10&Templates&CF$^{++}$& 65.5 & 97.3 & 33.6 & 0.40 & 71.5 & 99.0 & 44.1 & 0.51 &  66.3 &  74.2 &  58.4 &  0.33 \\
&11&Report&CF$^{++}$&  75.1 &  74.3 & 76.0 &  0.50 &  82.7 &  83.9&  81.6 &  0.65& 55.6 & 18.0 & 93.2 & 0.17 \\
&12&Both&CF$^{++}$& 76.8 & 82.3& 71.1 & 0.54 &83.4 & 88.8 & 78.0 & 0.67 & 54.5 & 20.2 & 88.8 & 0.12 \\ \hline
\multicolumn{15}{l}{*x.=X-ray, c.=chest, a.=anteroposterior; $^+$Ens.= Ensemble; $^{++}$CF=confidence-score filter}\\

\end{tabular}
} \label{tab:results}
\end{table}




\paragraph{Prompt Ensembles.}
Using prompt ensembles further improved ZS performance.
For adult datasets, using prompt ensembles of (11) \textit{Reports} or (12) \textit{Templates + Reports} was most effective. 
Although, for MIMIC, using the simple (6) \textit{Findings} prompt yielded slightly higher accuracy and MCC, using prompt ensembles enhanced the TPR by up to $5.4\%$ while maintaining a strong TNR.
On VinDr-PCXR, significant improvements were observed using (10) \textit{Template} ensembles, resulting in a TPR increase of 7.3\% compared to the best valid TPR of (2) \textit{chest}+$\{class\}$. 

\paragraph{Prompt and Vision Ensembles.} 
By additionally generating vision ensembles with augmented views, slight improvements in accuracy were achieved for MIMIC and CheXpert. Specifically, for the new adult domain, CheXpert, TPR was further improved by up to 2.5\%.
For the pediatric dataset, TPR slightly dropped when using vision ensembles.



\paragraph{Discussion.}

We found that using prompt ensembles is highly valuable and improves performance when applying FMs in ZS settings. Overall, we achieved best performance on CheXpert, followed by MIMIC, and the pediatric dataset VinDr-PCXR.   
Our results showed that if a given X-ray image belongs to a distribution similar to training (i.e. MIMIC and CheXpert; Figure \ref{fig:dataset}), prompt ensembles that include radiology reports enhance ZS performance. In this case, using vision ensembles further improved performance as more variability was added, better reflecting the known distribution.
For new domains (e.g., VinDr-PCXR), where X-ray embeddings deviated from the learned distribution (Figure \ref{fig:dataset}), and X-rays do not align with known radiology reports of adults, results were more reliable when using \textit{Template} ensembles without report information.
Similarly, image ensembles did not improve the results, as the augmented views failed to align with the known distribution. To further assess the impact of our ensembling strategy, we additionally evaluated it on the FM \textit{BiomedCLIP}. The results confirmed its effectiveness on adult datasets, while performance remained consistently low on pediatric cases in all ZS scenarios.



\subsection{Comparing training-based methods to ZS ensemble method} \label{Subsec:Comparing_training_based_methods}
In Figure \ref{fig:training_comparison}, we compare the MCC of the best ZS ensemble method of \textit{MedImageInsight }against our baselines (\textit{DenseNet, CheXagent, RAD-DINO}, \textit{Biomed-CLIP}) and all training-based methods across all three datasets.  Appendix \ref{Extended Results}, Figure \ref{app_fig:training_comparison} and Table \ref{tab:comparison} provide a detailed comparison of all metrics for the baselines and training-based methods. Table \ref{tab:costs} presents a quantitative analysis of the associated computational costs.

\paragraph{MIMIC.} Among \textit{MedImageInsight} methods, ZS ensembling performed best in low-data regimes (1\% training data). However, \textit{CheXagent}, trained on MIMIC, achieved the highest overall performance. BiomedCLIP and DenseNet consistently underperformed compared to ZS ensembling. Although \textit{RAD-DINO} was pre-trained on MIMIC, it required at least 50\% of training data to surpass our ZS approach.  



\paragraph{CheXpert.} Besides \textit{CheXagent} (pre-trained on CheXpert), which is only outperformed by linear probing with 100\% training data, ZS ensemble performed best when less than 10\% of training data was available. With 50\% data, linear probing exceeded ZS by 2\% in MCC, while LoRA required 80\% to surpass it by just 1\%. \textit{RAD-DINO}, despite being pre-trained on CheXpert, needed 100\% data to outperform ZS ensemble. DenseNet, BiomedCLIP, and kNN performed worse than ZS ensemble regardless of training data availability.


\paragraph{VinDr-PCXR.} 
While \textit{CheXagent} and linear probing  with 1\% of training data, achieved the highest MCC, their TPR remained $<50\%$, indicating that it failed to reliably predict pneumonia. Hence, with only 1\% training data, the ZS ensemble method still performed better than other methods. In general, LoRA fine-tuning achieved highest Acc with 86.6\% using 50\% data.  


\paragraph{Discussion.}
For \textit{MedImageInsight}, in low data regimes, the training-free ZS ensemble method led to best performance in all domains. It remained competitive even as training data increased, especially in adult datasets. With $>$10\% annotated training data, linear probing improved performance, highlighting that \textit{MedImageInsight} effectively captures clinically meaningful features. 
Notably, linear probing on \textit{MedImageInsight} outperformed that of \textit{RAD-DINO}, despite \textit{RAD-DINO} being pre-trained on both MIMIC and CheXpert.
On adult data, fine-tuning the image encoder caused catastrophic forgetting in low-data regimes and yielded only marginal improvements over ZS ensembling with more training data. In contrast, for pediatric cases, which are from an entirely new domain, fine-tuning with $>50\%$ of training data captured the distribution shift from adults to pediatrics and outperformed other methods. The baselines \textit{CheXagent} and  \textit{Biomed-CLIP} achieved TPR $<50\%$ on VinDr-PCXR. Only the baseline DenseNet trained with $>80\%$ of training data achieved a TPR and TNR $>50\%$ and exceeded ZS ensemble performance in MCC. However, LoRA fine-tuning with $50\%$ training data performed best overall on VinDr-PCXR.
These findings highlight the robustness of \textit{MedImageInsight} and the effectiveness of ZS ensemble method in low-data scenarios. 
If more than 10\% (MIMIC, VinDR-PCXR) or 50\% (CheXpert) of training data is available, along with sufficient computing resources, linear probing can provide slight improvements over ZS ensemble. However, in clinical settings, computational constraints often limit training-based adaptation. Only on VinDR-PCXR, LoRA fine-tuning remains the best option when training is feasible.
Table \ref{tab:costs} in Appendix \ref{Extended Results} compares the computational costs of ZS and training-based methods and DenseNet-121. ZS methods require no training time or labeled data, making them practical for resource-limited settings. Notably, ZS prompt ensembling is even faster than standard ZS, as embeddings are computed once from MIMIC and reused across experiments. As expected, training-based methods require labeled data and significantly more computational resources.



\subsection{Bias Assessment}
Figure \ref{fig:best_mcc} (right) illustrates the absolute MCC differences across the demographic subgroups, sex, age, and race, in the CheXpert dataset. All evaluated methods exhibited varying levels of bias, reflected in performance differences across subgroups. ZS ensembling demonstrated a notable bias reduction compared to linear probing across all variables.
For linear probing, differences ranged from 10\% for sex, 14\% for race, and up to 24\% for age. In contrast, the ZS ensemble method achieved considerably lower bias levels, with the highest observed difference of 10\% for race, while bias for sex and age remained below 5\%.
Although the ZS performance method demonstrated improved fairness compared to linear probing, methods such as LORA and KNN exhibited even smaller differences across all variables.

\begin{figure}[t]
    \centering
    \includegraphics[width=0.9\linewidth]{figures/Fig4_MCC_Train_data.png}
    \caption{Comparison of most effective ZS (Ensemble) ((12) Template and Report (MIMIC, CheXpert); (10) Templates (VinDr-PCXR)) and training-based methods and baseline models across (left) MIMIC, (middle) CheXpert, (right) VinDr-PCXR using varying amounts of training data. ZS (Ensemble) shows competitive performance, especially in low-data regimes. Only baseline model, \textit{CheXagent}, which was trained on all three datasets, consistently outperforms ZS (Ensemble).}
    \label{fig:training_comparison}
\end{figure}
\section{Conclusion}
In this work, we evaluated strategies for an effective application of a state-of-the-art vision-language FM in clinical settings based on pneumonia prediction across diverse populations. By applying text and vision ensembles for ZS prediction, we achieved up to 43\% improvement in MCC in adults and 35\% in pediatrics compared to training-based methods in low-data scenarios. Also, compared to linear probing, ZS ensembling reduced biases related to sex, age, and race by 6\%-17\% (MCC).
These findings demonstrate the potential of ZS ensembles as a resource-efficient alternative to training-based adaptation methods, especially in low-data and computationally constrained environments. In scenarios where data and computational resources are abundant, training-based methods such as linear probing or fine-tuning with LoRA are preferable for optimal performance. This work contributes to the equitable and accessible integration of FMs into clinical workflows, supporting diverse patient populations.



\clearpage  
\midlacknowledgments{
LF is funded by the Global Glimpse Program of the University of Stuttgart, which is supported by the Deutsche Forschungsgemeinschaft (DFG) as part of the Excellence Strategy of the Federal and State Governments as well as by the Carl-Duisburg-Fellowship of the Bayer Foundation.}


\bibliography{midl25_78.bib}

\newpage
\appendix
\section{Related Work} \label{RelatedWork}
The applied FM, \textit{MedImageInsight}, builds upon large-scale contrastive multimodal pre-training, which includes 14 different domains, including chest X-rays. Compared to other FMs such as \cite{zhang2023biomedclip, hyland2023maira, bannur2024maira}, \textit{MedImageInsight} is trained on images, text, and labels, enabling adaptation to diverse distributions, such as adult and pediatric cases. While some Large Language Models (LLMs), like Med-Gemini \cite{saab2024capabilities} or Med-PaLM-M \cite{tu2024towards}, are trained on text and labels, they are more than 10 times larger compared to \textit{MedImageInsight}.

Foundation models trained in a contrastive manner generalize well on zero-shot (ZS) tasks by aligning image and text embeddings without task-specific training. However, the success of ZS performance depends on the quality of the text prompt. Radford et al. \cite{radford2021learning} highlighted the sensitivity of ZS performance to text prompts in general natural tasks and, therefore, introduced the idea of text prompt ensembles using up to 80 different templates and averaging them over the embedding space. They demonstrated improvements of almost 5\% on the natural image dataset ImageNet \cite{deng2009imagenet}.

To this end, Shu et al. \cite{shu2022test} introduced Test-time Prompt Tuning (TPT), which generates image embedding ensembles on the fly based on multiple augmented versions of one input image. To exclude noisy augmentations, they added a confidence-based filter. Döbler et al. \cite{dobler2024lost} combined both approaches, creating text ensembles from templates and vision ensembles from augmented images, and tested them on general domains. However, their effectiveness in medical contexts, particularly for diverse patient populations, remains unexplored.

The application of FMs in clinical studies has revealed significant biases in their feature embeddings \cite{glocker2023risk, santomartino2024evaluating}. Glocker et al. \cite{glocker2023risk} found that FMs often encode demographic factors, which might lead to performance differences across subgroups. Mitigation strategies include adversarial training \cite{ganin2016domain}, fairness-aware loss functions \cite{zafar2017fairness}, or the reduction of shortcut learning \cite{fay2023avoiding}. In medical tasks, such biases are particularly concerning and need to be addressed to provide fair healthcare \cite{larrazabal2020gender}.

Pneumonia detection from chest X-rays was studied for various model architectures in \cite{singh2024efficient}. They presented that different types of convolutional-based models, such as VGGs \cite{simonyan2015a}, ResNets \cite{he2016deep}, InceptionV3 \cite{szegedy2016rethinking}, and DenseNets \cite{huang2017densely}, perform worse than Vision Transformers (ViTs) \cite{alexey2020image}. However, in comparison, a Vision Transformer (ViT) has more than 85B trainable parameters, while the convolutional-based models have fewer than 200M. Likewise, the image encoder of \textit{MedImageInsight} operates on 360M trainable parameters, offering a balance between efficiency and accuracy.


\section{Limitations} \label{Limitations}
While in our study, we evaluated a variety of text prompts and ensemble strategies, it is possible that more effective templates or more complex prompt designs might further improve performance. Especially, the lack of pediatric-specific radiology reports likely constrained the performance on pediatric cases. Incorporating such domain-specific reports might yield better alignment and performance in pediatric populations.
Additionally, while our study demonstrates promising approaches for deploying embedding-based vision-language FMs in clinical settings, this study does not explore other FMs on a large-scale besides the three baseline FMs, \textit{CheXagent}, \textit{RAD-DINO}, and \textit{Biomed-CLIP}.
We limited our work to the prediction of pneumonia from chest X-rays, as this condition appears in both adult and pediatric patients. However, exploring further diseases as well as modalities could provide broader insights and reliability regarding the application of FMs in clinical settings and is a crucial next step of our work.
Although our work explores the FM in three different environments, this may not fully reflect the heterogeneity of real-world clinical populations.
In our upcoming work, we aim to address these limitations, especially by exploring more complex prompting strategies, as well as different domains and modalities, to enable a reliable and fair application of FMs in clinical workflows.
Moreover, our future analysis will analyze misclassified cases, investigate failure patterns, and compare errors across different methods to provide further insights into model behavior and reliability.
\begin{figure}[ht]
    \centering
    \includegraphics[width=0.4\textwidth]{figures/Fig5_Appendix_All_embeddings_test.png}
    \vspace{0.5cm} 
    \includegraphics[width=0.8\textwidth]{figures/Fig5_Appendix_Train_embeddings_test.png}
    \includegraphics[width=0.8\textwidth]{figures/Fig5_Appendix_Test_embeddings_test.png}
    \caption{(Top) Comparison of X-ray embeddings of \textit{MedImageInsight} across the test datasets of MIMIC (adults), CheXpert (adults), VinDr-PCXR (pediatrics) using t-SNE. (Middle) Comparison of the X-ray embeddings distribution of \textit{No Finding} and \textit{Pneumonia} across the training datasets of MIMIC (left), CheXpert (middle), VinDr-PCXR (right). (Bottom) Comparison for all test datasets.}
    \label{app_fig:dataset}
\end{figure}

\section{Extended Dataset} \label{Extended Dataset}

In Figure \ref{app_fig:dataset} (top), we present the image embeddings of our balanced test datasets, in addition to those shown in Figure \ref{fig:dataset} (right). Furthermore, Figure \ref{app_fig:dataset} (middle/bottom) displays t-SNE plots of the training and test distributions, separated by the investigated classes \textit{No Finding} and \textit{Pneumonia}, across the adult datasets, MIMIC (left) and CheXpert (middle), as well as the pediatric dataset, VinDr-PCXR (right).


\section{Extended Results}
\subsection{Evaluation Metrics}
In our experiments, all test datasets are balanced to ensure fair evaluation of pneumonia prediction. We evaluate the performance focusing on accuracy (Acc), true-negative rate (TNR), true-positive rate (TPR), and  Matthews correlation coefficient (MCC).


\label{Extended Results}

\subsection{Zero-shot Evaluation of FM \textit{Biomed-CLIP}}

To additionally evaluate the ZS ensembling strategy, we performed all ZS experiments on the embedding-based vision-language FM \textit{BiomedCLIP} and provide the results in Table \ref{tab:biomedclip}.



\begin{table} 
\centering
\caption{ZS performance of BiomedCLIP model for different text and vision prompts across adult (MIMIC, CheXpert), and pediatric (VinDr-PCXR) datasets (Acc, TNr, TPR in [\%]}.
\resizebox{1\textwidth}{!}{%
\begin{tabular}{c|c|l|c|cccc|cccc|cccc}
\hline
\multicolumn{3}{c|}{\textbf{Prompt}}&\textbf{Vision}& \multicolumn{4}{c|}{\textbf{MIMIC}} & \multicolumn{4}{c|}{\textbf{CheXpert}} & \multicolumn{4}{c}{\textbf{VinDr-PCXR}} \\
\multicolumn{3}{c|}{\textbf{}}&\textbf{Ens.}$^+$ & \textbf{Acc} & \textbf{TNR} & \textbf{TPR} & \textbf{MCC} & \textbf{Acc} & \textbf{TNR} & \textbf{TPR} & \textbf{MCC} & \textbf{Acc} & \textbf{TNR} & \textbf{TPR} & \textbf{MCC}\\
\hline
 \multirow{5}{*}{\rotatebox{90}{\textbf{Template}}}& 1&&\multirow{4}{*}{-}& 51.4&28.8&74.0&0.03&48.4&12.0&84.7&-0.05&48.3&94.4&2.2&-0.09\\
&2&\textit{chest}&&38.0&29.1&47.0&0.24&32.1&17.0&47.2&-0.38&38.2&67.4&9.0&-0.29\\
&3&\textit{X-ray}&&43.0&44.4&41.9&0.14&37.6&33.2&42.0&-0.24&49.4&94.4&4.5&-0.03\\
&4&\textit{x. c.*}&&40.4&29.1&51.7&0.2&31.8&28.4&35.2&-0.37&34.3&28.1&40.4&-0.32\\
&5&\textit{x. c. a.*}&&40.2&19.4&60.7&0.22&31.2&22.9&39.5&-0.38&45.5&3.4&87.7&-0.17\\

\hline
\multirow{4}{*}{\rotatebox{90}{\textbf{Report}}}& 6& Findings (F)& \multirow{4}{*}{-}&48.5&67.6&29.4&0.03&48.8&73.9&23.6&0.03&50.6&98.9&2.2&0.04\\
&7&Impression (I)&&59.5&94.5&24.5&0.26&61.4&91.8&31.0&0.29&53.4&94.4&12.4&0.12\\
&8&F+I&&55.0&41.2&68.8&0.1&53.8&40.2&67.3&0.08&55.1&97.8&12.4&0.19\\
&9&Full Report &&52.7&60.3&45.1&0.05&47.9&70.5&25.4&-0.05&39.3&48.3&30.3&-0.22\\
\hline \hline
\multirow{9}{*}{\rotatebox{90}{\textbf{Prompt Ensemble}}}&10&Templates&\multirow{3}{*}{-}&69.5&79.9&59.1&0.4&73.9&90.9&56.9&0.51&52.2&100&4.0&0.15\\
&11&Reports&&61.7&64.6&58.8&0.23&69.0&82.7&55.3&0.4&53.4&100&6.7&0.19\\
&12&Both&&64.9&61.9&67.9&0.3&72.5&81.3&63.7&0.46&53.4&100&6.7&0.19\\ \cline{2-16}
&10&Templates &All&79.1&80.7&59.6&0.41&74.6&92.1&57.1&0.52&52.2&100&4.4&0.15 \\
&11&Reports &All&62.2&66.5&57.8&0.24&70.5&86.8&54.3&0.43&53.4&100&6.7&0.19\\
&12&Both &All&65.7&63.1&68.2&0.31&74.4&84.8&64.0&0.5&52.8&100&5.6&0.17\\  \cline{2-16}
&10&Templates&CF$^{++}$&70.0&80.6&59.4&0.41&74.6&92.1&57.1&0.52&52.2&100&4.5&0.15 \\
&11&Report&CF$^{++}$&62.3&66.7&57.8&0.25&70.4&86.4&54.4&0.43&52.8&100&5.6&0.17\\
&12&Both&CF$^{++}$&65.7&63.3&68.1&0.31&74.3&85.0&63.6&0.5&52.8&100&5.6&0.17 \\ \hline
\multicolumn{15}{l}{*x.=X-ray, c.=chest, a.=anteroposterior; $^+$Ens.= Ensemble; $^{++}$CF=confidence-score filter}\\

\end{tabular}
} \label{tab:biomedclip}
\end{table}

\newpage

\subsection{Hyperparameter optimization}

For our training-based methods k-NN and LoRA, we investigated a suitable number of k-neighbors as well as rank r for LoRA fine-tuning. We provide the results of our ablation study in Table \ref{tab:knn_hyperparameter} and Table \ref{tab:lora_hyperparameter}, respectively. Based on this ablation study, we chose $k=5$ and $r=8$.

\begin{table}[h]
\begin{center}
\caption{Hyperparameter optimization: Comparison of KNN performance (accuracy in [\%]) with different values of \( k \) across MIMIC, CheXpert, and VinDr datasets. We chose \( k=5 \) as it achieves best accuracy in most scenarios.}
\small
    \begin{tabular}{c|ccc|ccc|ccc}
        \hline
        \textbf{Training Data} & \multicolumn{3}{c|}{\textbf{MIMIC}} &  \multicolumn{3}{c|}{\textbf{CheXpert}}  & \multicolumn{3}{c}{\textbf{VinDr}} \\
        &\( k=5 \) & \( k=10 \) & \( k=100 \)  & \( k=5 \) & \( k=10 \) & \( k=100 \) &  \( k=5 \) & \( k=10 \) & \( k=100 \) \\
        \hline
        0.01  & 67.30 & - & -  &  76.65 & - & - &  56.75 & - & - \\
        \hline
        0.1  & 75.70 & 75.05 & - &  81.10 & 80.25 & - & 70.25 & 67.40 & - \\
        \hline
        0.5 & 76.95 & 77.50 & 77.80 &   82.60 & 82.30 & 82.00 & 73.60 & 71.35 & 67.40 \\
        \hline
        0.8 &  76.85 & 78.30 & 77.90 &   83.00 & 82.70 & 82.15 &  74.70 & 75.20 & 69.65 \\
        \hline
        1.0 &  77.20 & 78.75 & 77.80 &  83.90 & 83.30 & 82.60 &   76.95 & 75.25 & 70.25 \\
        \hline
\end{tabular}
\label{tab:knn_hyperparameter}
\end{center}
\end{table}

\begin{table}[h]
\begin{center}
\caption{Hyperparameter optimization: Comparison of LoRA performance (accuracy in [\%]) with different rank values \( r \) across MIMIC, CheXpert, and VinDr datasets. We chose \( r=8 \) as it achieves the best accuracy in most scenarios.} 
\small
    \begin{tabular}{c|ccc|ccc|ccc}
        \hline
         & \multicolumn{3}{c|}{\textbf{MIMIC}} &  \multicolumn{3}{c|}{\textbf{CheXpert}}  & \multicolumn{3}{c}{\textbf{VinDr}} \\
        Training data & \( r=8 \) & \( r=16 \) & \( r=32 \)  & \( r=8 \) & \( r=16 \) & \( r=32 \) &  \( r=8 \) & \( r=16 \) & \( r=32 \) \\
        \hline
         0.01& 69.95 & 60.80 & 56.30  &  56.10 & 67.35 & 68.10 &  50.00 & 50.00 & 64.00 \\
        \hline
        0.1& 75.45 & 74.30 & 76.40  &  80.70 & 80.10 & 79.10 & 71.35 & 73.60 & 74.20 \\
        \hline
        0.5  & 77.85 & 77.70 & 76.90  &  83.30 & 79.74 & 83.10 & 80.90 & 79.20 & 80.30 \\
        \hline
        0.8  & 77.90 & 78.50 & 77.70  &  84.50 & 84.95 & 84.60 & 79.80 & 80.30 & 79.20 \\
        \hline
        1.0  & 77.95 & 79.15 & 78.70  &  84.70 & 83.60 & 84.20 & 80.90 & 79.80 & 82.60 \\
        \hline
\end{tabular}
\label{tab:lora_hyperparameter}
\end{center}
\end{table}


\subsection{Results Baselines and training-based methods.}

In Figure \ref{app_fig:training_comparison} and Table \ref{tab:comparison}, we provide additional results of the experiment shown in Section \ref{Subsec:Comparing_training_based_methods}. The figure shows Acc, TNR, and TPR of our baselines \textit{CheXagent}, \textit{RAD-DINO}, and DenseNet-121 as well as the training-based methods explored on \textit{MedImageInsight} using a varying amount of training data compared to our ZS (Ensemble) Method. We did not retrain CheXagent as it is already pre-trained on all three datasets.




\begin{figure}[ht]
    \centering
    \includegraphics[width=1\textwidth]{figures/Fig6_Appendix_Acc.png}
    \vspace{0.5cm} 
    \includegraphics[width=1\textwidth]{figures/Fig6_Appendix_TNR.png}
    \vspace{0.5cm} 
    \includegraphics[width=1\textwidth]{figures/Fig6_Appendix_TPR.png}
    \caption{Comparison of Acc, TNR, TPR of ZS ensemble and baselines as well as }training-based method across MIMIC, CheXpert, and VinDr-PCXR dataset.
    \label{app_fig:training_comparison}
\end{figure}

\begin{table}[t] 
\centering
\caption{Comparison of Acc, TNR, TPR of ZS ensemble and baselines as well as training-based method across MIMIC, CheXpert, and VinDr-PCXR dataset.}
\resizebox{1\textwidth}{!}{%
\begin{tabular}{c|c|cccc|cccc|cccc}
\hline
\textbf{Method} & \textbf{Percentage} & \multicolumn{4}{c|}{\textbf{MIMIC}} & \multicolumn{4}{c|}{\textbf{CheXpert}} & \multicolumn{4}{c}{\textbf{VinDr-PCXR}} \\
 & & \textbf{Acc} & \textbf{TNR} & \textbf{TPR} & \textbf{MCC} & \textbf{Acc} & \textbf{TNR} & \textbf{TPR} & \textbf{MCC} & \textbf{Acc} & \textbf{TNR} & \textbf{TPR} & \textbf{MCC} \\
\hline
\textit{CheXagent}&&80.8&84.7&77.0&0.62&84.9&95.7&74.1&0.72&69.7&92.1&47.2&0.44\\ \hline
\textit{RAD-DINO}&0&50.2&51.3&49.1&0.0&50.3&48.7&52.0&0.0&56.7&86.5&27.0&0.17\\
&0.0&67.9&67.5&68.3&0.36&70.7&80.1&61.3&0.42&61.2&82.0&40.4&0.25\\
&0.1&75.2&75.1&75.4&0.50&80.4&82.1&78.6&0.61&72.5&84.3&60.7&0.46\\
&0.5&77.6&78.4&76.9&0.55&83.4&83.7&83.1&0.68&78.1&88.8&67.4&0.58\\
&0.8&77.8&78.9&76.7&0.55&83.9&82.8&85.0&0.68&79.8&86.5&73.0&0.6\\
&1.0&77.9&76.6&79.3&0.56&84.3&82.4&86.2&0.69&76.4&82.0&70.8&0.53\\ \hline
\textit{Linear Probe} & 0.01 & 73.1 & 65.0 & 81.2 & 0.47 & 82.8 & 87.5 & 78.1 & 0.66 & 65.7 & 94.8 & 36.6 & 0.39 \\
 & 0.1 & 77.3 & 74.2 & 80.4 & 0.55 & 83.5 & 87.5 & 79.4 & 0.67 & 72.0 & 86.5 & 57.5 & 0.46 \\
 & 0.5 & 79.8 & 79.8 & 79.7 & 0.60 & 85.0 & 87.2 & 82.9 & 0.70 & 73.1 & 93.0 & 53.3 & 0.50 \\
 & 0.8 & 79.9 & 78.4 & 81.4 & 0.60 & 86.1 & 86.3 & 85.9 & 0.72 & 75.4 & 91.0 & 59.8 & 0.53 \\
 & 1.0 & 80.4 & 79.5 & 81.2 & 0.61 & 86.2 & 86.6 & 85.9 & 0.73 & 76.0 & 81.6 & 70.3 & 0.52 \\
\hline
\textit{LoRA} & 0.01 & 69.9 & 81.8 & 58.1 & 0.41 & 56.1 & 99.8 & 12.4 & 0.25 & 0.5 & 100.0 & 0.0 & 0.00 \\
 & 0.1 & 75.4 & 70.8 & 80.1 & 0.51 & 80.7 & 92.1 & 69.3 & 0.63 & 71.3 & 68.5 & 74.2 & 0.43 \\
 & 0.5 & 77.9 & 79.3 & 76.4 & 0.56 & 83.3 & 83.7 & 82.9 & 0.67 & 80.9 & 82.0 & 79.8 & 0.62 \\
 & 0.8 & 77.9 & 72.1 & 83.7 & 0.56 & 84.5 & 86.8 & 82.2 & 0.69 & 79.8 & 84.3 & 75.3 & 0.60 \\
 & 1.0 & 78.0 & 74.5 & 81.4 & 0.56 & 84.7 & 85.6 & 83.8 & 0.69 & 80.9 & 84.3 & 77.5 & 0.62 \\
\hline
\textit{KNN} & 0.01 & 67.3 & 59.3 & 75.3 & 0.35 & 76.7 & 92.3 & 61.0 & 0.56 & 56.7 & 100.0 & 13.5 & 0.27 \\
 & 0.1 & 75.7 & 79.6 & 71.8 & 0.52 & 81.1 & 87.0 & 75.2 & 0.63 & 70.2 & 93.3 & 47.2 & 0.46 \\
 & 0.5 & 77.0 & 78.6 & 75.3 & 0.54 & 82.6 & 88.7 & 76.5 & 0.66 & 73.6 & 86.5 & 60.7 & 0.49 \\
 & 0.8 & 76.9 & 75.8 & 77.9 & 0.54 & 83.0 & 87.9 & 78.1 & 0.66 & 74.7 & 86.5 & 62.9 & 0.51 \\
 & 1.0 & 77.2 & 76.8 & 77.6 & 0.54 & 83.9 & 88.5 & 79.3 & 0.68 & 77.0 & 89.9 & 64.0 & 0.56 \\
\hline
\textit{DenseNet-121} & 0.01 & 60.1 & 51.8 & 68.5 & 0.21 & 65.0 & 81.2 & 48.9 & 0.32 & 49.4 & 73.0 & 25.8 & -0.01 \\
 & 0.1 & 61.3 & 66.0 & 56.5 & 0.23 & 69.5 & 78.0 & 61.0 & 0.40 & 64.0 & 65.2 & 62.9 & 0.28 \\
 & 0.5 & 67.9 & 53.6 & 82.3 & 0.37 & 76.0 & 79.6 & 72.5 & 0.52 & 67.4 & 85.4 & 48.3 & 0.36 \\
 & 0.8 & 69.7 & 63.2 & 76.3 & 0.40 & 77.0 & 83.5 & 70.5 & 0.54 & 71.9 & 68.5 & 75.3 & 0.44 \\
 & 1.0 & 67.6 & 51.7 & 83.4 & 0.37 & 79.3 & 80.9 & 77.8 & 0.59 & 69.7 & 57.3 & 82.0 & 0.41 \\
\hline
\end{tabular}
}
\label{tab:comparison}
\end{table}


\newpage


\subsection{Computational Costs}

Table \ref{tab:costs} compares the computational costs of different MedImageInsight methods, highlighting the efficiency of zero-shot (ZS) approaches versus training-based methods. While ZS methods require no training, their inference costs vary, with ZS vision ensemble being the most computationally intensive. In contrast, training-based methods like linear probing and LoRA introduce training overhead.

\begin{table}[!h]
\begin{center}
\caption{Computational cost comparison for training and inference of MedImageInsight's ZS and training-based methods using one Nvidia GeForce RTX 3090 - 24GB GPU.}
\small
\begin{tabular}{l| c| c| c| c| c | c| c}
    \hline
    Model & ZS &ZS Prompt&ZS Vision& Linear& LoRA &k-NN& DenseNet-\\ 
    & &Ens.$^*$& Ens.$^*$ &Probe & & &121\\
    \hline \hline
   Train Parameters &0&0&0s&0.0021M&360M&0&7M \\ \hline
    Training [per epoch] &0s&0s&0&39.4s&157.5s&38.5s&25.2s\\ \hline 
    Inference [per sample] &0.06s&0.056s&35.3s&0.056s&0.056s&0.06s&0.01s\\ \hline 
    GFLOPs &352.9&342.6&21926&342.6&342.6&342.6&26.6\\  \hline
    \multicolumn{8}{l}{$^*$Ens.=Ensemble}\\
\end{tabular}
\label{tab:costs}
\end{center}
\end{table}
\end{document}
