\documentclass{midl} % Include author names

\usepackage{booktabs}
% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{natbib}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{xcolor}
\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 202}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

\title[Balancing Classification and Retrieval in Cross-modal Vision Models]{A Balancing Act: Optimizing Classification and Retrieval in Cross-Modal Vision Models}


\midlauthor{\Name{Judith Lefkes\nametag{$^{1,2}$}} \Email{judith.lefkes@radboudumc.nl}\\
\Name{Cl\'ement Grisi\nametag{$^{1,2}$}} \Email{clement.grisi@radboudumc.nl} \\
\Name{Geert Litjens\nametag{$^{1,2}$}} \Email{geert.litjens@radboudumc.nl}\\
\addr $^{1}$ Computational Pathology Group, Radboudumc, Nijmegen, Netherlands \\
\addr $^{2}$ Oncode Institute, Utrecht, the Netherlands
}

%\footnotetext[1]{Contributed equally}


\begin{document}

\maketitle

\begin{abstract}
Despite the promising capabilities of vision-language models (VLMs) in diverse tasks, recent studies reveal that they struggle with the fundamental task of image classification. In this study, we explore leveraging state-of-the-art task-specific classification models as a foundation for VLMs, aiming to preserve strong classification performance. Specifically, we assess the impact of contrastive tuning to enable cross-modal retrieval capabilities on a Vision Transformer (ViT) model trained for multi-label classification on natural images and a Hierarchical Vision Transformer (H-ViT) trained for prostate cancer grading in Whole-Slide Images (WSIs). Our results demonstrate that contrastive fine-tuning creates a clear trade-off: classification accuracy rapidly deteriorates toward zero as vision-text alignment improves. By balancing task-specific and contrastive objectives in the loss function during fine-tuning, we achieve competitive slide-level retrieval performance while maintaining classification accuracy. Our code is available on \url{https://github.com/DIAGNijmegen/tradeoff_classification_alignment.git}.

\end{abstract}

\begin{keywords}
Multi-task Learning, Vision-Language Models, Representation disentanglement, Computational Pathology
\end{keywords}

\section{Introduction}
The field of computational pathology is seeing an increase in the development of foundation models (FMs) \citep{vorontsovVirchowMillionSlideDigital2024, ikezogwoQuilt1MOneMillion2023}. Large-scale pretraining using self-supervised learning (SSL) on thousands of histopathological slides spanning diverse tissue types and diseases can provide foundation models with advantages over task-specific models. They can serve as a general foundation for various downstream tasks in pathology, such as cancer subtyping and prognostication \citep{wangPathologyFoundationModel2024a, chenGeneralpurposeFoundationModel2024b}. 

Recently, vision-language models (VLMs), a subset of foundation models, have emerged to leverage the inherently multimodal nature of medical data by integrating textual sources such as pathology reports, educational materials, and PubMed, enabling them to learn cross-modal associations \citep{luVisualLanguageFoundationModel2023}. These studies have demonstrated strong potential of VLMs in various
medical imaging tasks, including zero-shot and few-shot cancer classification and cancer
subtyping \citep{luVisualLanguageFoundationModel2023,shaikovskiPRISMMultiModalGenerative2024, ahmedPathAlignVisionlanguageModel2024, zhangContrastiveLearningMedical2022a}.
Additionally, they have shown promising multi-modal capabilities such as cross-modal retrieval \citep{luVisualLanguageFoundationModel2023}, image captioning \citep{luVisualLanguageFoundationModel2023,shaikovskiPRISMMultiModalGenerative2024}, and
report generation \citep{tran2024generating}.


Despite their successes, recent computer vision research highlights VLMs' critical limitations. In particular, VLMs significantly underperform on standard image classification benchmarks compared to state-of-the-art (SOTA) task-specific classification models \citep{laurenconWhatMattersWhen2024, karamchetiPrismaticVLMsInvestigating2024a, zhangWhyAreVisuallyGrounded2024a, tongEyesWideShut2024, zhaiInvestigatingCatastrophicForgetting2023}.
\citet{zhangWhyAreVisuallyGrounded2024a} attribute this shortfall primarily to the limited availability of classification-focused data during pretraining of VLMs. \citet{zhaiInvestigatingCatastrophicForgetting2023} demonstrate that fine-tuning VLMs with classification-focused data enhances in-domain performance but causes catastrophic forgetting, leading to reduced performance on out-of-domain datasets and compromised generalizability.Catastrophic forgetting is a well-studied phenomenon in multi-task learning \citep{kirkpatrickOvercomingCatastrophicForgetting2017, perkoniggDynamicMemoryAlleviate2021, bandiContinualLearningStrategies2023}. Existing mitigation strategies include Elastic Weight Consolidation \citep{kirkpatrickOvercomingCatastrophicForgetting2017}, dynamic architectures \citep{rusuProgressiveNeuralNetworks2022}, and rehearsal approaches \citep{rebuffiICaRLIncrementalClassifier2017}. However, these studies primarily focus on catastrophic forgetting in single-modality multi-task learning. To our knowledge cross-modal forgetting—where a model is adapted for a novel task in a different modality—remains unexplored.


In high-stakes domains like medicine, where diagnosis guides treatment decisions and directly impacts patient outcomes, even slight declines in classification performance can have serious consequences. This raises a key question: can task-specific vision models be adapted for multi-modal tasks without compromising their classification performance? How much classification-specific information do we sacrifice in favor of cross-modal alignment?

To address this question, we begin with SOTA task-specific image classification models and explore the impact of contrastive tuning for enabling cross-modal tasks like image-to-text retrieval. Without any mitigation strategy, we hypothesize that the model will suffer from catastrophic forgetting while adapting to the cross-modal task. To mitigate this, we introduce a balancing parameter, $\lambda$, which modulates the relative emphasis on classification and vision-language alignment in the loss function. We summarize our contributions as follows:
\begin{enumerate}
   \item We show that contrastive fine-tuning without a classification objective leads to catastrophic forgetting, where classification accuracy deteriorates rapidly in favor of vision-text alignment in general vision and the medical domain.
   \item To address this trade-off, we propose fine-tuning with a dual-objective loss function weighted by a balancing parameter, $\lambda$, which controls the trade-off between classification and contrastive objectives. 
   \item We show that $\lambda$ selection is task-specific and that we can achieve competitive retrieval performance through careful tuning while preserving classification accuracy on a prostate cancer grading task.
\end{enumerate}


\section{Methods}
\subsection{Experimental setup}
To demonstrate that our results are applicable and transferable to both natural and medical images, we conduct experiments on two distinct datasets: the Microsoft Common Objects in Context (COCO) \citep{linMicrosoftCOCOCommon2015} dataset and a curated medical dataset of prostate biopsies and corresponding pathology reports.

We start with a high-performing vision model for a specific classification task and a frozen language encoder to test our hypothesis that classification performance is traded away when fine-tuning for cross-modal performance. We then use a dual-objective loss function that weights a classification and contrastive objective by a parameter $\lambda$, defined as follows:

\[
\mathcal{L}_{\text{total}} = \lambda \mathcal{L}_{\text{contrastive}} + (1 - \lambda) \mathcal{L}_{\text{classification}}
\]

Thus, $\lambda$ of $0.0$ implies disregarding the contrastive objective and continuing fine-tuning for classification, while a $1.0$ is equivalent to purely focusing on the contrastive objective. We hypothesize that the higher the $\lambda$, the more classification performance you lose. Additionally, we assume that the optimal value for $\lambda$ is task or dataset-specific. 

%We analyze performance across five folds.
We analyze the trade-off between classification and cross-modal alignment by tracking validation performance metrics over epochs for different lambda values during contrastive tuning. To fully assess the tradeoff rather than maximize peak performance, we intentionally avoided early stopping. Please refer to Table \ref{tab:implementation_details} for details on losses, tasks, and implementation. %Our code is available on \url{https://github.com/DIAGNijmegen/tradeoff_classification_alignment.git}.


\subsection{COCO Experiments}
\textbf{Dataset}\\
We select $30,000$ image-caption pairs from the 2014 MS COCO release for contrastive tuning. Of these, $4,952$ pairs are held out for independent testing, while the remaining pairs are divided into five cross-validation folds. Figure \ref{training_sample}A shows an example of an image-caption pair.
\\
\\
\textbf{Models}\\
\sloppy
The COCO experiments use a ViT-Base architecture, 
\texttt{google/vit-base-patch16-224} \citep{wuVisualTransformersTokenbased2020} fine-tuned for multi-label classification on the $80$ classes in the dataset. Fine-tuning details are provided in  Appendix \ref{finetuning_coco_vision_encoder}. We experimented with two publicly available SentenceTransformer models \citep{reimersSentenceBERTSentenceEmbeddings2019a} for the language encoder. We report results using the RoBERTa base model (\texttt{roberta-base-nli-stsb-mean-tokens}, Section \ref{roberta_coco_results}) and the MPNet model architecture (\texttt{multi-qa-mpnet-base-dot-v1}, Appendix \ref{appendix_mpnet_results}).
\\
\begin{figure}
    \centering
 \includegraphics[width=0.9\textwidth]{figures/training_sample_label_distribution.pdf}
    \caption{\textbf{(A)} An example case from the COCO dataset comprising a natural image, one or multiple label(s), and five reference captions. \textbf{(B)} An example case from the prostate biopsy data comprising a thumbnail of a WSI, its corresponding ISUP grade, and the pathology report. \textbf{(C)} Label distribution of the prostate biopsy data.}
    \label{training_sample}
\end{figure}
\\
\textbf{Evaluation Metrics}
We evaluate multi-label classification using mean average precision (mAP) and vision-language alignment through image-to-text retrieval. Retrieval performance is measured by $Recall@K$, where the top $K$ captions are retrieved from $4,690$ validation captions based on the cosine similarity between image and text embeddings. A retrieval is considered correct if at least one of the $K$-retrieved captions matches any of the five reference captions associated with the image.

\begin{table}[ht]
    \centering
    \resizebox{0.8\textwidth}{!}{ % Adjust width (70% of original)
        \begin{tabular}{lll}
            \toprule
             & \textbf{COCO} & \textbf{Prostate Biopsies} \\ 
            \midrule
            \textbf{Task Type} &  &  \\
            Classification & Multi-label classification (80 classes) & ISUP Grade (6 classes) \\
            Cross-modal Alignment & Image-to-Text Retrieval & WSI-to-Report Retrieval \\
            \midrule
            \textbf{Losses} &  &  \\
            $L_{\text{classification}}$ & Binary Cross-Entropy (BCE) & Mean Squared Error (MSE) \\
            $L_{\text{contrastive}}$ & CLIP \citep{radfordLearningTransferableVisual2021b} & TripletMarginLoss \citep{schroffFaceNetUnifiedEmbedding2015} \\ 
            % \midrule
            % \textbf{Evaluation Metrics} &  &  \\
            % Classification & Mean Average Precision (mAP) &  Cohen's quadratic kappa ($k^2$) \\
            % Cross-modal Alignment & Recall@K (k=5, 10) & Retrieval $k^2$ \\ 
            \midrule
            \textbf{Hyperparameters} &  &  \\
            Number of epochs & 50 & 30 \\
            Learning rate & \(1e^{-4}\) & \(1e^{-5}\) \\
            LR scheduler & - & StepLR \\
            Weight decay & 0.001 & 0.001 \\
            Optimizer & AdamW & AdamW \\
            Batch size & 64 & 1 \\
            Gradient accumulation & - & 16 \\
            \bottomrule
        \end{tabular}
    }
    \caption{Implementation Details for COCO and Prostate Biopsy Grading Experiments. Full loss formulations per task are provided in Appendix \ref{losses}.}
    \label{tab:implementation_details}
\end{table}
\subsection{Prostate biopsy grading experiments}
\textbf{Dataset}\\
We curated a dataset of $425$ WSIs containing a single prostate biopsy together with the corresponding ISUP grade and pathology report from the Radboud University Medical Center in Nijmegen. Each pathology report consists of a microscopy and conclusion section. An example is shown in Figure~\ref{training_sample}B, and the label distribution of the dataset in Figure~\ref{training_sample}C. We reserve $35$ cases for independent testing and partition the remaining data into five cross-validation folds, stratifying on ISUP grade. 
\\
\\
\textbf{Models}\\
For the task-specific vision model, we leveraged $10,616$ $H\&E$-stained prostate WSIs from the PANDA dataset \citep{bultenArtificialIntelligenceDiagnosis2022} to train a H-ViT \citep{grisiHierarchicalVisionTransformers2023}. This model achieves state-of-the-art performance in multi-class ISUP grade classification with a quadratic kappa score of $0.892$ on the combined PANDA test set ($938$ cases). Given the small tuning dataset size, we freeze the first two transformers and update only the weights of the last transformer. For the language encoder, we report results in the main paper using a model pretrained on Dutch clinical reports and fine-tuned for the task of predicting the ISUP grade from microscopic sections of a pathology report (see Appendix \ref{finetuning details roberta language model} for details) \citep{joeranbosmaDragonrobertabasedomainspecific2024}. Additional results using the \texttt{BioBERT} model \citep{leeBioBERTPretrainedBiomedical2020}, pretrained on English biomedical text are presented in the Appendix \ref{appendix_bioebert_results} for comparison. 
\\
\\
\textbf{Evaluation Metrics}\\
We evaluate ISUP grade classification performance on prostate biopsies using the quadratic kappa score ($\kappa^2$). For retrieval, we introduce a new metric, \textit{retrieval $\kappa^2$}, to measure WSI-level image-to-text retrieval. \textit{Retrieval $\kappa^2$} assesses the agreement between each slide's original labels and the labels of the top-one retrieved report by calculating Cohen's quadratic kappa. 

\section{Results}
\subsection{COCO} \label{roberta_coco_results}
\begin{figure}
    \centering
    \includegraphics[width=0.8\textwidth]{figures/tradeoff_l=1.0.pdf}
    \caption{Validation performance metrics during contrastive tuning using $\lambda = 1.0$ for the COCO dataset in \textbf{(A)} and the prostate biopsy grading experiments in \textbf{(B)}. Lines represent the medians across five folds, with shaded areas indicating the interquartile range.}
    \label{fig:lambda_1.0 results}
\end{figure}

\textbf{Contrastive tuning without classification objective ($\lambda = 1.0$) }\\
We evaluate the most natural choice for the hyperparameter $\lambda$, specifically $\lambda = 1.0$ in Figure \ref{fig:lambda_1.0 results}A. We observe a clear trade-off: classification performance declines immediately after the first epoch, as reflected by a steep increase in classification loss. At the same time, alignment improves significantly, as indicated by a moderate $Recall@K$ achieved at around $25$ epochs of fine-tuning. Finally, the mAP reaches zero after approximately $30$ epochs, reflecting the complete loss in classification capabilities when tuning without a classification objective.
\\
\\
\textbf{Balancing classification and alignment}\\
Before contrastive tuning the vision model achieves a median mAP of $0.768$ on the validation sets. To address the trade-off, we evaluate intermediate values of the hyperparameter $\lambda$ to balance classification and contrastive objectives during tuning. Figure \ref{fig:combined_results_coco}A shows the results. The training classification loss decreases linearly across all values of $\lambda$ with lower values of $\lambda$ (e.g., $\lambda=0.1$), achieving lower final loss as they prioritize the classification compared to higher $\lambda$ values (e.g., $\lambda=0.9$) which favor vision-language alignment. In contrast, the validation classification loss increases more rapidly for lower $\lambda$ values, suggesting that the model starts overfitting on the classification task. The validation contrastive loss converges quickly and displays similar trajectories across all $\lambda$ values, highlighting that a stronger emphasis on classification does not severely hinder contrastive learning performance.
\begin{figure}
    \centering
    \includegraphics[width=0.9\textwidth]{figures/roberta_experiments_coco.pdf}
    \caption{Impact of $\lambda$ on the classification-alignment trade-off for COCO with \textbf{(A)} $\lambda \in [0.1, 0.9]$ and \textbf{(B)} $\lambda \in [0.9, 1.0)$.}  
    \label{fig:combined_results_coco}
\end{figure}
\\
\\
\textbf{Optimizing $\lambda$ to minimize catastrophic forgetting}\\
In our third experiment, we redefine $\lambda$ as the range [0.9, 1.0) to isolate its impact from the previously observed overtraining effect, as shown in Figure \ref{fig:combined_results_coco}B. Selecting $\lambda$ closer to $1.0$ should maximize multi-modal alignment, mitigating overfitting and identifying the point at which classification performance begins to decline. Indeed, classification loss increases, but less sharply than when no mitigation is applied ($\lambda=1.0$), and this is accompanied by a slight decline in mAP. Contrastive loss and retrieval performance remain largely unaffected by the choice of $\lambda$, stabilizing around $25$ epochs. Importantly, lower values (e.g., $0.9$) achieve marginally better mAP compared to higher values like $0.98$, indicating values around, e.g., $\lambda=0.9$ may be ideal for this task as they maintain the highest classification performance while obtaining similar retrieval performance. 
\subsection{Prostate biopsy grading}
\textbf{Contrastive tuning without classification objective ($\lambda = 1.0$) }\\
As illustrated in Figure \ref{fig:lambda_1.0 results}B, we observe a rise in classification loss alongside continued contrastive alignment optimization, confirming a similar trade-off for prostate cancer grading as in COCO. Consistent with prior observations, fine-tuning with $\lambda = 1.0$ results in a complete loss of classification performance for prostate biopsies within $20$ epochs, trading it for a \textit{retrieval} $\kappa^2$ of approximately $0.8$.
\\
\\
\textbf{Balancing classification and alignment}\\
Before contrastive tuning, the trained H-ViT model achieves a median $\kappa^2 = 0.839$ across five validation folds. Contrastive Tuning using $\lambda \in [0.1, 0.9]$ results in a consistent rise in classification loss with minimal variation across $\lambda$ values as illustrated in Figure~\ref{fig:combined_results_medical}A.

However, after $30$ epochs the classification loss stabilizes around $1.0$, significantly lower than the approximately $3.5$ observed with $\lambda=1.0$ after $30$ epochs. Both training and validation classification losses exhibit higher variability compared to natural images. 
\begin{figure}
    \centering
\includegraphics[width=0.87\textwidth ]{figures/roberta_experiments_medical.pdf}
    \caption{Impact of $\lambda$ on the classification-alignment trade-off for the prostate biopsy grading experiments with \textbf{(A)} $\lambda \in [0.1, 0.9]$ and \textbf{(B)} $\lambda \in [0.9, 1.0]$.}
\label{fig:combined_results_medical}
\end{figure}
\\
\\
\textbf{Optimizing $\lambda$ to minimize catastrophic forgetting}\\
Figure ~\ref{fig:combined_results_medical}B displays results using $\lambda \in [0.9, 1.0)$. Regarding the losses for the two objectives, there is no clear difference between the intermediate and higher ranges of $\lambda$. Lower $\lambda$ values, such as $0.9$, appear more advantageous, achieving comparable retrieval performance while maintaining higher classification accuracy. However, the high inter-fold variability across folds complicates the precise interpretation of performance scores.

\subsection{Test performances COCO and Prostate grading experiments}
For both datasets, we conducted additional experiments with higher values of $\lambda$, using peak image-to-text retrieval performance as an early stopping criterion to assess the impact on classification accuracy. As shown in Table \ref{tab:performance} prioritizing retrieval in COCO results in a $7-10\%$ drop in classification performance compared to the baseline, while gaining competitive retrieval scores. In contrast, balancing objectives for multi-modal learning not only preserves but also enhances classification performance in the medical task, improving $k^2$ by up to $2\%$ while gaining a \textit{retrieval} $k^2$ of $0.63$.

\begin{table}[]
\centering
\caption{Test performance on COCO ($N=4,690$) and Prostate Biopsies ($N=35$) with early stopping at peak retrieval performance. $\Delta\%$ denotes the relative change in testset classification performance w.r.t the baseline for COCO ($0.77$) and Prostate Biopsies ($0.80$). We report median values (Q1–Q3) across five folds.}
\label{tab:performance}

% Resizing the table to fit within page width
\resizebox{0.9\textwidth}{!}{ 
\begin{tabular}{lcccc|ccc}
\toprule
\multicolumn{5}{c|}{ \textbf{COCO}} & \multicolumn{3}{c}{ \textbf{Prostate Biopsies}} \\  
\midrule
\textbf{$\lambda$} & \textbf{mAP} & \textbf{Recall@5} & \textbf{Recall@10} & \textbf{$\Delta\%$ mAP} & \textbf{$k^2$} & \textbf{Retrieval $k^2$} & \textbf{$\Delta\%$ $k^2$ } \\  
\midrule

0.9  & 0.695 & 0.330 & 0.465 & -7.5  & 0.800 & 0.633 & 0  \\ 
     & \tiny{(0.693–0.695)} & \tiny{(0.326–0.335)} & \tiny{(0.459–0.468)} &  & \tiny{(0.788–0.818)} & \tiny{(0.596–0.643)} & \\ 
\midrule

0.92 & 0.688 & 0.335 & 0.470 & -8.2  & 0.800 & 0.633 & 0 \\  
     & \tiny{(0.687–0.691)} & \tiny{(0.331–0.338)} & \tiny{(0.460–0.463)} &  & \tiny{(0.788–0.814)} & \tiny{(0.584–0.635)} & \\  
\midrule

\textbf{0.94} & \textbf{0.693} & \textbf{0.336} & \textbf{0.469} & \textbf{-7.7} & 0.814 & 0.633 & +1.4 \\  
                  & \tiny{(0.690–0.710)} & \tiny{(0.326–0.348)} & \tiny{(0.455–0.474)} &  & \tiny{(0.800–0.820)} & \tiny{(0.602–0.644)} & \\  
\midrule

0.96 & 0.685 & 0.329 & 0.459 & -8.5  & \textbf{0.820} & \textbf{0.633} & \textbf{+2.0} \\  
     & \tiny{(0.682–0.685)} & \tiny{(0.328–0.330)} & \tiny{(0.454–0.459)}&  & \tiny{(0.818–0.827)} & \tiny{(0.622–0.644)} & \\  
\midrule

0.98 & 0.674 & 0.329 & 0.453 & -9.6  & 0.814 & 0.648 & +1.4 \\  
     & \tiny{(0.674–0.681)} & \tiny{(0.325–0.333)} & \tiny{(0.453–0.459)} &  & \tiny{(0.808–0.824)} & \tiny{(0.613–0.650)} & \\  
\midrule

1.0  & 0.061 & 0.330 & 0.457 & -70.9  & 0.760 & 0.650 & -4.0 \\  
     & \tiny{(0.060–0.062)} & \tiny{(0.326–0.333)} & \tiny{(0.454–0.458)} &  & \tiny{(0.408–0.767)} & \tiny{(0.584–0.650)} & \\  

\bottomrule
\end{tabular}
}
\end{table}
% 
\section{Discussion}

In the medical domain, where accurate classification underpins critical tasks such as clinical decision-making and treatment planning, task-specific algorithms remain the standard for AI systems implemented in the clinic. This paper explored whether task-specific classification models can serve as a foundation for multi-modal systems, aligning novel cross-modal objectives to vision models without sacrificing classification performance.

Our findings indicate that contrastive tuning of a task-specific vision model without a classification objective results in catastrophic forgetting. The classification performance declined to nearly zero within fewer than $30$ epochs in both the COCO experiments and the medical task as the model increasingly prioritized vision-text alignment. These findings highlight that catastrophic forgetting also extends to cross-modal settings. They may also explain why VLMs often fail to surpass SOTA vision classifiers in classification tasks. 
 
We proposed a simple yet effective approach to address this trade-off by integrating a classification objective into the loss function during contrastive tuning, similar to rehearsal strategies for catastrophic forgetting, where past task examples are retained or generated and interleaved with new data during training. Our test results show that by carefully tuning the weighting factor $\lambda$, we effectively reduced the decline in classification performance from a complete $70\%$ drop ($\lambda=1.0$) to just $8\%$ with $\lambda=0.94$, thus retaining approximately $92\%$ of the baseline mAP performance in COCO. Interestingly, balancing objectives in the medical task improved classification accuracy by up to $2\%$ while attaining a \textit{retrieval} $k^2$ of $0.63$, suggesting that the classification performance can even benefit from cross-modal alignment. The consistent trend is independent of the language encoder, as shown in Appendices~\ref{appendix_mpnet_results} and~\ref{appendix_bioebert_results}.

Our study has some limitations. First, a single dataset per domain was used. 
Second, although we propose a simple solution to mitigate the loss of classification performance, a more thorough investigation and the development of more sophisticated methods could improve and simplify the management of the trade-off between classification and retrieval. 

To ensure effective loss balancing, we confirmed in our experiments that the gradient magnitudes of $\mathcal{L}_{\text{contrastive}}$ and $\mathcal{L}_{\text{classification}}$ are in the same order of magnitude. For instance, if classification gradients were two orders of magnitude larger, even a high $\lambda$ favoring vision-language alignment may not prevent the classification from dominating and thereby hindering multi-modal learning. 
Therefore, if the gradient magnitudes differ significantly, the range of $\lambda$ needs to be adjusted accordingly to successfully balance the  two loss functions. 
Additionally, our experiments show that the optimal value of $\lambda$ varies across tasks and loss functions. Therefore, tuning $\lambda$ per task while ensuring comparable gradient magnitudes is crucial.

Third, our approach highlights catastrophic forgetting in cross-modal learning but lacks a direct comparison with existing mitigation strategies. Future work should assess whether single-modality mitigation strategies translate to cross-modal settings and benchmark our method against them.


Fourth, while the variability in the prostate cancer grading dataset is relatively high, we anticipate that this variability could be reduced and that higher overall retrieval performance will be achieved with a larger dataset. 

In summary, this study calls for a renewed focus on catastrophic forgetting as a critical challenge in multi-task model development in the medical field. By developing strategies that ensure that 
fundamental classification capabilities are preserved, we can pave the way for building more robust models that are better suited for clinical implementation.


\bibliography{midl25_202}

\appendix
\newpage


\section{Fine-tuning Details for the Task-specific Vision Encoder in COCO} \label{finetuning_coco_vision_encoder}
We utilize the MS COCO 2014 dataset, which consists of $123,287$ images, each paired with five reference captions (training + validation). For vision-only fine-tuning, we randomly select $93,813$ image-label pairs stratified across $80$ classes, transforming the \texttt{google/vit-base-patch16-224} architecture into a task-specific multi-label classification model. The remaining $29,474$ image-caption pairs are reserved for the contrastive tuning experiments in the main paper. The data is split into training, validation, and test sets (80/10/10). Fine-tuning is performed for a maximum of $50$ epochs using the binary cross-entropy (BCE), with early stopping applied (patience = $10$). Optimization is conducted using the AdamW optimizer with a learning rate of $1e-4$, a weight decay of $0.001$, and a batch size of $64$. The fine-tuned model achieves a mAP of $0.77$ on the test set, and the resulting weights are used as initialization for contrastive tuning.

\section{Fine-Tuning Details for the Language Encoder Pretrained on Dutch Medical Reports} 
\label{finetuning details roberta language model}

    We further fine-tuned the \texttt{joeranbosma/dragon-bert-base-domain-specific} \citep{joeranbosmaDragonrobertabasedomainspecific2024} model for the task of predicting ISUP grade from the microscopic sections of pathology reports. This fine-tuning ensures that the [CLS] token acts as a meaningful sentence embedding of dimension $768$, as it is not inherently optimized for this during MLM pretraining. Additionally, fine-tuning was performed to meet the requirements of contrastive learning, where the output dimensions of the vision and language encoders need to be aligned.
\newpage

\section{Impact of Language Encoder Choice: Results with BioBERT and MPNet} \label{appendix_dual_plot_coco_medical}

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{figures/appendix_mpnet_biobert_1.0.pdf}
   \caption{Validation performance metrics during contrastive tuning with $\lambda = 1.0$. 
Panel \textbf{(A)} presents classification loss, mAP, and Recall@5 for the COCO dataset, where text embeddings are computed using the \texttt{MPNet} model. 
Panel \textbf{(B)} shows validation classification loss, $\kappa^2$  and \textit{retrieval} $\kappa^2$ for prostate cancer grading experiments, where report embeddings are derived from the \texttt{BioBERT} model. 
Both panels illustrate a clear trade-off, where classification performance is sacrificed in exchange for improved retrieval. 
In all figures, metrics are reported starting from the first epoch of fine-tuning. Lines represent the median across five folds, with shaded areas indicating the interquartile range (IQR).}
    \label{fig:lambda_1.0 results_different_language_encoder}
\end{figure}

\newpage

\subsection{Contrastive Tuning Experiments on COCO using \textbf{MPNet} as a Language Encoder} \label{appendix_mpnet_results}

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{figures/appendix_mpnet_coco_experiments.pdf}
    \caption{Impact of $\lambda$ on the classification-alignment trade-off for COCO with \textbf{(A)} $\lambda \in [0.1, 0.9]$ and \textbf{(B)} $\lambda \in [0.9, 1.0)$. We used the same Vit-Base model for the vision encoder while generating text embeddings using the frozen \texttt{multi-qa-mpnet-base-dot-v1} model as the language encoder. }
    \label{fig:mpnet_appendix}
\end{figure}


\newpage

\subsection{Contrastive Tuning Experiments on the Prostate Biopsy Data using \textbf{BioBERT} as a Language Encoder} \label{appendix_bioebert_results}

For the experiments utilizing \texttt{BioBERT}, the original Dutch reports were translated into English using the \texttt{Nous-Hermes-2-Mistral-7B-DPO.Q4-0.gguf} model and the GPT4ALL python library as \texttt{BioBERT} is primarily trained on English biomedical text.

\begin{figure}[h!]
    \centering
    \includegraphics[trim=10 10 10 10, clip, width=0.9\textwidth]{figures/appendix_biobert_med_experiments.pdf}
   \caption{Impact of $\lambda$ on the classification-alignment trade-off for the prostate cancer grading task with \textbf{(A)} $\lambda \in [0.1, 0.9]$ and \textbf{(B)} $\lambda \in [0.9, 1.0)$. We use the same H-ViT model as the vision encoder while generating text embeddings with the frozen \texttt{BioBERT} model as the language encoder.}

    \label{fig:mi}
\end{figure}

% \newpage
\section{Loss functions} \label{losses}
\subsection{COCO} \label{loss_coco}
\[
\mathcal{L}_{\text{total}} = \lambda \mathcal{L}_{\text{contrastive}} + (1 - \lambda) \mathcal{L}_{\text{classification}}
\]

\[
\mathcal{L}_{\text{total}} = \lambda \mathcal{L}_{\text{CLIP}} + (1 - \lambda) \mathcal{L}_{\text{BCE}}
\]


\[
\mathcal{L}_{\text{CLIP}} = -\frac{1}{2N} \sum_{i=1}^{N} \left[ \log \frac{\exp(\text{sim}(I_i, T_i) / \tau)}{\sum_{j=1}^{N} \exp(\text{sim}(I_i, T_j) / \tau)} + \log \frac{\exp(\text{sim}(T_i, I_i) / \tau)}{\sum_{j=1}^{N} \exp(\text{sim}(T_i, I_j) / \tau)} \right]
\]


\[
\mathcal{L}_{\text{BCE}} = -\frac{1}{N} \sum_{i=1}^{N} \left[ y_i \log (\sigma(W I_i + b)) + (1 - y_i) \log (1 - (\sigma(W I_i + b)) \right] 
\]

\noindent\textbf{Where:} \( N \) denotes the batch size. For each sample \( i \), \( I_i \in \mathbb{R}^{1 \times 768} \) represents the image embedding, and \( T_i \) is the corresponding text embedding. The function 
\( \text{sim}(I_i, T_j) \) computes the cosine similarity between the i-th image embedding and the j-th text embedding.

The scalar \( \tau \) is a temperature parameter that scales the logits. In the \( \mathcal{L}_{\text{BCE}} \) loss classification logits are produced using a weight matrix \( W \in \mathbb{R}^{80 \times 768} \) and a bias term \( b \in \mathbb{R}^{80} \). The predicted probabilities are obtained via the sigmoid activation function, defined as \( \sigma(x) = \frac{1}{1 + e^{-x}} \). Ground truth labels for each image are denoted by \( y_i \in \{0, 1\}^{80} \).


% \textbf{Where:}
% \begin{itemize}
%     \item \( N \) is the batch size.
%     \item \( I_i \) is the image embedding for the \( i \)-th image (shape: \( 1 \times 768 \)).
%     \item \( W \) is the classifier weight matrix (\( 80 \times 768 \)).
%     \item \( b \) is the bias term (\( 80 \)-dimensional).

%     \item \( T_i \) is the text embedding for the \( i \)-th text.
%     \item \( \text{sim}(I_i, T_j) \) is the cosine similarity between image \( I_i \) and text \( T_j \).
%     \item \( \tau \) is the temperature parameter that scales the logits.
%      \item  \( \sigma(x) = \frac{1}{1 + e^{-x}} \) is the sigmoid function.
    
% \end{itemize}



\subsection{Prostate biopsy grading experiments} \label{loss_prostate}

\[
\mathcal{L}_{\text{total}} = \lambda \mathcal{L}_{\text{contrastive}} + (1 - \lambda) \mathcal{L}_{\text{classification}}
\]

\[
\mathcal{L}_{\text{total}} = \lambda \mathcal{L}_{\text{triplet}} + (1 - \lambda) \mathcal{L}_{\text{MSE}}
\]

\[
\mathcal{L}_{\text{Triplet}}= \lambda \frac{1}{N} \sum_{i=1}^{N} \max(0, \| I_i - T^+_i \|_2 - \| I_i - T^-_i \|_2 + \alpha)
\]

\[
 \mathcal{L}_{\text{MSE}}= \frac{1}{N} \sum_{i=1}^{N} (y_i - (W I_i + b))^2
\]

\noindent\textbf{Where:} \( N \) denotes the batch size, and \( y_i \) is the ground truth value for the \( i \)-th sample. The term \( I_i \) represents the image embedding, while \( T^+_i \) and \( T^-_i \) correspond to the positive (correct) and negative (incorrect) text embeddings for that image, respectively. The triplet loss encourages the image embedding \( I_i \) to be closer to its corresponding positive text embedding \( T^+_i \) than to the negative one \( T^-_i \), by at least a margin \( \alpha \). The Euclidean distances \( \| I_i - T^+_i \|_2 \) and \( \| I_i - T^-_i \|_2 \) quantify the similarity in the embedding space.
For the MSE loss, the predicted output is computed as a linear transformation \( W I_i + b \), where \( W \) is the weight matrix and \( b \) is the bias term. This prediction is then compared to the true label \( y_i \).


% \textbf{Where:}
% \begin{itemize}
%     \item \( y_i \) is the ground truth value.
%     \item \( N \) is the batch size.
%     \item \( I_i \) is the image embedding for the \( i \)-th sample.
%     \item \( T^+_i \) is the positive text embedding (correct text for image \( I_i \)).
%     \item \( T^-_i \) is the negative text embedding (incorrect text for image \( I_i \)).
%     \item \( W \) is the classifier weight matrix.
%     \item \( b \) is the bias term.
%     \item \( \alpha \) is the margin that enforces separation between positive and negative pairs.
%     \item \( \| I_i - T^+_i \|_2 \) is the Euclidean distance between the image embedding and the positive text embedding.
%     \item \( \| I_i - T^-_i \|_2 \) is the Euclidean distance between the image embedding and the negative text embedding.
% \end{itemize}

\end{document}
