\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{booktabs}
\usepackage{tcolorbox}
\usepackage[table]{xcolor}
\usepackage{booktabs}
\usepackage{arydshln}

\usepackage{multirow}
\usepackage{colortbl}
\usepackage{makecell}
\definecolor{lightgreen}{RGB}{220, 255, 220}
\definecolor{pastelblue}{RGB}{170, 206, 232}

\usepackage{listings}

%\usepackage{changes}

\usepackage{hyphenat}
\hyphenation{contamination}


\usepackage[status=final,nomargin,inline,lang=spanish]{fixme}
\usepackage{placeins}
\usepackage{xcolor}
\fxusetheme{colorsig}
\FXRegisterAuthor{dg}{adg}{\textcolor{green}{}}
\FXRegisterAuthor{aa}{aanote}{\colorbox{orange!20}{\textcolor{black}{Anna}}}
\FXRegisterAuthor{kl}{klnote}{\colorbox{blue!20}{\textcolor{black}{KL}}}
\FXRegisterAuthor{jg}{jgnote}{\colorbox{purple!20}{\textcolor{black}{Jaume}}}
\FXRegisterAuthor{ms}{msnote}{\colorbox{red!20}{\textcolor{black}{Martin}}}

\newcommand{\rebuttal}[1]{\textcolor{black}{#1}}

\newcommand{\wrt}{{\it w.r.t. }} % with respect to
\newcommand{\eg}{\emph{e.g.}, } % for example
\newcommand{\ie}{\emph{i.e.}, } % that is
\newcommand{\etal}{\emph{et al.}} % and others
\newcommand\etc{\emph{etc.}}

\newcommand{\heatblue}[2]{\cellcolor{pastelblue!#1} #2}

\newcommand{\perfdrop}[2]{%
  \pgfmathparse{round(#1-#2)}% 
  \pgfmathtruncatemacro{\drop}{\pgfmathresult}%
  \ifnum\drop<0
    \mbox{#2\,\textsuperscript{\textcolor{blue!70!black}{\scriptsize\bfseries\the\numexpr-\drop\relax}}}%
  \else\ifnum\drop=0
    \mbox{#2}%
  \else
    \mbox{#2\,\textsuperscript{\textcolor{red!70!black}{\scriptsize\bfseries\drop}}}%
  \fi\fi
}

\usepackage{tcolorbox}   % For beautiful boxes
\usepackage{booktabs}    % For professional tables
\usepackage{enumitem}
\tcbuselibrary{skins, breakable} % Extensions for tcolorbox





\usepackage{mwe} % to get dummy images
%\jmlrvolume{-- Under Review}
%\jmlryear{2026}
%\jmlrworkshop{Full Paper -- MIDL 2026 submission}
%\editors{Under Review for MIDL 2026}


\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 206}
\editors{Accepted for publication at MIDL 2026}


%\title[Aloe-Vision]{Aloe-Vision: Building Robust Vision-Language Models for Healthcare} %to be updated
\title[Aloe-Vision]{Aloe-Vision: Robust Vision-Language Models for Healthcare} %to be updated

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}
%\definechangesauthor[name={Kike}, color=blue]{Kike}
 % Three or more authors with the same address:
 \midlauthor{\Name{Jaume {Guasch-Mart\'i}} \Email{jaume.guasch@bsc.es}\\
  \Name{Enrique {Lopez-Cuena}} \Email{enrique.lopez@bsc.es}\\
  \Name{Mart\'in {Su\'arez-Fern\'andez}} \Email{martin.suarez@bsc.es}\\
  \Name{Jordi {Bayarri-Planas}} \Email{jordi.bayarri@bsc.es}\\
  \Name{Anna {Arias-Duart}} \Email{anna.ariasduart@bsc.es}\\
  \Name{Dario {Garcia-Gasulla}} \Email{dario.garcia@bsc.es}\\
  \addr Barcelona Supercomputing Center (BSC)}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
%\midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \orcid{1111-2222-3333-4444} \Email{abc@sample.edu}\\
%\addr $^{1}$ Address 1 \\
%\addr $^{2}$ Address 2 \AND
%\Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
%\Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
%\Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
%\addr $^{3}$ Address 3 \AND
%\Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
%\addr $^{4}$ Address 4
%}



\begin{document}

\maketitle

% LONG VERSION
% \begin{abstract}
% Large Vision-Language Models (LVLMs) specialized in healthcare are emerging as a promising research direction due to their potential impact in clinical and biomedical applications. However, progress is constrained by the scarcity of high-quality medical multimodal data, concerns about robustness in safety-critical settings, and the narrow and potentially contaminated evaluation benchmarks that limit reliable assessment. To address these issues, the field requires the state-of-the-art solutions to be fully open and reproducible systems in which all components can be inspected, evaluated, and improved. This work introduces the \textbf{Aloe-Vision} family of medical LVLMs, openly released with full weights, training recipes and data, in two scales (7B and 72B). The models are trained on \textbf{Aloe-Vision-Data}, a quality-filtered mixture integrating both medical and general domains across multimodal and text-only sources. The corpus provides broad coverage over imaging modalities, clinical categories, and anatomical structures, and is designed for direct use in model fine-tuning. Through comprehensive benchmarking, we demonstrate that balanced training mixtures produce robust LVLMs which yield significant gains over the baseline models without compromising general capabilities, achieving competitive performance with state-of-the-art alternatives. We also introduce \textbf{CareQA-Vision}, a carefully curated vision benchmark derived from MIR and EIR exams, the residency entrance exams for medical and nursing specialists in Spain, offering novel vision questions with minimal contamination. Finally, we show that current LVLMs remain vulnerable to adversarial and misleading inputs, underscoring reliability challenges in clinical contexts. Although Aloe-Vision substantially improves robustness against these vulnerabilities, further progress is needed to advance toward trustworthy clinical LVLMs.
% \end{abstract}

\begin{abstract}
Large Vision-Language Models (LVLMs) specialized in healthcare are emerging as a promising research direction due to their potential impact in clinical and biomedical applications. However, progress is constrained by the scarcity of high-quality medical multimodal data, concerns about robustness in safety-critical settings, and the narrow and potentially contaminated evaluation benchmarks that limit reliable assessment. To address these issues, the field requires state-of-the-art solutions to be fully open and reproducible systems in which all components can be inspected, evaluated, and improved. \rebuttal{This work introduces \textbf{Aloe-Vision-Data}, a large-scale, quality-filtered mixture which integrates both medical and general domains across multimodal and text-only sources, designed for direct use in model fine-tuning. Building on this dataset, we train the \textbf{Aloe-Vision} family of medical LVLMs, openly released with full weights, training recipes and data, in two scales (7B and 72B).} Through comprehensive benchmarking, we demonstrate that \rebuttal{high quality} training mixtures produce \rebuttal{balanced} LVLMs which yield significant gains over the baseline models without compromising general capabilities, achieving competitive performance with respect to state-of-the-art alternatives. To support reliable evaluation, we introduce \textbf{CareQA-Vision}, a carefully curated vision benchmark derived from MIR and EIR exams, the residency entrance exams for medical and nursing specialists in Spain, offering novel vision questions with \rebuttal{low likelihood of} contamination. Finally, we show that current LVLMs remain vulnerable to adversarial and misleading inputs, underscoring reliability challenges in clinical contexts.
\end{abstract}

% \begin{abstract}
% Large Vision-Language Models (LVLMs) specialized in healthcare are emerging as a promising research direction due to their potential impact in clinical and biomedical applications. However, progress is constrained by the scarcity of high-quality medical multimodal data, concerns about robustness in safety-critical settings, and the narrow and potentially contaminated evaluation benchmarks that limit reliable assessment. To address these issues, the field requires state-of-the-art solutions to be fully open and reproducible systems in which all components can be inspected, evaluated, and improved. This work introduces the \textbf{Aloe-Vision} family of medical LVLMs, openly released with full weights, training recipes and data, in two scales (7B and 72B). The models are trained on \textbf{Aloe-Vision-Data}, our quality-filtered mixture which integrates both medical and general domains across multimodal and text-only sources, designed for direct use in model fine-tuning. Through comprehensive benchmarking, we demonstrate that balanced training mixtures produce robust LVLMs which yield significant gains over the baseline models without compromising general capabilities, achieving competitive performance with state-of-the-art alternatives. We also introduce \textbf{CareQA-Vision}, a carefully curated vision benchmark derived from MIR and EIR exams, the residency entrance exams for medical and nursing specialists in Spain, offering novel vision questions with minimal contamination. Finally, we show that current LVLMs remain vulnerable to adversarial and misleading inputs, underscoring reliability challenges in clinical contexts.
% \end{abstract}


\begin{keywords}
LVLMs, Healthcare, Adversarial evaluation
\end{keywords}

\section{Introduction}

Large Vision-Language Models (LVLMs) have achieved remarkable progress in general multimodal domains, and their ability to jointly process images and text makes them naturally aligned with the multimodal nature of medicine, where visual data (\eg X-rays, CT scans, histopathology slides) must be interpreted alongside clinical narratives, patient histories, and diagnostic reports. However, despite their strong potential, progress in medical LVLMs remains limited and still falls short of human-level performance \citep{sun2024pathmmu}, largely due to three key challenges. First, the availability of high-quality, medical image-text data remains severely limited, restricting both the scale and diversity required to train LVLMs that generalize across modalities, pathologies, and anatomical structures. Second, evaluation practices rely heavily on medical VQA benchmarks that are often noisy and have been publicly available long enough to risk contamination, resulting in unreliable or overly optimistic assessments of model performance. Third, even state-of-the-art LVLMs exhibit notable vulnerabilities to adversarial, ambiguous, or misleading prompts, exposing robustness failures that are unacceptable in safety-critical clinical environments. Together, these limitations highlight an urgent need for fully open, well-documented, reproducible LVLMs that can serve as trustworthy foundations for clinical and biomedical applications. 

To address these challenges, we introduce \textbf{Aloe-Vision}, a family of open and reproducible medical LVLMs achieving competitive performance with the state-of-the-art. The models are trained on \textbf{Aloe-Vision-Data}, a balanced and quality-filtered mixture explicitly curated to be directly usable for LVLM fine-tuning. It integrates four types of data: (1) multimodal medical datasets for visual clinical reasoning, (2) general multimodal data to preserve visual capabilities, (3) medical text-only data for domain-specific knowledge, and (4) general text-only data to maintain conversational fluency. The proportion of each data category is determined by the number of loss-contributing tokens rather than raw sample counts, ensuring uniform influence across data modalities and preventing longer samples from dominating the training signal. Beyond standard cleaning, we apply a semi-automatic filtering procedure that removes low-quality or inconsistent annotations using LVLM-based scoring and answer perplexity. We prevent leakage into benchmark datasets eliminating duplicates through perceptual hashing. To obtain a holistic view beyond standard medical VQA benchmarks, we assess model performance across medical multimodal, medical text-only, general multimodal, and general text-only tasks. We also incorporate a new medical vision benchmark, \textbf{CareQA-Vision}, curated to assess clinical reasoning on entirely unseen cases. Finally, we evaluate the robustness of medical LVLMs under adversarial conditions, testing their reliability when confronted with misleading or contradictory multimodal cues.

In summary, the core contribution of this work is delivering fully open and reproducible medical LVLMs that the community can build upon. This is enabled by the following three components, made publicly available\footnote{\url{https://huggingface.co/collections/HPAI-BSC/healthcare-vlms-aloe-vision}}:

\begin{itemize}
%to be reviewed with the final models and add link
\item \textbf{Aloe-Vision-Data}: a ready-to-train balanced training mixture across modality (multimodal vs. text-only) and domain (medical vs. general).

\item \rebuttal{Aloe-Vision: a family of open medical LVLMs with improved robustness against adversarial attacks and competitive performance in healthcare and general domains.}

\item \textbf{CareQA-Vision}: a benchmark curated from Spanish residency entrance exams for evaluating model performance on unseen data. %add link
\end{itemize}


\section{Related Work}
\label{sec:related_work}

% \dgnote{what does that mean?} (capable)
% \dgnote{not a smooth way of starting a section. paragraph is weird, and MMFM requires an explanation}
\paragraph{Medical Multimodal Models} The development of Large Language Models (LLMs) established a foundation for processing clinical text~\citep{singhal2023towards,chen2023meditron}. This architecture was subsequently expanded into Multimodal LLMs by integrating vision encoders to enable the processing of mixed-modality inputs~\citep{liu2023visual,bai2023qwen}. These general advancements have been translated to the healthcare domain to handle medical imaging. Early adaptations such as MedFlamingo ~\citep{moor2023med}, LLaVA-Med ~\cite{li2023llava}, and MedGemini ~\cite{saab2024capabilities} demonstrated that fine-tuning general LVLMs on medical images yielded strong visual-question-answering (VQA) abilities, yet they remained limited to modest parameter and data scales. Subsequent developments have employed more capable foundation models with substantially larger training corpora. HuatuoGPT-Vision ~\cite{chen2024huatuogpt} integrates clean PubMed VQA pairs into a Yi 1.5~\cite{young2024yi} based model, making it one of the few fully open efforts, although its performance now lags behind more recent systems. GMAI-VL ~\cite{li2024gmai} couples a three-stage alignment pipeline with 5.5 M image-text pairs, but its model remains closed. Finally, the most recent Lingshu ~\cite{xu2025lingshu} and  Hulu-Med~\cite{jiang2025hulu} improved the state-of-the-art but are only partially open, limiting transparency and evaluation. Our work introduces \textbf{Aloe-Vision}, which matches the performance of these state-of-the-art systems while offering fully reproducible medical LVLMs at 7B and 72B scales, released with complete training data, recipes, and preprocessing steps.

\paragraph{Large-Scale Medical Multimodal Datasets} Training corpora have been scaled from thousands to millions of pairs. PubMedVision ~\cite{chen2024huatuogpt} filters and refines existing PubMed image collections (PMC-OA, LLaVA-Med PMC, PMC-Inline) yielding 915K medical images that generate 1.3 million VQA pairs via GPT-4V synthesis. GMAI-VL-5.5M ~\cite{li2024gmai} aggregates 219 expert datasets across 13 imaging modalities and 18 specialties, converting classification and detection annotations into 5.5 million caption and instruction samples using GPT-4o; however, the dataset itself is not publicly released. MedTrinity-25M ~\cite{xiemedtrinity} employs retrieval-augmented generation with domain-specific segmentation models to auto-generate ROI-grounded triplets for 25 million images without requiring paired text descriptions. Recent works assemble four-way mixtures spanning medical multimodal, medical text, general multimodal, and general text: Lingshu~\cite{xu2025lingshu} curates such a mixture but does not disclose category ratios while Hulu-Med~\cite{jiang2025hulu} reports a similar four-way composition but does not release the mixture. In contrast, we release the first \textbf{ready-to-train}, balanced mixture which uses a loss-token-based weighting scheme to avoid overfitting to longer samples. 

\paragraph{Evaluation Benchmarks} Traditional medical VQA datasets such as SLAKE~\cite{liu2021slake}, PathVQA~\cite{he2020pathvqa}, and VQA-RAD~\cite{lau2018dataset} remain widely used but are too limited to evaluate modern LVLMs effectively. This gap has motivated the development of more comprehensive and challenging benchmarks. GMAI-MMBench ~\cite{ye2024gmai} consolidates 284 expert datasets into 26k questions spanning region-, box-, mask-, and image-level reasoning across 38 imaging modalities and 18 clinical departments. OmniMedVQA ~\cite{hu2024omnimedvqa} converts 73 medical classification datasets into 128k multiple-choice questions (MCQ) covering 12 modalities and over 20 anatomical regions. In pathology, PathMMU ~\cite{sun2024pathmmu} provides 33k expert-validated QA pairs, demonstrating that LVLMs significantly underperform board-certified pathologists. ProbMed ~\cite{yan2025worse} introduces adversarial evaluation by pairing ground-truth queries with negated hallucination versions. We assemble a comprehensive benchmark suite that unifies medical multimodal, medical text-only, general multimodal, and general text-only tasks to obtain a holistic view of model quality. To do so, we combine the benchmarks described above, introduce \textbf{CareQA-Vision} as a contamination-free medical vision benchmark, and evaluate LVLMs under adversarial conditions using the HEART framework~\cite{heart2025}\jgnote{update cite to HEART}. 

% the scope of multimodal robustness evaluation in healthcare lags behind the general domain, where recent works study adversarial conditions such as misleading text \citep{zhu2024unraveling,zhang2025modalities,deng2025words}

% Maybe too many subsections?
\section{Training Data}
\label{sec:training_data}
%Prior medical LVLM work has largely relied on medical multimodal corpora, which can overfit to vision-heavy tasks and cause catastrophic forgetting of general and textual skills. \klnote{citation?} \jgnote{or maybe remove the whole sentence?} 

To preserve broad conversational competence while improving medical visual reasoning, we construct a balanced training mixture along two axes: modality (multimodal vs.\ text-only) and domain (medical vs.\ general). Within medical multimodal data, both global image understanding and fine-grained, region-referenced reasoning are included. The final \textbf{Aloe-Vision-Data} mixture draws from eight datasets, as detailed in Table \ref{tab:final_mixture}.

\begingroup
\setlength{\aboverulesep}{0pt}
\setlength{\belowrulesep}{0pt}
% (optional) add a touch of row breathing room since we removed rule padding:
\setlength{\extrarowheight}{.6ex}

\begin{table*}[t]
\centering
\small
\caption{Final composition of the SFT training mixture after preprocessing, leakage removal, quality filtering, and token-based rebalancing.}
\label{tab:final_mixture}
\resizebox{\textwidth}{!}{%
\begin{tabular}{p{5.3cm}ccccc}
\toprule
% \addlinespace[0.6ex]
\rowcolor{gray!5}
% \textbf{Dataset} & \textbf{Samples} & \textbf{\shortstack{Loss \\ tokens (M)}} & \textbf{Modality} & \textbf{Domain} & \textbf{\shortstack{Boxes\\Regions}} \\
\textbf{Dataset} & \textbf{Samples} & \textbf{\shortstack{Loss tokens (M)}} & \textbf{Modality} & \textbf{Domain} & \textbf{\shortstack{B. Boxes}} \\
\midrule
PubMedVision~\cite{chen2024huatuogpt}     & 1.26M & 175.3 & MM   & Medical & No  \\  %\rowcolor{blue!4}  
MedMax~\cite{bansal2024medmax}            & 409K  & 33.7  & MM   & Medical & No  \\ %\rowcolor{blue!4}
MeCoVQA~\cite{huang2025towards}           & 27.5K & 0.7   & MM   & Medical & Yes \\ %\rowcolor{blue!4}
Med-GRIT~\cite{huang2024refer}            & 17.7K & 2.6   & MM   & Medical & Yes \\ %\rowcolor{blue!4} 
MedTrinity-25M~\cite{xiemedtrinity}       & 330K  & 55.5  & MM   & Medical & Yes \\ %\rowcolor{blue!4} 
Cambrian-10M~\cite{tong2024cambrian}    & 668K  & 65.4  & MM   & General & No  \\ %\rowcolor{yellow!4}
Aloe~\cite{gururajan2024aloe}            & 756K  & 190.3 & Text & Medical & --  \\ %\rowcolor{green!4} 
Magpie-Ultra-v1.0~\cite{huggingface_magpie_ultra} & 100K  & 116.6 & Text & General & --  \\ %\rowcolor{red!7}
\midrule
\textbf{Total}                            & 3.57M & 640.0 & -- & -- & -- \\ %\rowcolor{gray!10}
\bottomrule
\end{tabular}%
}
\end{table*}
\endgroup



% Supervised fine-tuning (SFT) of LVLMs is formulated as a dialogue supervision over triplets \texttt{(image, question, answer)}.
\paragraph{Preprocessing and normalization.}
Samples are converted to a unified conversational schema (alternating \texttt{user}/\texttt{assistant} messages), enabling multi-turn dialogue. This structure supports interleaved multimodal inputs, allowing for sequences containing multiple images mixed with text. Region-level supervision is standardized using Qwen2-VL~\citep{wang2024qwen2} format, using the markers
\texttt{<|box\_start|>}(x\textsubscript{tl}, y\textsubscript{tl}), (x\textsubscript{br}, y\textsubscript{br})\texttt{<|box\_end|>} and normalizing coordinates to $[0,1000)$. Cleaning steps included (1) removal of missing/corrupted images, (2) a $50{\times}50$ minimum size, (3) capping at $5$ images/sample to avoid training instability and (4) sequence-length filtering at 4096 tokens. MedTrinity-25M~\citep{xiemedtrinity} is randomly subsampled to 400K sequences to avoid over-representation of images with rendered boxes.
% Within the conversation text, visual inputs are represented by a special \texttt{<image>} placeholder token.

\paragraph{Evaluation leakage prevention.}
\label{subsec:eval_leakage}
Leakage into evaluation sets is explicitly controlled using 64-bit perceptual hashing (pHash) \cite{imagehash} matching between all training and evaluation images, which detects near-duplicate images even under resizing or compression. This removes 6{,}273 training samples, ensuring reported evaluation gains are not overestimated by training-evaluation overlap with data used to fine-tune the models. 


% A figure of the quality filtering process could be interesting
\paragraph{Semi-automatic quality filtering.}
\label{subsec:quality_filtering}
Manual inspection of medical multimodal datasets revealed low-quality cases (\eg answers written on the image, image irrelevant to the question,  mismatched question and answer, see Figure \ref{fig:filtering}). Given the scale, a two-signal semi-automatic filter is adopted:

\begin{itemize}
    \item \textbf{LVLM tagging.} Qwen2.5-VL-72B-Instruct~\cite{yang2025qwen2} is prompted to produce a 1-5 quality score per sample based on coherence and relatedness between image, question, and answer. See an excerpt of the prompt in Appendix \ref{app:tagging_template}.
    \item \textbf{Answer perplexity.} Qwen2-VL-7B-Instruct~\citep{wang2024qwen2} is used to compute perplexity of the answer conditioned on image and question. Very low perplexity often flags trivial answers (e.g., answers visible in the image), whereas very high perplexity indicates noisy or incorrect annotations.
\end{itemize}


\noindent Thresholds for both quality scores and perplexity are manually defined per source by reviewing high- and low-score/perplexity examples, ensuring that filtering adapts to the specific characteristics of each dataset. In total, 541{,}237 samples are excluded. % (Filtering process with low-quality examples in Figure~\ref{fig:filtering}).



\paragraph{Token-based rebalancing.}
Early training runs showed biases related to answer length even though sample counts were balanced. To correct this, mixtures are rebalanced by \emph{loss-contributing tokens} (assistant tokens only), rather than by examples. For each dataset, token statistics (total, loss, text, image) are computed and sources with long answers, such as Chain-of-Thought reasoning traces, are subsampled to equalize their effective gradient contribution. This procedure preserves the intended modality/domain proportions while mitigating bias from long-form datasets.



\begin{figure}[t]
\centering
\includegraphics[width=1.0\textwidth]{figures/output_5cat.png}
\caption{Category coverage analysis of the final training mixture across imaging modality (rows) and medical specialty (columns).}
\label{fig:mod_med}
\end{figure}

% TODO: Add body part figure to appendix
\paragraph{Coverage analysis} We assess dataset diversity across three axes (image modality, medical specialty, and anatomical structure), focusing on the coverage of their combinations rather than balance along each individual axis. During the semi-automatic quality filtering (Section~\ref{subsec:quality_filtering}), Qwen2.5-VL-72B-Instruct is additionally prompted to tag each sample with categories along these axes using the information provided by the triplet (question, image, answer), enabling construction of coverage heatmaps. Figure~\ref{fig:mod_med} illustrates the \textit{image modality} vs. \textit{medical specialty} distribution. After excluding nonsensical cases (e.g., \emph{fundus-bones}, \emph{angiography-dentistry}), the dataset exhibits strong and balanced representation.

\paragraph{Final mixture. }
\label{subsec:final_mixture}
The final SFT mixture contains $\sim$\textbf{3.57M samples} and $\sim$\textbf{640M loss tokens} (see Table~\ref{tab:final_mixture}). By loss tokens, the allocation is:
\emph{medical multimodal} 41.8\% (267.8M),
\emph{medical text-only} 29.7\% (190.3M),
\emph{general text-only} 18.2\% (116.6M),
and \emph{general multimodal} 10.2\% (65.4M).
Overall, multimodal data contributes 52\% of loss tokens and medical data represents 71.5\%.

\section{Evaluation data}
\label{sec:eval_data}

% TODO: Add incorrect examples in appendix
We evaluate models using a comprehensive benchmark suite that spans medical multimodal, medical text-only, general multimodal, and general text-only tasks, providing a unified and reproducible assessment of overall model quality. We standardize execution using \texttt{VLMEvalKit}~\cite{duan2024vlmevalkit} and \texttt{lm-eval-harness}~\cite{eval-harness} to ensure consistent and reproducible evaluation across models. Prior studies focus on a limited subset of medical VQA benchmarks (\eg PathVQA, VQA-RAD, SLAKE), which not only narrow the evaluation scope but also risk data leakage, as many of these datasets have been publicly available for years. After manual inspection, we exclude PathVQA and VQA-RAD due to quality concerns (\eg incorrect references, image-independent questions) and adopt newer, higher-fidelity benchmarks that better capture clinical diversity and reasoning competence. The final benchmark suite is summarized in Table~\ref{tab:evaluation_suite_compact}.

\begingroup
\setlength{\aboverulesep}{0pt}
\setlength{\belowrulesep}{0pt}
\setlength{\extrarowheight}{.3ex}

\begin{table*}[t]
\centering
\footnotesize
\setlength{\tabcolsep}{6pt} % tighter spacing between columns
\caption{Evaluation suite. MCQ = multiple-choice; Y/N = yes/no; OE = open-ended (J = LLM-as-judge).}
\label{tab:evaluation_suite_compact}
%\resizebox{\textwidth}{!}{%
\begin{tabular}{lcccr}
\toprule
\rowcolor{gray!5}
\textbf{Benchmark} & \textbf{Modality} & \textbf{Domain} & \textbf{Task} & \textbf{Samples} \\
\midrule
PathMMU~\cite{sun2024pathmmu}           & Multi & Medical & MCQ    & 1.1K \\ %\rowcolor{blue!4}
GMAI-MMBench~\cite{ye2024gmai}          & Multi & Medical & MCQ    & 4.5K \\ %\rowcolor{blue!4}
OmniMedVQA~\cite{hu2024omnimedvqa}      & Multi & Medical & MCQ    & 89K  \\ %\rowcolor{blue!4} 
ProbMed~\cite{yan2025worse}                 & Multi & Medical & Y/N    & 57K  \\ %\rowcolor{blue!4} 
SLAKE~\cite{liu2021slake}               & Multi & Medical & OE (J) & 2K   \\ %\rowcolor{blue!4} % Comment: Didn't SLAKE have a closed subset?
MMMU~\cite{yue2024mmmu}                 & Multi & General & MCQ    & 1.4K \\ % \rowcolor{yellow!4}
MultiMedQA~\cite{singhal2023publisher}  & Text  & Medical & MCQ    & 7K   \\ %\rowcolor{green!4} 
MMLU~\cite{hendrycksmeasuring}          & Text  & General & MCQ    & 14K  \\ %\rowcolor{red!4}
\bottomrule
\end{tabular}%
%}
\end{table*}
\endgroup



\paragraph{CareQA-Vision.} To ensure a contamination-free evaluation, we curate an extension of the CareQA dataset \cite{arias2025automatic}. CareQA-Vision is derived from MIR and EIR exams, the residency entrance exams for medical and nursing specialists in Spain, where all questions are curated by medical experts. It incorporates vision-based questions in nursing and medicine from 2020 to 2024 exams. Originally in Spanish and presented in a multiple-choice format, the questions are translated to English and converted into open-ended format using Qwen2.5-72B-Instruct \cite{yang2025qwen2}, followed by manual verification. Although some images include Spanish words, they are not required to solve the tasks. CareQA-Vision contains a total of 301 questions: 70 from nursing and 231 from medicine. Closed questions represent 60\% of nursing and 53.2\% of medicine queries, with the remaining 136 questions being open-ended. Figure \ref{fig:careqa_vision} shows both open-ended and close-ended examples. \rebuttal{Despite its relatively limited size, this dataset provides a meaningful indicator of model performance in the healthcare domain, as it consists of high-quality, expert-reviewed questions with a low risk of training-set contamination.}

% This dataset offers a robust measure of model generalization to unseen data and it is also publicly available.
%Fewer questions could be converted to open-ended form because some require expert knowledge to rephrase without ambiguity.

\begin{figure}[tb] 
    \centering
    \includegraphics[width=0.745\linewidth]{figures/careqa_green.png}  
    \caption{CareQA-Vision examples. Top: a medical MCQ, with the correct option in \textbf{bold}. Bottom: a nursing question in an open-ended format.
    }
    \label{fig:careqa_vision}  
\end{figure}




\begin{figure}[thb] 
    \centering
    \includegraphics[width=0.745\linewidth]{figures/examples_adversarial_new.png}  
    \caption{Adversarial examples, correct option in \textbf{bold}. Left: sycophancy example based on a detection task. Right: caption example for a classification task.} % (image from the DENTEX dataset~\citep{hamamci2023diffusion, hamamci2023dentex})  (image from the ISIC 2017 dataset~\citep{codella2018skin})
    \label{fig:adv_example}  
\end{figure}

\paragraph{Adversarial Benchmark.} To assess the robustness of state-of-the-art LVLMs under misleading conditions, we evaluate models using the HEART adversarial benchmark \citep{heart2025}. \rebuttal{Constructed from eight existing medical datasets spanning multiple imaging modalities (\eg X-ray, MRI, ultrasound), it} includes both classification and detection tasks (see Figure \ref{fig:adv_example}, left/right) and introduces four types of adversarial attacks designed to probe whether model predictions remain grounded in the visual evidence. These include: (1) Sycophancy, where suggested labels or bounding boxes are inserted into the prompt, (2) Captions, where incorrect captions are embedded directly within the image, (3) Prompt, where incorrect captions are provided in the text prompt, and (4) Legends, where mismatched legends accompany the image. \rebuttal{In total, the benchmark contains 24,311 samples, out of which 2,979 represent a baseline that serves for comparison against each of the adversarial attack types.} These settings test whether models can accurately interpret the underlying visual content despite the presence of misleading distractors. 



\section{Experiments}
\label{sec:experiments}

We evaluate \emph{Aloe-Vision-7B} and \emph{Aloe-Vision-72B} against state-of-the-art LVLMs using the benchmark suite described in Section~\ref{sec:eval_data}. In addition to standard performance comparisons, we report results on the adversarial robustness evaluation.  Comprehensive ablation studies are provided in Appendix~\ref{appendix:ablations}, covering the effect of (1) training-mixture composition, (2) evaluation-leakage control, and (3) semi-automatic quality filtering.

\subsection{Setup}
All experiments use single-stage supervised fine-tuning (SFT) with \texttt{TRL}~\citep{vonwerra2022trl} library on alternating \texttt{user}/\texttt{assistant} dialogues, optimizing next-token cross-entropy over assistant turns. We fine-tune the Qwen2-VL-Instruct family~\citep{wang2024qwen2} at 7B and 72B parameters, retaining the native multi-image interface and the Qwen-style chat template. The resulting models are referred to as \emph{Aloe-Vision-7B} and \emph{Aloe-Vision-72B}. Compute runs were performed in two European HPC systems: Leonardo (CINECA, A100\,64GB\,$\times$4 per node) and MareNostrum~5 ACC (BSC, H100\,64GB\,$\times$4 per node). The full training configurations for both models are summarized in Appendix~\ref{appendix:training_config}. % The full training configurations for both models, including batch sizes, optimizer settings, and resource allocation, are summarized in Appendix~\ref{appendix:training_config}.

\paragraph{Models.} We benchmark our models against a set of open-source LVLMs, categorized into: (1) General-purpose LVLMs, specifically the Qwen2-VL~\citep{wang2024qwen2} and Qwen3-VL~\citep{bai2025qwen3} families, and Kimi-VL~\citep{team2025kimi}, which serve as strong reference for general visual capabilities; and (2) Specialized Medical LVLMs, including HuatuoGPT-Vision~\citep{chen2024huatuogpt}, Lingshu~\citep{xu2025lingshu}, Hulu-Med~\citep{jiang2025hulu} and Chiron-o1~\citep{sun2025enhancing}. Finally, we include MiMo-VL~\citep{yue2025mimo} and GLM-4.5V~\citep{hong2025glm}, representing the emerging class of \textit{reasoning} models trained to use extended thought processes for complex visual problems. \rebuttal{In addition, we include GPT-5.2~\citep{openai_gpt52} as a closed-source baseline to contextualize the performance of open and reproducible medical LVLMs against state-of-the-art proprietary systems.}

\paragraph{Adversarially Robust Aloe-Vision.} To mitigate the impact of adversarial attacks, we develop an adversarially robust (AR) variant of Aloe-Vision by introducing an additional post-training SFT stage using adversarial samples. The adversarial training set is created by applying all attack types described above \textbf{exclusively to a single imaging modality} (FracAtlas~\citep{abedeen2023fracatlas}), which is not included in the HEART benchmark. The purpose of this design is to test whether robustness learned from one modality can generalize to others (\ie whether fine-tuning on adversarial examples from a single source domain yields cross-modal and cross-specialty robustness). This second stage consists of a single SFT epoch over 17.2k samples.

\paragraph{Evaluation protocol.} All multimodal benchmarks are implemented and run within VLM\-EvalKit~\cite{duan2024vlmevalkit}, and all text-only benchmarks are evaluated with \textit{lm-evaluation-harness}. For every model, the benchmarks are run under identical settings to ensure fair and reproducible comparison. For multimodal tasks, inference uses greedy decoding, and accuracy is computed via exact string match for multiple-choice and yes/no formats. Open-ended answers in SLAKE and CareQA-Vision are scored with \rebuttal{a majority voting LLM-as-judge protocol, where Qwen2.5-VL-72B~\cite{yang2025qwen2}, Llama-3.3-70B~\citep{grattafiori2024llama} and Olmo-3-32B~\citep{olmo2025olmo}} assigne one of three rubric-based scores \{0.0, 0.5, 1.0\}. For text-only tasks, prediction followed the standard multiple-choice evaluation in \texttt{lm-evaluation-harness}, selecting the option with the highest log-likelihood as the model's answer.

\paragraph{Human Evaluation.} Before reporting quantitative results, we assess the reliability of the judges (Qwen2.5-72B-Instruct~\citep{yang2025qwen2},  \rebuttal{Llama-3.3-70B~\citep{grattafiori2024llama} and Olmo-3-32B~\citep{olmo2025olmo}}) used in open-ended benchmarks. To do so, we sample the open-ended medical subset of CareQA-Vision and ask experts to evaluate Aloe-Vision-72B-AR's answers following the same criteria as the LLM judge. For each question, experts were shown the correct answer (from the original exam) and the model's answer, and were asked to classify the model output as \textit{correct}, \textit{partially correct}, or \textit{incorrect} (see Appendix~\ref{sec:expert_eval} for details). \rebuttal{Inter-evaluator agreement among human experts, computed independently of the LLM judges, is moderate (Krippendorff's $\alpha = 0.796$)~\cite{krippendorff2011computing}, reflecting the inherent difficulty and subjectivity of the task. We then assess agreement when incorporating LLM-based judging by treating the automatic evaluation as the majority vote of the three independent LLM judges, and comparing this aggregated decision against the human annotations. Under this setup, overall agreement increases to $\alpha = 0.812$, indicating that the LLM judges are largely consistent with human expert assessments.}
Based on these findings, we consider the judge's evaluation sufficiently reliable, while acknowledging potential biases, such as a tendency to favor longer answers.

%all reviewers agreed with the model's label. In contrast, in 5,71\% of the cases the reviewers agreed with each other bu disagreed with the model. In these situations, the experts were consistently more conservative: when experts labeled an answer as incorrect, the model labeled it as partially correct; when experts labeled an answer as partially correct, the model labeled it as correct. Additionally, there were three questions (2.86\%) where only one of the three evaluators agreed with the model, while the other two disagreed with each other. Based on these findings, we consider the judge's evaluation is sufficiently reliable, while acknowledging potential biases, such as a tendency to favor longer answers over more concise ones. 

\subsection{Results}\label{sec:eval_results}

\input{tables/results_table}

\paragraph{Evaluation Results}
\rebuttal{Table~\ref{tab:model_performance} summarizes the evaluation results, covering both general-purpose LVLMs (highlighted in gray) and medical LVLMs. Overall, the results reveal a clear separation between models trained with domain-specific medical data and general-purpose counterparts, as well as a consistent scaling trend with increasing model size. A notable exception is observed on the SLAKE benchmark, where Lingshu and Hulu-Med outperform the remaining models by approximately 15--20\%. This gap is largely explained by the inclusion of the SLAKE training split in their respective training mixtures, exposing these models to highly similar samples. In contrast, Aloe-Vision models explicitly exclude all SLAKE examples from training, resulting in what we consider to be a more conservative and fair estimate of generalization performance on this benchmark.}

\paragraph{Small models ($<$10B).}
\rebuttal{Within the small-scale parameter group, Hulu-Med-7B achieves the strongest overall results, ranking first on four out of the ten benchmarks. The remaining datasets are led by Qwen3-VL-8B-Instruct, Aloe-Vision-7B-AR, and Lingshu-7B. At this model scale, the results suggest that architectural and pretraining advances in newer base models significantly improve performance, with Qwen3-VL consistently outperforming its Qwen2-VL counterparts across benchmarks.}

\paragraph{Large models (20B-106B).}
\rebuttal{Increasing model size yields consistent performance improvements across all evaluated benchmarks. To the best of our knowledge, there are currently no other open-source medical LVLMs at the 70B parameter scale. As a result, Aloe-Vision-72B is compared against the strongest available medical models, which are limited to the 32B–34B range. Among all evaluated models, Aloe-Vision-72B-AR achieves the strongest overall performance, combining high accuracy on general and text-only benchmarks with robust results on medical-specific datasets. Interestingly, despite its substantially smaller parameter count, Hulu-Med-32B achieves comparable performance and in some benchmarks it even surpasses Aloe-Vision-72B, highlighting the impact of architectural choices, training data scale, and training strategy beyond model size alone.}

\paragraph{Reasoning-oriented models.}
\rebuttal{Finally, we observe that models explicitly post-trained to generate intermediate reasoning traces before producing a final answer (GLM-4.5V, MiMo-VL-7B-RL, and Chiron-o1-8B) do not consistently outperform other models across the benchmark suite. Among these, only GLM-4.5V achieves top performance on two benchmarks, an advantage that may be primarily attributable to its substantially larger scale (106B parameters). Overall, these findings suggest that the majority of current medical vision-language benchmarks are predominantly knowledge-based and can be effectively addressed without multi-step reasoning. This observation raises an important open question regarding the practical utility of reasoning-oriented models in the medical vision domain and highlights the need for dedicated benchmarks that require and evaluate structured reasoning capabilities.}

\paragraph{Closed-source baseline.}
\rebuttal{GPT-5.2 provides a useful reference point for the current performance level of proprietary multimodal systems. It achieves the strongest results on CareQA-V MCQ, CareQA-V VQA, MultimedQA, and MMLU, indicating particularly strong performance on text-only benchmarks. At the same time, several open models surpass GPT-5.2 on OMVQA, GMAI, PathMMU, ProbMed, SLAKE, and MMMU, showing that targeted domain adaptation can substantially narrow, and in some cases overcome, the gap to proprietary systems on medical vision-language evaluation. However, we view GPT-5.2 primarily as a contextual baseline rather than a directly comparable model, since the evaluated open-source baselines are standalone models and GPT-5.2 is a full system that may incorporate capabilities extending beyond the underlying model alone.}

\paragraph{Generalization and Robustness.}
\rebuttal{Both Aloe-Vision models perform particularly well on the CareQA-Vision benchmarks, indicating generalization to unseen medical distributions. Furthermore, Aloe-Vision and Aloe-Vision-AR exhibit nearly identical performance across all standard (non-adversarial) benchmarks, regardless of model size. This result indicates that the additional supervised fine-tuning stage introduced to improve robustness does not come at the cost of reduced accuracy on conventional evaluation settings.}


% REVISAR esta subsecci\'on
 
% The results are summarized in Table~\ref{tab:model_performance}, which presents general-purpose LVLMs (highlighted in gray) and medical LVLMs. Among the smaller models ($<$10B), Hulu-Med-7B achieves the best overall performance, ranking first in four of the ten evaluated benchmarks. The remaining six datasets are led by Qwen3-VL-8B-Instruct, Aloe-Vision-7B-AR and Lingshu-7B. Aloe-Vision-7B delivers consistently competitive results, typically ranking within the top three across benchmarks.

%When moving to larger models (20B-106B), we observe performance gains in all benchmarks. In this parameter range, two models stand out: Hulu-Med-32B and Aloe-Vision-72B-AR. Notably, Aloe-Vision-72B-AR achieves the strongest overall performance since it not only demonstrates competitive performance in multimodal-medical benchmarks but also  achieves the best results in text-only medical benchmarks (likely due to our token-based balancing approach). Additionally, Aloe-Vision-72B-AR also shows the strongest performance on general benchmarks among the healthcare-specialized models, demonstrating that high medical knowledge can be achieved without losing general capabilities.

% On a dataset-specific analysis, it can be observed that both Lingshu and Hulu-Med show greater performance than other models by a large margifn in the SLAKE benchmark, most likely because they include the training partition of such dataset in their training mixture, whereas we explicitly exclude it from our mixture. On the other hand, both Aloe-Vision models stand out on the CareQA-Vision datasets, achieving the best scores in the closed setting and second best only below GLM-4.5V in the open set, which performs best due to its size and thinking capabilities. This underscores their ability to generalize to entirely unseen medical data. 

%\jgnote{Comment training partition of SLAKE in training data increases performance. Comment 72B models are not available for comparison. Distinguish dataset sizes. Distinguish medical from general. Comment on reasoning models (MiMo, Chiron, GLM?). Comment the second finetuning to produce AR does not decrease performance.}


% TODO: Extend results comments

% "Worse" than Hulu-Med due to training set size (should we say this?)
% GLM better on open subsets due to size and thinking capabilities
% Thinking vs No Thinking (GLM & Kimi - Notable performance relative to model size in careqa VQA maybe?)

\input{tables/adv_results}

\paragraph{Adversarial Results}
While previous results show performance on standard benchmarks, we next assess model robustness under more challenging adversarial conditions. As shown in Table \ref{tab:robustness}, strong performance on standard evaluations does not necessarily imply robustness when models face ambiguous or adversarial inputs. These results are obtained using the adversarial datasets described in previous section (see Figure \ref{fig:adv_example}). First column (\textit{Base}) shows the results without any modifications, while the remaining columns correspond to evaluations with different adversarial attacks. To differentiate the effect of attacks on global versus region-specific predictions, we report results separately for classification tasks (using image-level labels) and detection tasks (using bounding-box-level labels).

Most models show substantial degradation when misleading information is introduced, indicating that high baseline accuracy (\eg Hulu-Med or Lingshu) does not necessarily guarantee robustness. Among the different adversarial subsets, misleading captions embedded in the image is the most damaging strategy, with nearly all models collapsing under this condition, especially in the classification tasks (\eg Qwen2-VL-7B-Instruct drops from 51.6 to 0.2, and Lingshu-7B from 78.8 to 1.2).
Regarding detection and classification tasks, detection generally proves more resilient than classification, suggesting that spatial grounding offers partial protection against textual manipulation. 
% In contrast, the sycophantic setting primarily exposes a weakness in smaller models, which suffer large drops when prompts encourage agreement, revealing a bias toward following misleading user intent rather than relying on visual evidence. 


As expected, Aloe-AR models consistently outperform their standard counterparts across the adversarial settings, confirming that explicit robustness training mitigates susceptibility to misleading or sycophantic cues. Furthermore, this confirms that robustness learned from FracAtlas transfers reliably to unseen medical specialties and image modalities.

Notably, GPT-5.2 shows significant performance drops under adversarial conditions, especially in caption-based attacks, indicating that even advanced proprietary models remain susceptible to misleading multimodal inputs.


\section{Conclusion}
\rebuttal{We introduced \textbf{Aloe-Vision-Data}, a large-scale, quality-filtered and token-balanced instruction mixture spanning medical and general domains across multimodal and text-only sources, together with a fully reproducible training and evaluation pipeline for medical LVLM fine-tuning. Building on this foundation, we released the \textbf{Aloe-Vision} model family at 7B and 72B scales and demonstrated competitive performance across a broad benchmark suite that jointly measures medical multimodal, medical text-only, and general capabilities. To support reliable assessment, we proposed \textbf{CareQA-Vision}, a contamination-resistant benchmark derived from expert-curated Spanish residency exams, enabling evaluation on previously unseen medical cases. Finally, our adversarial analysis shows that strong standard-benchmark performance does not guarantee reliability under misleading inputs, and that targeted robustness fine-tuning improves resistance to such attacks without degrading conventional accuracy. Together, these resources provide a fully open and reproducible foundation for advancing trustworthy medical vision-language modeling. Building on this foundation, an important direction for future research is to systematically disentangle the relative contributions of model scale, data scale and quality, and training curricula, in order to better characterize the performance trade-offs in medical vision-language models.}

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{

Anna Arias-Duart, Jordi Bayarri-Planas, and Jaume Guasch-Mart\'i acknowledge their AI4S fellowship within the “Generaci\'on D” initiative by Red.es, Ministerio para la Transformaci\'on Digital y de la Funci\'on Pública, for talent attraction (C005/24-ED CV1), funded by NextGenerationEU through PRTR. This work was also partially supported by the %AI4EUROPE project (Grant Agreement No. 101070000) and the 
ELLIOT project funded by the European Union under Grant Agreement No. 10121439. We also acknowledge the computational resources provided by CINECA and the Barcelona Supercomputing Center (BSC). We are particularly grateful to the Operations department at BSC for their technical support. Finally, we would like to thank all the healthcare experts who participated, especially Anabel Antol\'inez Dueñas, Marina Arias Duart, Jordi Farguell Piulachs, Elena Toledo, and Andreu Pacheco Agust\'i for their time and expertise.}

\bibliography{midl26_206}

\clearpage
\appendix

\section{Quality Filtering}
\label{app:quality_filtering}

Before supervision fine-tuning, we apply a semi-automatic quality filtering pipeline to remove incoherent or low-value samples from the training pool, as explained in Section~\ref{subsec:quality_filtering}. Figure~\ref{fig:filtering} illustrates typical failure modes captured by this process.

\begin{figure}[htb] 
    \centering
    \includegraphics[width=0.8\linewidth]{figures/filtering.png}  
    \caption{Semi-automatic quality filtering process. Below are examples of low-quality samples identified during filtering. Left: answer appears in the image (low score, low perplexity). Right: answer unrelated to the image (low score, high perplexity).} \label{fig:filtering}  
\end{figure}

\subsection{Tagging Template}
\label{app:tagging_template}
The following prompt is used with Qwen2.5-VL-72B-Instruct~\cite{yang2025qwen2}. We utilize a structured template that defines the task and output format, followed by a constrained taxonomy of tags.

% --- THE PROMPT BOX ---
\begin{tcolorbox}[
    enhanced,
    title={\textbf{System Prompt: Medical Sample Extraction}},
    colframe=black!70,       % Dark grey frame
    colback=gray!5,          % Very light grey background
    coltitle=white,
    fonttitle=\bfseries\sffamily,
    boxrule=0.5mm,
    sharp corners=south,     % Sharp corners at bottom
    rounded corners=north,   % Rounded at top
    breakable                % Allows splitting across pages
]
\sffamily % Switch to Sans-Serif for the prompt text to distinguish from body
\small

You are an expert medical assistant designed to categorize samples in different ways. A sample is composed of an optional image (or list of images), a question (or list of questions), and an answer (or list of answers). \\

Determine for each of the following sections which of the possible tags provided best describe the sample. For each category, you must only use the tags explicitly listed. If none apply, use the \textit{Other} tag.

\begin{center}
    \textit{[...Taxonomy definitions omitted for brevity, see Table \ref{tab:taxonomy}...] }
\end{center}

Taking into account that for each category type, you must provide the tags in priority order, respond using the following format:

\texttt{MOD[Modality] MED[Medical Field] ST[Sample Type] SBP[Specific Body Part] GBP[General Body Part] SQ[Sample Quality]} \\

\textbf{Example:} \\
\textit{Image:} \{CT scan\} \\
\textit{Question:} Is there evidence of a fracture in the distal radius? \\
\textit{Answer:} Yes \\
\textit{Response:} \texttt{MOD[X-ray] MED[Orthopedics] ST[Abnormality Detection] SBP[Bones] GBP[Upper limbs] SQ[4]}

\begin{center}
\textit{[Additional examples omitted for brevity]}
\end{center}
\end{tcolorbox}

\subsection{Taxonomy Definitions}
Table \ref{tab:taxonomy} details the five categorization dimensions and their vocabularies.

% --- THE CATEGORY TABLE ---
\begin{table}[h!]
    \centering
    \small
    \renewcommand{\arraystretch}{1.3} % Adds vertical breathing room
    % p{width} allows text wrapping without tabularx
    \begin{tabular}{@{} l p{0.75\textwidth} @{}} 
    \toprule
    \textbf{Dimension} & \textbf{Constrained Tags} \\
    \midrule
    \textbf{Modality} & X-ray, CT, MRI, Ultrasound, PET, SPECT, Microscopy, Dermatoscopy, Fundus, OCT, Endoscopy, Fluoroscopy, Angiography, Infrared, Figures (Graphs/Charts), Other \\
    \midrule
    \textbf{Medical Fields} & Cardiology, Neurology, Oncology, Orthopedics, Gastroenterology, Pulmonology, Dermatology, Ophthalmology, Pathology, Dentistry, Obstetrics \& Gyn., Endocrinology, Hematology, Nephrology, Surgery, Infectious Diseases, Other \\
    \midrule
    \textbf{Sample Types} & Diagnosis, Abnormality Detection, Modality ID, Anatomy ID, Comparison, Procedural Explanation, Visual Attributes, Severity Est., Prognosis, Treatment Suggestion, Etiology, Image Description, Result Analysis, Other \\
    \midrule
    \textbf{Specific Body Parts} & Cell, Brain, Lungs, Heart, Stomach, Intestines, Liver, Pancreas, Spleen, Kidneys, Spine, Pelvis, Bones, Skin, Eyes, Teeth, Blood Vessels, Muscles, Joints, Breasts, Ears, Nose, Throat, Reproductive Organs, Non-body part, Other \\
    \midrule
    \textbf{General Body Parts} & Head, Neck, Upper limbs, Lower limbs, Abdomen, Thorax, Pelvis, Non-body part, Other \\
    \midrule
    \textbf{Sample Quality} & Score (1--5) evaluating coherence among image, question, and answer. \\
    \bottomrule
    \end{tabular}
    \caption{Taxonomy used for metadata extraction.}
    \label{tab:taxonomy}
\end{table}

\section{Training}\label{appendix:training_config}

All training configurations are summarized in Table~\ref{tab:train_config}.


\begin{table}[h]
\centering
\caption{Training configuration for Aloe-Vision-7B and Aloe-Vision-72B.}
\label{tab:train_config}
\small
\begin{tabular}{@{}lcc@{}}
\toprule
\textbf{Parameter} & \textbf{7B} & \textbf{72B} \\
\midrule
Stage & \multicolumn{2}{c}{Single-stage full SFT} \\
Precision & \multicolumn{2}{c}{BF16} \\
Max. Sequence length & \multicolumn{2}{c}{4096} \\
Epochs & \multicolumn{2}{c}{1} \\
LR schedule & \multicolumn{2}{c}{Cosine} \\
Gradient checkpointing & \multicolumn{2}{c}{Enabled} \\
Parallelism & \multicolumn{2}{c}{DeepSpeed ZeRO-3} \\
Warmup & \multicolumn{2}{c}{3\%} \\
Global batch size & 1024 & 2000 \\
Micro-batch size & 16 & 4 \\
Gradient accumulation & 2 & 5 \\
Optimizer & AdamW & AdamW 8-bit \\
Max. Learning rate & 3.75e-5 & 1.25e-5 \\
Total GPUs & 32 & 100 \\
Estimated GPU-hours & 500 & 4500 \\
\bottomrule
\end{tabular}
\end{table}

\section{Evaluation} \label{appendix:eval}

More details on the diverse suite of medical and general benchmarks and the protocol used to ensure fair comparison and reproducibility are provided here.

\paragraph{Benchmarks} 
Our medical multimodal category comprises \textbf{CareQA-Vision} (our contam\-ination\-free benchmark derived from Spanish national medical and nursing exams, covering both MCQ and open-ended questions), \textbf{PathMMU}~\cite{sun2024pathmmu} (expert-validated pathology QA), \textbf{GMAI-MMBench}~\cite{ye2024gmai} (26k MCQs spanning image/box/ mask/image-set reasoning over 38 modalities and 18 departments), \textbf{OmniMedVQA}~\cite{hu2024omnimedvqa} (large-scale MCQ synthesized from 73 classification datasets), \textbf{ProbMed}~\cite{yan2025worse} (adversarial reliability checks), and \textbf{SLAKE}~\cite{liu2021slake} (physician-curated QA). To verify that SFT preserves general capabilities, we include \textbf{MMMU}~\cite{yue2024mmmu} (general multimodal MCQ), \textbf{MultiMedQA}~\cite{singhal2023publisher} (medical text-only MCQ), and \textbf{MMLU}~\cite{hendrycksmeasuring} (general text-only MCQ). 

\section{Ablation Studies}
\label{appendix:ablations}
All ablations experiments are conducted on \emph{Aloe-Vision-7B} using the identical training configuration described in Table~\ref{tab:train_config}. Evaluation is performed following the protocol detail in Appendix~\ref{appendix:eval}. For mixtures containing fewer samples than the one described in Section~\ref{subsec:final_mixture}, we proportionally increase the number of training epochs to ensure that the model receives at least as many gradient updates as the baseline. 

\paragraph{Data mixtures.} We compare three compositions: (1) the final balanced mixture (Section~\ref{subsec:final_mixture}), (2) a multimodal-only mixture (medical+general multimodal), and (3) a medical-only mixture (medical multimodal+medical text-only). Table~\ref{tab:7b_data_mixtures} shows that medical-only underperforms consistently. The reason might be the higher percentage of medical text-only Chain-of-Thought samples (from 29.7\% to 41.5\% of loss tokens), which induces verbose outputs despite prompts requesting a single MCQ option. The multimodal-only model improves on several multimodal benchmarks but regresses on text-only tasks, indicating that general and medical text-only data are important to preserve language-domain competence.

\begin{table*}[tbp]
  \centering
    \caption{Comparison of different data mixtures across all evaluation benchmarks. Accuracy (\%) and LLM-as-judge score for SLAKE.}
    \label{tab:7b_data_mixtures}
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{@{}lcccccccc@{}}
    \toprule
    \textbf{Model} & \textbf{PathMMU} & \textbf{OmniMed} & \textbf{GMAI} & \textbf{ProbMed} & \textbf{SLAKE} & \textbf{MMMU} & \textbf{MultimedQA} & \textbf{MMLU} \\
    \midrule
    Final & 61.8 & 75.9 & 52.8 & 76.5 & 65.4 & 45.1 & 58.5 & 65.9 \\
    Multimodal-only & 64.9 & 76.5 & 52.7 & 81.1 & 66.9 & 47.2 & 54.7 & 61.8 \\
    Medical-only & 55.0 & 71.0 & 48.5 & 53.8 & 64.6 & 44.8 & 49.7 & 64.2 \\
    \bottomrule
    \end{tabular}%
    }
\end{table*}

\paragraph{Evaluation leakage.} We measure the effect of removing evaluation images from training using exact image-hash matching (Section~\ref{sec:training_data}), which eliminates 6{,}273 samples ($\approx$0.18\% of the pool). Two models are trained identically, one without leakage (excludes matches) and the other with leakage (includes them). As shown in Table~\ref{tab:evaluation_leakage}, accuracy remains unchanged across benchmarks. We hypothesise that at this leakage rate, memorization effects are negligible for a 7B model, and matched images with potentially differing text can further attenuate any direct answer leakage.

\begin{table*}[tbp]
  \centering
    \caption{With vs.\ without eval-image leakage. Accuracy (\%) / SLAKE LLM-as-judge.}
    \label{tab:evaluation_leakage}
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{@{}lcccccccc@{}}
    \toprule
    \textbf{Model} & \textbf{PathMMU} & \textbf{OmniMed} & \textbf{GMAI} & \textbf{ProbMed} & \textbf{SLAKE} & \textbf{MMMU} & \textbf{MultimedQA} & \textbf{MMLU} \\
    \midrule
    Without Leakage & 61.8 & 75.9 & 52.8 & 76.5 & 65.4 & 45.1 & 58.5 & 65.9 \\
    With Leakage    & 61.0 & 74.3 & 52.2 & 75.8 & 65.9 & 45.6 & 59.6 & 66.1 \\
    \bottomrule
    \end{tabular}%
    }
\end{table*}

\paragraph{Filtering.} We test the effect of semi-automatic filtering (Section~\ref{subsec:quality_filtering}) by comparing training with vs. without removing low-quality samples. The non-filtered mixture contains 3{,}959{,}087 samples while the filtered mixture contains 3{,}571{,}622 samples (9.8\% reduction). The non-filtered run executes 3{,}866 vs.\ 3{,}487 steps due to dataset's larger size. Performance is nearly identical overall (Table~\ref{tab:filtering}), with a notable gain on ProbMed (+2.4\% absolute) for the filtered model. Filtering achieves comparable aggregate accuracy with $\sim$10\% fewer samples.

\begin{table*}[t]
  \centering
    \caption{Filtered vs.\ non-filtered mixtures. Accuracy (\%) / SLAKE LLM-as-judge.}
    \label{tab:filtering}
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{@{}lcccccccc@{}}
    \toprule
    \textbf{Model} & \textbf{PathMMU} & \textbf{OmniMed} & \textbf{GMAI} & \textbf{ProbMed} & \textbf{SLAKE} & \textbf{MMMU} & \textbf{MultimedQA} & \textbf{MMLU} \\
    \midrule
    With Filtering     & 61.8 & 75.9 & 52.8 & 76.5 & 65.4 & 45.1 & 58.5 & 65.9 \\
    Without Filtering  & 62.2 & 76.1 & 52.7 & 74.1 & 65.1 & 45.7 & 58.7 & 65.8 \\
    \bottomrule
    \end{tabular}%
    }
\end{table*}

\section{CareQA}
\label{appendix:careqa_results}

\subsection{Results}

Results for different models, organized by category (Medicine or Nursing) and by task type (multiple-choice or open-ended), are shown in Table \ref{fig:careqa_complete}. Across all models, performance on MCQ is consistently higher than on open-ended tasks, highlighting ongoing challenges in free-text medical reasoning. Larger models generally outperform smaller ones, with Aloe-Vision-72B achieving the highest MCQ scores and GLM-4.5V leading in the open format. Notably, medicine questions are answered more accurately than nursing questions, suggesting uneven domain coverage in training data. %Surprisingly, medical-specialized models such as HuatuoGPT, Linghsu or Hulu-Med do not surpass large generalist models, showing that strong free-text generation capabilities can compensate for a lack of explicit medical fine-tuning.

% \begin{table*}[tbp]
% \centering
% \small
% \caption{Performance of different models on the CareQA-Vision dataset. Models highlighted in gray are general-purpose, while the others are domain-specific. Top table shows smaller models, and the bottom table shows larger models. Results are reported separately for multiple-choice (MCQ) and open-ended (VQA) formats, and further divided by category (Nursing and Medicine). Best results are shown in \textbf{bold}, and second-best results are \underline{underlined}.}\label{fig:careqa_complete}
% \vspace{4pt}
% \begingroup
% \setlength{\aboverulesep}{0pt}
% \setlength{\belowrulesep}{0pt}
% \setlength{\extrarowheight}{.6ex}
% \resizebox{\textwidth}{!}{%
% \begin{tabular}{lccccccc}
% \toprule
% \multirow{2}{*}{\textbf{Model}} & 
% \multicolumn{3}{c}{\textbf{MCQ}} & & 
% \multicolumn{3}{c}{\textbf{VQA}} \\
% \cmidrule{2-4} \cmidrule{6-8}
%  & \textbf{Overall} & \textbf{Nursing} & \textbf{Medicine} & & \textbf{Overall} & \textbf{Nursing} & \textbf{Medicine} \\
% \midrule

% % ------------------- Small models -------------------
% \multicolumn{8}{l}{\textbf{Small models (\textless 10B)}} \\

% \rowcolor{gray!10} Qwen2-VL-7B-Instruct & 
% 51.52 & 35.71 & 56.91 & & 26.47 & 16.07 & 29.17  \\

% \rowcolor{gray!10} Qwen3-VL-8B-Instruct & 
% 58.79 & \underline{52.38} & 60.98 & & \underline{37.50} & \textbf{26.79} & \underline{40.28} \\

% \rowcolor{gray!10} MiMo-VL-7B-RL & 
% \underline{59.39} & 50.00 & 62.60 & & 29.41 & 16.07 & 32.87 \\

% Chiron-o1-8B & 
% 47.27 & \textbf{57.14} & 43.90 & & 20.96 & 17.86 & 21.76 \\

% Lingshu-7B & 
% 56.97 & 50.00 & 59.35 & & 31.62 & 21.43 & 34.26 \\

% HuatuoGPT-Vision-7B & 
% 50.30 & 38.10 & 54.47 & & 18.38 & 16.07 & 18.98 \\

% Hulu-Med-7B & 
% \underline{59.39} & 40.48 & \underline{65.85} & & \textbf{41.18} & \underline{23.21} & \textbf{45.83}  \\

% \noalign{\vskip 0.5ex}
% \hdashline[0.5pt/2pt]
% \noalign{\vskip 0.5ex}

% Aloe-Vision-7B & 
% 56.36 & 38.10 & 62.60 & & 36.03 & \textbf{26.79} & 38.43 \\

% Aloe-Vision-7B-AR & 
% \textbf{60.61} & 35.71 & \textbf{69.11} & & 36.03 & \underline{23.21} & 39.35 \\


% % ------------------- Separation row -------------------
% \noalign{\vskip 1ex}
% \hdashline[1pt/2pt]
% \noalign{\vskip 0.75ex}

% \multicolumn{8}{l}{\textbf{Large models (\textgreater 10B)}} \\

% % ------------------- Large models -------------------
% \rowcolor{gray!10} Qwen2-VL-72B-Instruct & 
% 72.73 & 64.29 & 75.61 & & 45.22 & 32.14 & 48.61 \\

% \rowcolor{gray!10} Qwen3-VL-30B-A3B-Instruct & 
% 71.52 & 57.14 & 76.42 & & 44.49 & 23.21 & 50.00 \\

% \rowcolor{gray!10} Kimi-VL-A3B-Instruct &  
% 55.15 & 47.62 & 57.72 & & 48.53 & 33.93 & 52.31 \\

% \rowcolor{gray!10} GLM-4.5V & 
% 72.73 & \underline{66.67} & 74.80 & & \textbf{67.65} & \textbf{53.57} & \textbf{71.30} \\

% HuatuoGPT-Vision-34B & 
% 54.55 & 47.62 & 56.91 & & 24.63 & 14.29 & 27.31 \\

% Lingshu-32B & 
% 64.85 & 52.38 & 69.11 & & 43.01 & 23.21 & 48.15 \\

% Hulu-Med-32B & 
% 63.64 & 38.10 & 72.36 & & 47.79 & 26.79 & 53.24 \\

% \noalign{\vskip 0.5ex}
% \hdashline[0.5pt/2pt]
% \noalign{\vskip 0.5ex}

% Aloe-Vision-72B & 
% \textbf{77.58} & \textbf{69.05} & \underline{80.49} & & \underline{51.10} & \underline{39.29} & \underline{54.17} \\

% Aloe-Vision-72B-AR & 
% \underline{75.76} & 59.52 & \textbf{81.30} & & 50.00 & 33.93 & \underline{54.17} \\

% \bottomrule
% \end{tabular}%
% }
% \endgroup
% \end{table*}


\begin{table*}[tbp]
\centering
\small
\caption{Performance of different models on the CareQA-Vision dataset. Models highlighted in gray are general-purpose, while the others are domain-specific. Top table shows smaller models, and the bottom table shows larger models. Results are reported separately for multiple-choice (MCQ) and open-ended (VQA) formats, and further divided by category (Nursing and Medicine). Best results are shown in \textbf{bold}, and second-best results are \underline{underlined}.}\label{fig:careqa_complete}
\vspace{4pt}
\begingroup
\setlength{\aboverulesep}{0pt}
\setlength{\belowrulesep}{0pt}
\setlength{\extrarowheight}{.6ex}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lccccccc}
\toprule
\multirow{2}{*}{\textbf{Model}} & 
\multicolumn{3}{c}{\textbf{MCQ}} & & 
\multicolumn{3}{c}{\textbf{VQA}} \\
\cmidrule{2-4} \cmidrule{6-8}
 & \textbf{Overall} & \textbf{Nursing} & \textbf{Medicine} & & \textbf{Overall} & \textbf{Nursing} & \textbf{Medicine} \\
\midrule

% ------------------- Small models -------------------
\multicolumn{8}{l}{\textbf{Small models (\textless 10B)}} \\

\rowcolor{gray!10} Qwen2-VL-7B-Instruct & 
51.52 & 35.71 & 56.91 & & 26.47 & 16.07 & 29.17  \\

\rowcolor{gray!10} Qwen3-VL-8B-Instruct & 
58.79 & \underline{52.38} & 60.98 & & \underline{38.24} & \textbf{26.79} & \underline{41.20} \\

\rowcolor{gray!10} MiMo-VL-7B-RL & 
\underline{59.39} & 50.00 & 62.60 & & 28.31 & 14.29 & 31.94 \\

Chiron-o1-8B & 
47.27 & \textbf{57.14} & 43.90 & & 21.32 & 17.86 & 22.22 \\

Lingshu-7B & 
56.97 & 50.00 & 59.35 & & 31.25 & 17.86 & 34.72 \\

HuatuoGPT-Vision-7B & 
50.30 & 38.10 & 54.47 & & 16.91 & 16.07 & 17.13 \\

Hulu-Med-7B & 
\underline{59.39} & 40.48 & \underline{65.85} & & \textbf{43.38} & \underline{23.21} & \textbf{48.61}  \\

\noalign{\vskip 0.5ex}
\hdashline[0.5pt/2pt]
\noalign{\vskip 0.5ex}

Aloe-Vision-7B & 
56.36 & 38.10 & 62.60 & & 36.76 & \textbf{26.79} & 39.35 \\

Aloe-Vision-7B-AR & 
\textbf{60.61} & 35.71 & \textbf{69.11} & & 36.40 & 21.43 & 40.28 \\


% ------------------- Separation row -------------------
\noalign{\vskip 1ex}
\hdashline[1pt/2pt]
\noalign{\vskip 0.75ex}

\multicolumn{8}{l}{\textbf{Large models (\textgreater 10B)}} \\

% ------------------- Large models -------------------
\rowcolor{gray!10} Qwen2-VL-72B-Instruct & 
72.73 & 64.29 & 75.61 & & 45.59 & 32.14 & 49.07 \\

\rowcolor{gray!10} Qwen3-VL-30B-A3B-Instruct & 
71.52 & 57.14 & 76.42 & & 43.75 & 23.21 & 49.07 \\

\rowcolor{gray!10} Kimi-VL-A3B-Instruct &  
55.15 & 47.62 & 57.72 & & 40.44 & 32.14 & 42.59 \\

\rowcolor{gray!10} GLM-4.5V & 
72.73 & \underline{66.67} & 74.80 & & \textbf{64.71} & \textbf{51.79} & \textbf{68.06} \\

HuatuoGPT-Vision-34B & 
54.55 & 47.62 & 56.91 & & 24.26 & 16.07 & 26.39 \\

Lingshu-32B & 
64.85 & 52.38 & 69.11 & & 42.28 & 19.64 & 48.15 \\

Hulu-Med-32B & 
63.64 & 38.10 & 72.36 & & 48.53 & 28.57 & \underline{53.70} \\

\noalign{\vskip 0.5ex}
\hdashline[0.5pt/2pt]
\noalign{\vskip 0.5ex}

Aloe-Vision-72B & 
\textbf{77.58} & \textbf{69.05} & \underline{80.49} & & \underline{49.63} & \underline{37.50} & 52.78 \\

Aloe-Vision-72B-AR & 
\underline{75.76} & 59.52 & \textbf{81.30} & & 48.53 & 32.14 & 52.78 \\

\bottomrule
\end{tabular}%
}
\endgroup
\end{table*}




\subsection{Expert evaluation}\label{sec:expert_eval}

As described in the main paper, experts labeled each model response as \textit{correct, partially correct or incorrect} (see the interface in Figure \ref{fig:interface}). A response was considered \textit{correct} when it matched the reference answer, even if the level of detail varies. It was labeled \textit{partially correct} when it captured only part of the required information, and \textit{incorrect} when it failed to match the correct content.

Because the evaluation was performed on a dataset we created (CareQA-Vision), we introduced an additional label, \textit{ambiguous}, to identify samples that were unclear, for example, poorly rephrased questions where multiple answers could be considered valid. This label was used exclusively for data cleaning: a sample was removed from the dataset if at least two experts marked it as ambiguous. Only three questions met this criterion, and were removed from the dataset. The remaining 105 questions from the medical CareQA-Vision were evaluated by four medical experts, with each question receiving at least two and up to four independent assessments (not all experts annotated all questions). 


\begin{figure}[tb] 
    \centering
    \includegraphics[width=0.85\linewidth]{figures/interface.png}  
    \caption{Interface used by experts to evaluate the model's answers.}
    \label{fig:interface}  
\end{figure}

%Beyond the results reported in the main paper, these are some additional observations. First, in the 5.71\% of cases where reviewers agreed with each other but disagreed with the LLM-as-a-judge, the experts were consistently more conservative: when experts labeled an answer as incorrect, the model labeled it as partially correct; when experts labeled an answer as partially correct, the model labeled it as correct. Second, there were three questions (2.86\%) where only one of the three evaluators agreed with the model, while the other two disagreed with each other, showing the difficulty of the task. Despite these disagreements, the overall alignment between experts and the LLM-as-a-judge supports the reliability of the automatic evaluation used.



\end{document}
