\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{booktabs}  % For \toprule, \midrule, \bottomrule
\usepackage{floatrow}  % Or whichever package provides \floatconts
\usepackage{mwe} % to get dummy images
\usepackage{fvextra}
\usepackage{footmisc}

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 305}
\editors{Accepted for publication at MIDL 2026}

\title[An Open Pipeline and Dataset for Whole-Slide Vision-Language Modelling]{Democratising Pathology Co-Pilots: An Open Pipeline and Dataset for Whole-Slide Vision-Language Modelling}%: Introducting Polysome, HISTAI-Instruct and ANTONI-$\alpha$}

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Sander Moonemans\midljointauthortext{Equal contribution}} \Email{sander.moonemans@radboudumc.nl}\\
\Name{Sebastiaan Ram\midlotherjointauthor} \Email{sebastiaan.ram@radboudumc.nl}\\
\Name{Fr{\'e}d{\'e}rique Meeuwsen} \Email{frederique.meeuwsen@radboudumc.nl} \\
\Name{Carlijn Lems} \Email{carlijn.lems@radboudumc.nl} \\
\Name{Jeroen {van der Laak}} \Email{jeroen.vanderlaak@radboudumc.nl}\\
\Name{Geert Litjens} \Email{geert.litjens@radboudumc.nl}\\
\Name{Francesco Ciompi} \Email{francesco.ciompi@radboudumc.nl}\\
\addr Computational Pathology Group, Radboud University Medical Center, Nijmegen, The Netherlands
}

\begin{document}

\maketitle

\begin{abstract}
% The last decade has witnessed the growing development of task-specific deep learning to support pathology diagnostics via histopathology whole-slide image (WSI) analysis.
Vision-language models (VLMs) have the potential to become co-pilots for pathologists. However, most VLMs either focus on small regions of interest within whole-slide images, provide only static slide-level outputs, or rely on data that is not publicly available, limiting reproducibility. Furthermore, training data containing WSIs paired with detailed clinical reports is scarce, restricting progress toward transparent and generalisable VLMs. 
We address these limitations with three main contributions. First, we introduce \emph{Polysome}, a standardised tool for synthetic instruction generation. Second, we apply Polysome to the public HISTAI dataset, generating \emph{HISTAI-Instruct}, a large whole-slide instruction tuning dataset spanning 24,259 slides and over 1.1 million instruction-response pairs. Finally, we use HISTAI-Instruct to train \emph{ANTONI-$\alpha$}, a VLM capable of visual-question answering (VQA). We show that ANTONI-$\alpha$ outperforms MedGemma on WSI-level VQA tasks of tissue identification, neoplasm detection, and differential diagnosis. We also compare the performance of multiple incarnations of ANTONI-$\alpha$ trained with different amounts of data. All methods, data, and code are publicly available\footnote{\url{https://github.com/computationalpathologygroup/ANTONI-Alpha}\label{fn:antoni}}$^,$
\footnote{\url{https://github.com/computationalpathologygroup/Polysome}\label{fn:polysome}}$^,$
\footnote{\url{https://huggingface.co/datasets/SaltySander/HISTAI-Instruct} \label{fn:histai-instruct}}. 
\end{abstract}

\begin{keywords}
Instruction-tuning, visual question-answering, whole-slide images, digital assistant
\end{keywords}

\section{Introduction}
Recent years have seen the advent of vision-language models in computational pathology that facilitate robust image-text understanding. These models are evolving into general-purpose AI assistants: while currently explored for automated generation of pathology reports \cite{RubenSander2025,Tran2025,guo2024histgenhistopathologyreportgeneration}, they pave the way for interactive workflows such as slide pre-screening, question-answering, and visual analysis conditioned on text-based genomic and clinical records.

Despite the proliferation of general-purpose AI, the application of Vision-Language Models (VLMs) in computational pathology remains constrained by the need to integrate fine-grained morphological details with long-range spatial dependencies. Current state-of-the-art approaches generally fall into three categories, each with distinct limitations regarding context, reproducibility, and availability.

Patch-level models demonstrate strong local reasoning but lack global context. While instruction-tuned models like PathChat \cite{lu2024} show superior reasoning on small Regions of Interest (ROIs), they remain proprietary. Conversely, MedGemma \cite{sellergren2025medgemmatechnicalreport} offers a promising open-weight alternative; however, its histopathology performance was evaluated solely on internal data, leaving its efficacy on public benchmarks unverified. Crucially, none of these models support Whole Slide Image (WSI)-level analysis, failing to capture the broader tissue architecture.

Attempts to scale to WSI-level analysis such as PathChat+ paired with SlideSeek \cite{chen2025evidencebaseddiagnosticreasoningmultiagent} face methodological bottlenecks. These systems often rely on general-purpose planners (e.g., OpenAI o1) that utilize low-resolution thumbnails to guide search strategies. This approach risks overlooking critical fine-grained features, such as micrometastases, that fall outside the planned search area. Furthermore, the stochastic decision-making process of these agents makes ensuring reproducible diagnostic results a significant challenge.

Last, native slide-level models \cite{RubenSander2025,Tran2025,chen2024wsicaptionmultipleinstancegeneration} process WSIs efficiently but are largely confined to static report generation. They typically lack the open-ended Visual Question Answering (VQA) capabilities necessary for a collaborative ``co-pilot" workflow. While emerging foundation models like PRISM2 \cite{vorontsov2025prism2unlockingmultimodalgeneral} and SmartPath-R1 \cite{xu2025versatilepathologycopilotreasoning} have begun to address the intersection of slide-level understanding and interactivity, they are not publicly available, precluding direct comparison and reproducibility.

Although public challenges such as CAMELYON \cite{litjens20181399} and TIGER \cite{van_Rijthoven2025.02.28.25323078}, along with initiatives such as TCGA \cite{weinstein2013cancer}, have led to a significant increase in public and transparent benchmarking, paired image-text datasets for vision-language pretraining remain scarce. Furthermore, the availability of instruction-tuning data, specifically structured as question-answer pairs or dialogues, remains limited, with SlideChat \cite{chen2024slidechat} being one of the few open-source instruction-tuning datasets leveraging the TCGA archive. While it establishes a precedent with 176k VQA pairs, its reliance on a relatively small cohort of 4.2k slides limits the model's ability to generalise across diverse tissue types and patient populations.

In this work, we address these lacunae with three contributions. First, we introduce Polysome\footref{fn:polysome}, a generic, domain-agnostic tool designed to transform any unstructured text into structured instruction-tuning data by prompting large language models (LLMs). Second, we demonstrate Polysome’s capabilities by applying it to pathology reports to generate HISTAI-Instruct\footref{fn:histai-instruct}. Spanning 24,259 whole-slide images and 1,118,691 conversational attributes, this stands as the largest fully open-source instruction-tuning dataset currently available in computational pathology. Third, we present ANTONI-$\alpha$\footref{fn:antoni}, a vision-language model based on MedGemma 4B, trained on HISTAI-Instruct. To fully commit to open science, we release Polysome, HISTAI-Instruct, and ANTONI-$\alpha$, along with the corresponding training and validation pipelines.

\section{Methods}
This section describes a) the curation of the source dataset and WSI preprocessing pipeline used to extract visual features, b) the use of our synthetic generation pipeline Polysome to generate $>1.1$M conversational instruction pairs, released as HISTAI-Instruct, and c) the architecture and training protocol of ANTONI-$\alpha$, our slide-level vision-language model.

\subsection{Dataset}
% Second, as our proposed architecture is designed for single-image interpretation, we 
We use HISTAI \cite{nechaev2025histaiopensourcelargescaleslide}, a large-scale, open-source digital pathology archive comprising 112,801 WSIs from $>$47k medical cases. Of these cases, 46,128 include multimodal metadata such as patient demographics, diagnostic conclusions, and detailed microscopic descriptions.

To curate a high-quality subset for instruction tuning, we applied a multistage filtering pipeline (Figure \ref{fig:methods-sankey}). First, we performed deduplication and excluded multi-file cases as a heuristic to obtain a one-to-one mapping between slides and reports. Second, we filtered out cases lacking microscopic descriptions or pathological conclusions, as these fields serve as the reference standard for our VQA generation. This resulted in an intermediate dataset of 24,259 cases, which was used to create the instruction-tuning pairs. 

After this step, we excluded an additional 1723 cases; these consisted of slides from the source dataset that were not stained with H\&E ($N=1360$), images were significantly out-of-focus ($N=7$), or were shifted along the y-axis, to the point that tissue structures could no longer be recognized ($N=4$). Finally, several slides ($N=29$) contained ``micro-biopsies" with an average mean area of $0.57 mm^2$ (for HISTAI mixed) and $0.31 mm^2$ (HISTAI skin). These represent a known failure mode for automated segmentation, as models struggle with images with a high background-to-tissue ratio (debris-laden), drowning out the actual tissue signal \cite{kani2025}. Additionally, we excluded slides digitised at 40x magnification ($N=323$) to maintain a uniform 20x resolution across the training set. The final curated cohort comprises 22,536 cases.

\begin{figure}
    \centering
    \includegraphics[width=1.0\linewidth]{images/methods/sankey.png}
    \caption{\textbf{HISTAI data preprocessing pipeline}. Blue: retained cases; orange: discarded cases.}
    \label{fig:methods-sankey}
\end{figure}

\paragraph{Visual Representation Pipeline}
% To process the WSIs in our curated cohort, we addressed a metadata deficit in the raw HISTAI TIFF files, which lacked native spacing information (microns per pixel). To enable resolution-agnostic processing, we injected the correct spacing metadata—0.25$\mu$m/px (40x) or 0.50$\mu$m/px (20x)—by inferring the scan magnification from the source filenames.

% After this essential step, 
We implemented our image processing pipeline using the open-source Trident toolkit \cite{trident}, which is structured into three stages: segmentation, tiling, and feature encoding. Foreground tissue segmentation was performed at 0.5$\mu$m/px (20x magnification) using the HEST model \cite{hest}. To maximise sensitivity and capture sparse tissue fragments, we empirically adjusted the segmentation probability threshold from 0.5 to 0.4 based on manual observation of several cases where tissue was not segmented with the default probability.

Finally, a hierarchical approach to feature extraction was used. Individual tissue tiles were encoded using Virchow \cite{vorontsov2024virchow}, providing tile embeddings for the given image. These embeddings were subsequently aggregated into slide-level representations using PRISM \cite{shaikovski2024prism}. We selected PRISM specifically for its Contrastive Captioning (CoCa \cite{yu2022cocacontrastivecaptionersimagetext}) architecture, which is explicitly pre-trained for both representation learning and text generation. The model outputs 513 latent variables: a single global token trained on an image-text contrastive (CLIP-like) objective and 512 latent tokens optimised for the generative decoding objective. We utilise this full set of latents to condition our downstream language model, making use of features that are already aligned with captioning tasks.

% \subsection{Conversational Data Generation, Quality Filtering and Diversification}
\subsection{Conversational Data Generation Using Polysome}
% \paragraph{Polysome}
To address the scarcity of structured biomedical instruction data, we developed \emph{Polysome}, a modular Python framework that uses large language models (LLMs) for reproducible text transformation. Polysome orchestrates the conversion of static metadata (e.g., CSV, JSON) into dynamic instruction-response pairs by injecting records into customisable prompt templates. This allows one to systematically ``rewrite'' raw clinical notes into diverse conversational formats without writing code. A simplified schematic of how Polysome ingests metadata and produces conversational data can be found in Appendix \ref{sec:polysome_figure}.

% To ensure scalability, Polysome operates on a Directed Acyclic Graph (DAG) architecture defined via declarative JSON configurations. This design supports backend-agnostic execution, enabling high-throughput and distributed inference via vLLM \cite{kwon2023efficientmemorymanagementlarge}, resource-efficient processing with \texttt{llama.cpp}\footnote{\url{https://github.com/ggml-org/llama.cpp}}, or standard Hugging Face Transformers \cite{DBLP:journals/corr/abs-1910-03771}.

Next, we used Polysome to generate the \emph{HISTAI-Instruct} dataset based on the metadata of the subset of 24,259 cases, preceding the filtering of non-H\&E and 40x magnification images. \footnote{For full reproducibility, the complete workflow configuration JSON file is available at \href{https://github.com/computationalpathologygroup/Polysome/blob/main/histai-instruct-workflows/histai_instruct_generate_workflow.json}{\texttt{histai-instruct-workflows/histai\_instruct\_generate\_workflow.json}} in the Polysome Github project}. Using the Gemma-3-27B-it (Instruction-Tuned) model with 4-bit integer quantisation\footnote{RedHatAI/gemma-3-27b-it-quantized.w4a16}, we designed a curriculum of seven conversational categories targeting specific competencies (Table \ref{tab:conv_tasks}; refer to Appendix \ref{appendix:prompts} for prompts). To increase linguistic diversity and maximise coverage of the target clinical user base, English instructions were generated first and subsequently translated into six additional high-resource languages (Dutch, French, German, Italian, Polish, Spanish) supported by our translation model. An additional reason for choosing these languages was the availability of native speakers within our research group for sanity checks on the translated conversational attributes. This yielded an unprecedented whole-slide instruction tuning dataset with 157k unique conversational attributes in 7 languages, totalling $>$1.1M conversational instances. This dataset is split into train, test and validation sets at the case level. Since each case contains the same seven conversational attributes across seven languages (49 attributes per case), the language distribution is identical across all splits.



\begin{table}[ht]
\floatconts
  {tab:conv_tasks}%
  {\caption{The seven conversational categories of HISTAI-Instruct.}}%
  {%
  \small 
  \setlength{\tabcolsep}{2pt} % Minimal padding to maximize text width
  \renewcommand{\arraystretch}{1.0} % Tightest vertical spacing
  \resizebox{\textwidth}{!}{
  \begin{tabular}{@{} l p{0.82\textwidth} @{}} 
  \toprule
  \bfseries Task Name & \bfseries Description \& Objective\\
  \midrule
  Advanced reasoning & \textbf{Chain-of-Thought.} Reasons about feature implications, linking morphology to pathophysiology. \\
   
  Clean report & \textbf{Structured Output.} Generates structured reports (e.g., Microscopy, Diagnosis) strictly grounded in visual evidence, ignoring other context. \\
   
  Detailed description & \textbf{Visual Grounding.} Single-turn, dense captioning of all visible features to build a complete slide representation. \\
   
  Differential diagnosis & \textbf{Discriminative Analysis.} Evaluates potential diagnoses, ruling them in or out based on microscopic criteria. \\
   
  Multi-turn conversations & \textbf{State Tracking.} Maintains context across exchanges, guiding users from general to specific findings. \\
   
  Negative reasoning & \textbf{Hallucination Mitigation.} Identifies unanswerable queries (e.g., genetics) and explicitly states uncertainty. \\
   
  Short VQA & \textbf{Classification Benchmarking.} Ultra-concise responses to standardized questions (e.g., Organ, Neoplasm) for exact-match evaluation. \\
  \bottomrule
  \end{tabular}%
  }}
\end{table}

To improve the quality of our dataset, and to prevent hallucinations by downstream models, we applied an automated ``LLM-as-a-Judge" evaluation pipeline using Gemma 3. Via a standardised rubric (Appendix \ref{appendix:prompts}), the LLM scored the English versions of the conversational attributes on three dimensions: constraint adherence (microscopic focus), factual groundedness, and reasoning clarity. We discarded attributes that did not meet strict quality thresholds (accuracy $\le$3/5 (minimum acceptable quality) or constraint violations), along with their translated variants. This removed 13,167 instances (1.1\% of the total), resulting in a final high-quality corpus of 1,175,524 conversational attributes. 

To prevent overfitting to repetitive prompts, we further implemented a frequency-based diversification strategy. We identified high-frequency user queries (occurring $\ge$100 times) and replaced them with 20 linguistically diverse, semantically equivalent alternatives generated by Gemma 3. This replacement was applied progressively: frequent questions were stratified into four tiers, with replacement rates scaling from 30\% for common queries to 90\% for ubiquitous ones. In total, this approach diversified 25.4\% of all user messages ($N=547,333$).

\subsection{Vision-Language Model}
\begin{figure}[ht]
    \centering
    \includegraphics[width=.990\linewidth]{images/methods/architecture.png}
    %\vspace{-0.8cm}
    \caption{\textbf{Architecture of ANTONI-$\alpha$.} Image processing modules (blue) extract features via VIRCHOW and PRISM. These features are aligned with conversational data (green) via a Vision Projector. The MedGemma LLM (pink) generates responses using inputs from both modalities. Snowflake and flame icons denote frozen and trainable parameters, respectively, during the instruction-tuning stage (in contrast to pretraining, where the LLM is fully frozen).}

    \label{fig:figure_antoni}
\end{figure}

% \paragraph{Model architecture}
\noindent
\emph{ANTONI-$\alpha$} connects a domain-specific vision encoder with an LLM for slide-level histopathology analysis using the LLaVA framework~\cite{liu2023visualinstructiontuning}.
For language, we use the MedGemma-4B-IT~\cite{sellergren2025medgemmatechnicalreport} model optimised for medical domains and bypass its SigLIP vision encoder and multimodal projector by integrating our domain-specific features directly. 
We bridge vision and language modalities via a vision projector using attentional pooling that compresses 513 vision embeddings (dimension 1280) into 256 tokens matching MedGemma's expected visual input length.
This projector employs a single cross-attention layer where 256 learnable query tokens attend to input features, with queries and keys projected to the vision embedding's dimensionality while values project directly to the language model's hidden dimension (3072), eliminating additional projection layers. The complete architecture is presented in Figure \ref{fig:figure_antoni}. We use 8 attention heads for both queries and key-values, along with dropout (0.1) and layer normalisation.
The projected vision embeddings are inserted at the beginning of the user's first turn by replacing placeholder tokens in MedGemma's input template; specifically, \texttt{<start\_of\_image>} expands to 256 \texttt{<image>} tokens and are then replaced.

% \paragraph{Training}
We adopted a two-stage training protocol distributed across 8$\times$NVIDIA H200 GPUs using Fully Sharded Data Parallel (FSDP) and bfloat16 mixed precision. Across both stages, we used the AdamW optimiser with a weight decay of 0.01 and cosine learning rate scheduling (10\% warmup). We employed a 4-step gradient accumulation strategy to achieve an effective batch size of 512 (from a batch size of 16 per GPU), ensuring robust convergence. The objective for both stages was to minimise cross-entropy loss exclusively on assistant response tokens: $\mathcal{L} = -\frac{1}{T}\sum_{t=1}^{T}\log P(y_t|\mathbf{x}_{1:t-1})$,
where $y_t$ represents assistant tokens and $\mathbf{x}_{1:t-1}$ denotes the preceding sequence, including conversation structure, vision embeddings, and user queries. Text attributes were sampled randomly with replacement throughout the process. In the first pretraining stage, we aligned visual features with the language representation space by training only the vision projector while keeping the language model frozen. We trained the model on the multilingual pathology reports (generated using the \emph{clean report} task), leveraging translated pairs across seven languages to enhance representation learning within the projector through linguistic and perceptual diversity \cite{nguyen2024multilingual, buettner-etal-2025-multimodal}. This pretraining stage proceeded for 35 epochs, utilising a learning rate of $3 \times 10^{-4}$.  In the second stage, we jointly trained the projector and the language model using Quantised Low-Rank Adaptation (QLoRA)~\cite{dettmers2023qloraefficientfinetuningquantized}, with a rank of 16 applied to all linear layers. The training data encompassed the English variants of the tasks summarised in Table \ref{tab:conv_tasks}. This stage ran for 21 epochs with a reduced learning rate of $3 \times 10^{-5}$.

\section{Experiments}
% \subsection{Experiments}
\noindent
To validate ANTONI-$\alpha$, we established a benchmark comprising three core diagnostic tasks: organ identification, neoplasm detection, and differential diagnosis (selecting the most likely diagnosis from a set of options). Figure \ref{fig:validation_pipeline} illustrates this evaluation framework: ANTONI-$\alpha$ processes native whole-slide images directly, while the baseline MedGemma model receives downsampled, whitespace-removed $892 \times 892$ pixel thumbnails. This preprocessing is necessary because MedGemma cannot handle the full resolution of WSIs or accept custom encoder embeddings such as those from PRISM. Both models are queried with identical questions, and responses are evaluated against pathologist-verified reference labels. Evaluation was performed on a held-out internal test set of 317 cases, predominantly consisting of skin ($N=141$), breast ($N=92$), and colon ($N=48$) specimens, comprising 218 neoplastic and 99 non-neoplastic samples. Reference labels were derived from the first two questions of the generated `short-vqa' and `differential diagnosis' attributes, resulting in 951 instruction-response pairs verified and corrected by a board-certified pathologist. Further details can be found in Appendix \ref{appendix:eval}.

\paragraph{Evaluation Metrics}
For \emph{organ identification}, the model was prompted to identify the tissue type or organ present. Responses were parsed and evaluated against a standardised hierarchical tissue taxonomy, found in the ANTONI-$\alpha$ repository, which organises organs and tissue types into a structured ontology with synonyms for each node (e.g., large intestine and colon tissue).
Furthermore, a hierarchical scoring scheme was applied, where a response received a score of 1.0 if it matched the reference node exactly, 0.75 if it was one step away in the hierarchy (i.e., a direct parent, child, or sibling), 0.5 if it was two steps away, and 0.0 otherwise. 

For \emph{neoplasm detection} and \emph{differential diagnosis}, we formulated the tasks as multiple-choice classification problems. Neoplasm detection was treated as a binary choice (Yes/No). In contrast, the differential diagnosis task required the model to distinguish between clinically similar options; it was presented with a specific differential of three conditions and prompted to select the most probable diagnosis after discussing each option. Generally, this resulted in the model producing a free-form text explanation considering each option, followed by a conclusive answer. Furthermore, we included an extra instruction to format the definitive answer in double brackets ([[ ]]). However, the models did not follow this instruction in many cases, and parsing the definitive answer proved to be difficult due to formatting deviations. Therefore, to ensure consistent evaluation, we introduced an automated scoring pipeline in which the free-text reasoning generated by each model was parsed using Gemini 2.5 Flash to extract the definitive answer. Performance metrics were tailored to the specific nature of each task. For \emph{neoplasm detection}, we evaluated the model using Precision, Recall, and F1-score to assess the trade-off between sensitivity and positive predictive value. For the \emph{differential diagnosis} task, performance was reported as overall accuracy, calculated as the percentage of instances where the model's extracted choice strictly matched the reference diagnosis.

\begin{figure}[ht]
    \centering
    \includegraphics[width=1.0\linewidth]{images/experiments/Evaluation.pdf}
    \caption{Validation pipeline for comparing ANTONI-$\alpha$ and MedGemma. Both models process the same WSI. For MedGemma, the WSI is first downscaled and packed to remove most whitespace. The evaluation consists of three questions: \textbf{Q1} targets organ or tissue identification, \textbf{Q2} detects the presence of a neoplasm, and \textbf{Q3} requires the most likely diagnosis selected from three possible candidate differentials.}
    \label{fig:validation_pipeline}
\end{figure}

\paragraph{Baselines and Scaling}
To investigate the impact of data scaling, we trained three versions of ANTONI-$\alpha$ on subsets of 2k, 9k, and 18k samples, respectively. We compared performance against the standard MedGemma (4B and 27B versions)\cite{sellergren2025medgemmatechnicalreport}. Furthermore, we evaluated a pretrained-only (base) version of ANTONI-$\alpha$ to investigate the impact of the finetuning stage.

% \paragraph{External Validation}
% To assess generalisation beyond the source distribution, we performed an independent evaluation on SlideBench-TCGA \cite{chen2024slidechat}. We selected the multiple-choice question subsets corresponding to the tissue types represented in our training data (BRCA, COAD, READ, HNSC, and SKCM), excluding unseen organs (e.g., Brain/LGG, Lung/LUAD) to ensure fair comparison. 

\section{Results}
Table \ref{tab:results_hierarchical} reports the performance of the ANTONI-$\alpha$ variants against learnt and random baselines on 100\% data coverage of the hold-out test set. We report the average hierarchical score for organ identification, classification metrics for neoplasm detection, and accuracy for resolving the differential diagnosis.

For the baselines, we find that the larger MedGemma-27B generally outperformed the smaller 4B variant in differential diagnosis; however, it suffered from substantially lower performance in organ identification and neoplasm detection. For ANTONI-$\alpha$, increasing the number of training samples produced the best results, with the 18k configuration achieving the highest scores across most metrics and demonstrating the most consistent performance across all tasks.

Furthermore, the fine-tuned ANTONI-$\alpha$ models generally outperformed the MedGemma baselines. ANTONI-$\alpha$ (9k and 18k) achieved a score of 0.91 for organ identification, surpassing the best baseline score of 0.48. Regarding neoplasm detection, MedGemma-27B achieved the highest precision (85\%), but it missed a significant number of positive cases. In contrast, ANTONI-$\alpha$ (2k) demonstrated superior overall performance with near-perfect recall and an F1 score of 81\%, compared to 38\% for the baseline. Finally, while both model families struggled with differential diagnosis, ANTONI-$\alpha$ (18k) achieved more robust results, with an accuracy increase of 23 percentage points compared to MedGemma-27B (68\% vs 45\%).

Figure \ref{fig:conversation} illustrates these performance differences through a representative basal cell carcinoma case. ANTONI-$\alpha$ progressively builds a detailed morphological assessment, identifying proliferation of cells within the dermis, evidence of cell division, and tumor margins, which culminates in the correct diagnosis. In contrast, MedGemma consistently acknowledges its visual limitations due to the low-resolution thumbnail input, repeatedly stating it cannot assess margins or describe cellular details. Unable to extract the fine-grained morphological features necessary for diagnosis, MedGemma defaults to an incorrect diagnosis of a benign skin lesion.


% Furthermore, the ANTONI-$\alpha$ models outperformed the MedGemma baselines across all tasks. ANTONI-$\alpha$ (9k and 18k) achieved a score of 0.91 for organ identification, surpassing the best baseline score of 0.48 by a wide margin. In neoplasm detection, while MedGemma-27B achieved a higher precision (85\%), ANTONI-$\alpha$ (2k) demonstrated superior reliability with a near-perfect recall and an F1 score of 81\%, compared to 38\% for the baseline. Both model families faced challenges with the complex reasoning required for differential diagnosis. However, ANTONI-$\alpha$ (18k)'s accuracy for resolving the differential diagnosis was 23 percentage points higher than MedGemma-27B (68\% vs 45\%).

\begin{table}[ht]
\floatconts
  {tab:results_hierarchical}% Label
  {\caption{Performance comparison of ANTONI-$\alpha$ variants against learned and random baselines. We report the hierarchical score (0.00--1.00) for \emph{organ identification}, accuracy for \emph{differential diagnosis}, and precision, recall, and F1 for \emph{neoplasm detection}. 95\% confidence intervals are shown in brackets.}}% Caption
  {% Table content
  \begin{tabular}{l|c|ccc|c}
  \toprule
  \bfseries Model & \bfseries Organ & \multicolumn{3}{c|}{\bfseries Neoplasm Detection} & \bfseries Diff. Diag.\\
   & \bfseries Score & \bfseries Prec (\%) & \bfseries Rec (\%) & \bfseries F1 (\%) & \bfseries Acc (\%)\\
  \midrule
  % \textit{Theoretical Baselines} & & & & & \\
  Random Chance & -- & 68.77 & 50.00 & 57.90 & 26.89 \\
  \midrule
  \textit{Baselines} & & & & & \\
  MedGemma-4B & \shortstack{0.48 \\ \scriptsize [0.43--0.53]} 
              & \shortstack{71.43 \\ \scriptsize [65.26--77.39]} 
              & \shortstack{68.81 \\ \scriptsize [62.67--74.89]} 
              & \shortstack{70.09 \\ \scriptsize [64.94--74.89]} 
              & \shortstack{40.06 \\ \scriptsize [34.70--45.43]} \\
  \addlinespace
  MedGemma-27B & \shortstack{0.37 \\ \scriptsize [0.32--0.42]} 
               & \shortstack{\bfseries 85.48 \\ \scriptsize [76.00--93.75]} 
               & \shortstack{24.31 \\ \scriptsize [18.67--30.14]} 
               & \shortstack{37.86 \\ \scriptsize [30.30--44.97]} 
               & \shortstack{44.79 \\ \scriptsize [39.12--50.16]} \\
  \midrule
  \textit{Ours} & & & & & \\
  ANTONI-$\alpha$ (Base) & \shortstack{0.52 \\ \scriptsize [0.47--0.58]} 
                             & \shortstack{60.33 \\ \scriptsize [53.12--67.40]} 
                             & \shortstack{50.92 \\ \scriptsize [44.25--57.35]} 
                             & \shortstack{55.22 \\ \scriptsize [49.21--60.68]} 
                             & \shortstack{48.26 \\ \scriptsize [42.89--53.63]} \\
  \addlinespace
  ANTONI-$\alpha$ (2k) & \shortstack{0.66 \\ \scriptsize [0.60--0.71]} 
                       & \shortstack{68.67 \\ \scriptsize [63.61--73.73]} 
                       & \shortstack{\bfseries 99.54 \\ \scriptsize [98.56--100.00]} 
                       & \shortstack{\bfseries 81.27 \\ \scriptsize [77.61--84.73]} 
                       & \shortstack{52.68 \\ \scriptsize [47.00--58.36]} \\
  \addlinespace
  ANTONI-$\alpha$ (9k) & \shortstack{\bfseries 0.91 \\ \scriptsize [0.88--0.94]} 
                       & \shortstack{70.89 \\ \scriptsize [65.60--76.04]} 
                       & \shortstack{94.95 \\ \scriptsize [91.85--97.70]} 
                       & \shortstack{81.18 \\ \scriptsize [77.33--84.69]} 
                       & \shortstack{66.25 \\ \scriptsize [60.88--71.29]} \\
  \addlinespace
  ANTONI-$\alpha$ (18k) & \shortstack{\bfseries 0.91 \\ \scriptsize [0.88--0.94]} 
                        & \shortstack{72.89 \\ \scriptsize [67.54--78.18]} 
                        & \shortstack{91.28 \\ \scriptsize [87.44--94.82]} 
                        & \shortstack{81.06 \\ \scriptsize [77.12--84.74]} 
                        & \shortstack{\bfseries 68.45 \\ \scriptsize [63.09--73.50]} \\
  \bottomrule
  \end{tabular}}
\end{table}

\begin{figure}[ht]
    \centering
    \includegraphics[width=1.0\linewidth]{images/results/Conversation.pdf}
    \caption{Qualitative comparison of ANTONI-$\alpha$ and MedGemma on a dermatology case (basal cell carcinoma, case: histai-skin-b2/case 01406). ANTONI-$\alpha$ (left) processes the full-resolution WSI and synthesizes its findings into the correct diagnosis of basal cell carcinoma. In contrast, MedGemma (right) relies on a lower resolution thumbnail. It is unable to assess margins or describe cell details, leading to an incorrect diagnosis.}
    \label{fig:conversation}
\end{figure}
\section{Discussion and Conclusion}
In this work, we presented an end-to-end, open-source pipeline for whole-slide vision-language modelling, encompassing the \emph{Polysome} VQA instruction generator, the \emph{HISTAI-Instruct} dataset, and the \emph{ANTONI-$\alpha$} model. Thanks to its capability to process WSI as a native input rather than a downsampled thumbnail, our approach outperforms generalist medical VLMs like MedGemma across both identification and diagnostic tasks. This performance gap highlights the critical necessity of domain-specific encoders that preserve high-resolution morphological details, which are otherwise lost.

A key finding of our study is the impact of data scaling on model performance. While previous efforts such as SlideChat \cite{chen2024slidechat} have relied on smaller cohorts ($\approx$4k slides), our ablation study demonstrates that scaling instruction-tuning data from 2k to 18k samples yields substantial improvements in organ identification and differential diagnosis. These improvements highlight the important role of scalable and efficient synthetic data generation pipelines like Polysome in overcoming label scarcity. Furthermore, we address the reproducibility barrier inherent in proprietary models \cite{vorontsov2025prism2unlockingmultimodalgeneral, xu2025versatilepathologycopilotreasoning} by open-sourcing our entire pipeline and outputs. By democratising access to the underlying infrastructure, rather than just the final model, we aim to facilitate a shift towards more transparent and collaborative digital pathology research.

% Despite these strengths, the reliance on synthetic data generation introduces specific challenges. Since our instruction pairs are derived from clinical reports using LLMs, any ambiguity or noise in the source text is propagated to the model. A substantial issue we observed is the misalignment between reports and slide content within HISTAI; clinical reports often reference multiple slides and stains (e.g., IHC), while our model ``sees'' only a single H\&E slide. Although we use a heuristic to select single-file cases, this was not foolproof, leading to instances where the model was trained on descriptions of tissue not present in the input image. Previous research \cite{lucassen2025importancetextpreprocessingmultimodal} has shown that this disconnect contributes to the model's tendency to hallucinate non-visual features, such as specific genetic mutations or immunohistochemical results, as it attempts to correlate text with unrelated visual patterns.


Despite these strengths, the reliance on synthetic data generation introduces specific challenges. Since our instruction pairs are derived from clinical reports using LLMs, any ambiguity or noise in the source text propagates to the model. A substantial issue we observed is the misalignment between reports and slide content within HISTAI; clinical reports often reference multiple slides and stains (e.g., IHC), while our model ``sees'' only a single H\&E slide of that case. Although we use a heuristic to select single-file cases, this was not foolproof, leading to instances where the model was trained on descriptions of tissue not present in the input image. Previous research \cite{lucassen2025importancetextpreprocessingmultimodal} has shown that this disconnect contributes to the model's tendency to hallucinate non-visual features, such as specific genetic mutations or immunohistochemical results, as it attempts to correlate text with unrelated visual patterns. Furthermore, our qualitative stress testing (Appendix \ref{appendix:hallucination_analysis}) reveals a pattern: the model demonstrates strong guardrails against non-visual clinical questions where it correctly refuses to predict prognosis or molecular status from H\&E alone, showing the benefit of including the ``negative reasoning'' task in training. However, it exhibits a ``positive compliance bias'' when queried about specific morphological features, tending to hallucinate their presence rather than acknowledging their absence. Future work should address these limitations through improved preprocessing strategies, such as isolating non-H\&E feature filtering using supervised classification \cite{lucassen2025importancetextpreprocessingmultimodal}, investigating zero-shot filtering approaches using local LLMs and matching textual descriptions from pathology reports to individual slides. Additionally, incorporating negative constraint training through QA pairs about absent features could reduce visual hallucinations.

Furthermore, the current paradigm operates in a ``blinded'' setting, lacking the clinical context (e.g., patient age, gross description, prior history) that pathologists rely on for accurate diagnosis. This information gap helps explain the lower performance in complex differential diagnoses, where morphology alone is often insufficient. Additionally, while our use of pre-extracted PRISM embeddings provides a computationally efficient foundation by condensing high-level slide information, this static representation creates an inherent bottleneck for interactive analysis. Because the visual features are fixed prior to the conversation, the model cannot dynamically query fine-grained morphological details that may only become relevant during a specific line of questioning. Future work could explore integrating a slide encoding component directly into the model’s projection layer—for instance, by ingesting raw tile embeddings rather than aggregated slide vectors.

Another important avenue for future work is the development of more clinically realistic evaluation frameworks. A preliminary external evaluation on the public COBRA dataset (Appendix~\ref{appendix:cobra}) provides early evidence of generalisability beyond our internal cohort. However, it also reveals notable prompt sensitivity in both ANTONI-$\alpha$ and MedGemma, highlighting the need for standardised evaluation protocols. Furthermore, our current benchmark focuses on closed-ended classification tasks (organ identification, neoplasm detection, differential diagnosis), which, while quantifiable, do not fully capture the open-ended, multi-turn reasoning that characterizes real pathology workflows. The field lacks gold-standard benchmarks for slide-level clinical dialogue, and existing text-generation metrics (e.g., BLEU, ROUGE) correlate poorly with diagnostic accuracy. Similarly, our finetuning and evaluation remains English-only despite HISTAI-Instruct containing a wealth of multilingual instruction response pairs. Extending the evaluation to multilingual benchmarks would be valuable for validating performance across clinical centers worldwide, yet no such standardized frameworks currently exist in digital pathology. We acknowledge these gaps as limitations of the present study, but anticipate that our pipeline will enable the community to establish and translate more sophisticated evaluation frameworks.

In conclusion, this study demonstrates that advancing vision-language modelling in pathology requires moving beyond generalist adaptations toward domain-specific systems capable of native whole-slide processing. By coupling the Polysome synthetic data engine with the ANTONI-$\alpha$ architecture, we have shown that performance scales directly with the quality and quantity of domain-aligned instructions. Our framework provides the community with the tools to scrutinise and expand upon our findings, whether by curating larger-scale datasets or designing architectures that better bridge the modality gap. Future work will include a larger variety of training data and a comprehensive external validation set. By releasing our code, data, and models, we aim to contribute to standardising the baseline for vision-language modelling in pathology, establishing the foundation for more reliable and interpretable clinical AI assistants.

% \begin{enumerate}
%     \item In this work we (name contributions)
% \end{enumerate}

% \begin{enumerate}
%     \item Future work: validation on external datasets
% \end{enumerate}

% \paragraph{Model}
% \begin{enumerate}
%     \item Dependence on PRISM -> not perfect
%     \item Only single-slide support (for now)
%     \item Not taking clinical context into account (for now)
%     \item No IHC
% \end{enumerate}

% \paragraph{HISTAI}
% During preprocessing, we identified several limitations within the HISTAI dataset that may have impacted model performance. A notable issue was the presence of cases where image-text pairs were mismatched or contained entirely incorrect diagnoses. This aligns with reports from other users, suggesting that the dataset occasionally provides incorrect correlations. Although likely limited to a small subset, we could not manually verify the entire dataset. Furthermore, several slides exhibited physical artefacts or processing errors, such as scanned glass slide labels, pen marks, or incorrect tiling that caused tissue overlap. Finally, we encountered inconsistencies where reports referenced slides that were missing from the physical dataset. For instance, a report might describe two slides, but only one is provided. This could cause the model to look for visual features described in the text that are not present in the available image.
% \begin{enumerate}
%     \item Some HISTAI cases with wrong diagnosis entirely
%     \item strange artifacts on slides (penmarks, name of glass slide present in scan, incorrect tiling showing on software)
%     \item mismatching micro reports and diagnosis
%     \item slides are missing (less cases available then are mentioned in the report so the model learns things from images that are not there).
% \end{enumerate}

% \paragraph{SlideBench}
% % ANTONI performance:
% % Diagnosis: 21.82\% (127/582)
% % Microscopy: 36.42\% (59/162)
% % Clinical: 52.24\% (35/67)

% % Why does ANTONI score bad in the diagnosis?
% We hypothesize that the performance degradation observed on SlideBench, which is characterised by strong results in general 'Clinical' reasoning but sub-random performance in 'Diagnosis' and 'Grading', originates to a modality misalignment introduced during instruction tuning on HISTAI. Our architecture integrates PRISM for generating slide-level embeddings, projecting these visual features into a fine-tuned MedGemma model. However, the presence of noisy data in HISTAI (as mentioned earlier) likely prevented the projection layer from learning a reliable mapping between visual and textual representations. Consequently, during training, the model may have learned to treat the visual embedding as unreliable noise, deciding instead to minimize loss by over-relying on MedGemma's textual shortcuts, which is known to be a characteristic of VLMs \cite{fu2025hiddenplainsightvlms}. Furthermore, we do not explicitly train on 'Grading' and 'Staging', nor did we follow the specific standards used in the benchmark for these tasks. This explains why the model succeeds at 'Risk Factors' and 'Treatment Guidance'. These tasks are often solvable via general medical knowledge. However, where precise visual features are key, for example, on 'Staging' and 'Grading' categories, answers cannot simply be derived from the textual prompt. Since we solely train on HISTAI data, our current setup appears insufficient for generalization to these external standards.

% \section{Conclusion}
% In this work we introduced three open-source contributions: 1) \emph{Polysome}, a domain-agnostic tool for transforming unstructured text into rich instruction-tuning data, 2) \emph{HISTAI-Instruct}, the largest pathology instruction-tuning dataset to date, spanning over 1.1M conversation pairs generated using Polysome, and 3) \emph{ANTONI-$\alpha$}, a set of VLMs trained on our instruction data using three different HISTAI slide subsets. We show that, despite inherent limitations in the HISTAI dataset, the fine-tuned ANTONI-$\alpha$ models consistently outperform MedGemma on organ identification, neuplasm detection and differential diagnosis tasks, with the 18k configuration showing the most consistent performance across all tasks. With this work, we lay the groundwork for future development of powerful and versatile VLMs in digital pathology and contribute to the open-source community.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work is funded by the 2024 Ammodo Science Award for groundbreaking research.}


\bibliography{midl26_305}


\appendix


\section{Prompts}
\label{appendix:prompts}

The prompts used by Polysome, as well as the workflow configurations, to generate HISTAI-Instruct can be found in the \href{https://github.com/computationalpathologygroup/Polysome}{Polysome GitHub repository} (in the \verb|histai-instruct-prompts/| and \verb|histai_instruct_workflows| folders).

\subsection{LLM as-a-judge Prompt For Rating the Generated Data}
We used the following prompt to curate the generated instruction-tuning dataset. We removed any samples that failed to achieve a factual groundedness score of at least 3 or a constraint adherence score of 1. Although we did not use reasoning clarity to filter the data, it provides a metric for inspecting the logical quality of the instructions. For LLM inference on this prompt, Polysome was used.

\begin{Verbatim}[breaklines=true, breakanywhere=true]
You are an expert pathology data analyst. Your task is to evaluate the quality of a synthetically generated question-answering sample based on a source pathology report.

Carefully compare the Generated Conversation against the Source Report according to the detailed Evaluation Rubric below.Source Report:{
  "icd10": "{{ icd10 }}",
  "icd10_text": "{{ icd10_text }}",
  "micro_protocol": "{{ micro_protocol }}",
  "conclusion": "{{ conclusion }}",
  "diff_diagnostic": "{{ diff_diagnostic }}"
}
Generated Conversation:{{ generated_text }}

## Evaluation Rubric

1. Constraint Adherence (Binary Score)
Does the assistant's response strictly adhere to the negative constraint of using only microscopic findings? (Ignore the user's question for this rubric).
- Score 1 (Adherent): The assistant's answer exclusively discusses features visible under a microscope as provided in the source.
- Score 0 (Non-Adherent): The assistant's answer mentions any information not visible microscopically (e.g., patient demographics, clinical history, specimen dimensions, anatomical location).

2. Factual Groundedness and Accuracy (1-5 Scale)
How accurately does the assistant's answer reflect the facts provided in the Source Report, without adding or contradicting information?
- Score 5 (Excellent): Perfectly reflects all relevant source facts. The answer is completely grounded in the source; it does not omit key details, contradict the source, or introduce any information not present in the source.
- Score 4 (Good): All stated facts are correct and grounded, but there is a minor omission of a non-critical detail from the source.
- Score 3 (Acceptable): Contains a significant omission of a key finding from the source, OR introduces a minor, plausible piece of information not found in the source (minor hallucination).
- Score 2 (Poor): Contains a clear factual error that contradicts the source material OR introduces a significant piece of information not found in the source (significant hallucination).
- Score 1 (Very Poor): Contains multiple factual contradictions, dangerous hallucinations, or fundamentally misrepresents the source conclusion.

3. Reasoning Quality & Clarity (1-3 Scale)
How clear, logical, and well-structured is the assistant's reasoning?
- Score 3 (Excellent): The reasoning is clear, logically flows from observation to implication, and is easy to understand.
- Score 2 (Acceptable): The reasoning is generally correct but may be slightly confusing, verbose, or poorly structured.
- Score 1 (Poor): The reasoning is unclear, illogical, convoluted, or incoherent.

Task
First, provide a concise, step-by-step analysis comparing the Generated Conversation to the Source Report. In your reasoning, explicitly justify the score you will assign for each rubric item.
Second, provide the final JSON object with your ratings and justifications.

Final Output in this exact format, with the step-by-step reasoning inside of the json.:
{
  "step-by-step-reasoning": "<Your brief thinking process and justification for the scores goes here.>",
  "evaluation_scores": {
    "constraint_adherence": {
      "score": <integer_score_0_or_1>,
      "justification": "<Briefly justify the score. e.g., 'Adherent. Assistant response is purely microscopic.' or 'Non-adherent, assistant mentions patient age.'>"
    },
    "factual_groundedness_and_accuracy": {
      "score": <integer_score_1_to_5>,
      "justification": "<Briefly justify the score. e.g., 'Fully grounded and accurate.' or 'Introduces information about necrosis not found in the source report.'>"
    },
    "reasoning_clarity": {
      "score": <integer_score_1_to_3>,
      "justification": "<Briefly justify the score. e.g., 'Clear logical flow.' or 'Reasoning is convoluted and hard to follow.'>"
    }
  }
}

Start your response with the opening bracket `{` and end with the closing bracket `}`.

\end{Verbatim}

\section{Evaluation Dataset Statistics}
\label{appendix:eval}

The evaluation set consists of 317 total pathology cases covering 16 unique organs. The dataset is predominantly neoplastic ($68.8\%$). Regarding the difficulty of the diagnosis generation task, the questions contain an average of 3.72 options per case (dominantly 4 options), resulting in a random chance baseline accuracy of 26.89\%.

\begin{table}[h]
    \centering
    \caption{Complete Organ Distribution of the Evaluation Set (N=317).}
    \label{tab:organ_dist}
    \small % Slightly smaller font to fit columns neatly
    \begin{tabular}{lrlr}
        \toprule
        \textbf{Organ} & \textbf{N (\%)} & \textbf{Organ} & \textbf{N (\%)} \\
        \midrule
        Skin & 141 (44.5\%) & Bone marrow & 3 (0.9\%) \\
        Breast & 92 (29.0\%) & Tonsil & 3 (0.9\%) \\
        Colon & 48 (15.1\%) & Intestine & 2 (0.6\%) \\
        Lymph node & 10 (3.2\%) & Duodenum & 1 (0.3\%) \\
        Soft tissue & 6 (1.9\%) & Liver & 1 (0.3\%) \\
        Lung & 4 (1.3\%) & Brain & 1 (0.3\%) \\
        Rectum & 3 (0.9\%) & Bronchial epithelium & 1 (0.3\%) \\
         & & Ovary & 1 (0.3\%) \\
        \bottomrule
    \end{tabular}
\end{table}

\section{Polysome Simplified End-to-End Example}
\label{sec:polysome_figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=1\linewidth]{images/methods/polysome-end-to-end.drawio.png}
    %\vspace{-0.8cm}
    \caption{The main components of Polysome in a simplified flow: an input dataset (green) is processed through a JSON-configured workflow (blue) containing three nodes. A data loading node standardizes the input CSV into an internal structure. Two sequential text prompt nodes then generate an English conversation and translate it to Dutch, each guided by Jinja2 prompt templates (yellow). Finally, a combine node merges outputs into a JSONL dataset (red).}

    \label{fig:figure_polysome}
\end{figure}


\section{Qualitative Analysis of ANTONI-\texorpdfstring{$\alpha$}{alpha}'s Hallucination Tendencies}
\label{appendix:hallucination_analysis}

To evaluate ANTONI-$\alpha$'s behaviour when confronted with information beyond the scope of H\&E morphology, we conducted a targeted analysis using a dermatology case (histai-skin-b2/case\_02716). We queried the model about molecular markers, prognostic indicators, and absent visual features to assess its tendency to hallucinate or appropriately acknowledge limitations.

\subsection{Appropriate Acknowledgment of Limitations}

\textbf{KRAS Mutation Status.} When asked about KRAS mutation status, ANTONI-$\alpha$ correctly acknowledged that molecular genetic information cannot be determined from H\&E morphology alone:

\begin{quote}
\textit{``Based on the microscopic findings, we can observe features consistent with a keratoacanthoma [...] However, determining the KRAS mutation status [...] is not possible from microscopic examination alone. KRAS mutations are typically identified through molecular genetic testing, such as PCR or sequencing.''}
\end{quote}

Even under follow-up questioning with leading prompts (``Are you certain about this mutation status?''), the model maintained its position, demonstrating robustness against pressure to confabulate molecular data.

\vspace{1mm}
\textbf{5-Year Survival Prediction.} When queried about prognosis, the model appropriately declined to provide a survival rate:

\begin{quote}
\textit{``Determining a 5-year survival rate based solely on microscopic findings from this biopsy is not possible. While we can observe features like the presence of a nevus, the degree of atypia, and the presence of mitoses, these findings alone don't allow us to predict long-term outcomes.''}
\end{quote}

The model correctly identified that survival depends on clinical factors beyond histopathological examination, including patient health status, disease stage, and treatment response.

\subsection{Problematic Hallucinations}

\textbf{HER2 Status Confabulation.} When asked about HER2 status from H\&E images, ANTONI-$\alpha$ hallucinated the presence of immunohistochemical staining:

\begin{quote}
\textit{``Based on the microscopic findings, we can see evidence of HER2 protein overexpression in the tumor cells. Specifically, there's a noticeable staining pattern indicating a high level of HER2 protein expression.''}
\end{quote}

Although research shows HER2 can be predicted from H\&E via sub-visual patterns \cite{LU2022102486, labarbera2020detection}, these are not 'visible' as the model claims. At a minimum, the model fabricated specific observations (``granular, dot-like pattern'') that mimic IHC/FISH terminology but are physically impossible to detect on H\&E morphology.

\vspace{1mm}
\textbf{Inflammatory Cell Subtyping.} When asked to specify immune cell types infiltrating the tumor, the model provided specific immunophenotypic information:

\begin{quote}
\textit{``The lymphocytes appear to be predominantly CD4+ T cells [...] I also observe a smaller number of CD8+ T cells, which are cytotoxic T cells.''}
\end{quote}

This level of detail is impossible to determine from H\&E staining alone, as CD4+ and CD8+ distinction requires immunohistochemistry. The model fabricated specific cell populations with apparent confidence.

\vspace{1mm}
\textbf{Quantitative Fabrication.} When asked about necrosis extent and mitotic count, the model provided specific percentages and counts despite these features not being prominent in the actual case:

\begin{quote}
\textit{``Based on my observations, the necrosis appears to be present in approximately 10\% of the tissue examined.'' [...] ``I estimate there are approximately 10 to 15 mitoses per high-power field.''}
\end{quote}

The case description noted only ``mitotic figures are typical and rare,'' making the claimed 10-15 mitoses per HPF a significant overestimation.

\section{External validation on COBRA}
\label{appendix:cobra}
To assess the generalisability of our approach, we evaluated performance on an external public dataset. We used the publicly available COBRA dataset \cite{cobra2023}, which comprises over 7,000 histopathology WSIs related to the diagnosis of basal cell carcinoma. Initial results on a subset of 50 cases (50\% cancer) show that ANTONI-$\alpha$ outperforms MedGemma-4B on a binary classification prompt (cancer yes/no), achieving an F1 of .732 (ANTONI-$\alpha$) versus .667 (MedGemma).

In this analysis, we observed a large sensitivity to prompt formulation and to whether the model reasons prior to classification. To quantify this effect, we conducted a small-scale prompt variation study on the same subset. Starting from the baseline prompt \textit{Is there cancer present in the image of the histological slide? Strictly answer with `Yes' or `No'.''}, we substituted the key term \textit{cancer} with \textit{cancerous tissue} and \textit{malignancy}, and additionally rephrased the prompt to \textit{Can you identify cancer in the image [...]?''}. Table~\ref{tab:prompt_variation} reports F1 scores as percentage-point differences relative to each model's baseline.
For ANTONI-$\alpha$, all prompt variations led to improved F1 scores, with \textit{malignancy} yielding the largest gain (+10.2~pp) and \textit{cancerous tissue} and the rephrased prompt showing comparable improvements (+8.2~pp and +9.6~pp, respectively). MedGemma showed a more inconsistent pattern: \textit{malignancy} produced no change, \textit{cancerous tissue} yielded a slight gain (+3.3~pp), while the rephrased prompt led to the largest improvement (+9.5~pp). Notably, ANTONI-$\alpha$ outperformed MedGemma across all prompt formulations.

These findings highlight a well-known issue in the field regarding prompt sensitivity. While ANTONI-$\alpha$ appears more robust to lexical substitution than MedGemma, both models exhibit non-trivial performance variation across semantically equivalent prompts. We leave further investigation of prompt robustness for future research.

\begin{table}[h]
\centering
\caption{F1 scores for binary cancer classification across prompt variations. $\Delta$ denotes percentage-point difference from each model's baseline prompt (\textit{``cancer''}).}
\label{tab:prompt_variation}
\begin{tabular}{lcccc}
\toprule
& \multicolumn{2}{c}{ANTONI-$\alpha$} & \multicolumn{2}{c}{MedGemma} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5}
Prompt variant & F1 & $\Delta$ & F1 & $\Delta$ \\
\midrule
\textit{cancer} (baseline) & .732 & --- & .667 & --- \\
\textit{cancerous tissue} & .814 & +8.2 & .700 & +3.3 \\
\textit{malignancy} & .833 & +10.2 & .667 & 0.0 \\
\textit{can you identify cancer [...]?} & .828 & +9.6 & .762 & +9.5 \\
\bottomrule
\end{tabular}
\end{table}


% \section{Images}
% \begin{figure}[h]
%     \centering
%     \includegraphics[width=0.5\linewidth]{images/appendix/Artefacts.pdf}
%     \caption{Two cases of HISTAI images with a) incorrect tiling such that the tissue becomes distorted and b) glass slide artefacts covering parts of the tissue.}
%     \label{fig:appendix-artefacts}
% \end{figure}

% \section{Proof of Theorem 1}

% This is where the content of your paper goes.  Some random
% notes\footnote{Random footnote are discouraged}:
% \begin{itemize}
% \item You should use \LaTeX \cite{Lamport:Book:1989}.
% \item JMLR/PMLR uses natbib for references. For simplicity, here, \verb|\cite|  defaults to
%   parenthetical citations, i.e. \verb|\citep|. You can of course also
%   use \verb|\citet| for textual citations.
% \item Eprints such as arXiv papers can of course be cited \cite{Hinton:arXiv:2015:Distilling}. We recommend using a \verb|@misc| bibtex entry for these as shown in the sample bibliography.
% \item You should follow the guidelines provided by the conference.
% \item Read through the JMLR template documentation for specific \LaTeX
%   usage questions.
% \item Note that the JMLR template provides many handy functionalities
% such as \verb|\figureref| to refer to a figure,
% e.g. \figureref{fig:example},  \verb|\tableref| to refer to a table,
% e.g. \tableref{tab:example} and \verb|\equationref| to refer to an equation,
% e.g. \equationref{eq:example}.
% \end{itemize}

% \begin{table}[htbp]
%  % The first argument is the label.
%  % The caption goes in the second argument, and the table contents
%  % go in the third argument.
% \floatconts
%   {tab:example}%
%   {\caption{An Example Table}}%
%   {\begin{tabular}{ll}
%   \bfseries Dataset & \bfseries Result\\
%   Data1 & 0.12345\\
%   Data2 & 0.67890\\
%   Data3 & 0.54321\\
%   Data4 & 0.09876
%   \end{tabular}}
% \end{table}

% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example}
%   {\caption{Example Image}}
%   {\includegraphics[width=0.5\linewidth]{example-image}}
% \end{figure}

% \begin{algorithm2e}
% \caption{Computing Net Activation}
% \label{alg:net}
%  % older versions of algorithm2e have \dontprintsemicolon instead
%  % of the following:
%  %\DontPrintSemicolon
%  % older versions of algorithm2e have \linesnumbered instead of the
%  % following:
%  %\LinesNumbered
% \KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
% \KwOut{$y$, the net activation}
% $y\leftarrow 0$\;
% \For{$i\leftarrow 1$ \KwTo $n$}{
%   $y \leftarrow y + w_i*x_i$\;
% }
% \end{algorithm2e}


\end{document}
