\documentclass{midl}

\usepackage{float}
\usepackage{multirow}
\usepackage{array}
\usepackage{tcolorbox}
\tcbuselibrary{listings, skins, breakable}
\usepackage{array, booktabs}
\usepackage[table,xcdraw]{xcolor}
\usepackage{graphicx}
\usepackage{enumitem}
\usepackage{pdflscape}
\usepackage{rotating}
\newcommand{\tinysmall}{\fontsize{7.5pt}{6.5pt}\selectfont}
\usepackage{lipsum}
\usepackage{pifont}
\newcommand{\cmark}{\ding{51}} % ✓
\newcommand{\xmark}{\ding{55}} % ✗
\usepackage{longtable}
\usepackage{placeins}

\usepackage{xcolor}
\definecolor{conceptPurple}{HTML}{39275B}
\definecolor{survivalGold}{HTML}{C79900}

\usepackage{arydshln} % dotted lines in tables

\usepackage{enumitem}

\usepackage{xcolor}
\usepackage{soul}  % for \sethlcolor and \hl

% --- Define highlight colors ---
\definecolor{correctgreen}{RGB}{210,255,210}
\definecolor{almostyellow}{RGB}{255,255,180}
\definecolor{incorrectred}{RGB}{255,210,210}
\definecolor{lightgrayhl}{RGB}{230,230,230}

% --- Highlight commands ---
\newcommand{\correct}[1]{\sethlcolor{correctgreen}\hl{#1}}
\newcommand{\almost}[1]{\sethlcolor{almostyellow}\hl{#1}}
\newcommand{\incorrect}[1]{\sethlcolor{incorrectred}\hl{#1}}

% % --- Highlight command ---
\newcommand{\lightgray}[1]{\sethlcolor{lightgrayhl}\hl{#1}}

\definecolor{btpurple}{HTML}{39275B}
\newtcolorbox{btprompt}{
    enhanced,
    breakable,
    colback=black!2,          % light grey background
    colframe=black,           % thin border
    boxrule=0.7pt,
    arc=0pt, outer arc=0pt,   % square corners
    left=8pt, right=8pt, top=10pt, bottom=10pt,
    fontupper=\ttfamily\small,    % content in monospace
    title=BTReport Prompt,
    fonttitle=\bfseries\large\color{white},
    boxed title style={
        colback=btpurple,
        colframe=btpurple,
        arc=0pt,
        outer arc=0pt,
        top=3pt, bottom=3pt,
        left=8pt, right=8pt
    }
}

\usepackage{tcolorbox}
\usepackage{xcolor}
\usepackage{csquotes}

\usepackage{makecell}
\definecolor{headerblack}{HTML}{000000}
\definecolor{lightgraybg}{HTML}{F2F2F2}
\definecolor{dividergray}{HTML}{B5B5B5}

\usepackage{enumitem,amssymb}
\newlist{todolist}{itemize}{2}
\setlist[todolist]{label=$\square$}
\usepackage{pifont}

\newcommand{\done}{\rlap{$\square$}{\raisebox{2pt}{\large\hspace{1pt}\cmark}}%
\hspace{-2.5pt}}
\newcommand{\wontfix}{\rlap{$\square$}{\large\hspace{1pt}\xmark}}

\usepackage{tcolorbox}
\tcbuselibrary{breakable}
\tcbuselibrary{skins}
\tcbuselibrary{listings}
\tcbuselibrary{minted}

\newtcolorbox{FullReportBox}{
    enhanced,
    breakable,
    colback=white,
    colframe=black,
    boxrule=0.8pt,
    arc=10pt,
    outer arc=10pt,
    left=0pt, right=0pt,
    top=0pt, bottom=6pt,   % <-- FIX: add a bit of bottom padding
    boxsep=0pt,
    interior style={left=0pt,right=0pt}, %,top=0pt,bottom=0pt},
}

\newtcolorbox{FullReportBoxMP}{
    enhanced,
    breakable,
    enhanced jigsaw,
    colback=white,
    colframe=black,
    boxrule=0.8pt,
    arc=10pt,
    break at=0pt, 
    attach title to upper,
    before skip=0pt,
    after skip=0pt,
    outer arc=10pt,
    left=0pt, right=0pt,
    top=0pt, bottom=6pt,
    boxsep=0pt,
    interior style={left=0pt, right=0pt},
}

\newcommand{\splitmodel}[2]{%
  \begin{tabular}[t]{@{}c@{}}%
    \textbf{#1}\\%
    \textbf{#2}%
  \end{tabular}%
}

\newcommand{\metrics}[5]{%
    {\scriptsize
    \textcolor{gray}{ROUGE:} #1\quad
    \textcolor{gray}{BLEU:} #2\quad
    \textcolor{gray}{RATESCR:} #3\quad
    \textcolor{gray}{BERTSCR:} #4\quad
    \textcolor{gray}{TB-F1:} #5}
}

\usepackage{mwe} % to get dummy images
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}

\jmlrvolume{-- 115}
% \editors{Under Review for MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[BTReport: A Framework for Brain Tumor Radiology Report Generation]{BTReport: A Framework for Brain Tumor Radiology Report Generation with Clinically Relevant Features}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Juampablo E. {Heras Rivera}\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \orcid{0000-0002-0205-6329} \Email{jehr@uw.edu}\\
\Name{Dickson T. Chen\midlotherjointauthor\nametag{$^{1}$}} \orcid{0000-0001-9433-077X} \Email{dtchen19@uw.edu}\\
\Name{Tianyi Ren\nametag{$^{1}$}} \orcid{0000-0001-9548-6645} \Email{tr1@uw.edu}\\
\Name{Daniel K. Low\nametag{$^{2}$}} \orcid{0000-0002-9519-8691} \Email{dalow@uw.edu} \\
\Name{Jacob Ruzevick\nametag{$^{2}$}} \Email{ruzevick@uw.edu}\\
\Name{Asma {Ben Abacha}\nametag{$^{3}$}}  \orcid{0000-0001-6312-9387} \Email{abenabacha@microsoft.com }\\
\Name{Alberto {Santamaria-Pang}\nametag{$^{3},^{4}$}} \orcid{0000-0003-4012-8394} \Email{alberto.santamariapang@microsoft.com}\\
\Name{Mehmet Kurt\nametag{$^{1}$}} \orcid{0000-0002-5618-0296} \Email{mkurt@uw.edu}\\ 
\addr $^{1}$ University of Washington \\
\addr $^{2}$  University of Washington School of Medicine \\
\addr $^{3}$  Microsoft Health AI \\
\addr $^{4}$  The Johns Hopkins University School of Medicine
}

\begin{document}

\maketitle
\begin{abstract}
Recent advances in radiology report generation (RRG) have been driven by large paired image-text datasets; however, progress in neuro-oncology RRG has been limited due to a scarcity in open paired image-report datasets. Here, we introduce BTReport, an open-source framework for brain tumor RRG that constructs natural language radiology reports using reliably extracted quantitative imaging features. Unlike existing approaches that rely on general-purpose or fine-tuned vision-language models for both image interpretation and report composition, BTReport performs deterministic feature extraction of clinically-relevant features, then uses large language models only for syntactic structuring and narrative synthesis. By separating RRG into deterministic feature extraction and report generation stages, synthetically generated reports are completely interpretable and contain reliable numerical measurements, a key component lacking in existing RRG frameworks. We validate the clinical relevance of BTReport-derived features, and demonstrate that BTReport-generated reports more closely resemble reference clinical reports when compared to existing baseline RRG methods. To further research in neuro-oncology RRG, we introduce BTReport-BraTS, a companion dataset that augments BraTS imaging with synthetic radiology reports generated with BTReport, and BTReview, a web-based platform for validating the clinical quality of synthetically generated radiology reports. Code for this project can be found at: \url{https://github.com/KurtLabUW/BTReport}.
\end{abstract}

\begin{keywords}
Brain MRI, Radiology report generation, VASARI, Midline shift, Open dataset, Multimodal learning, Neuro-oncology
\end{keywords}

\section{Introduction}

Radiology is a medical specialty that employs a variety of imaging modalities (e.g., X-ray, computed tomography (CT), multi-parametric magnetic resonance imaging (mpMRI) for the detection and monitoring of human disease. The radiology report contains a detailed summary of imaging findings, providing insights into a patient's condition crucial for diagnosis and clinical decision making. With a growing aging population, the demand for radiology services is expected to increase between 16.9\% to 26.9\% by 2055, while attrition in radiology also continues to increase \cite{christensen2025projected}. As the gap between physician workload and available workforce widens, assisted radiology report generation (RRG) is positioned to help address this unmet clinical need.

RRG leverages advances in artificial intelligence (AI) to extract quantitative imaging markers from raw unstructured data in an automated manner. In clinical workflows, RRG promises to improve data quality, repeatability, factual completeness, and timeliness of radiology reporting. The adoption of vision-language models (VLMs) in RRG has led to substantial advances, allowing models to jointly reason over medical images and textual findings \cite{cxr_reportgen, cxr_reportgen2, sellergren2025medgemma}. These developments have been primarily driven by large-scale chest X-ray datasets such as MIMIC-CXR \cite{mimiccxr} and IU X-ray \cite{iuxray}, which provide over 300,000 chest X-ray images paired with clinical reports. However, accessible image-report datasets across other radiology specialties are limited. For instance, in neuro-oncology, large neuroimaging datasets are made openly available through efforts such as BraTS \cite{brats24}, but paired text reports for VLM training remain lacking. 

Advances in computer vision have automated glioblastoma (GBM) segmentation from mpMRI, enabling reliable quantification of the contrast-enhancing lesion, necrotic core, and peritumoral edema \cite{menze2014multimodal}. However, segmentations alone lack the clinical context needed to evaluate the impact of tumor on the surrounding brain environment. Neuro-radiologists supplement their interpretation with additional imaging-derived evidence, such as subregion involvement and midline shift (MLS) measurements, among others. Together, these descriptors provide comprehensive brain tumor characterization, and improve risk stratification, treatment planning, and outcome prediction. Neuro-oncology RRG models should reliably include these features to more closely align with radiologist expectations.

Here we introduce BTReport, a two-stage framework for neuro-oncology RRG grounded in clinically relevant quantitative features. BTReport first deterministically extracts imaging markers from mpMRI including patient metadata, VASARI (Visually AcceSAble Rembrandt Images) features, and automated 3D midline shift measurement. These markers are then provided as structured inputs to large language models (LLMs) for clinical reasoning and synthetic report generation. Because quantification is performed algorithmically upstream, BTReport enables measurement-grounded reporting without requiring task-specific fine-tuning of vision encoders or VLMs. This design addresses a key challenge in neuro-oncology RRG: clinical reports frequently include quantitative measurements such as lesion size, volume, and midline shift, which may not be reliably extracted by generic vision encoders without explicit training for measurement and precise spatial reasoning \cite{chen2024spatialvlm}. By grounding report generation in deterministically extracted imaging features, our framework is directly interpretable and reduces the likelihood of critical detail omission \cite{wu2025first}.

Our contributions are as follows: \textbf{a) }a scalable brain tumor mpMRI radiology report generation framework driven by deterministically extracted neuroimaging features (Section \ref{sec: btreport}), \textbf{b)} a robust, interpretable 3D midline shift (MLS) estimation algorithm (Section \ref{sec: midline}), \textbf{c) }clinical validation of BTReport-derived features demonstrated with semantic clustering of radiology concepts from reference reports, and retrospective modeling of overall survival (Section \ref{sec: survpred}), and \textbf{d)} release of BTReport-BraTS, an open-source image-report companion dataset augmenting BraTS cases with clinically grounded anatomical and pathological descriptors  (Appendix \ref{sec: btreportbrats}).

\section{Related Work}
\label{sec: related work}
A variety of approaches have been proposed for the task of image-paired RRG and generally fall into one- and two-stage frameworks. The leading paradigm for RRG involves training monolithic VLM foundation models to extract image features and generate reports in one step, such as in MedGemma \cite{sellergren2025medgemma} and MedPaLM-2 \cite{medpalm2}. Approaches in neuro-oncology following this paradigm include TextBraTS \cite{textbrats}, which directly prompts GPT-4 models \cite{achiam2023gpt} with videos of 2D mpMRI axial slices alongside tumor segmentation masks derived from FLAIR imaging.  Radiologists refine these annotations into structured textual labels, which are then used to guide tumor segmentation models toward clinically relevant regions, improving segmentation performance. Although these annotations improved segmentation performance, they do not contain quantitative measurements typically reported by radiologists.

Other image-paired RRG frameworks adopt a two-stage design. For example, \textit{From Segmentation to Explanation} (\textit{Seg-to-Exp}) \cite{seg2exp} first establishes descriptive tumor-ROI relationships based on the co-registration of tumor segmentation masks to an anatomical atlas. These relationships are extracted as structured features that describe the percent overlap of the brain tumor with ROIs it occupies, and then provided to an LLM for report generation. The generated reports provide details about regional tumor impact and offer insightful clinical implications based on the functional role associated with identified tumor-ROI relationships. However,  generated reports do not resemble the standard narrative structuring of radiology reports authored by fellowship trained neuro-radiologists. Additionally, \textit{Seg-to-Exp} reports do not describe additional clinically-relevant imaging observations beyond anatomical tumor location.

\textit{AutoRG-Brain} \cite{autorg} also adopts a two-stage framework.  In the first stage, anomaly ROI masks are generated and mapped to specific ROIs based on anatomical segmentations. In the second stage, a fine-tuned VLM takes embedded anomaly ROI masks as visual prompts for RRG. This approach uses deterministic features to focus the VLM, generating region-specific findings grounded in areas of the image where anomalies were detected. Based on this design, generated reports are largely descriptive, with findings expressed through signed magnitude operators (e.g., \enquote{enlarged}, \enquote{high signal}) that encode comparative attribute relationships between detected anomalies and their anatomical location. Importantly, describing an imaging abnormality in relation to anatomical context does not, by itself, establish the clinical context required for higher-order clinical interpretation. As a result, observations from generated reports are not framed in a clinically meaningful way, limiting their utility in informing downstream clinical decision making. Furthermore, reference reports used for fine-tuning contain a limited set of clinically-relevant features.

The framework proposed in \textit{RadGPT} \cite{bassi2025radgpt} first deterministically extracts clinically-relevant features from CT scans of abdominal tumors,then uses LLMs for syntactic structuring. This approach produces radiology reports that more closely resemble reference reports when compared to end-to-end report generation models. However, it remains unclear whether a framework similar to \textit{RadGPT} can be applied for neuro-oncology RRG based on deterministically extracted features from mpMRI.

\section{Data}

This study uses two complementary datasets to support radiology report generation and validation of BTReport-derived features. First, the HuskyBrain dataset is a retrospective cohort of fully de-identified pre-operative mpMRI studies paired with radiologist-authored reports, serving as the primary resource for training and evaluation of neuro-oncology RRG frameworks. The second dataset is a subset of BraTS'23 cases, which enable survival analyses with imaging and clinical metadata. Together, these datasets support the evaluation of synthetic report clinical quality and the use of deterministic neuroimaging features for survival outcome prediction.

\subsection{HuskyBrain Dataset}

\label{preprocessing}
We collected pre-operative mpMRI scans and radiology reports from a retrospective cohort of GBM patients (n=184) treated at the University of Washington Medical Center (UWMC), an academic medical institution serving patients across the WWAMI region (Washington, Wyoming, Alaska, Montana, and Idaho). Inclusion criteria were: 1) confirmed histopathologic diagnosis of GBM; 2) availability of pre-operative mpMRI sequences including T1, T2, T1c, and T2-FLAIR; and 3) a corresponding pre-operative diagnostic radiology report. Images were pre-processed using CaPTk \cite{Davatzikos2018CIPT, Pati2020CaPTk}, following BraTS 2017-2023 procedures: DICOM to NIfTI conversion, SRI24 co-registration, 1 mm isotropic resampling, and skull-stripping. Reference reports were authored by fellowship-trained radiologists and selected as the clinical reference standard. For each case, the HuskyBrain dataset contains de-identified MRI sequences, radiologist-authored reports stripped of protected health information, and tumor masks.

\subsection{Survival Analysis Dataset}
\label{subsec: SurvivalPredictionData}
From the BraTS'23 dataset \cite{brats23}, a smaller cohort of mpMRI cases (n=461) was used for survival analyses. Cases were selected based on the availability of five minimum metadata entries: (1) age at initial diagnosis, (2) biological sex, (3) confirmed methylation status of O6-methylguanine-DNA methyltransferase (\textit{MGMT}), (4) mutation status of isocitrate dehydrogenase 1 (\textit{IDH1}), and (5) overall survival or equivalent survival quantification representing the number of days between radiological diagnosis and reported days to known death. We  collected these demographic and genomic features from multiple publicly available collections of GBM cases, including those from the University of California San Francisco (UCSF-PDGM) \cite{UCSF-PDGM}, University of Pennsylvania (UPenn-GBM) \cite{UPenn-GBM}, Clinical Proteomic Tumor Analysis Consortium (CPTAC-GBM) \cite{CPTAC-GBM}, and The Cancer Genome Atlas from the Cancer Imaging Archive (TCGA-LGG \& TGCA-GBM) \cite{TCGA-LGG} \cite{TCGA-GBM}. 

\section{Methods}
\label{methods}

\subsection{BTReport Framework}

\label{sec: btreport}

BTReport is a two-stage approach for neuro-oncology RRG that first deterministically extracts interpretable and clinically relevant neuroimaging features from mpMRI, then uses an LLM for syntactically structured narrative construction and report generation. Since quantitative measurements are central to radiological evaluation, BTReport relies on validated open-sourced algorithms to derive descriptors of anatomy and tumor pathology, rather than performing end-to-end medical inference with a VLM. 

\begin{figure}[H]
    \centering
    \resizebox{0.99\linewidth}{!}{%
        \includegraphics{figures/btreport_v6.png}%
    }
    \caption{BTReport Overview: Interpretable, clinically meaningful variables are deterministically extracted for each case, including demographics, VASARI features, and 3D midline shift measurements. These features are utilized by context-guided LLMs for clinically grounded radiology report generation.}
    \label{fig: btreport}
\end{figure}

\noindent For a given case, the BTReport framework (Figure \ref{fig: btreport}) consists of four steps:
\\
\begin{enumerate}[topsep=0pt, partopsep=0pt, parsep=0pt, itemsep=2pt]

\item T1-weighted scans and tumor masks are used to generate tumor-robust anatomical segmentations, from which regional volumetric statistics are computed (Section~\ref{sec: seg stats}). %Additional metadata such as patient demographics can be included if provided.

\item Midline shift is quantified by propagating a hand-annotated MNI152 midline into subject space using the MNI152-to-subject deformation field (Section~\ref{sec: midline}).

\item Anatomical segmentations (Section~\ref{sec: seg stats}) are used to extract VASARI (Visually AcceSAble Rembrandt Images) features using a modified VASARI-auto pipeline \cite{vasari-auto}, yielding standardized tumor- and anatomy-specific descriptors (Section~\ref{sec: VASARI}).

\item  A general-purpose LLM is provided a context-specific prompt and tasked with generating structured radiology reports, with generation conditioned on deterministic BTReport-extracted features (Section~\ref{sec: llm reportgen}).
\end{enumerate}

\noindent

\subsection{Segmentation Statistics}
\label{sec: seg stats}
\paragraph{BTReport Inputs:} For a given case, inputs to the BTReport framework include their T1 sequence and corresponding tumor segmentation masks. Tumor segmentation masks are derived from manual annotation or automated segmentation, and tumor sub-region annotations (necrotic core, edema, enhancing tumor) follow BraTS convention. 
 
\paragraph{Robust Anatomical Segmentations \& Volumetric Statistics:} Anatomical segmentations provide tumor-ROI relational context by localizing tumors with respect to critical brain structures. However, many automated methods assume near-normal anatomy and can fail when large tumors deform tissue and obscure boundaries. Prior work improves tumor-robust segmentations either by training with synthetic tumor-mask augmentation \cite{autorg}, or by synthesizing pseudo-healthy structural images that suppress tumor appearance, increasing the reliability of established segmentation pipelines \cite{iglesias2023synthsr}. Our approach closely resembles the latter. 

First, we register the MNI152 atlas \cite{mnicollins1999animal, mnifonov2011unbiased, mnifonov2009unbiased} to subject space using SynthMorph \cite{synthmorph}. The resulting warped atlas, referred to as MNI152-to-subject, provides a pseudo-healthy anatomical representation of the brain. Next, we obtain anatomical segmentations by running SynthSeg \cite{billot2023synthseg} on the MNI152-to-subject volume. As SynthSeg operates on the tumor-free brain representations, anatomical segmentations remain robust when tumors deform native anatomy. Finally, anatomical labels are merged with the tumor and midline segmentations, to produce a unified mask that jointly represents normal neuroanatomy and pathological structures. With this joint segmentation, we can reliably extract quantitative features including tumor and ventricle volume, lesion count, tumor sub-region proportions, and tumor-ROI overlap.

\subsection{3D Midline Shift Measurement via Atlas-based Segmentation}
\label{sec: midline}

Midline shift (MLS) is an intracranial pathology characterized by the displacement of brain tissue across the skull's midsagittal axis. MLS arises as a result of traumatic brain injury or tumor mass effects and is an indirect indicator of elevated intracerebral pressure. Estimation of MLS is done by identifying the axial slice with the largest deviation, as indicated by midline structures such as the septum pellucidum, the third and fourth ventricles, and the falx cerebri. However, this estimation is subject to high inter-rater variability as there is not a standard procedure for axial slice level selection. 

Here, we propose a novel pipeline for MLS estimation based on clinical guidelines, using a deep learning atlas-based segmentation approach. Our approach, shown in Figure \ref{fig:mls}, leverages the robust registration capabilities of SynthMorph \cite{synthmorph} to register hand-annotated midline segmentations from a MNI152 atlas template onto patient T1 scans. These are compared to an \enquote{ideal} midline defined by connecting the anterior and posterior points of the falx cerebri for each axial slice. By calculating the distance between the ideal and subject midlines at each voxel, we obtain accurate 3D MLS estimations in seconds, giving a more complete picture in comparison to 2D automated or manual methods. This approach has strong zero-shot generalization and can be applied to MRI or CT scans.

\begin{figure}[H]
    \centering
    \resizebox{0.95\linewidth}{!}{%
        \includegraphics{figures/mls3.png}%
    }
    \caption{
Atlas-based 3D midline shift (MLS) estimation using SynthMorph \cite{synthmorph}, in which atlas midline annotations are registered to patient imaging and voxel-wise distances to an ideal midline axis are computed per axial slice.
    }
\label{fig:mls}
\end{figure}

\subsection{VASARI Feature Extraction}
\label{sec: VASARI}

To standardize neuroimaging-derived glioma characterization and improve repeatability, the VASARI (Visually AcceSAble Rembrandt Images) feature set was developed by The Cancer Imaging Archive.  The VASARI feature set uses controlled vocabulary to quantitatively describes anatomical relationships between GBM tumors and clinically relevant brain structures established in the literature. Furthermore, these relationships are also routinely included in neuroradiology reports and used by neurosurgeons to assess whether patients are candidates for surgical intervention. VASARI features have been used to accurately predict tumor histological grade (WHO I-IV grade), disease progression, molecular mutation status (e.g., \textit{IDH1} WT/mutant, \textit{MGMT} un/methylated), risk of recurrence, and overall patient survival  \cite{jain2014outcome, nicolasjilwan2015addition, peeken2019combining, setyawan2024beyond, wang2021preoperative, zhou2017mri}. Here, we employ a modified variant of VASARI-auto \cite{vasari-auto}, an automated labeling tool, which has been validated as non-inferior to radiologist VASARI annotations, and can be used to reduce inter-rater variance.  The included variations use the subject-space anatomical and midline segmentations extracted in Sections \ref{sec: seg stats} and \ref{sec: midline} to improve feature accuracy.

\subsection{BTReport Feature Validation}
To determine whether extracted imaging features are clinically relevant, we assessed them based on two criteria: (1) whether features correspond to commonly reported radiological concepts (Section \ref{sec: embed_analysis}) and (2) by modeling their prognostic value with respect to patient survival (Section \ref{sec: survpred}). To validate the accuracy of extracted features, we assess the quality of feature extraction in generated reports to those found in associated radiologist-authored ground truth reports, assessing categorical and numerical feature precision and error.

\subsubsection{Semantic Clustering of Common Radiology Findings}
\label{sec: embed_analysis}
Here, we derived a ranked list of the most frequently included topics in real radiology reports. First, we extract all discrete factual claims across all reports in the HuskyBrain dataset using the method proposed in TBFact \cite{blondeel2025healthcare}. TBFact uses an LLM (in this case DeepSeek-R1 \cite{guo2025deepseek}) to divide each clinical report into independently verifiable factual claims. Next, we embedded each extracted claim using a lightweight sentence-transformer model (all-MiniLM-L6-v2 \cite{reimers-2019-sentence-bert}) to obtain dense semantic representations. All embedded claims were pooled across the dataset and clustered using hierarchical agglomerative clustering (cosine distance, average linkage) without specifying the number of clusters a priori, allowing groups of semantically related statements to emerge from the data in an unsupervised manner. 

We then use a pre-trained Gemma 3 27B LLM \cite{team2025gemma} model to summarize claims in each cluster into representative topic sentences. For example, a cluster containing claims such as: [\enquote{The lesion measures 4.0 x 3.5 cm}, \enquote{The lesion measures 3.9 x 2.0 x 2.1 cm.}, ...], is summarized by the topic sentence \enquote{Lesion size and measurements reported.} The number of claims assigned to each cluster serves as an estimate of the frequency of the corresponding topic. The 35 most frequent cluster descriptors, along with representative example claims for each cluster, are reported in Appendix \ref{appendix: clusters}. 

Collectively, the resulting set of cluster descriptors and frequencies provides an interpretable, data-driven view of radiological concepts based on real-world reporting. We use these concepts to validate the clinical relevance of deterministically extracted features found in BTReport and assess their alignment with findings in real radiology reports.

\subsubsection{Survival Outcome Modeling}
\label{sec: survpred}
To assess whether BTReport-extracted features were predictive of clinical outcomes, we evaluated their association with overall survival measured from diagnosis. This analysis used the dataset described in Section~\ref{subsec: SurvivalPredictionData}, which provides survival outcomes and patient metadata for a subset of BraTS cases. Survival analyses were performed using Kaplan–Meier estimators implemented with the \texttt{lifelines} library \cite{lifelines}, with curves reported in Appendix~\ref{sec: KM plots}. Differences between survival groups were assessed using the log-rank test, and relative risk was quantified using Cox proportional hazards models, reporting hazard ratios with 95\% confidence intervals. Statistical significance was determined from p-values derived from both the log-rank tests and Cox models.

\subsubsection{Clinically Relevant Features Included in BTReport}
\label{sec: clinical features}
Table~\ref{tab:btreport_features} summarizes the quantitative imaging features used by BTReport for automated report generation. Each feature was evaluated according to two clinically motivated criteria: 1) whether it corresponded to at least one of the top 35 radiological concepts most frequently documented in reference reports from the HuskyBrain dataset (Section~\ref{sec: embed_analysis}), and 2) whether it was a statistically significant predictor of patient survival time based on the survival analysis described in Section~\ref{sec: survpred}. We found that 21/22 of the BTReport features are commonly reported in clinical reports, and 11/22 are predictive of overall patient survival. This analysis highlights the clinical relevance of extracted features, motivating their use as inputs for downstream RRG.

\begin{table}[H]
\centering

\caption{
Summary of quantitative features used in BTReport. %
\enspace Feature groups are color-coded:%
\enspace \colorbox{gray!20}{\strut gray} = segmentation statistics,%
\enspace \colorbox{green!20}{\strut green} = VASARI features,%
\enspace \colorbox{orange!20}{\strut orange} = midline features.%
\enspace Key: {\color{conceptPurple}\Large$\bullet$} indicates the feature is among the top 35 most frequently reported concepts in real radiology reports, and
{\color{survivalGold}\Large$\bullet$} indicates the feature is a significant survival predictor. Acronyms: WM-white matter; MLS-midline shift.
}

\label{tab:btreport_features}

\scriptsize
\setlength{\tabcolsep}{4pt}
\renewcommand{\arraystretch}{0.95}

\begin{tabular}{
p{3.6cm}cc
p{3.6cm}cc
p{3.6cm}cc
}
\toprule

% ===============================
% SEGMENTATION (GRAY)
% ===============================
\cellcolor{gray!20}Total tumor volume (mL)& {\color{conceptPurple}\large$\bullet$} & {\color{survivalGold}\large$\bullet$} &
\cellcolor{green!20}Ventricular Invasion& {\color{conceptPurple}\large$\bullet$} & {\color{survivalGold}\large$\bullet$} &
\cellcolor{orange!20}Level of max MLS& {\color{conceptPurple}\large$\bullet$} & {\color{survivalGold}\large$\bullet$} \\

\cellcolor{gray!20}3D Lesion Sizes (cm)& {\color{conceptPurple}\large$\bullet$} & &
\cellcolor{green!20}Side of Tumor Epicenter& {\color{conceptPurple}\large$\bullet$} & &
\cellcolor{orange!20}Max MLS (mm) + L/R& {\color{conceptPurple}\large$\bullet$} & {\color{survivalGold}\large$\bullet$} \\

\cellcolor{gray!20}Proportion of Necrosis& {\color{conceptPurple}\large$\bullet$} & &
\cellcolor{green!20}Enhancement Quality& {\color{conceptPurple}\large$\bullet$} & &
\cellcolor{orange!20}Edema crosses midline& {\color{conceptPurple}\large$\bullet$} & {\color{survivalGold}\large$\bullet$} \\

\cellcolor{gray!20}Number of lesions& {\color{conceptPurple}\large$\bullet$} & &
\cellcolor{green!20}Enhancement thickness& {\color{conceptPurple}\large$\bullet$} & &
\cellcolor{orange!20}ET Crosses midline& {\color{conceptPurple}\large$\bullet$} & {\color{survivalGold}\large$\bullet$} \\

% ===============================
% VASARI (GREEN)
% ===============================
\cellcolor{gray!20}Proportion of Enhancing& {\color{conceptPurple}\large$\bullet$} & {\color{survivalGold}\large$\bullet$} &
\cellcolor{green!20}Multiple satellites present& {\color{conceptPurple}\large$\bullet$} & &
\cellcolor{orange!20}Asymmetrical Ventricles& {\color{conceptPurple}\large$\bullet$} & \\

\cellcolor{gray!20}Proportion of Edema& {\color{conceptPurple}\large$\bullet$} & &
\cellcolor{green!20}Multifocal or Multicentric& {\color{conceptPurple}\large$\bullet$} & {\color{survivalGold}\large$\bullet$} &
\cellcolor{orange!20}Enlarged Ventricles& {\color{conceptPurple}\large$\bullet$} & {\color{survivalGold}\large$\bullet$} \\

\cellcolor{green!20}Cortical involvement& {\color{conceptPurple}\large$\bullet$} & &
\cellcolor{green!20}Deep WM invasion& {\color{conceptPurple}\large$\bullet$} & {\color{survivalGold}\large$\bullet$} &
        &  & \\

\cellcolor{green!20}Tumor Location& {\color{conceptPurple}\large$\bullet$} & {\color{survivalGold}\large$\bullet$} &
\cellcolor{green!20}Eloquent Brain Involved& & &
 & & \\

\bottomrule
\end{tabular}
\end{table}

\subsection{Structured Synthetic Report Generation Using LLMs}

\label{sec: llm reportgen}

The BTReport framework generates the \textit{Findings} sections of radiology reports in a style that mimics a target institution through an in-context learning prompt to provide stylistic guidance and emphasize the relevant radiological facts which should be included in generated reports (Appendix \ref{appendix: LLM prompt}). Some facts in reference reports (e.g., diffusion characteristics, vascular features) cannot be derived from standard structural mpMRI modalities used for tumor segmentation (T1w, T1c, T2w, T2-FLAIR). Therefore, the LLM is explicitly instructed to exclude these findings in generated reports to prevent unsupported claims. 

Report generation is conditioned on the BTReport feature set and prompt instructions that restrict the LLM to narrative synthesis and stylistic alignment, enabling tighter control over report content and structure without task-specific fine-tuning. This prompting strategy improves interpretability by allowing reported findings to be directly traced to deterministic features.

For RRG with BTReport, we experiment with two open-source pre-trained LLMs with reasoning capability: gpt-oss:120b \cite{gptoss} and Llama 3.1 70B Instruct \cite{grattafiori2024llama}. These models were selected based on their strong reasoning and instruction-following capabilities. To avoid cloud sharing of medical data, only local offline models were used. Examples \textit{Findings} sections from synthetically generated reports across the different BTReport variants and other RRG frameworks can be found in Appendix \ref{sec: example outputs}. 

\section{Evaluation}
\label{sec: eval metrics}
We evaluated BTReport on 129 paired GBM image-report cases from the HuskyBrain dataset. Synthetic reports generated by BTReport and other neuro-oncology RRG frameworks described in Section \ref{sec: related work} were compared against ground-truth radiology reports using automated metrics (Section~\ref{sec: automated metrics}), and the reliability of BTReport-extracted features was assessed (Section~\ref{sec: intermediate feature evaluation}). To test robustness to segmentation quality, we compared BTReport outputs generated using DeepMedic \cite{deepmedic} segmentations, a dual-pathway CNN-based model, with outputs generated using segmentations from the BraTS~2023-winning submission \cite{ferreira2024we}, which is based on an nnU-Net \cite{isensee2021nnunet} ensemble. Prior work \cite{pemberton2023multi} has shown nnU-Net–based models to outperform DeepMedic for brain tumor segmentation, motivating this comparison.


\subsection{Automated Evaluation of Generated Radiology Reports}
\label{sec: automated metrics}

Generated radiology reports were evaluated using RadEval \cite{radeval}, a unified open-source framework that evaluates radiology text based on:

\begin{itemize}[topsep=0pt, partopsep=0pt, parsep=0pt, itemsep=0pt]
    \item N-gram-based lexical similarity: BLEU~\cite{bleu}, ROUGE~\cite{rouge}
    \item pre-trained contextualized embeddings: BERTScore~\cite{bertscore}
    \item clinically grounded scores: RaTEScore~\cite{zhao2024ratescore}
\end{itemize}

\noindent To assess the clinical correctness of content in generated reports, we used TBFact \cite{blondeel2025healthcare}, an LLM-based factuality metric that evaluates reports based on factual inclusion, distortion, and omission. These metrics are presented in Tables \ref{tab:rrg_bleu_rouge} and \ref{tab:rrg_other}.

\subsection{Reliability of Extracted Features}
\label{sec: intermediate feature evaluation}
We evaluated the reliability of BTReport-extracted clinical features by comparing them with the same features extracted from ground-truth radiology reports in the HuskyBrain dataset. Using LangExtract \cite{goel2025langextract}, we decompose radiology reports into class-attribute pairs, where each class represents a clinically defined radiology concept, and their associated attribute(s) specify the characteristic linked to the concept. For example, the text [\enquote{arising in the right parietal lobe...}] is decomposed into the \enquote{side of tumor epicenter} class and the \enquote{right} attribute (See Appendix \ref{appendix: intermediate} for a complete example).  We present the accuracy of categorical features and the error of the numerical features in Table \ref{tab: intermediate_features_two_column}. 

\section{Results}

\subsection{Quality of Extracted Features}

\begin{table}[H]
\centering
\caption{
Attribute-level reliability of BTReport-extracted features relative to ground-truth HuskyBrain radiology reports. Categorical features report accuracy, and numeric features report mean absolute error (MAE) $\pm$ standard deviation. Best-performing variant is highlighted in \textcolor{blue}{blue}.
}
\label{tab: intermediate_features_two_column}

{\scriptsize
\resizebox{\columnwidth}{!}{
\begin{tabular}{l c c c | c c c}
\toprule
& \multicolumn{3}{c|}{\textbf{Accuracy}} &
\multicolumn{3}{c}{\textbf{MAE}} \\
\cmidrule(lr){2-4} \cmidrule(lr){5-7}

\textbf{Method} &
Cortical &
Side of &
Ventricular &
Midline shift &
Number of &
Tumor volume \\
&
involvement &
tumor epicenter &
effacement &
(mm) &
lesions &
(cm$^3$) \\
\midrule

BTReport (DM) &
0.79 &
\cellcolor{blue!20}\textbf{1.00} &
\cellcolor{blue!20}\textbf{0.56} &
1.26 {\tiny$\pm$ 2.60} &
1.48 {\tiny$\pm$ 2.63} &
\cellcolor{blue!20}\textbf{1.80} {\tiny$\pm$ 3.95} \\

\midrule

BTReport (FI) &
\cellcolor{blue!20}\textbf{0.80} &
\cellcolor{blue!20}\textbf{1.00} &
0.52 &
\cellcolor{blue!20}\textbf{1.21} {\tiny$\pm$ 2.55} &
\cellcolor{blue!20}\textbf{0.41} {\tiny$\pm$ 1.69} &
2.00 {\tiny$\pm$ 5.16} \\

\bottomrule
\multicolumn{7}{c}{\scriptsize
DM: DeepMedic \cite{deepmedic} segmentations \quad
FI: Faking It \cite{ferreira2024we} segmentations}
\end{tabular}
}
}
\end{table}

Table~\ref{tab: intermediate_features_two_column} reports the reliability of BTReport-extracted clinical features under two segmentation inputs. Across both variants, categorical features showed strong agreement with LangExtract-derived ground truth, including perfect accuracy for identifying the side of the tumor epicenter and high accuracy for cortical involvement and tumor location. Ventricular effacement was less reliable, indicating that this attribute may be more challenging to describe consistently. For numeric features, BTReport achieved low mean absolute error for lesion count and midline shift, with consistently lower errors when using the improved segmentation model (FI). Tumor volume errors were comparable across variants.

\subsection{Lexical Similarity of Generated Reports}
\label{sec: semantic sim}

\begin{table}[H]
\centering
\caption{
Mean $\pm$ standard deviation BLEU and ROUGE metrics. Best values per metric are highlighted in \textcolor{blue}{blue}. 
\noindent\textsuperscript{$\dagger$}Following Approximate Randomization, synthetic reports generated with all BTReport variants were superior to 
those generated using other frameworks across all evaluation metrics ($p < 0.0001$).
}
\label{tab:rrg_bleu_rouge}

{\scriptsize
\resizebox{\columnwidth}{!}{
\begin{tabular}{lcccc}
\toprule

\textbf{Framework} &
\textbf{BLEU-1} & \textbf{BLEU-2} &
\textbf{ROUGE-1} & \textbf{ROUGE-2} \\
\midrule

BTReport (gpt-oss:120B, DM)$^{\dagger}$ &
0.236 {\tiny$\pm$ 0.068} &
0.124 {\tiny$\pm$ 0.044} &
0.360 {\tiny$\pm$ 0.078} &
0.109 {\tiny$\pm$ 0.038} \\
\midrule

BTReport (gpt-oss:120B, FI)$^{\dagger}$ &
0.244 {\tiny$\pm$ 0.070} &
0.132 {\tiny$\pm$ 0.044} &
\cellcolor{blue!20}\textbf{0.371} {\tiny$\pm$ 0.078} &
\cellcolor{blue!20}\textbf{0.115} {\tiny$\pm$ 0.040} \\
\midrule

BTReport (LLaMA3:70B, DM)$^{\dagger}$ &
\cellcolor{blue!20}\textbf{0.248} {\tiny$\pm$ 0.078} &
\cellcolor{blue!20}\textbf{0.136} {\tiny$\pm$ 0.052} &
0.362 {\tiny$\pm$ 0.080} &
0.115 {\tiny$\pm$ 0.043} \\
\midrule

AutoRG-Brain \cite{autorg} &
0.158 {\tiny$\pm$ 0.072} &
0.080 {\tiny$\pm$ 0.042} &
0.268 {\tiny$\pm$ 0.060} &
0.070 {\tiny$\pm$ 0.032} \\
\midrule

Seg-to-Exp \cite{seg2exp} &
0.085 {\tiny$\pm$ 0.039} &
0.035 {\tiny$\pm$ 0.018} &
0.163 {\tiny$\pm$ 0.055} &
0.023 {\tiny$\pm$ 0.013} \\

\bottomrule
\multicolumn{5}{c}{\scriptsize
DM: DeepMedic \cite{deepmedic} segmentations \quad
FI: Faking It \cite{ferreira2024we} segmentations}
\end{tabular}
}
}
\end{table}

Table~\ref{tab:rrg_bleu_rouge} summarizes the lexical similarity between generated and reference reports using mean BLEU and ROUGE metrics calculated over the 129 subject test dataset. Across all metrics, all BTReport variants substantially outperformed \textit{AutoRG-Brain} and \textit{Seg-to-Exp}, indicating closer n-gram overlap with clinical ground-truth reports. Across the two BTReport variants, BTReport with LLaMA3:70B achieved the highest BLEU-1 and BLEU-2 scores, indicating improved short-range lexical precision. In contrast, BTReport with gpt-oss:120B attained the highest ROUGE-1 and ROUGE-2 scores, suggesting improved recall of clinically relevant phrases and longer contextual spans. While the relative strengths marginally differed across metrics for BTReport variants, all demonstrated statistically significant improvements in lexical alignment with reference reports in comparison to baseline methods. 

\subsection{Factual Accuracy of Generated Reports}
\label{sec: factual accuracy}

Table~\ref{tab:rrg_other} reports factual consistency and semantic alignment of generated reports using TBFact, BERTScore, and RaTEScore. Across all metrics, both BTReport variants outperformed \textit{AutoRG-Brain} and \textit{Seg-to-Exp}, indicating improved factual grounding and clinically coherent report generation. BTReport with gpt-oss:120B and LLaMA3:70B achieved the highest overall performance, with the gpt-oss:120B model showing consistently higher TBFact, BERTScore, and RaTEScore values under the higher-quality segmentation input (FI) compared to DM. This suggests that reports generated with BTReport contain a greater proportion of verifiable clinical statements that are consistent with the underlying imaging findings. Both BTReport frameworks achieved substantially higher scores for the BERTScore and RaTEScore metrics, indicating closer semantic alignment with reference reports and improved clinical relevance. 
In contrast, \textit{AutoRG-Brain} exhibited moderate performance, while \textit{Seg-to-Exp} showed limited factual consistency, reflected by low TBFact scores and reduced semantic similarity. All improvements using the BTReport framework were statistically significant ($p < 0.0001$) when evaluated with AR.

\begin{table}[H]
\centering
\caption{
Mean $\pm$ standard deviation TBFact, BERTScore, and RaTEScore metrics. Best values per metric are highlighted in \textcolor{blue}{blue}.
\noindent\textsuperscript{$\dagger$}
Following Approximate Randomization, synthetic reports generated with all BTReport variants were superior to 
those generated using other frameworks across all evaluation metrics ($p < 0.0001$).
}
\label{tab:rrg_other}

\resizebox{\columnwidth}{!}{
\begin{tabular}{lcccccc}
\toprule
& \multicolumn{4}{c}{\textbf{TBFact (DeepSeek-R1)}} &
\multicolumn{1}{c}{\multirow{2}{*}{\textbf{BERTScore}}} &
\multicolumn{1}{c}{\multirow{2}{*}{\textbf{RaTEScore}}} \\
\cmidrule(lr){2-5}

\textbf{Framework} &
\textbf{Score} & \textbf{Prec.} & \textbf{Recall} & \textbf{F1} &
& \\
\midrule

BTReport (gpt-oss:120B, DM)$^{\dagger}$ &
0.313 {\scriptsize$\pm$ 0.145} &
0.345 {\scriptsize$\pm$ 0.155} &
0.325 {\scriptsize$\pm$ 0.163} &
0.313 {\scriptsize$\pm$ 0.145} &
0.447 {\scriptsize$\pm$ 0.060} &
0.568 {\scriptsize$\pm$ 0.057} \\
\midrule

BTReport (gpt-oss:120B, FI)$^{\dagger}$ &
\cellcolor{blue!20}\textbf{0.353} {\scriptsize$\pm$ 0.151} &
\cellcolor{blue!20}\textbf{0.412} {\scriptsize$\pm$ 0.146} &
\cellcolor{blue!20}\textbf{0.349} {\scriptsize$\pm$ 0.162} &
\cellcolor{blue!20}\textbf{0.359} {\scriptsize$\pm$ 0.145} &
\cellcolor{blue!20}\textbf{0.453} {\scriptsize$\pm$ 0.055} &
\cellcolor{blue!20}\textbf{0.577} {\scriptsize$\pm$ 0.054} \\
\midrule

BTReport (LLaMA3:70B, DM)$^{\dagger}$ &
0.295 {\scriptsize$\pm$ 0.130} &
0.377 {\scriptsize$\pm$ 0.168} &
0.274 {\scriptsize$\pm$ 0.136} &
0.295 {\scriptsize$\pm$ 0.130} &
0.433 {\scriptsize$\pm$ 0.062} &
0.567 {\scriptsize$\pm$ 0.061} \\
\midrule

AutoRG-Brain \cite{autorg} &
0.072 {\scriptsize$\pm$ 0.123} &
0.282 {\scriptsize$\pm$ 0.155} &
0.186 {\scriptsize$\pm$ 0.137} &
0.196 {\scriptsize$\pm$ 0.130} &
0.327 {\scriptsize$\pm$ 0.047} &
0.477 {\scriptsize$\pm$ 0.053} \\
\midrule

Seg-to-Exp \cite{seg2exp} &
0.014 {\scriptsize$\pm$ 0.047} &
0.131 {\scriptsize$\pm$ 0.121} &
0.147 {\scriptsize$\pm$ 0.108} &
0.098 {\scriptsize$\pm$ 0.089} &
0.156 {\scriptsize$\pm$ 0.042} &
0.409 {\scriptsize$\pm$ 0.039} \\

\bottomrule
\multicolumn{7}{c}{\small
DM: DeepMedic \cite{deepmedic} segmentations \quad
FI: Faking It \cite{ferreira2024we} segmentations}
\end{tabular}
}
\end{table}

\section{Discussion}

We present BTReport, a framework for brain tumor RRG grounded in clinically relevant quantitative imaging features. Overall, our findings support the two-stage report generation paradigm for neuro-oncology RRG, suggesting that in medical imaging domains with limited data, adding quantitative features to prompts is an efficient way to generate reports and improve factual consistency. BTReport enables accurate measurement-grounded reporting without requiring task-specific fine-tuning of vision encoders or VLMs, and generates interpretable reports by using modular, reliably-extracted features. 

We presented novel pipelines for midline shift measurement and tumor-robust anatomical segmentation for deterministic feature extraction, and showed that these features had strong agreement with expert annotation. Additionally, other quantitative features were selected based on clinical guidelines such as VASARI and validated to be significant predictors of survival time using KM-analysis. By clustering concepts frequently reported in real radiology reports, we validated that the included features were clinically relevant.  When compared with existing neuro-oncology RRG approaches, BTReport generated reports are superior in terms of lexical similarity and factual accuracy. We believe this improvement is due to the clinically-relevant features used for generation and the in-context learning prompt which allows reasoning LLMs to predict the most relevant features for RRG. 

To facilitate further research in neuro-oncology RRG, we provide BTReport-BraTS, an open-source companion dataset containing anatomical descriptors, metadata, and BTReport generated reports for mpMRI cases in the BraTS'23 dataset. To assess the clinical applicability of BTReport, we have developed BTReview, a survey tool for radiologist assessment of generated neuro-oncology radiology reports (Appendix \ref{appendix: btreview}). Future work will obtain radiologist feedback with BTReview, incorporate additional features such as white matter hyperintensities and basal cistern status, handle additional mpMRI imaging modalities, and include descriptions of ischemic or hemorrhagic stroke findings.

\clearpage

\midlacknowledgments{The authors are grateful to Caitlin Neher, Leonardo Schettini, and Ankush Jindal for their helpful discussions. The authors are grateful for Dr. Kambiz Nael for his clinical perspectives for improving the evaluation platform. The authors are also grateful to James K. Ruffle, for making the VASARI-Auto implementation available.}

\section{Ethics}
This study’s activities were approved by the Institutional Review Board at the University of Washington
(STUDY00022466). This research is in accordance with the principles embodied in the Declaration of Helsinki.

\section{Funding Statement}
The work of Juampablo Heras Rivera was partially supported by the U.S. Department of Energy Computational Science Graduate Fellowship under Award Number DE-SC0024386. 
\bibliography{midl26_115}

\newpage
\appendix \section{Cluster Analysis of Common Topics in Reference Reports}
\label{appendix: clusters}
\begin{table}[H]
\centering
\scriptsize
\setlength{\tabcolsep}{4pt}
\renewcommand{\arraystretch}{1.2}
\resizebox{\textwidth}{!}{%
\begin{tabular}{p{6cm} c p{13cm}}
\toprule
\textbf{Cluster Description} & \textbf{Frequency} & \textbf{Two Example Sentences in this cluster} \\
\midrule
Lateral ventricle effacement/asymmetry; possible\dots & 50 & \enquote{The lateral ventricles are symmetric.}, \enquote{The lateral ventricles are symmetric} \\
Midline shift presence and magnitude.\dots & 27 & \enquote{There is a 2 mm left midline shift}, \enquote{There is a susceptibility millimeters rightward midline shift.} \\
Cisterns appear normal, no obstruction. & 19 & \enquote{The basal cisterns are patent.}, \enquote{The basal cisterns are patent} \\
Peritumoral/Vasogenic edema present. *(This\dots & 18 & \enquote{There is mild-to-moderate associated edema}, \enquote{The first mass has associated surrounding vasogenic edema} \\
Lesion size and measurements reported. & 16 & \enquote{The lesion measures 4.0 x 3.5 cm}, \enquote{The lesion measures 3.9 x 2.0 x 2.1 cm.} \\
No acute intracranial hemorrhage or infarct. & 15 & \enquote{There is no acute infarct}, \enquote{No acute infarct is seen} \\
Multiple intracranial masses present bilaterally.\dots & 15 & \enquote{There is a large irregular enhancing mass centered in the right frontal lobe}, \enquote{There is a right frontal lobe mass} \\
Multiple, enhancing intracranial lesions present. & 14 & \enquote{The lesion originates in the anterior paramedian left frontal lobe}, \enquote{There is a lesion in the right frontal lobe} \\
White matter disease, likely nonspecific etiology. & 14 & \enquote{There are scattered deep and periventricular white matter T2/FLAIR hyperintensities}, \enquote{There is mild subcortical and periventricular white matter T2 FLAIR abnormality} \\
No mass effect/midline shift. (Alternatively: No\dots & 13 & \enquote{There is no shift in the brain}, \enquote{There is no shift of the brain structures} \\
Ventricular system: normal or prominent.\dots & 13 & \enquote{The ventricles, sulci, and cisterns are normal}, \enquote{The remaining ventricles, sulci, and cisterns are normal} \\
Restricted diffusion within the lesion(s). & 9 & \enquote{The solid components of the lesion demonstrate moderate diffusion restriction}, \enquote{The lesion has peripheral areas of mild diffusion restriction} \\
Frontal/Temporal lobe FLAIR signal abnormality & 9 & \enquote{There is extensive T2/FLAIR signal abnormality in the right frontotemporal lobes}, \enquote{There is surrounding FLAIR signal hyperintensity inferiorly going into the temporal lobe and posteriorly.} \\
Herniation syndromes present on imaging. (or\dots & 8 & \enquote{There is suggestion of transtentorial herniation}, \enquote{There is leftward subfalcine herniation} \\
Mass dimensions and size measurements. & 8 & \enquote{The mass measures 2.4 x 3.4 cm in axial cross-section}, \enquote{The mass measures approximately 6.6 x 4.7 cm in transverse dimensions and 4.6 cm craniocaudally} \\
Uncal medialization, potentially impacting\dots & 8 & \enquote{There is right uncal medialization}, \enquote{There is medialization of the right uncus} \\
No acute intracranial hemorrhage present.\dots & 7 & \enquote{No parenchymal hemorrhage is present}, \enquote{There is no associated hemorrhage} \\
Mass effect on lateral ventricles. & 7 & \enquote{The mass extends along the ependymal surface of the right lateral ventricle}, \enquote{The mass extends into the posterior horn of the right lateral ventricle} \\
Edema predominantly affecting frontal \& temporal\dots & 6 & \enquote{There is perilesional edema along the predominantly anterior aspect of the medial frontal lobes}, \enquote{The vasogenic edema extends to the frontal lobe} \\
Mass size and measurements. (Alternatively: Lesion\dots & 6 & \enquote{The mass measures 40 x 53 mm}, \enquote{The mass measures approximately 58 x 44 x 44 mm} \\
Corpus callosum lesion, midline crossing/spread. & 6 & \enquote{The lesion extends into the splenium of the corpus callosum, crossing midline to the right}, \enquote{There is ependymal spread along the body of the corpus callosum} \\
Diffusion restriction presence/absence \&\dots & 6 & \enquote{There is no associated restricted diffusion}, \enquote{There are areas of internal diffusion restriction and susceptibility} \\
Frontal horn effacement \& ventricular asymmetry. & 6 & \enquote{There is partial effacement of the right frontal horn}, \enquote{There are areas of subtle ependymal enhancement in the bilateral frontal horns} \\
Hemorrhagic lesion with restricted diffusion. & 5 & \enquote{The lesion restricts diffusion and has intralesional hemorrhage}, \enquote{The mass is T2 hyperintense, contains multiple foci of internal hemorrhage, and demonstrates mottled diffusion restriction consistent with hypercellularity and/or necrosis.} \\
Cistern effacement suggests mass effect. (Or, more\dots & 5 & \enquote{There is effacement of the right crural cistern}, \enquote{There is partial effacement of the basal cisterns} \\
Sulcal effacement, widespread cortical\dots & 5 & \enquote{There is sulcal effacement involving the right parietal, posterior temporal, and occipital lobes}, \enquote{There is mild sulcal effacement of the left occipital lobe.} \\
Mass shows restricted diffusion on imaging. (Or,\dots & 5 & \enquote{There are patchy foci of restricted diffusion within the mass}, \enquote{The second mass has diffusion restriction} \\
Corpus callosum mass/involvement. (or simply:\dots & 5 & \enquote{The mass extends into the right-sided genu of the corpus callosum}, \enquote{The mass extends along the splenium of the corpus callosum} \\
Basal ganglia involvement with signal abnormality. & 5 & \enquote{The signal abnormality extends into the right basal ganglia, right thalamus, right cerebral peduncle, and right midbrain}, \enquote{The hyperintensity involves the bilateral basal ganglia, with greater involvement on the left} \\
Ventricular size and morphology assessment. & 4 & \enquote{The third ventricle is near slitlike}, \enquote{There is complete effacement of the third ventricle} \\
Midline shift at foramen of Monro & 4 & \enquote{There is a negative millimeters leftward midline shift at the level of the foramen of Monroe}, \enquote{There is some millimeters of rightward midline shift at the level of the foramen of Monro} \\
No acute hydrocephalus present. & 4 & \enquote{There is no evidence of acute hydrocephalus}, \enquote{There is no hydrocephalus} \\
Corpus callosum FLAIR edema/hyperintensity\dots & 4 & \enquote{The surrounding T2/FLAIR signal is similar and extends to the left splenium, septum pellucidum, and superior corpus callosum}, \enquote{There is subtle patchy T2 FLAIR hyperintensity along the right body of the corpus callosum} \\
Peripheral mass enhancement characteristics. (Or\dots & 4 & \enquote{The first mass has peripheral enhancement with a nodular solid enhancing component}, \enquote{The mass has irregular somewhat nodular peripheral enhancement} \\
Right cerebral peduncle lesion/mass effect. & 4 & \enquote{There is mass effect on the right cerebral peduncle}, \enquote{The lesion has questionable extension into the posterior right cerebral peduncle} \\
\bottomrule
\end{tabular}
}
\caption{Example sentences associated with the top 35 radiological concept clusters ranked by their prevalence in reference radiology reports.}
\label{tab:cluster_examples}
\end{table}

% \newpage
\section{Kaplan-Meier Survival Analysis of Extracted Features}
\label{sec: KM plots}
\begin{figure}[H]
    \centering
    \includegraphics[width=0.97\linewidth]{figures/kmplot.png}
    \caption{Kaplan-Meier (KM) plots for demographic and radiogenomic features, and deterministic features extracted with the BTReport framework. Survival probabilities at specific time points are obtained by projecting vertically from the time of interest to the curve and horizontally to the y-axis. Example: for the Age feature, at 500 days post-diagnosis, the estimated survival probability for the older age group (High) is $\sim$ 20\%, compared to $\sim$ 50\% for the younger age group (Low).  Kaplan–Meier analyses indicate that many of these features are predictive of overall survival, highlighting their clinical relevance and motivating their use as structured inputs for radiology report generation. }
    
    \label{fig:kmplots}
\end{figure}
\newpage
\section{LLM Prompt for Radiology Report Generation}
\label{appendix: LLM prompt}

\small
\begin{FullReportBox}
% ---- TOP BLACK HEADER ----
\begin{tcolorbox}[
    colback=headerblack,
    colframe=headerblack,
    boxrule=0pt,
    width=\linewidth,
    arc=10pt,
    outer arc=10pt,
    sharp corners,
    rounded corners=north,
    left=10pt, right=10pt,
    top=6pt, bottom=6pt,
    boxsep=0pt,
    enlarge left by=-0.8pt,   % <-- pushes header to the left frame
    enlarge right by=-0.8pt   % <-- pushes header to the right frame
]
    {\bfseries\normalsize\color{white} BTReport Prompt}
\end{tcolorbox}

% ---- BODY SECTION ----
\begin{tcolorbox}[
    colback=white,
    colframe=white,
    boxrule=0pt,
    arc=0pt,
    outer arc=0pt,
    left=10pt, right=10pt,
    top=6pt, bottom=6pt
]
\tinysmall

You are a radiologist generating a synthetic clinical MRI report.

Below are example FINDINGS sections taken from real brain tumor reports:

\vspace{6pt}

\textbf{EXAMPLE FINDINGS:}\\
\{\texttt{example\_findings}\}

\medskip
\hrule
\medskip

Your job is to generate a FINDINGS section in the same clinical style, but only using the METADATA provided below.

Please abide strictly to the following rules (follow them exactly).

\begin{enumerate}
\item Use only the metadata provided for quantitative statements. Do NOT hallucinate any information that is not directly inferable.

\item Include 10--20 clinically meaningful findings summarized in an anatomically descriptive manner. Prioritize describing abnormal or clinically significant observations.

\item Preserve the subsection structure from the example reports. Make sure to include the following subsections: MASS EFFECT \& VENTRICLES and BRAIN / ENHANCEMENT.

\item Never mention imaging sequences other than T1n, T2w, T2 FLAIR, or T1-Gd. Do not mention diffusion, perfusion, spectroscopy, MRA, or other modalities (unless stated explicitly in metadata).

\item Do not mention structures or measurements not present in the metadata.

\item Mandatory Considerations: Make sure to include the following findings below if present in metadata. Remember to follow the sentence structure in the example reports.

    \begin{enumerate}[label=\alph*)]
        \item Maximum midline shift represented in mm units.
            \begin{enumerate}[label=\roman*)]
                \item Make sure to describe the magnitude and the direction of the midline shift.
                \item Describe the anatomical level at which the (e.g., foramen of Monro, third and fourth ventricles, septum pellucidum).
                \item If shift is minimal (e.g., $< 5$ mm), explicitly state the measurement as no shift, but still provide the measurement.
            \end{enumerate}

        \item If tumor mass effect is present, describe the mass effect on ventricles or surrounding brain structures.
            \begin{enumerate}[label=\roman*)]
                \item Include a description of ventricular effacement (if present), including which horn (anterior/posterior horn), and on which hemisphere it is observed.
            \end{enumerate}

        \item Comment on ventricular status. If effacement is present, describe the extent of the effacement or asymmetry. If ventricles are normal, explicitly state so (mirroring example reports). Use the following metadata fields in your description: ``Asymmetrical Ventricles'', ``Enlarged Ventricles''.

        \item Describe the size of the primary lesion, as well as any smaller secondary lesions (if present) represented in cm units. Use the 3D measurements from the metadata. Make sure to include the following:
            \begin{enumerate}[label=\roman*)]
                \item If multiple lesions exist, summarize number, dominant lesion, and laterality. Use the following metadata fields in your description: ``Number of lesions'' and ``Multifocal or multicentric.''
                \item Anatomical location of lesion(s).
                \item Use the following metadata fields in your description: ``Tumor Location'', ``Side of Tumor Epicenter'', and ``Region Proportions.'' 
            \end{enumerate}

        \item Describe enhancing characteristics. Use the following metadata fields in your description: ``Enhancement quality'', ``Thickness of enhancing margin'', ``Proportion Enhancing''.
            \begin{enumerate}[label=\roman*)]
                \item Describe enhancement style (e.g., rim-enhancing, mildly enhancing, peripheral enhancing, multi-lobular enhancing) only if explicitly supported.
                \item Describe edematous tissue. Use the following metadata fields in your description: ``ED volume'', Whether edema crosses midline, ``Proportion of edema''.
                \item Describe vasogenic edema and its extent only if metadata supports it.
            \end{enumerate}

        \item Describe invasion and involvement. Use the following metadata fields in your description: ``Cortical involvement'', ``Deep WM invasion'', ``Ependymal invasion'', ``Eloquent Brain Involvement''.

        \item Describe the necrosis if present. Use the metadata field ``Proportion Necrosis'' to describe the central foci of necrosis.
    \end{enumerate}
\end{enumerate}

\medskip

\textbf{METADATA (for subject \{\texttt{subject\_id}\}):}\\
\{\texttt{metadata\_json}\}

\medskip
\hrule
\medskip

Write the \textbf{FINDINGS} section now, using clinical radiology language.

\end{tcolorbox}

\end{FullReportBox}

\newpage
\section{Short version of LLM Prompt for BTReport}

\begin{center}
\small

\label{tab:btreport_prompt_short}

\begin{FullReportBox}

% ---- TOP BLACK HEADER ----
\begin{tcolorbox}[
    colback=headerblack,
    colframe=headerblack,
    boxrule=0pt,
    width=\linewidth,
    arc=10pt,
    outer arc=10pt,
    sharp corners,
    rounded corners=north,
    left=10pt, right=10pt,
    top=6pt, bottom=6pt,
    boxsep=0pt,
    enlarge left by=-0.8pt,
    enlarge right by=-0.8pt
]
{\bfseries\normalsize\color{white} BTReport Prompt (Short)}
\end{tcolorbox}

% ---- BODY ----
\begin{tcolorbox}[
    colback=white,
    colframe=white,
    boxrule=0pt,
    arc=0pt,
    left=10pt, right=10pt,
    top=6pt, bottom=6pt,
    breakable
]
\tinysmall

You are a radiologist generating a synthetic clinical MRI report.

Below are example FINDINGS sections taken from real brain tumor reports:

\medskip
\textbf{EXAMPLE FINDINGS:}\\
\{\texttt{example\_findings}\}

\medskip
\hrule
\medskip

Now generate a similar FINDINGS section, but \textbf{only} using the metadata provided below.

\begin{itemize}
    \item Do not hallucinate any information that is not directly inferable.
    \item Preserve all subsections present in the example findings reports.
    \item Select the top 10 metadata-supported findings; real reports typically include 7--10 facts.
    \item Prioritize abnormal or clinically significant findings.
    \item Only T1n, T2w, T2 FLAIR, and T1-Gd sequences were obtained; do not comment on diffusion or other modalities.
    \item Comment on midline shift in the style of the example reports, including direction and magnitude.
    \item Describe mass effect and ventricular effacement if present, including laterality and anterior/posterior horn involvement.
    \item Report lesion dimensions in three orthogonal axes (cm) if supported by metadata.
    \item Do not mention structures, measurements, or features unless supported by the metadata.
\end{itemize}

\medskip

\textbf{METADATA (for subject \{\texttt{subject\_id}\}):}\\
\{\texttt{metadata\_json}\}

\medskip
\hrule
\medskip

Write the \textbf{FINDINGS} section now, using clinical radiology language.

\end{tcolorbox}
\end{FullReportBox}

\end{center}


\section{Radiology report feature extraction for validating BTReport features}
\label{appendix: intermediate}
\begin{figure}[H]
    \centering
    \includegraphics[width=0.66\linewidth]{figures/clinical_extract.png}
    \caption{Visualization of features extracted from free text radiologist-authored report using LangExtract.}
    \label{fig:placeholder}
\end{figure}
\begin{figure}[H]
    \centering
    \includegraphics[width=0.66\linewidth]{figures/predicted_extract.png}
    \caption{Visualization of features extracted from free text BTReport-generated report using LangExtract.}
    \label{fig:placeholder}
\end{figure}


\newpage
\section{Example Generated Reports}
\label{sec: example outputs}

\begin{FullReportBoxMP}

% ======================================================
% TOP BLACK HEADER (CONNECTED TO ROUNDED OUTER FRAME)
% ======================================================

\begin{tcolorbox}[
    colback=headerblack,
    colframe=headerblack,
    boxrule=0pt,
    arc=10pt,
    outer arc=10pt,
    sharp corners,
    rounded corners=north,
    left=10pt, right=10pt,
    top=6pt, bottom=6pt
]
{\bfseries\normalsize\color{white}
Model comparison of RRG output: Findings Section}
\end{tcolorbox}
\begin{tcolorbox}[
    colback=white,
    colframe=white,
    boxrule=0pt,
    left=10pt, right=10pt,
    top=1pt, bottom=1pt
]
\small
\noindent
\textbf{COLOR KEY:} \correct{Green} indicates correct statements;
\almost{yellow} partially correct;
\incorrect{red} incorrect;
\lightgray{gray} ground-truth statements referenced by generated reports;
\textbf{**} findings derived from unavailable MRI modalities.
\end{tcolorbox}
% ======================================================
% GREY BANNER — GROUND TRUTH
% ======================================================

\begin{tcolorbox}[
    colback=dividergray,
    colframe=dividergray,
    boxrule=0pt,
    arc=0pt,
    outer arc=0pt,
    left=0pt, right=0pt,
    top=3pt, bottom=3pt
]
\hspace*{10pt}{\bfseries\normalsize\color{white} Ground Truth Findings}
\end{tcolorbox}

% ======================================================
% GROUND TRUTH CONTENT
% ======================================================

\begin{tcolorbox}[
    colback=white,
    colframe=white,
    boxrule=0pt,
    left=10pt, right=10pt,
    top=1pt, bottom=6pt
]
\footnotesize
\textbf{MASS EFFECT \& VENTRICLES:}
Prominent \lightgray{leftward midline shift} by \lightgray{approximately 14 mm} (XXX/XXX). There is also \lightgray{medialization of right uncus.} The \lightgray{basal cisterns are partially effaced}. 

\vspace{6pt}

\textbf{BRAIN/ENHANCEMENT:}
 A \lightgray{large irregular enhancing lesion} centered within the \lightgray{right temporal lobe with significant mass effect.} Restricted diffusion\textbf{**} is noted within the enhancing portion of the lesion, which contains susceptibility artifact suggestive of microhemorrhages/angioinvasion.\textbf{**} The lesion measures approximately \lightgray{6.8 x 4.4 x 4.8 cm (AP, TV, CC).} There is probable small \lightgray{subependymal enhancement at the atria of right lateral ventricle} (XXX/XXX, XXX/XXX). No acute hematoma or infarct is seen.
\end{tcolorbox}




% ======================================================
% GREY BANNER — GENERATED
% ======================================================

\begin{tcolorbox}[
    colback=dividergray,
    colframe=dividergray,
    boxrule=0pt,
    arc=0pt,
    outer arc=0pt,
    left=0pt, right=0pt,
    top=3pt, bottom=3pt
]
\hspace*{10pt}{\bfseries\normalsize\color{white} Generated Findings}
\end{tcolorbox}

% ======================================================
% LONGTABLE — MULTIPAGE
% ======================================================

\footnotesize
\setlength{\tabcolsep}{4pt}

% \begin{longtable}{p{3.5cm} p{\dimexpr\linewidth-3.5cm-2\tabcolsep\relax}}
\begin{longtable}{%
    p{3.3cm}%
    p{\dimexpr\linewidth-3.3cm-2\tabcolsep-20pt\relax}%
}
\textbf{Framework} & \textbf{Prediction} \\
\midrule
\endfirsthead

\textbf{Framework} & \textbf{Prediction} \\
\midrule
\endhead

% ======================================================
% ROWS
% ======================================================

% \textbf{Structured template} &
% \metrics{0.42}{0.13}{0.77}{0.89}{0.65}

% Right parietal-cortex region tumor, mildly enhancing, solitary, no ependymal
% invasion, no deep white matter invasion, enhancing region does not cross midline,
% tumor-side ventricular asymmetry, ventricles not enlarged. 
% \\[10pt]

% \addlinespace[12pt]

\splitmodel{BTReport}{(gpt-oss:120B)}  &
\metrics{0.37}{0.24}{0.56}{0.47}{0.33}

\textbf{MASS EFFECT \& VENTRICLES:} 
Approximately \almost{12 mm} of \correct{right-to-left midline shift} is present, measured at the level of the falx cerebri above. The ventricles are asymmetrical, with compression of the right lateral ventricle secondary to \correct{tumor-related ependymal invasion}; there is no overall ventricular enlargement. No tonsillar herniation is identified.  

\vspace{2pt}

\textbf{BRAIN/ENHANCEMENT:}  A solitary \correct{right-sided lesion} involving the \correct{temporal}, cortical, and parietal lobes measures \almost{7.1 x 5.6 x 5.3 cm (AP x TV x CC)}. The mass shows marked \correct{heterogeneous enhancement} with a thick ($>3 mm$) enhancing margin; roughly 28\% of the tumor volume enhances. A necrotic core comprises about 22\% of the lesion volume. Deep white-matter invasion and cortical involvement are evident, and there is \correct{direct ependymal invasion of the adjacent lateral ventricle.} The enhancing component does not cross the midline, but extensive vasogenic edema (84 mL, around 50\% of total lesion volume) crosses the midline and surrounds the lesion, extending into the contralateral hemisphere. Multiple small enhancing satellite nodules are present along the cortical surface. The edema involves visual-association cortex, indicating eloquent brain (vision) involvement.\\[10pt]

\addlinespace[12pt]

\splitmodel{BTReport}{(LLaMa3:70B)}  &
\metrics{0.42}{0.29}{0.57}{0.47}{0.34}

\textbf{MASS EFFECT \& VENTRICLES:} 
 There is a maximum of \almost{12 mm} \correct{leftward midline shift} at the level of the falx cerebri above. The ventricles are asymmetrical, with tumor-side ventricular asymmetry noted. There is no enlargement of the ventricles.

\vspace{6pt}

\textbf{BRAIN/ENHANCEMENT:}  Within the \correct{right temporal}, cortex, and parietal lobes, there is a solitary, predominantly enhancing mass measuring \almost{7.1 x 5.6 x 5.3 cm.} The lesion exhibits marked enhancement with a thick ($>3mm$) enhancing margin. There are multiple small satellite lesions present. The enhancing region does not cross the midline. The lesion invades the cortex and deep white matter, with \correct{ependymal invasion also present.} There is a large volume of surrounding vasogenic edema (83.6 mL), which crosses the midline. The proportion of edema is 49.74\%. A central focus of necrosis is present, comprising approximately 21.99\% of the lesion." \\[10pt]

\addlinespace[12pt]

\textbf{AutoRG-Brain} &
\metrics{0.32}{0.25}{0.53}{0.41}{0.25}

After contrast administration, the lesion shows \correct{significant enhancement} on T1-weighted images with unclear boundaries. The adjacent brain parenchyma shows significant edema in the brain pools and ventricles. After contrast administration, the \correct{lesion in the right temporal lobe} demonstrates significant ring-like enhancement. The lesion measures approximately \incorrect{82*111*82mm}, with indistinct borders and a large amount of edema in the adjacent brain tissue, causing \correct{compression of the right lateral ventricle} and \correct{deviation of midline structures to the left.}  \\[10pt]

\addlinespace[12pt]

\textbf{Seg-to-Exp} &
\metrics{0.21}{0.16}{0.41}{0.26}{0.0}

The brain tumor segmentation results indicate that the tumor is primarily located in the \correct{right temporal} and parietal regions, with a significant presence in the Temporal-to-Parietal (GapMap) region (40.03\% of the tumor). The tumor also affects several surrounding regions, including Area STS2 (STS), Area STS1 (STS), Frontal-to-Temporal-II (GapMap), and Area TE 2.2 (STG).

Semantic segmentation of the tumor reveals three distinct categories: \textbf{Tumor Core} (red), \textbf{Peritumoral Edema} (yellow), and \textbf{GD-Enhancing Tumor} (green). The Tumor Core represents the most aggressive part of the tumor, while Peritumoral Edema indicates the area surrounding the tumor that is affected by edema. The GD-Enhancing Tumor region shows enhancement on Gadolinium (GD) MRI sequences, suggesting a highly vascularized and active part of the tumor.\\[10pt]

\addlinespace[12pt]


\end{longtable}

% ======================================================
% CLOSE OUTER CONTAINER
% ======================================================

\end{FullReportBoxMP}




\section{BTReport-BraTS: A Companion Dataset for BraTS RRG}
\label{sec: btreportbrats}

Pre-operative mpMRI cases from the combined training and validation splits of the BraTS 2023 Adult Glioma (BraTS'23) dataset (n=1,470 cases) were used to develop the BTReport-BraTS dataset, an open-source companion dataset generated using the BTReport framework. For each case, corresponding midline segmentations, extracted metadata, structured summary reports, and radiology reports generated using BTReport are openly available in the project GitHub page.


\section{BTReview - A Tool for Radiologist Assessment of Generated Brain Tumor Radiology Reports}
\label{appendix: btreview}

\begin{figure}[H]
  \centering
  \begin{minipage}[t]{0.41\textwidth}
    \centering
    \includegraphics[width=\linewidth]{figures/AppendixH1.png}
  \end{minipage}
  \begin{minipage}[t]{0.49\textwidth}
    \centering
    \includegraphics[width=\linewidth]{figures/AppendixH2.png}
  \end{minipage}
  \caption{A clinical assessment tool to evaluate the clinical quality of synthetically generated radiology reports generated using various RRG frameworks.}
  \label{fig:two_side_by_side}
\end{figure}



Despite rapid advances in radiology report generation (RRG), there remains a lack of accessible, standardized tools that enable radiologists to efficiently review, compare, and score synthetically generated reports across different RRG frameworks. To address this gap, we developed BTReview, an open-source, web-based platform that supports de-identified case review and double-blinded clinical assessment of generated reports. This platform is customizable with any set of glioblastoma mpMRI cases and is designed to record structured, high-quality radiologist evaluations assessing the overall clinical quality of radiology reports generated with any RRG framework. The generation of these radiologist assessments creates a practical pathway for producing clinically grounded data and error annotation at scale, essential for generating high quality data for VLM training. Furthermore, this validation approach critically evaluates the translatability of existing RRG frameworks to ensure alignment with real-world clinical scenarios. We denote key features of BTReview platform with numerical labels and describe these features below. Feedback from a board-certified neuroradiologist was incorporated into BTReview during the design stage to ensure clinical relevance, appropriate terminology, and alignment with real-world neuroradiology reporting practices.


\noindent 1) \textbf{Interactive Multi-View:} Leveraging the capabilities of NiiVue \cite{Rorden2024Niivue}, the first panel is an interactive multi-view of T1n, T1c, T2w, and T2f image sequences that can run on any device (phone, tablet, computer), enabling radiologists to review de-identified mpMRI cases in various clinical environments.

\noindent 2) \textbf{View Selector:}  Users can select and modify their multi-view from the drop down menu, enabling them to view image sequences for a given case in the axial, coronal, and sagittal planes, as well as in a 3D render view. An additional option allows all planes to be viewed simultaneously for a given case. Changes in view selection will propagate across all image sequences. 

\noindent 3) \textbf{Clinical Toolbar:} Within the clinical toolbar, radiologists can select blue buttons to overlay tumor masks, BTReport-generated midline, and the brain's ideal midline on image sequences in the interactive multi-view. Users can toggle the button again to remove the overlays. Additionally, a measurement tool enables clinicians to obtain quantitative measurements, such as midline shift by dragging their cursor between two points on any image sequence. These measurements will be automatically updated in the multi-view pane, allowing radiologists to revisit measurements across previously annotated cases. Users can also clear their measurements and perform re-measurements if desired.

\noindent 4) \textbf{Midline Shift Measurement:} To validate midline shift measurements generated with BTReport, radiologists can overlay the tumor mask and the brain's ideal midline and scroll to a slice that demonstrates strong midline deviation (e.g., max midline shift). Users can then use the measurement tool to directly obtain a quantitative distance measurement from the boundaries of different sub-regions of the tumor to boundaries demarcated by midline brain structures (e.g., septum pellucidum) for midline shift measurement. Different sub-regions (necrotic core, edema, and enhancing tumor sub-regions) of the tumor mask are color coded and derived from BraTS tumor segmentation. Note: the appearance of tumor segmentation tasks may appear differently (e.g., masks from some institutions may not provide granular parcellation of tumor sub-regions). Radiologists can then overlay the BTReport-generated midline and compare the magnitude of midline deviation through direct measurement and compare their midline shift measurement to the measurement described in synthetically generated radiology reports (see below).

\noindent 5) \textbf{RRG Comparison:} To ensure unbiased evaluation of the clinical quality of synthetically generated radiology reports, the \textit{Findings} description of the reports generated by the BTReport(gpt-oss:120B), BTReport(Llama 3:70B), AutoRG-Brain, and Seg-to-Exp RRG pipelines are displayed, where each column represents the synthetic output generated by a different pipeline. Importantly, synthetically generated reports are described as Report A, B, C, etc., ensuring reviewers are blinded to the identity of the pipeline used to generate the displayed reports. To mitigate potential bias during the review stage, the order in which the radiology report are shown (from left to right) is randomized for each case, ensuring that reviewers are not entrained to any associations with the generated reports and the order in which the reports are displayed.

\noindent 6) \textbf{Clinical Assessment:} After radiologists have had the chance to review image sequences, they will proceed with the clinical assessment, where they will review radiology reports generated through different RRG frameworks. After their review, radiologists will then address a series of questions designed to critique the clinical quality of each generated report based on the following: 

\noindent a) Generally, the first three questions aim to assess the real-world limitations of synthetic reports generated using automated RRG, and whether radiologists would consider reports clinically useful.

\noindent The first question will ask reviewers whether they identified hallucinations across the generated reports. If the radiologist selects \enquote{minor} or \enquote{major}, a conditional question will appear, prompting radiologists to select the types of hallucinations they observed. The next question asks whether key radiological features are missing in this report. Similar to the previous question, if the reviewer selects \enquote{some} or \enquote{many}, a follow up question will appear, prompting them to select from a list of missing elements that would be impactful for improving report quality. The final question asks reviewers to consider how they would personally use each generated report in clinical practice. Here, they will have the opportunity to select from the following responses from the drop down menu: \enquote{As a first draft}, \enquote{As a cross check / second reader}, \enquote{As a summary aid}, and \enquote{Would not use.}


\noindent Summarized below are the questions and their conditional question response options.

\begin{table}[H]
\centering
\scriptsize

\renewcommand{\arraystretch}{1.15}
\setlength{\tabcolsep}{5pt}

\begin{tabular}{|
>{\raggedright\arraybackslash}p{0.18\linewidth}|
>{\raggedright\arraybackslash}p{0.32\linewidth}|
>{\raggedright\arraybackslash}p{0.40\linewidth}|}
\hline
\textbf{Question} & 
\textbf{Response Options (single-select)} & 
\textbf{Conditional Follow-Up (select all that apply)} \\
\hline

\textbf{Hallucinations} &
None; Minor; Major &
\textit{Shown if Minor/Major selected:} \\[2pt]
& & \textbf{Type of hallucination(s) observed?} \\[2pt]
& & 
\begin{itemize}[leftmargin=*, nosep]
  \item Incorrect anatomical location of tumor
  \item Incorrect tumor characteristics (e.g., size, laterality, enhancement)
  \item Incorrect clinical implication
  \item Fabricated finding
  \item Other (free text: \textit{Other hallucination details})
\end{itemize}
\\
\hline

\textbf{Missing Features} &
No; Some; Many &
\textit{Shown if Some/Many selected:} \\[2pt]
& & \textbf{Most impactful missing element(s)?} \\[2pt]
& & 
\begin{itemize}[leftmargin=*, nosep]
  \item Tumor size/extent
  \item Enhancement characteristics
  \item Edema/mass effect
  \item Midline shift
  \item Multifocality
  \item Invasion/eloquent cortex
  \item Other (\textit{free text})
\end{itemize}
\\
\hline

\textbf{Intended use} &
As a first draft; As a cross-check/second reader; As a summary aid only; Would not use &
--- \\
\hline

\end{tabular}

\caption{Clinical evaluation questions and response options, including conditional follow-up items triggered by reviewer selection.}
\label{tab:clinical_eval_questions}
\end{table}




\noindent b) Next, radiologists will complete a series of Likert-scale questions asking them to evaluate each generated report based on the following criteria: (a) decision support, (b) clinical accuracy, (c) omission coverage, and (d) clinical structure.

\noindent \textbf{Decision Support:} Reviewers will be asked to evaluate whether the \textit{Findings} narrative would positively or negatively affect downstream clinical decision making processes (e.g., treatment planning, surgical consult, oncology evaluation).

\noindent \textbf{Clinical Accuracy:} Reviewers will evaluate the factual accuracy of generated reports.

\noindent \textbf{Omission Coverage:} Reviewers will evaluate whether clinically relevant findings important for overall radiological interpretation of tumor effects on the surrounding brain environment are included in generated reports, or whether critical clinical findings are lacking or missing.

\noindent \textbf{Clinical Structure:} The last question asks reviewers to assess the extent to which generated \textit{Findings} narrative differ or align with the clinical organization and conventions typically used in reports. \\

\noindent Summarized below are the question response options for their associated Likert-scale question.


\begin{table}[H]
\centering
\scriptsize
\renewcommand{\arraystretch}{1.15}
\setlength{\tabcolsep}{4pt}

\begin{tabular}{|
>{\raggedright\arraybackslash}m{0.28\textwidth}|
>{\centering\arraybackslash}m{0.14\textwidth}|
>{\centering\arraybackslash}m{0.115\textwidth}|
>{\centering\arraybackslash}m{0.115\textwidth}|
>{\centering\arraybackslash}m{0.115\textwidth}|
>{\centering\arraybackslash}m{0.115\textwidth}|
}
\hline
\textbf{Question} &
\textbf{Criteria} &
\textbf{1} &
\textbf{2} &
\textbf{3} &
\textbf{4} \\
\hline

Would this report support or hinder your clinical decision making (e.g., treatment planning, surgical consultation, oncology consultation)? 
& \textbf{Decision Support}
& Strongly hinders
& Slightly hinders
& Slightly supports
& Strongly supports \\
\hline

Assess the clinical accuracy of this report.
& \textbf{Clinical Accuracy}
& Clinically inaccurate
& Partially inaccurate
& Mostly accurate
& Fully accurate \\
\hline

To what extent does this report omit clinically relevant findings?
& \textbf{Clinical Omission}
& Major clinically relevant omissions
& Several clinically relevant omissions
& Minor clinically relevant omissions
& No clinically relevant omissions \\
\hline

To what extent does this report differ or align with the clinical structure of a standard neuroradiology report?
& \textbf{Clinical Structure}
& Substantially differs from typical clinical reporting
& Somewhat differs from typical clinical reporting
& Somewhat consistent with typical clinical reporting
& Highly consistent with typical clinical reporting \\
\hline

\end{tabular}

\caption{Four-point Likert-scale response options used for radiology report clinical assessment.}
\label{tab:likert_scale}
\end{table}





% 
\noindent 7) \textbf{Report Ranking:} As a final part of the clinical assessment, radiologists will be asked to rank generated reports based on clinical usefulness. In this question, they will be asked to order reports by \enquote{Most useful}, \enquote{Second most useful}, \enquote{Third most useful}, and \enquote{Least useful}.
% 
\noindent At the end of their clinical assessment, radiologists will be given the opportunity to provide optional comments and notes for the case they reviewed before saving their annotation and proceeding to the next case for review. The platform is designed with system memory, meaning that reviewers can revisit previously reviewed cases and make changes to their clinical assessments and annotations. All recorded assessments will be saved as a .json file when the browser closes.

\end{document}