\clearpage
\appendix

% CXReasonBench (p14) 
% Airway: Carina Angle, Trachea Deviation

% Cardiac: Cardiomegaly

% metadata -> absolute measurement, otherwise -> CXR struct expert criteria

\section{Pathology-aware Skill Set (Examples)} \label{appx:skillset-pathology}

\begin{itemize}%[leftmargin=*, nosep]
    \item \textbf{Lines, Tubes, and Devices.} \\
    \textit{Goal:} Identify device type, localize position, and verify termination points (e.g., ET tube depth). \\
    \textit{Workflow:} (1) \textbf{Detection:} Use BiomedParser with prompts (e.g., "Endotracheal tube", "CVC") to generate device masks. (2) \textbf{Anatomical Localization:} Overlay device masks with anatomical masks (Trachea, Carina, SVC) from CXAS. (3) \textbf{Verification:} Use Python scripts to measure distances (e.g., ET tip to Carina) or check containment (e.g., NG tube tip inside Stomach mask). Malposition is flagged based on geometric thresholds.

    \item \textbf{Pneumothorax.} \\
    \textit{Goal:} Confirm presence, laterality, and estimate size. \\
    \textit{Workflow:} (1) \textbf{Screening:} Use DenseNet-121 to filter negative cases based on probability thresholds. (2) \textbf{Localization:} Deploy MAIRA-2 with prompts like "Pneumothorax" or "Pleural line" to generate bounding boxes. (3) \textbf{Side Determination:} Map bounding box centroids to Left/Right Lung masks from CXAS. (4) \textbf{Characterization:} Query MedGemma (VQA) with cropped regions to estimate size (small/moderate/large) and check for tension physiology.

    \item \textbf{Pleural Effusion.} \\
    \textit{Goal:} Detect presence, laterality, and quantify volume. \\
    \textit{Workflow:} (1) \textbf{Segmentation:} Use CXAS to segment Lung fields and Costophrenic Angles. (2) \textbf{Detection:} Use BiomedParser to segment "Fluid" or "Effusion" regions. (3) \textbf{Geometric Measurement:} Calculate the vertical height of the fluid mask relative to the total lung height. (4) \textbf{Quantification:} Apply rule-based thresholds (e.g., $<1/4$ lung height = Small; $>1/2$ = Large) and check for costophrenic angle blunting.

    \item \textbf{Lung Opacity (Pneumonia, Mass, Atelectasis, etc.).} \\
    \textit{Goal:} Differentiate pathology type (texture) and precise anatomical location. \\
    \textit{Workflow:} (1) \textbf{Classification:} Use DenseNet-121 to obtain probability distributions for various opacity types. (2) \textbf{Localization:} Use MAIRA-2 to generate bounding boxes for high-probability findings. (3) \textbf{Anatomy Mapping:} Overlay bounding boxes with CXAS lung lobe masks to assign location (e.g., RUL, LLL). (4) \textbf{Texture Analysis:} Use MedGemma (VQA) to describe pixel patterns (e.g., "patchy/fluffy" for pneumonia vs. "linear/plate-like" for atelectasis) to confirm diagnosis.

    \item \textbf{Cardiac Silhouette.} \\
    \textit{Goal:} Assess heart size and calculate Cardiothoracic Ratio (CTR). \\
    \textit{Workflow:} (1) \textbf{Segmentation:} Use CXAS to segment the Heart and Thoracic Cavity/Lungs. (2) \textbf{Calculation:} Execute Python scripts to measure the maximum horizontal cardiac width and thoracic width. (3) \textbf{Thresholding:} Calculate CTR (Cardiac Width / Thoracic Width); if $>0.5$, classify as Cardiomegaly/Enlarged.

    \item \textbf{Mediastinum and Hilar Regions.} \\
    \textit{Goal:} Evaluate mediastinal width/contours and hilar enlargement. \\
    \textit{Workflow:} (1) \textbf{Segmentation:} Use CXAS to segment the Mediastinum and Aortic Knob. (2) \textbf{Measurement:} Measure superior mediastinal width via Python script (threshold $>8$ cm for widening). (3) \textbf{Hilar Assessment:} Use MAIRA-2 to detect "Hilar enlargement" or "Mass"; if detected, use VQA to characterize potential lymphadenopathy or vascular prominence.

    \item \textbf{Skeletal Abnormalities.} \\
    \textit{Goal:} Detect and localize fractures or bone lesions. \\
    \textit{Workflow:} (1) \textbf{Detection:} Use BiomedParser to parse "Fracture" or "Bone lesion" into masks. (2) \textbf{Localization:} Intersect finding masks with CXAS skeletal masks (Ribs, Clavicles, Spine) to identify specific bones (e.g., "Right 6th Rib"). (3) \textbf{Confirmation:} Use VQA on the ROI to verify cortical disruption and rule out false positives from vascular overlap.
\end{itemize}

\section{Anatomy-aware Skill Set (Examples)} \label{appx:skillset-anatomy}
\begin{itemize}%[leftmargin=*, nosep]
    \item \textbf{Cardiomegaly (Cardiothoracic Ratio, CTR).}
    Given heart and bilateral lung masks, we first identify the axial row where the heart mask attains its maximal width. On this row, we extract the leftmost and rightmost pixels of the heart mask to obtain the cardiac width, and the leftmost and rightmost pixels of the union of the left- and right-lung masks to obtain the thoracic width. The cardiothoracic ratio is then
    \[
        \mathrm{CTR} = \frac{\text{cardiac width}}{\text{thoracic width}}.
    \]
    Comparing CTR to a view-specific clinical threshold (e.g., $\mathrm{CTR} > 0.5$ on PA view) yields a binary cardiomegaly label.

    \item \textbf{Carina Angle.}
    From the trachea bifurcation mask we compute a concave hull and locate the lowest point where the trachea splits, defining the carina point. At $10\%$, $20\%$, and $30\%$ of the mask height below the carina, we find the leftmost and rightmost side points and average them to estimate the centerlines of the left and right main bronchi. The carina angle is defined as the interior angle between the two vectors from the carina point to the left and right bronchus centerline points, respectively. This angle is compared against a normal reference range to determine whether the carina angle is abnormal.

    \item \textbf{Tracheal Deviation.}
    Using trachea, trachea bifurcation, and vertebrae (T1--T7) masks, we first suppress the carina region within the trachea mask, retaining the upper trachea. From the vertebral masks, we extract posterior spinous-process points that lie within the vertical extent of the trachea mask. For each such vertebral level, we compare the horizontal position of the trachea to the corresponding spinous-process point and assign a local label (left, right, centered). The final tracheal deviation label is obtained via majority voting across levels: predominant left labels indicate left deviation, predominant right labels indicate right deviation, and balanced or centered labels indicate no deviation.

    \item \textbf{Mediastinal Widening.}
    From upper mediastinum and bilateral lung masks, we find the row where the mediastinum mask is widest. On this row, we extract the leftmost and rightmost mediastinum pixels to obtain the mediastinum width and the leftmost and rightmost pixels of the combined lung masks to obtain the thoracic width at the same level. The mediastinum-to-thoracic width ratio
    \[
        \text{Ratio} = \frac{\text{mediastinum width}}{\text{thoracic width}}
    \]
    is compared against an expert-defined threshold to determine the presence of mediastinal widening.

    \item \textbf{Aortic Knob Enlargement.}
    Using aortic arch, descending aorta, trachea, and trachea bifurcation masks, we first localize the aortic knob with the aortic arch mask. The trachea mask, restricted to the vertical range of the aortic arch and excluding the bifurcation, provides the starting point of the knob measurement:
    \emph{Point A} is the average $x$-coordinate of the left tracheal wall in this range. From the upper $30\%$ of the descending aorta mask, we extract the innermost $x$-coordinate (\emph{Point B}) and the outermost $x$-coordinate on the same row (\emph{Point C}). \emph{Point D} is the outermost $x$-coordinate of the aortic arch mask.
    The aortic knob width is defined as the horizontal distance from Point~A to the farthest of Point~C or Point~D. The tracheal width is computed as the median horizontal width of the trachea mask (excluding the bifurcation). The diagnostic index is
    \[
        \text{Ratio} = \frac{\text{aortic knob width}}{\text{tracheal width}},
    \]
    and aortic knob enlargement is declared when this ratio exceeds an expert-defined threshold.

    \item \textbf{Ascending Aorta Enlargement.}
    With ascending aorta, heart, trachea, and trachea bifurcation masks, we first identify the most right-sided point of the heart mask (\emph{Point A}) and the most right-sided point of the trachea mask excluding the bifurcation (\emph{Point B}), which approximates the inner boundary of the right lung. We construct a reference line connecting Points~A and~B and then isolate the portion of the ascending aorta mask that extends beyond this line toward the right. Let $A_{\text{beyond}}$ be the area of the ascending aorta beyond the reference line and $A_{\text{total}}$ the total ascending aorta area. The diagnostic index is
    \[
        \text{Ratio} = \frac{A_{\text{beyond}}}{A_{\text{total}}},
    \]
    and if this ratio exceeds an expert-defined threshold, the ascending aorta is labeled enlarged.

    \item \textbf{Descending Aorta Enlargement.}
    From the descending aorta, trachea, and trachea bifurcation masks (with the heart mask used to define the thoracic segment), we retain only the descending aorta portion lying above the inferior border of the heart, corresponding to the thoracic aorta. Within this region, we identify the row where the aortic width is maximal and extract the leftmost and rightmost $x$-coordinates at this row, yielding the maximum thoracic descending aorta width. From the trachea mask, we compute the median tracheal width across rows and record the corresponding left/right $x$-coordinates at the median row. The diagnostic index is
    \[
        \text{Ratio} = \frac{\text{descending aorta width}}{\text{tracheal width}},
    \]
    and descending aorta enlargement is indicated when this ratio exceeds an expert-defined threshold.

    % \item \textbf{CXR Anatomy-Aware Workflow for Agentic Systems.}
    % We implement a unified anatomy-aware workflow that decomposes each thoracic radiographic sign into reusable, tool-callable skills operating on segmentation masks and simple geometric primitives. The workflow follows a common four-stage pattern:
    % \begin{enumerate}[leftmargin=*, nosep]
    %     \item \emph{Mask-conditioned anatomical landmark extraction.}
    %     Starting from a pre-processed CXR and organ masks (heart, lungs, trachea, tracheal bifurcation, vertebrae, mediastinum, aortic arch, ascending/descending aorta, etc.), the agent invokes geometry primitives to derive discrete anatomical landmarks such as extremal points (leftmost/rightmost, top/bottom), sampled points along centerlines, and special junctions (e.g., the carina point via concave hull analysis). Constraints across masks (e.g., using only the upper $30\%$ of a mask or rows overlapping another mask) enforce anatomical plausibility.
    %     \item \emph{Geometry-based diagnostic measurements.}
    %     Using these landmarks, the agent computes low-level geometric quantities: linear distances (e.g., cardiac width, mediastinal width, aortic knob width), angles (e.g., carina angle between bronchial vectors), relative positions (e.g., trachea vs.\ spinous processes), and areas or area fractions (e.g., ascending aorta beyond the heart--trachea line). These measurements are explicitly tied to named anatomical structures, enabling traceability and visual inspection.
    %     \item \emph{Construction of interpretable diagnostic indices.}
    %     Measurements are composed into clinically meaningful indices such as width ratios (cardiothoracic ratio, mediastinum-to-thoracic ratio, aortic knob-to-trachea and descending aorta-to-trachea ratios), area ratios (ascending aorta beyond boundary vs.\ total area), or categorical indices (e.g., tracheal deviation via majority voting over vertebral levels). These scalar indices form an intermediate, human-interpretable representation for downstream reasoning.
    %     \item \emph{Application of clinical rules and agentic decisions.}
    %     For each attribute, the agent applies standardized or expert-defined thresholds and normal ranges to the diagnostic index, producing structured labels (e.g., presence/absence of cardiomegaly, mediastinal widening, tracheal deviation, or aortic enlargement) together with the numerical values and landmarks used. Because the computation graph is explicit and modular, the same set of skills can be reused or extended to new findings, and the agent can provide explanations, perform quality control, and adapt thresholds to different populations or imaging protocols.
    % \end{enumerate}
\end{itemize}

\section{Vanilla-MedRAX-comparable experiments using GPT-4o}
\label{appx:radagents_gpt-4o}

\subsection{{Implementation Details of RadAgents}}
\label{appx:radagents_implementation}

RadAgents is implemented on top of the LangGraph agentic framework, and is conceptually inspired by MedRAX~\citep{fallahpourmedrax}. The core engine of each agent can in principle be any LLM; in this work we focus on open-source, lightweight models, specifically Qwen3-VL-Instruct series, which exhibit strong instruction-following and reasoning capabilities and supports long-context input that is crucial for our agentic system. %deployed via Ollama
Because we explicitly encode clear, radiologist-style workflows into the agents, even the smaller models can reliably follow the prescribed procedures in most cases. To ensure transparency and enable debugging and analysis, we log the full execution trajectory and all intermediate outputs for each run.

For workflow extraction, we adopt hybrid retrieval over the textual workflow descriptions, combining BM25 with dense similarity based on the Snowflake-arctic-embed-m-v1.5 embeddings. Skills and tools are exposed as structured JSON APIs, and agents issue calls by constructing precise JSON objects that specify the target tool together with all required arguments (e.g., image paths and text prompts).

\subsection{Experimental Setup}
Here we use GPT-4o as the core LLM engine instead of Qwen3-VL, and, to make our system comparable to vanilla MedRAX, we adopt the same tool set. Considering the cost of API calls, we perform our evaluation on smaller-scale datasets that are less structurally complex than ChestAgentBench but exhibit increasing reasoning complexity: 181 MS-CXR \citep{boecking2022making} VQA cases for existence and attribute (E\&A) queries about abnormalities, 785 VQA cases for comparison and progression (C\&P) of abnormalities, and 181 MIMIC-CXR two-view frontal report-generation cases.

\subsection{Results}
\paragraph{Existence and attributes.}
% \label{subsec:vqa-eq}
The VQA questions cover seven common findings in CXR: atelectasis, cardiomegaly, consolidation, edema, lung opacity, pleural effusion, and pneumothorax, derived from the standard test split of MS--CXR. Each image receives the following prompt:
% (details are stated in the Appendix \ref{apd:prompting}):

\begin{infobox}\ttfamily\small
<image> Describe if [finding] is present; if present, describe [attributes].
\end{infobox}

\paragraph{Comparison and progression.}
We use MS--CXR--T to assess stability, improvement, or worsening of a specific positive finding (consolidation, edema, pleural effusion, or pneumothorax). We only retain cases where the metadata indicates a \textbf{consensus} among human reviewers. We pose a comparative question that explicitly references the prior study. The prompt template is:

\begin{infobox}\ttfamily\small
Given current image <image>, and previous image <image>, decide if [finding] is \\improving, stable, or worsening.
\end{infobox}

\paragraph{Report generation.}
We construct a MIMIC--CXR subset aligned with MS--CXR identities so that findings queried in VQA are represented in the corresponding reports. Prompts request generation of the \textit{Findings} section, and all agents are activated by default.
% The prompt combines the template from Section~\ref{subsec:vqa-eq} with a curated list of clinically significant findings, following prior work \citep{tu2024towards, peng2025scaling}; details appear in Appendix~\ref{apd:prompting}.

Table~\ref{tab:gpt-4o-radagents} summarizes performance across the three evaluation settings.
%: VQA (E\&A) on MS-CXR, VQA (C\&P) on MS-CXR-T, and report generation on MIMIC-CXR. 
RadAgents with V-RAG achieves the best results on all tasks. Relative to the ablated agent without V-RAG, it improves GREEN on VQA (E\&A) from 0.5841 to 0.6032 ({+}0.0191, $\sim$3.3\%), raises VQA (C\&P) accuracy from 48.0\% to 50.9\% ({+}2.9 points, $\sim$6.0\%), and boosts report-generation GREEN from 0.3821 to 0.4527 ({+}0.0706, $\sim$18.5\%). Augmenting GPT-4o with ReAct and our workflow also yields consistent gains over the base LLM; for example, GREEN on VQA (E\&A) increases from 0.2127 (GPT-4o) to 0.4619 (GPT-4o+ReAct) and 0.5351 (GPT-4o+Workflow), while VQA (C\&P) accuracy rises from 18.2\% to 41.9\% and 45.5\%, respectively. Nonetheless, even the strongest GPT-4o+Workflow baseline remains substantially behind RadAgents, trailing by 0.0681 GREEN on VQA (E\&A), 5.4 accuracy points on VQA (C\&P), and 0.0686 GREEN on report generation. Overall, the ranking of methods is largely consistent across tasks, suggesting that the benefits of structured tool use and V-RAG transfer from explanation-style VQA to longitudinal comparison questions and free-text report generation.


\begin{table}[ht]
    \centering
    % \small
    \caption{Performance comparison between different setting when using gpt-4o as the core LLM engine in RadAgents.}
    \label{tab:gpt-4o-radagents}
    \resizebox{1.0\linewidth}{!}{
    \begin{tabular}{lccc}
        \toprule
        \textbf{Method} 
        & \textbf{VQA (E\&A) (GREEN)} 
        & \textbf{VQA (C\&P) (Acc. \%)} 
        & \textbf{Report Generation (GREEN)} \\
        \midrule
        CheXagent           & 34.3 & 34.1 & 18.3 \\
        GPT-4o              & 21.3 & 18.2 & 31.4 \\
        GPT-4o w/ ReAct     & 46.2 & 41.9 & 33.3 \\
        GPT-4o w/ Workflow  & 53.5 & 45.5 & 38.4 \\
        RadAgents w/o V-RAG & 58.4 & 48.0 & 38.2 \\
        RadAgents           & \textbf{60.3} & \textbf{50.9} & \textbf{45.3} \\
        \bottomrule
    \end{tabular}
    }
\end{table}


\section{Details of V-RAG}\label{apd:vrag}

\paragraph{Multimodal retrieval.} We retrieve images and their textual descriptions that align with the features of target medical images following \citep{chu2025reducing}. These references, rich in visual and textual medical details, guide response generation. To obtain embeddings, we use Rad-DINO, which provides robust representations across diverse CXR image types. For each image $X_{img}$, we extract its embedding $E_{img}=\mathbf{R}^d$, with $d=768$, and store them in the embedding memory $\mathcal{M}$.

{\paragraph{Corpus.} The retrieval index consists exclusively of the MIMIC-CXR official training split. We strictly enforce patient-level exclusion (via Subject ID), ensuring test patients are completely disjoint from the retrieval corpus. For external benchmarks (e.g., ChestAgentBench), the data originates from distinct sources, ensuring zero overlap with the MIMIC-CXR index.}

\paragraph{Sensitivity of $k$.}  \label{apd:sens_k_vrag}
We study how retrieval quality changes with the number of retrieved studies $k$ with the sampled 100 cases from the MS-CXR test set used in this study. For each setting, we compute two metrics: \textit{helpful-rate}, the percentage of retrieved studies that improve the answer, and \textit{harmful-rate}, the percentage that hurt the answer. As shown in Figure~\ref{fig:rag_ablation}, 
As k increases, the harmful rate grows more quickly than the helpful, e.g., the helpful-rate increases from 0.48 at $k=1$ to 0.65 at $k=5$, while the harmful-rate also rises from 0.09 to 0.20. 
We hypothesize this is due to longer contexts imposing a heavier reasoning burden and increasing hallucination, consistent with prior LLM findings.
This illustrates a trade-off: retrieving more studies provides greater chances of including helpful evidence but also increases the risk of introducing misleading content. To balance these effects, we choose $k=3$ by default, achieving a helpful-rate of 0.62 with a moderate harmful-rate of 0.14.

\paragraph{Augmented Inference.} In the inference stage, we encode the query image $X_q$ to obtain its embedding. We then retrieve the top-$k$ most similar images from $\mathcal{M}$, represented as $(I_1,\dots,I_k)$ with their corresponding reports $(R_1,\dots,R_k)$. These references are appended to the input of multimodal LLM to guide generation. The prompt is structured as: 

\begin{infobox}\ttfamily\small
This is the $i$-th similar image and its report for your reference. [Reference]$_i$ \\... According to the query image and the references, [Question] [Query Image].
\end{infobox}
where each reference is denoted as $(I_i, R_i)$.

For efficient retrieval during inference, we build $\mathcal{M}$ using FAISS \footnote{https://github.com/facebookresearch/faiss}, a GPU-accelerated vector search system. We employ approximate kNN with the Hierarchical Navigable Small World (HNSW) algorithm \citep{malkov2018efficient}, enabling retrieval of the top-$k$ most similar images in $\mathcal{M}$.


{\section{Additional Results and Analysis}}

\subsection{Entity-level Evaluation for Report Generation}

Evaluating the quality of generated radiology reports is non-trivial. Early works adopted general-domain natural language processing metrics such as ROUGE \citep{lin2004rouge} and BLEU \citep{papineni2002bleu}. While these metrics are widely used for text evaluation, they treat differences in wording the same as clinically significant errors, failing to reflect medical accuracy. To address this limitation, clinically informed evaluation metrics, such as CheXbert \citep{smit2020combining}, RadGraph \citep{jain1radgraph}, GREEN \citep{ostmeier2024green}, and RaTEScore \citep{zhao2024ratescore}, have been proposed to better assess clinical correctness and utility. % In our training, inspired by prior RL-based RRG studies, we primarily utilize RadGraph and CheXbert scores as clinical reward signals.
CheXbert is based on multi-label classification results for 5 or 13 diseases (along with one extra ``normal'' label).
RadGraph considers literal entity agreement considering the positive or negative
context of each entity.  GREEN judges recall and precision errors by LLM
prompting.  RaTEScore is inspired by RadGraph but less sensitive to phrasing
by an F1-like computation which allows semantic matching between entities based on a cosine similarity. 
The metrics are computed using their official and standardized implementations: 
\textsc{RadGraph-F1}\footnote{\url{https://pypi.org/project/radgraph/0.1.2/}}, 
\textsc{CheXbert-F1}\footnote{\url{https://pypi.org/project/f1chexbert/}}, 
\textsc{RaTE Score}\footnote{\url{https://pypi.org/project/RaTEScore/0.5.0/}}, and 
\textsc{GREEN}\footnote{\url{https://pypi.org/project/green-score/0.0.8/}}.

\begin{table}[h]
    \centering
    \small
    \caption{Performance Comparison on Report Generation Metrics. The proposed RadAgents achieves the highest scores across all standard metrics.}
    \label{tab:performance_comparison}
    \renewcommand{\arraystretch}{1.2} % Increases row height for better readability
    \begin{tabular}{l c c c}
        \toprule
        \textbf{Settings / Metrics} & \textbf{CheXbert-macro-F1 (14)} & \textbf{RadGraph-F1} & \textbf{RaTE} \\
        \midrule
        CheXagent & 29.2 & 12.7 & 44.4 \\
        GPT-4o & 22.4 & 15.3 & 49.8 \\
        Qwen3-VL w/ ReAct & 44.9 & 17.1 & 53.6 \\
        Qwen3-VL w/ Workflow & 45.5 & 17.2 & 53.9 \\
        \midrule
        RadAgents w/o V-RAG & 49.1 & 18.1 & 55.8 \\
        %\rowcolor{gray!15} 
        \textbf{RadAgents (Ours)} & \textbf{53.2} & \textbf{19.4} & \textbf{58.5} \\
        \bottomrule
    \end{tabular}
\end{table}

\subsection{Computational Cost and Latency Analysis}

To assess the deployment feasibility of RadAgents, we conducted a detailed analysis of inference latency and computational cost.

\paragraph{Experimental Setup and Mechanism.}
A critical efficiency feature of RadAgents is its sparse activation design. As detailed in Section 2.3, the Orchestrator activates strictly those sub-agents required for a specific query (e.g., a query regarding ``bone fractures'' triggers only the relevant specialist, leaving the ``Lung Opacity'' agent idle). Consequently, the system rarely incurs the computational overhead of running all 7 agents simultaneously; idle agents consume zero active compute resources.

Performance was measured on an on-premise node equipped with $8 \times$ NVIDIA RTX A5000 (24GB) GPUs. To reflect a practical deployment environment, we utilized quantized models served via Ollama backends with LangGraph orchestration. We report two key metrics: (1) \textbf{Latency}: End-to-end wall-clock time per case (seconds), and (2) \textbf{Compute Cost}: Cumulative GPU usage per case (GPU-seconds).

\paragraph{Quantitative Results.}
We compared the proposed RadAgents (in both Sequential and Parallel execution modes) against single-agent baselines across three benchmarks. The results are summarized in Table~\ref{tab:latency_cost}.

\begin{table}[h]
    \centering
    \caption{Latency and Cost Analysis across benchmarks. Metrics are reported as \textbf{Wall-Clock Time (sec) / Compute Cost (GPU-sec)}. The Parallel configuration significantly reduces latency compared to sequential execution.}
    \label{tab:latency_cost}
    \renewcommand{\arraystretch}{1.2}
    \begin{tabular}{l c c c}
        \toprule
        \textbf{Configuration} & \textbf{CheXbench} & \textbf{ChestAgentBench} & \textbf{Report Gen} \\
        & (sec / GPU-sec) & (sec / GPU-sec) & (sec / GPU-sec) \\
        \midrule
        Single-Agent ReAct & 16 / 30 & 25 / 50 & 50 / 110 \\
        Single-Agent Workflow & 18 / 36 & 29 / 64 & 58 / 140 \\
        \midrule
        RadAgents (Sequential) & 38 / 89 & 62 / 178 & 120 / 368 \\
        \textbf{RadAgents (Parallel)} & \textbf{26 / 83} & \textbf{41 / 164} & \textbf{85 / 340} \\
        \bottomrule
    \end{tabular}
\end{table}

\paragraph{Analysis.}

\textbf{Parallelism Efficiency:} Running sub-agents in parallel reduces wall-clock latency by approximately 30--40\% compared to the sequential variant. For complex queries in ChestAgentBench, the system achieves an average latency of 41 seconds, rendering it responsive enough for asynchronous clinical workflows.

\textbf{Cost-Benefit Trade-off:} While the multi-agent architecture incurs roughly $2$--$3\times$ the compute cost of a simple Single-Agent baseline, this is a necessary trade-off to facilitate the complex reasoning that drives the observed 10\%+ performance gains. In the context of high-stakes medical diagnosis, where accuracy and traceability are paramount, this increased computational investment is justified.

\subsection{Systematic Error Analysis}

To evaluate the robustness of the Orchestrator/Sub-agents and the stability of the agentic workflows, we analyzed the frequency of workflow deviations across three benchmarks. Specifically, we tracked two key metrics:
\begin{enumerate}
    \item \textbf{Workflow-free (ReAct) Fallback Rate}: The percentage of queries where the Orchestrator could not match a pre-defined clinical workflow and defaulted to a generalist ReAct loop.
    \item \textbf{SkillMismatchError / Re-dispatch Rate}: The frequency with which an assigned sub-agent rejected a task (due to scope mismatch) or failed, necessitating a re-dispatch by the Orchestrator.
\end{enumerate}

\paragraph{Quantitative Results.}
The average error frequencies are summarized in Table~\ref{tab:error_freq}.

\begin{table}[h]
    \centering
    \caption{Systematic Error Frequency Analysis. \textbf{ChestAgentBench} exhibits higher fallback rates due to its open-ended query nature, whereas \textbf{MIMIC-CXR} report generation follows a highly structured routine, resulting in minimal deviations.}
    \label{tab:error_freq}
    \renewcommand{\arraystretch}{1.2}
    \begin{tabular}{l c c}
        \toprule
        \textbf{Datasets} & \textbf{Workflow-free (ReAct)} & \textbf{SkillMismatchError /} \\
        & \textbf{Fallback Rate} & \textbf{Re-dispatch Rate} \\
        \midrule
        CheXbench & 10.4\% & 5.3\% \\
        ChestAgentBench & \textbf{18.1\%} & \textbf{9.3\%} \\
        MIMIC-CXR Report Gen & 2.1\% & 4.0\% \\
        \bottomrule
    \end{tabular}
\end{table}

\paragraph{Interpretation.}
The variance in error rates reflects the distinct nature of the evaluation datasets:
\begin{itemize}
    \item \textbf{ChestAgentBench} contains a higher proportion of open-ended, mixed-intent queries. This complexity forces the Orchestrator to utilize the fallback ReAct mechanism more frequently (18.1\%) and results in higher rates of internal re-dispatching (9.3\%) to resolve ambiguities.
    \item \textbf{MIMIC-CXR Report Generation} typically follows a fixed, routine clinical workflow (e.g., standard frontal/lateral review). Consequently, it exhibits the highest stability with a minimal workflow-free fallback rate (2.1\%).
    \item \textbf{CheXbench} falls between these two extremes, representing a balanced mix of structured classification tasks and moderately complex reasoning queries.
\end{itemize}

{\section{Demonstration}}

{An end-to-end execution trace for the query ``Is the trachea midline?''. (a) Dispatch: The Orchestrator routes the query to the Airway Agent based on intent analysis. (b) Tool Execution: The sub-agent invokes the segmentation tool; note the incorrect "midline" classification in the raw tool metadata despite successful segmentation. (c) Conflict Detection: The Synthesizer performs a cross-modal check using a VQA model, detecting a discrepancy between the tool's heuristic and the visual assessment. (d) Resolution: The Synthesizer triggers V-RAG to retrieve Top-3 similar cases. By synthesizing the retrieved evidence with its own reasoning, the system corrects the tool error to output the final accurate diagnosis of ``deviated to the right.''}

% \begin{center}
%     \textbf{\Large Figure R2. Complete Execution Pipeline showing Conflict Resolution}
% \end{center}
\newpage
\begin{tracebox}[Execution Trace: Query ``Is the trachea midline?'']
    \ttfamily\small

    % =========== STEP 1: ORCHESTRATOR ===========
    \textcolor{cmdblue}{\textbf{\faNetworkWired \ STEP 1: ORCHESTRATION \& DISPATCH}}\vspace{-0.3em}
    \par\noindent\hrulefill\vspace{0.1em}
    
    \textbf{[ORCHESTRATOR]} Analyzing Query Context... \\
    \textbf{Query:} ``Is the trachea midline?'' \ \ \textbf{Intent:} Anatomic Localization (Airway) \\
    \textbf{Strategy:} Activate [\textcolor{cmdblue}{Airway\_Agent}] \ \ \textbf{Dispatch:} \textgreater \ Executing Airway\_Agent...
    
    \vspace{0.15cm} % reduced spacing

    % =========== STEP 2: TOOL EXECUTION ===========
    \textcolor{cmdblue}{\textbf{\faTools \ STEP 2: SUB-AGENT REASONING \& TOOL EXECUTION}}\vspace{-0.3em}
    \par\noindent\hrulefill\vspace{0.1em}
    
    \begin{minipage}[t]{0.65\textwidth}
        \textbf{[AIRWAY\_AGENT]} Action: Calling tool `check\_tracheal\_position` \\
        \textbf{Params:} \{ ``organ'': ``Trachea'' \} \vspace{0.1em}
        
        \textbf{[TOOL\_OUTPUT]} Status: \textcolor{successgreen}{SUCCESS} \\
        Metrics: \{ ``area\_pixels'': 12490, ``centroid'': [157.8, 470.0], \\
        \ \ \textbf{``heuristic\_classification'': ``midline''} \textcolor{loggray}{\textit{<-- (Incorrect)}} \}
        \vspace{0.1em}
        
        \textbf{[AIRWAY\_AGENT]} Tool execution complete. \textgreater \ Handoff to Synthesizer.
    \end{minipage}%
    \hfill
    \begin{minipage}[t]{0.3\textwidth}
        \vspace{0pt} 
        \raggedleft
        % REPLACE WITH YOUR IMAGE FILENAME
        \includegraphics[width=0.95\linewidth]{fig/radagents_trace.png}\vspace{0.05em}
        \begin{center} \scriptsize \textit{Tool Segmentation Visualization} \end{center}
    \end{minipage}

    \vspace{0.15cm}

    % =========== STEP 3: CONFLICT DETECTION ===========
    \textcolor{alertorange}{\textbf{\faExclamationTriangle \ STEP 3: CONFLICT DETECTION (SYNTHESIZER)}}\vspace{-0.3em}
    \par\noindent\hrulefill\vspace{0.1em}
    
    \textbf{[SYNTHESIZER]} Phase 1: Verification \\
    \textgreater \ Tool Claim: ``Trachea is midline'' (based on centroid heuristic). \\
    \textgreater \ Action: Cross-check with VQA (Global View). \ \textgreater \ VQA: ``The trachea appears deviated to the right.''
    
    \vspace{0.1cm}
    {\setlength{\fboxsep}{3pt} 
    \fcolorbox{alertorange}{orange!5}{
        \begin{minipage}{0.95\textwidth}
            \textcolor{alertorange}{\textbf{! ALERT: Tool output contradicts visual assessment.}} \\
            \textcolor{alertorange}{\textbf{! Status: UNCERTAIN. Triggering V-RAG Resolution...}}
        \end{minipage}
    }}

    \vspace{0.15cm}

    % =========== STEP 4: RESOLUTION ===========
    \textcolor{errred}{\textbf{\faCheckCircle \ STEP 4: RESOLUTION (V-RAG + SYNTHESIS)}}\vspace{-0.3em}
    \par\noindent\hrulefill\vspace{0.1em}
    
    \textbf{[SYNTHESIZER]} Phase 2: Resolution via V-RAG \\
    \textgreater \ Action: Retrieving Top-3 clinically similar cases... \ \textbf{[RETRIEVAL]} Found 3 references (Sim $>$ 0.85): 
    \vspace{-0.3em} % tighten space before list
    \begin{itemize}
        \setlength\itemsep{-0.15em} % slightly tighten list spacing
        \item 1. Case\_782: ``Trachea deviated to right'' (Visual match: \textcolor{successgreen}{High})
        \item 2. Case\_104: ``Rightward deviation due to effusion'' (Visual match: \textcolor{successgreen}{High})
        \item 3. Case\_993: ``Midline trachea'' (Visual match: \textcolor{errred}{Low})
    \end{itemize}
    \vspace{-0.3em} % tighten space after list

    \textbf{[FINAL REASONING]} \\
    ``Although segmentation suggests midline, VQA and 2/3 retrieved cases confirm rightward deviation. The tool's centroid heuristic is likely misled by patient rotation.''

    \vspace{0.1cm}
    {\setlength{\fboxsep}{3pt}
    \fcolorbox{errred}{white}{
        \begin{minipage}{0.95\linewidth}
            \textbf{[FINAL ANSWER]} \textcolor{errred}{\textbf{``No, the trachea is not midline; it is deviated to the right.''}} \\
            \textbf{Confidence:} High (Supported by V-RAG Evidence)
        \end{minipage}
    }}

\end{tracebox}