\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{adjustbox}
\usepackage{array}
\usepackage[table]{xcolor} 

\usepackage{mwe} % to get dummy images
\jmlrvolume{-- Under Review}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026 submission}
\editors{Under Review for MIDL 2026}

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 176}
\editors{Accepted for publication at MIDL 2026}

\title[CardAIc-Agents for Cardiac Care Support]{CardAIc-Agents: A Multimodal Framework with Hierarchical Adaptation for Cardiac Care Support}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Yuting Zhang\nametag{$^{1}$}} \Email{ytz300@student.bham.ac.uk}\\
\addr $^{1}$ School of Computer Science, University of Birmingham, UK \\
\Name{Karina V. Bunting\nametag{$^{2}$}} \Email{k.v.bunting@bham.ac.uk}\\
\Name{Asgher Champsi\nametag{$^{2}$}} \Email{a.champsi@bham.ac.uk}\\
\addr $^{2}$ Department of Cardiovascular Sciences, University of Birmingham, UK \\
\Name{Xiaoxia Wang\nametag{$^{2,3}$}} \Email{xiaoxia.wang@uhb.nhs.uk}\\
\addr $^{3}$ NIHR Birmingham Biomedical Research Centre and West Midlands NHS Secure Data Environment, University Hospitals Birmingham NHS Foundation Trust, UK \\
\Name{Wenqi Lu\nametag{$^{4}$}} \Email{W.Lu@mmu.ac.uk}\\
\addr $^{4}$ Department of Computing and Mathematics, Manchester Metropolitan University, UK \\
\Name{Alexander Thorley\nametag{$^{1}$}} \Email{ajt973@student.bham.ac.uk}\\
\Name{Sandeep S Hothi\nametag{$^{5}$}} \Email{s.hothi@nhs.net}\\
\addr $^{5}$ Department of Cardiology, Heart and Lung Centre, Royal Wolverhampton NHS Trust, UK \\
\Name{Zhaowen Qiu\nametag{$^{6}$}} \Email{249600398@qq.com}\\
\addr $^{6}$ College of Computer and Control Engineering, Northeast Forestry University, China \\
\Name{Baturalp Buyukates\midlotherjointauthor\nametag{$^{1}$}} \Email{b.buyukates@bham.ac.uk}\\
\Name{Dipak Kotecha\midlotherjointauthor\nametag{$^{2,3,7}$}} \Email{d.kotecha@bham.ac.uk}\\
\addr $^{7}$ Julius Center, University Medical Center Utrecht, the Netherlands\\
\Name{Jinming Duan\midlotherjointauthor\nametag{$^{1,8}$}} \Email{j.duan@bham.ac.uk}\\
\addr $^{8}$ Division of Informatics, Imaging and Data Sciences, University of Manchester, UK
}
\begin{document}

\maketitle

\begin{abstract}
Cardiovascular diseases (CVDs) remain the foremost cause of mortality worldwide, a burden worsened by a severe deficit of healthcare workers. Artificial intelligence (AI) agents have shown potential to alleviate this gap through automated detection and proactive screening, yet their clinical application remains limited by: 1) rigid sequential workflows, whereas clinical care often requires adaptive reasoning that selects specific tests and, based on their results, guides personalised next steps; 2) reliance solely on intrinsic model capabilities to perform role assignment without domain-specific tool support; 3) general and static knowledge bases without continuous learning capability; and 4) fixed unimodal or bimodal inputs and lack of on-demand visual outputs when clinicians require visual clarification. In response, a multimodal framework, CardAIc-Agents, is proposed to augment models with external tools and adaptively support diverse cardiac tasks. First, a CardiacRAG agent generates task-aware plans from updatable cardiac knowledge, while the Chief agent integrates  tools to autonomously execute these plans and deliver decisions. Second, to enable adaptive and case-specific customization, a stepwise update strategy is developed to dynamically refine plans based on preceding execution results, once the task is assessed as complex. Third, a multidisciplinary discussion team is proposed which is automatically invoked to interpret challenging cases, thereby supporting further adaptation. In addition, visual review panels are provided to assist validation when clinicians raise concerns. Experiments across three datasets showed the efficiency of CardAIc-Agents compared to mainstream Vision–Language Models (VLMs) and state-of-the-art agentic systems. Code will be publicly available at \url{https://github.com/ytz300/CardAIc-Agents}.
\end{abstract}


\begin{keywords}
Multimodal framework, medical AI agents, workflow optimization, cardiac applications, foundation models, echocardiographic imaging
\end{keywords}

\section{Introduction}
Cardiovascular diseases (CVDs) are the leading cause of mortality worldwide, accounting for 17.9 million deaths each year \cite{almeida2024cardiovascular}. Notably, up to 80\% of these deaths occur in low- and middle-income countries, where specialised care is limited \cite{bulto2024burden}, which, combined with a global shortage of over 4 million healthcare workers \cite{vedanthan2011urgent}, drives the demand for scalable and accessible cardiovascular care solutions. Recent advancements in large language models (LLMs) have led to human-level performance on challenging tasks; for instance, Med-PaLM has outperformed clinicians on the United States Medical Licensing Examination \cite{singhal2025toward}. Despite these achievements, however, clinical practice, particularly for complex chronic conditions (e.g., heart failure (HF)), often relies on multimodal data for diagnosis, prognosis, and treatment \cite{weintraub2019role}. This gap underscores the need for multimodal strategies that extend beyond language-only models to more effectively support clinical practice.

While vision-language models (VLMs) such as LLaVA-Med \cite{li2023llavamed} and MedGe-mma \cite{sellergren2025medgemma} have fueled anticipation for medical multimodal artificial intelligence (AI), several challenges remain. For example, they are restricted to static images, whereas dynamic inputs such as echocardiograms are vital for cardiac function assessment. In addition, such generalist models retain static knowledge, which hinders their ability to assimilate evolving medical evidence. While Retrieve Augmented Generation (RAG) mitigates this challenge to some extent, traditional retrieval methods still present notable limitations. For example, Term Frequency Inverse Document Frequency (TF-IDF) relies on lexical matching but is limited in semantic comprehension, while Dense Passage Retrieval (DPR) encodes queries and documents into embeddings for similarity based retrieval yet often lacks semantic relevance \cite{karpukhin2020dense, mallen2022not}.

Crucially, complex cardiovascular management often requires multi-step reasoning and a coordinated sequence of clinical actions, rather than a single-step response \cite{mcdonagh20212021}. Although prompt engineering techniques such as Chain-of-Thought (CoT) \cite{wei2022chain} partially mitigate this limitation by decomposing problems into substeps, model performance remains constrained by their intrinsic capabilities. The recent introduction of function calling and the Model Context Protocol (MCP) \cite{hou2025model} provides a complementary pathway, enabling models to integrate external tools automatically and access standardized functions. These advances drive the development of AI agents capable of reasoning, planning, memory utilization, and action execution \cite{chang2024agentboard}. However, most existing VLM-based agents in medicine still rely on assigning roles to models with static and generic knowledge, and often lack cross-turn memory, limiting their suitability for real-world cases that require multidisciplinary deliberation.  

Another limitation of these existing VLM-based agents lies in their rigid and sequential workflows \cite{kim2024mdagents}. Although recent advances have enabled ReAct-based frameworks \cite{yao2023react} to perform stepwise reasoning with intermediate outcomes and external tools, these frameworks still lack global planning capability. In contrast, MedAgent-Pro \cite{wang2025medagent} offers disease-level planning, but its plans are generated before receiving patient-specific input, which may be effective for some routine tasks yet risks misalignment with individual clinical contexts. For example, echocardiogram view identification typically follows a fixed workflow (e.g., commercial software, manual view selection), whereas complex HF diagnosis involves diverse patient presentations that require tailored test orders (e.g., ECG, echocardiography) and subsequent personalised management based on results. These observations highlight the need for flexible frameworks capable of both task-level and case-level adaptation across diverse clinical contexts. In addition, current VLM agents lack intermediate visual outputs, such as  the left ventricular contour delineations, which are critical for clinical verification in complex or uncertain cases.


Motivated by the above, in this study, an adaptive framework, CardAIc-Agents (comprising CardiacRAG and a Chief agent), is introduced to augment models with external tools, enabling autonomous execution of cardiac tasks (e.g., diagnosis, echocardiogram view extraction, segmentation, detection of P, QRS, and T waves) across diverse modalities (e.g., textual, signal, image, and video). Specifically, the CardiacRAG agent is developed to formulate general plans based on the latest domain knowledge and the proposed hybrid retrieval technique, whereas the Chief agent enhances its own capabilities through the integration and orchestration of external tools for plan execution and definitive decision-making. To support adaptive planning across tasks and patient-specific cases, the system initially assesses task complexity, executes the plan, and dynamically refines it as new evidence emerges. For more challenging cases, a multidisciplinary discussion team (MDT), augmented with external tools and cross-turn memory, is proposed to support further interpretation. Finally, when clinicians raise concerns, visual review panels are provided for validation. In summary, the key contributions can be articulated as follows:

\begin{itemize}    
    \item A domain-specific framework, CardAIc-Agents, is developed to enhance the capabilities of large models through specialized tool integration, enabling autonomous execution of diverse cardiac tasks on multimodal data.
    \item Adaptive strategies are proposed to stratify task complexity, refine plans iteratively as new evidence emerges, initiate team discussions, and provide visual validation, enabling hierarchical adaptation tailored to specific tasks and individual patients.
    \item A CardiacRAG agent is introduced to derive plans based on an updated cardiac knowledge base, while employing a hybrid retrieval mechanism to optimize semantic comprehension and relevance. 
    \item A multidisciplinary discussion team is designed to integrate external tools that extend the static and general capabilities of foundation models and to incorporate cross-turn memory that preserves context across reasoning steps.
\end{itemize}  

\section{Method}
CardAIc-Agents consist of two components: the CardiacRAG and the Chief agent (Figure~\ref{fig:workflow}). The former, based on a dedicated cardiac knowledge base, generates and updates plans as new evidence emerges. The Chief agent serves as the primary decision-maker, responsible for complexity assessment, task assignment, plan execution, and tool invocation. Together, these components support \textit{hierarchical adaptation} in CardAIc-Agents, consistent with clinical workflows. Specifically, \textit{hierarchical adaptation} denotes 1) task-complexity stratification (Basic vs.\ Advanced), 2) iterative plan refinement as new evidence is incorporated, 3) automatic on-demand initiation of the multidisciplinary discussion team (MDT), and 4) optional visual outputs for further validation.

%CardAIc-Agents consist of two components: the CardiacRAG and the Chief agent (Figure \ref{fig:workflow}). 
As shown in Figure~\ref{fig:workflow}, upon receiving the query with associated multimodal data such as ECGs (a), the Chief agent performs a complexity assessment (b) and assigns the task to the CardiacRAG agent (c), which retrieves domain-specific evidence from a curated cardiac knowledge base and constructs a general plan for the case (d). For low-complexity cases, the Chief agent executes this plan by invoking the required analytical tools (e), integrates their results (f), and generates the final clinical response (i). For high-complexity cases, the plan is adaptively revised (g, h) as new evidence is incorporated, enabling iterative refinement of subsequent reasoning and tool selection before the final decision is produced. For more challenging cases, the Chief agent may autonomously initiate the MDT to provide enhanced interpretation. When clinicians raise concerns, visual review panels may be required as an available option for human validation.


\begin{figure}[t!]
%\floatconts
\centering
\includegraphics[width=0.95\linewidth]{test.pdf}
\caption{Overview of the CardAIc-Agents framework. The workflow proceeds from (a) to (i). For tasks identified as basic, steps (g) and (h) are skipped; for advanced tasks, the full pipeline is executed.}
\label{fig:workflow}
\vspace{-5mm}
\end{figure}


\subsection{CardiacRAG Agent}
The CardiacRAG agent is developed as current LLMs and VLMs encode static knowledge and their general-purpose design may lack cardiovascular domain specificity. This agent emulates the clinician reasoning process through information retrieval from authoritative medical sources and it is structured into three key stages (see Figure \ref{fig:kb}).

\smallskip
\noindent\textit{Knowledge base construction.} 
To reduce the complexity of information retrieval and improve accuracy, the knowledge base construction process focuses exclusively on cardiac content. This domain-specific approach selectively aggregates data \(\{D_i\}_{i=1}^M=\{ D_1, D_2, \ldots, D_M \}\) from authoritative medical sources, including major US academic medical centers (e.g., Mayo Clinic \citeyear{mfmmer2025}), UK National Health Service (\citeyear{nhs2025}), health information platforms (e.g., MedlinePlus \citeyear{miller2000medlineplus}), and recently published official guidelines (see Appendix \ref{app:kb} for details).

Then, the raw documents \(\{D_i\}_{i=1}^M\) are preprocessed through a transformation function \(T\) that extracts and normalizes textual content, producing clean text:
\begin{equation}
\{S_i\}_{i=1}^M = T\big(\{D_i\}_{i=1}^M\big), \quad i = 1,2,\ldots,M.
\end{equation}
where \(T\) refers to BeautifulSoup  \cite{abodayeh2023web} for HTML files and Docling  \cite{livathinos2025docling} for PDFs, and \(M\) refers to the total number of collected documents. 

Finally, cleaned texts $S_i$ are split into chunks \( s_i^j \) to preserve contextual continuity:
\begin{equation}
s_i^j = \text{chunk}_j(S_i; d_s, d_o), \quad j = 1, 2, \ldots, L_i,
\end{equation}
where \(L_i\) is the chunk count of document \(i\); \(j\) is the chunk index; and $d_s$ and $d_o$ denote the chunk and overlap size, respectively.

\smallskip
\noindent\textit{Hybrid retrieval.} 
To reduce irrelevant results from current vector similarity retrieval techniques, a hybrid retrieval method, combined with TF-IDF variants, is applied to further filter results using domain-specific keywords and ensure clinical specificity (see Figure \ref{fig:kb}).

{\scriptsize{$\bullet$}} Vector similarity retrieval. To preserve semantic relevance, both chunks \( s_i^j \) and the query are embedded via Bio\_ClinicalBERT \cite{alsentzer-etal-2019-publicly} as \(v_i^j=\phi(s_i^j)\) and \( q=\phi(\text{query}) \). The set of document vectors $\mathcal{V}=\bigcup_{i=1}^{M} \{ v_i^j \}_{j=1}^{L_i}$ is ranked by cosine similarity:
\begin{equation}
\quad
\mathrm{sim}(q, v_i^j) = \frac{q \cdot v_i^j}{\|q\| \|v_i^j\|},
\end{equation}
and the top \(3n\) vectors (\(\mathcal{V}_{(1)},\ldots,\mathcal{V}_{(3n)}\)) are returned, where \( n \) is the final number of results. Document vectors are indexed and stored in the FAISS vector database for efficient retrieval and reused without recalculation in subsequent queries.

\begin{figure}[t!]
%\floatconts
\centering
\includegraphics[width=0.75\linewidth]{kb.pdf}
\caption{Illustration of the CardiacRAG agent. $D_i$ denotes the \(i\)-th source, \(S_i\) is the cleaned text, \(s_i^j\) is the \(j\)-th chunk from source \(i\), \(v_i^j\) is its corresponding vector, \(T\) represents the transformation method, \(K\) denotes keyword-based filtering, \(n\) is the number of chunks retrieved, \(c\) is the final retrieved content, and \textit{Cite} indicates optional return of original chunks for transparency and reference.}
\label{fig:kb}
\vspace{-5mm}
\end{figure}

{\scriptsize{$\bullet$}} Keyword-based filtering. To improve clinical relevance, top \(3n\) documents are
further filtered based on domain-specific weights:
\begin{equation}
MW(k) = 
\begin{cases} 
\omega_{\text{medical}}[k], & k \in \mathcal{B}_{\text{medical}} \\
1, & \text{otherwise}
\end{cases},
\end{equation}
where \(k\) is the keyword from the query \(Q\), \(\mathcal{B}_{\text{medical}}\) is the clinical vocabulary, and \(\omega_{\text{medical}}\) is the term importance defined based on the clinical context. To exploit structural cues, a position-based bonus \cite{hofstatter2021mitigating} is introduced: 

\begin{equation}
PB(k,s_i^j) = 
\begin{cases} 
1.2, & \text{if } \text{pos}(k,s_i^j) < 0.3 \times d_s \\
1, & \text{otherwise}
\end{cases},
\end{equation}
where \(\text{pos}(k,s_i^j)\) is the first index of \(k\) in chunk \(s_i^j\). The final retrieval score then is:
\begin{equation}
\text{Score}_{(s_i^j,Q)} = \frac{1}{|Q|} \sum_{k \in Q} TF(k,s_i^j) \cdot MW(k) \cdot PB(k,s_i^j),
\end{equation}
where chunks with scores above threshold \(\theta\) are retained, and term frequency defined as:
\begin{equation}
TF(k,s_i^j) = \frac{\text{count}(k,s_i^j)}{|\text{words}(s_i^j)|}.
\end{equation}

\smallskip
\noindent\textit{Guideline generation.} 
Based on the retrieved chunks \( C=\{c_1, c_2, \dots, c_n\} \) and the query $Q$, the general plan is generated by the CardiacRAG agent, which employs DeepSeek-R1-Distill-Qwen-32B \cite{deepseekai2025deepseekr1incentivizingreasoningcapability} as its core model. This general plan could be continuously updated as new results become available during subsequent steps, thereby reflecting clinical practices. 



{\scriptsize{$\bullet$}} General plan. Given the query \(Q\), the model does not reconstruct the knowledge base; instead, it retrieves relevant evidence from the prebuilt cardiac knowledge base (FAISS vector database described earlier) using the hybrid retrieval mechanism. The retrieved chunks \(C\), together with the query, provide the contextual input from which the model generates an initial stepwise plan (see Appendix~\ref{app:prompt} for the prompt):
\begin{equation}
P = \texttt{DeepSeek-R1}(\texttt{PlanPrompt}(Q, C)),
\end{equation}
where the plan \(P=(p_1,\ldots,p_s) \) contains \( s \) steps, which may vary by case. 


{\scriptsize{$\bullet$}} Stepwise update. At each step, the CardiacRAG agent evaluates the execution state and determines whether the next action should be revised based on intermediate evidence. Formally, at step $i$, the model receives the execution history $\log_i$ and the proposed next step $p_{i+1}$, and produces an updated decision:

\begin{equation}
(S_i, A_i, p_{i+1}')
= \texttt{DeepSeek-R1}\big(\texttt{UpdatePrompt}(\log_i, p_{i+1})\big),
\qquad i = 1,\ldots,s-1.
\end{equation}
Here, $S_i$ summarizes the evidence, $A_i \in \{\text{stop},\text{continue}\}$ specifies whether execution terminates, and $p_{i+1}'$ is the updated next step; if no revision is required, $p_{i+1}' = p_{i+1}$. The procedure halts at the first index $k$ such that $A_k = \text{stop}$, or proceeds through all $s$ steps otherwise. A detailed case study is provided in Appendix \ref{app:cs}.

\subsection{Chief Agent}
The Chief agent leverages the advanced reasoning capabilities of DeepSeek-R1 to coordinate specialized tools and apply adaptive strategies that adjust behaviour at both the task and patient levels, reflecting clinical workflows and supporting real-world application.

\smallskip
\noindent\textit{Adaptive strategies.}
Given the query $Q$, the Chief agent first assesses the task complexity:

\begin{equation}
\ell = \texttt{DeepSeek\_R1}(\texttt{Prompt}(Q)),
\qquad 
\ell \in \{\text{basic}, \text{advanced}\}.
\end{equation}

Based on the predicted complexity level \( \ell \), the agent executes either the general plan or a stepwise refinement procedure. Execution in both modes follows:
\begin{equation}
p_{i+1}^{*} =
\begin{cases}
p_{i+1}, & \ell=\text{basic},\\[3pt]
p_{i+1}', & \ell=\text{advanced},
\end{cases}
\qquad i = 1,\ldots,k.
\end{equation}


At each step, the Chief automatically executes the actual action 
\(p_{i+1}^{*}\) by invoking tool \(T_{i+1} \in \mathcal{T}\) (see Appendix \ref{app:tool} for the used tools), yielding the output
\(t_{i+1} = T_{i+1}(p_{i+1}^{*})\). For complex cases, the Chief may additionally invoke the MDT tool to simulate clinical case conferences. The evidence is then appended to the log via \(\log_{i+1}=\texttt{Append}(\log_i, t_{i+1})\). After all steps, the Chief synthesizes the log to generate the final summary and answer:
\begin{equation}
(\text{Summary}, \text{Answer}) = \texttt{DeepSeek-R1}(\log).
\end{equation}

Further, the overall system can provide visual validation when required for disputed or ambiguous cases (see Figure \ref{fig:panel}), allowing clinicians to perform manual verification:
\begin{equation}
V = \texttt{CardAIc-Agents}(Q),
\end{equation}
which encapsulates the above workflow before producing the final visual output \(V\).

\begin{figure}
    %\floatconts
    \centering
    \includegraphics[width=1\linewidth]{MDT.pdf}

    \caption{Multidisciplinary Discussion Team (MDT) workflow: an iterative loop of (i) Assign Roles, (ii) Analyses, (iii) Synthesizes Analyses, and (iv) Review, terminated by a decision from the Chief Agent. (b) Iterative inference (corresponding to (a)): DeepSeek-R1 performs (i) and (iii) and outputs the decision; Qwen and MedGemma handle (ii) and (iv) as experts. \( Q \) denotes the original input, \( Z \) the intermediate tool outputs, \( E_{*}, P_{*}, D_{*} \) the model responses, with \( T \) as the total steps and \( t \) the step index.}
    \label{fig:team}
    \vspace{-5mm}
\end{figure}


\smallskip
\noindent\textit{Multidisciplinary discussion team (MDT).} As shown in Figure \ref{fig:team}, this team reviews inputs and intermediate outputs from tools to support comprehensive decisions. First, the Chief designates two relevant domain-expert roles based on the inputs. Each expert independently analyzes the inputs, and their respective results are synthesized by the Chief. The two experts then review this synthesis together with outputs produced by tools, enabling the integration of information sources that extend beyond the large model review paradigm typically adopted in existing medical agents. During this review, each expert provides an explicit binary agreement signal (agree/disagree) to indicate whether their current judgment is concordant with the synthesized conclusion from the preceding step. The Chief subsequently re-synthesizes the updated information, completing one discussion round. All intermediate results generated during each round are stored as memory for downstream use.

If consensus is achieved by both experts, or the maximum number of predefined discussion rounds is exhausted, the Chief issues the final decision. Otherwise, the synthesized output from the current round is combined with the original inputs and passed to the next round, ensuring that subsequent iterations built upon accumulated evidence rather than relying solely on the initial inputs or on the stochastic behavior of responses from LLMs or VLMs. In this study, the two experts are implemented using MedGemma \cite{sellergren2025medgemma}, specialized for medical image analysis, and Qwen2.5-VL \cite{qwen2.5-VL}, specialized for video processing as noted in Figure~\ref{fig:team}(b).

\section{Experiments}
\subsection{Experimental Settings} 
\noindent\textit{Datasets.} CardAIc-Agents is evaluated on three datasets: (i) MIMIC-IV \cite{johnson2023mimic}, for HF diagnosis, which includes data from 1,524 patients with three modalities (laboratory test results, 12-lead ECGs, and echocardiograms (ECHOs));  (ii) PTB-XL \cite{wagner2020ptb}, for Myocardial infarction (MI) diagnosis \cite{strodthoff2023ptb}, which includes data from 10,147 patients with structured patient information, ECG-derived variables, and 12-lead ECGs; (iii) The PTB Diagnostic ECG Database contains patient information and 12-lead ECGs from 268 cases for HF prediction (see Appendix \ref{app:dataset} for details).

\smallskip
\noindent\textit{Metrics and baseline.}
The diagnostic performance is evaluated using the area under the receiver operating characteristic curve (AUC) and accuracy \cite{yu2021evaluation}, with 95\% confidence intervals. For the visual outputs, two cardiologists independently assess and score the results. The proposed agent is compared with medical VLMs, including LLaVA-Med \cite{li2023llavamed} and MedGemma \cite{sellergren2025medgemma}, with MedGemma combined with CoT \cite{wei2022chain} and ReAct \cite{yao2023react} to evaluate step-by-step reasoning and tool-augmented strategies. Comparisons are also made with medical agent frameworks such as MedAgents \cite{tang2024medagents}, ReConcile \cite{chen-etal-2024-reconcile}, and MDAgents \cite{kim2024mdagents}. Further implementation details are provided in Appendices \ref{app:idus} for the proposed method and \ref{app:idbase} for the baseline agent, respectively.


\subsection{Results and Comparative Analysis}
\begin{table*}[t]
\centering
\caption{Performance Comparison Across Methods and Datasets}
\label{tab:cp}
\vspace{1pt}
\setlength{\tabcolsep}{1pt}
\renewcommand{\arraystretch}{1.12}
\begin{adjustbox}{width=\textwidth}

% make the whole Latency column (5th col) red
\begin{tabular}{ccccccccc}
\toprule
\multirow{2}{*}{\textbf{Category}} &
\multirow{2}{*}{\textbf{Method}} &
\multicolumn{3}{c}{\textbf{MIMIC-IV}} &
\multicolumn{2}{c}{\textbf{PTB-XL}} &
\multicolumn{2}{c}{\textbf{PTB Diagnostic}} \\
\cmidrule(lr){3-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
& & \textbf{ACC} & \textbf{AUC} & \textbf{Latency} & \textbf{ACC} & \textbf{AUC} & \textbf{ACC} & \textbf{AUC} \\
\midrule

\multirow{6}{*}{\textbf{\begin{tabular}[c]{@{}c@{}}VLMs \\ \&\\Variants\end{tabular}}}
& \begin{tabular}[c]{@{}c@{}}LLaVA-Med\\[-3pt](\citeauthor{li2023llavamed})\end{tabular} & 0.35(0.30,0.41) & 0.34(0.28,0.40) & 22.515 & 0.39(0.37,0.42) & 0.51(0.47,0.55) & 0.12(0.08,0.16) & 0.44(0.34,0.59) \\
& \begin{tabular}[c]{@{}c@{}}MedGemma\\[-3pt](\citeauthor{sellergren2025medgemma})\end{tabular} & 0.76(0.71,0.80) & 0.82(0.77,0.86) & 15.046 & 0.56(0.53,0.59) & 0.55(0.52,0.59) & 0.76(0.71,0.81) & 0.88(0.83,0.93) \\
& \begin{tabular}[c]{@{}c@{}}MedGemma\\[-3pt](CoT,\citeauthor{wei2022chain})\end{tabular} & 0.65(0.60,0.70) & 0.81(0.76,0.86) & 359.038 & 0.53(0.50,0.56) & 0.54(0.50,0.58) & 0.58(0.52,0.64) & 0.75(0.64,0.88) \\
& \begin{tabular}[c]{@{}c@{}}MedGemma\\[-3pt](ReAct,\citeauthor{yao2023react})\end{tabular} & 0.67(0.62,0.72) & 0.71(0.66,0.76) & 139.422 & 0.83(0.81,0.85) & 0.83(0.80,0.85) & 0.69(0.64,0.75) & 0.72(0.41,0.99) \\
\midrule

\multirow{5}{*}{\textbf{\begin{tabular}[c]{@{}c@{}}Medical\\Agents\end{tabular}}}
& \begin{tabular}[c]{@{}c@{}}MedAgents\\[-3pt](\citeauthor{tang2024medagents})\end{tabular} & 0.74(0.70,0.79) & 0.82(0.78,0.87) & 156.870 & 0.65(0.62,0.68) & 0.62(0.58,0.66) & 0.75(0.70,0.79) & 0.84(0.74,0.92) \\
& \begin{tabular}[c]{@{}c@{}}ReConcile\\[-3pt](\citeauthor{chen-etal-2024-reconcile})\end{tabular} & 0.49(0.44,0.55) & 0.75(0.69,0.80) & 103.206 & 0.43(0.40,0.46) & 0.57(0.53,0.61) & 0.55(0.49,0.61) & 0.76(0.58,0.93) \\
& \begin{tabular}[c]{@{}c@{}}MDAgents\\[-3pt](\citeauthor{kim2024mdagents})\end{tabular} & 0.52(0.47,0.58) & 0.61(0.55,0.68) & 64.563 & 0.56(0.52,0.58) & 0.60(0.56,0.64) & 0.74(0.68,0.79) & 0.74(0.53,0.96) \\
\midrule

\multirow{3}{*}{\textbf{\begin{tabular}[c]{@{}c@{}}Fine tuned\\VLMs\end{tabular}}}
& {\begin{tabular}[c]{@{}c@{}}Qwen2.5-VL\\[-3pt](\citeauthor{qwen2.5-VL})\end{tabular}}
& 0.78(0.73,0.82)
& 0.85(0.81,0.90)
& {\textbf{1.173}}
& 0.93(0.91,0.94)
& 0.96(0.94,0.97)
& 0.72(0.66,0.77)
& 0.80(0.61,0.99) \\
& \begin{tabular}[c]{@{}c@{}}Janus-Pro\\[-3pt](\citeauthor{chen2025janus})\end{tabular}
& 0.84(0.79,0.88)
& \textbf{0.91(0.88,0.94)}
& 1.247
& \textbf{0.96(0.95,0.97)}
& \textbf{0.99(0.98,0.99)}
& 0.75(0.70,0.80)
& 0.83(0.57,0.99) \\
\midrule

\textbf{Proposed} & CardAIc-Agents & \textbf{0.87(0.82,0.90)} & 0.89(0.85,0.93) & 79.137 & \textbf{0.96(0.95,0.97)} & 0.96(0.94,0.98) & \textbf{0.77(0.72,0.82)} & \textbf{0.88(0.65,1.00)} \\
\bottomrule

\multicolumn{9}{l}{\normalsize{Note: Boldface values indicate best performance within each dataset and metric; Values in parentheses represent 95\% confidence intervals.}} \\[-5pt]
\multicolumn{9}{l}{\normalsize{\phantom{Note: }Latency is reported per sample; ACC = accuracy; AUC = Area Under the Curve; CoT = Chain of Thought; ReAct = Reasoning and Acting.}} \\
\end{tabular}
\end{adjustbox}
\end{table*}


\noindent\textit{Comparison with VLMs and variants.} As shown in Table \ref{tab:cp}, CardAIc-Agents outperformed all baseline VLMs across the three cardiac datasets. The largest gap was observed on the MIMIC-IV, where CardAIc-Agents achieved an accuracy of 0.87 compared to only 0.35 by LLaVA-Med ($p<0.05$). This was partly due to the limited token input length, which constrained the performance of this medical yet general VLM. Secondly, MedGemma performed the best among the VLMs, while enabling CoT reasoning did not improve performance across all datasets (see Appendix \ref{app:cm} for confusion metric analysis). Finally, its ReAct system, built on LangChain for tool use, improved PTB-XL performance but not MIMIC-IV, and still was outperformed by the proposed method.

\smallskip
\noindent\textit{Comparison with medical agents.}
CardAIc-Agents also outperformed state-of-the-art medical agents as shown in Table \ref{tab:cp}. Among these, ReConcile showed the largest gap, with accuracies of 0.49 (vs.\ 0.87) on MIMIC-IV, 0.43 (vs.\ 0.96) on PTB-XL, and 0.55 (vs.\ 0.77) on PTB Diagnostic ($p<0.05$). A key limitation of these agents lay in their reliance on the intrinsic capabilities of models; inference remained constrained despite guidance from expert-role prompts. In addition, their static knowledge bases and rigid reasoning pipelines limited adaptation to diverse cases. By contrast, the proposed agent leveraged an updatable CardiacRAG agent, integrated external tools to augment model capabilities, and incorporated an adaptive strategy that enabled refinement and optimization across diverse cases.

\smallskip
\noindent{\textit{Comparison with fine-tuned VLMs.}
The proposed agent achieved performance comparable to that of fine-tuned VLMs specifically optimized for their respective tasks. The results showed that it outperformed Qwen2.5-VL and achieved performance comparable to Janus-Pro on the MIMIC-IV and PTB-XL datasets. To assess generalization, all fine-tuned models were directly evaluated on the PTB Diagnostic Database for HF diagnosis without any task-specific adaptation (Table~\ref{tab:cp}). The results showed that Qwen2.5-VL and Janus-Pro achieved lower accuracies of 0.72 and 0.75, respectively, compared with 0.77 ($p<0.05$). 

Another finding was that the fine-tuned Janus-Pro achieved a higher AUC than the proposed agent on the first two datasets, but not higher accuracy. This difference might have stemmed from the tendency of LLM-based agents to assign elevated baseline probabilities even in the absence of explicit diagnoses, whereas fine-tuned models typically generated more precise probability estimates, often benefiting from higher numerical precision (e.g., 16- or 32-bit). Even so, results in Table~\ref{tab:cp} demonstrated that the proposed CardAIc-Agents achieved comparable, if not better, performance than VLMs specifically fine-tuned for a given task. Note that beyond generalization and task specificity, flexibility and interpretability are also concerns for fine-tuned models, whereas CardAIc-Agents does not require task-specific fine-tuning and serves as a general-purpose framework.
}

\smallskip
\noindent\textit{Assessment of intermediate visual outputs.}
CardAIc-Agents could provide on-demand support to clinicians for the validation of complex or uncertain cases, a capability enabled by the review panel introduced to facilitate this process (see Figure \ref{fig:panel}). This function was evaluated by two cardiologists. For echocardiography, the agent automatically identified 11 standard views from raw DICOM, achieving 100\% accuracy in key views (eg., A3C, A4C, PLAX, PSAX, SC) and over 80\% accuracy in others (a random sample of 10 cases); Left ventricle segmentation on A4C views had been reported in prior studies with a Dice Coefficient of 0.922 on the EchoNet-Dynamic dataset. Detection of P, QRS, and T waves from 12-lead ECGs was rated suboptimal by experts, mainly due to stringent criteria requiring precise identification of every heartbeat, indicating an area for further improvement. 

\begin{figure}[t!]
    \centering
    \includegraphics[width=0.75\linewidth]{panel.pdf}
    \caption{Visual panel generated by CardAIc-Agents: a) patient profile display b) ECG waveform with labeled P and T waves c) echocardiographic view identification with A4C segmentation video frames. Additional details are in the Appendices.}
    \label{fig:panel}
    \vspace{-5mm}
\end{figure}  


\begin{table*}[t]
\centering
\caption{Ablation Study of Adaptive Workflow, CardiacRAG, and Tools.}
\label{tab:ablation}
\vspace{1pt}
\setlength{\tabcolsep}{2.5pt}
\renewcommand{\arraystretch}{1.10}

\begin{adjustbox}{width=\textwidth}
\begin{tabular}{c|c|cc|>{}c|>{}c|>{}c|cc|cc}
\toprule
\multirow{2}{*}{\textbf{Level}} &
\multirow{2}{*}{\textbf{Workflow}} &
\multicolumn{2}{c|}{\multirow{2}{*}{\textbf{CardiacRAG}}} &
\multicolumn{5}{c|}{\textbf{Tools}} &
\multirow{2}{*}{\textbf{ACC}} &
\multirow{2}{*}{\textbf{AUC}} \\
\cmidrule(lr){5-9}
& & \multicolumn{2}{c|}{} &
\textbf{EPhys} &
\textbf{EchoSeg} &
\textbf{CardioF} &
\multicolumn{2}{c|}{\textbf{MDT}} &
& \\
\midrule

\multirow{7}{*}{Module}
&  &\multicolumn{2}{c|}{\checkmark} & \checkmark & \checkmark & \checkmark & \multicolumn{2}{c|}{\checkmark} & 0.80(0.75,0.85) & 0.87(0.83,0.91) \\
& \checkmark &  &  & \checkmark & \checkmark & \checkmark & \multicolumn{2}{c|}{\checkmark} & 0.77(0.72,0.82) & 0.81(0.76,0.86) \\
\cmidrule(lr){2-11}
& \checkmark & \multicolumn{2}{c|}{\checkmark} & & \checkmark & \checkmark & \multicolumn{2}{c|}{\checkmark} & 0.84(0.80,0.88) & 0.88(0.83,0.92)\\
& \checkmark & \multicolumn{2}{c|}{\checkmark} &  \checkmark& & \checkmark & \multicolumn{2}{c|}{\checkmark} & 0.83(0.78,0.87)& 0.88(0.84,0.92)\\
& \checkmark & \multicolumn{2}{c|}{\checkmark} & \checkmark &  \checkmark& & \multicolumn{2}{c|}{\checkmark} & 0.77(0.72,0.81)& 0.84(0.79,0.88)\\
& \checkmark & \multicolumn{2}{c|}{\checkmark} & \checkmark & \checkmark &  \checkmark& \multicolumn{2}{c|}{} & 0.84(0.80,0.88)& 0.88(0.84,0.92)\\
\midrule

\multirow{5}{*}{\raisebox{-2.5\height}{\begin{tabular}{c}Intra\\module\end{tabular}}}
&  & Similarity & Filter &  &  &  & Tool & Memory & ACC & AUC \\
\midrule
& \checkmark &  & \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & 0.83(0.79,0.87) & 0.87(0.83,0.91) \\
& \checkmark & \checkmark &  & \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & 0.83(0.78,0.87) & 0.86(0.81,0.90) \\
& \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \checkmark &  & \checkmark & 0.75(0.70,0.80) & 0.83(0.78,0.87) \\
& \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \checkmark &  & 0.82(0.77,0.86) & 0.86(0.81,0.90) \\
\midrule

Proposed
& \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \checkmark
& \textbf{0.87(0.82,0.90)} & \textbf{0.89(0.85,0.93)} \\
\bottomrule

\multicolumn{11}{l}{\normalsize{\checkmark\ = enabled; empty = disabled; Boldface = best performance; MDT = multidisciplinary discussion team.}} \\[-3pt]
\multicolumn{11}{l}{\normalsize{EPhys = electrophysiologists; EchoSeg = echo segmente; CardioF = cardiology fellow.}} \\
\end{tabular}
\end{adjustbox}
\vspace{-3mm}
\end{table*}

\subsection{Ablation Studies}
\noindent\textit{Adaptive workflow.} 
The ablation study was conducted on the MIMIC-IV dataset to evaluate the contribution of the adaptive workflow, where accuracy improved from 0.80 to 0.87 ($p<0.05$, Table \ref{tab:ablation}). This result confirmed the effectiveness of reasoning in an incremental and feedback-aware manner. Specifically, the model performed step-by-step evaluation and summarization, allowing it to re-assess the current state at each stage and adjust the plan accordingly before proceeding. This process emphasized a global plan followed by stepwise adjustments, distinguishing it from a purely ReAct-based approach that prioritizes stepwise changes, as well as from strategies that rely solely on general planning. More detailed sensitivity analysis of task complexity assessment is provided in Appendix \ref{app:sensitivity}.


\smallskip
\noindent\textit{CardiacRAG agent.} The contribution of the CardiacRAG agent is shown in Table \ref{tab:ablation}. The results indicated a clear improvement when a dedicated and independent agent was assigned to generate and refine plans based on domain knowledge, yielding a 10\% performance improvement. This highlighted the effectiveness of the proposed module in precisely retrieving relevant information, as well as the valuable contribution of its curated domain-specific knowledge base. Furthermore, the intra-module ablation of the hybrid retrieval mechanism confirmed that using only vector similarity retrieval or only keyword-based filtering led to decreased performance (see Appendices \ref{app:pa} and \ref{app:retrieval} for parameter analysis and retrieved knowledge quality, respectively). 

\smallskip
\noindent\textit{Multidisciplinary discussion team.} Table \ref{tab:ablation} also reports an increase in accuracy from 0.84 to 0.87 attributed to the proposed team ($p<0.05$). This gain reflected two drivers: first, the effectiveness of this team to incorporate diverse multimodal information and assign distinct roles to specialized models for collaborative discussion; second, the capability of the agent to dynamically activate the tool upon detecting uncertainty or insufficient evidence in earlier reasoning stages, selectively engaging the team as needed. Such improvement also confirmed the benefits of the proposed adaptation strategy. In addition, intra-module ablation showed that both the tool and its persistent memory contributed to performance, highlighting their importance for stable multi-agent coordination. 

{\noindent\hspace*{\parindent}%
Notably, MDT was triggered for only 6\% of samples, yet the mean latency increased from 50.44\,s to 79.14\,s, indicating a latency overhead associated with its invocation (see Table~\ref{tab:cost_vs_perf}). Further analysis on the triggered subset showed that the per-case latency was 194.31\,s, substantially higher than without MDT (52.72\,s), while larger accuracy gains were achieved (ACC 0.84 vs.\ 0.74). This highlighted that on-demand MDT yielded performance improvements at the cost of latency overhead on a small subset of samples, which may constrain deployment. However, this trade-off parallels clinical escalation workflows, in which multidisciplinary discussion is reserved for a minority of complex cases and incurs substantial time overhead.
}

\begin{table}[t]
\centering
\caption{Inference Cost and Performance with On demand MDT.}
\vspace{3pt}
\label{tab:cost_vs_perf}
\setlength{\tabcolsep}{2pt}
\resizebox{\columnwidth}{!}{%
{%
\begin{tabular}{c|cc|cc|c|cc|cc}
\toprule
\multirow{3}{*}{\textbf{Method}} &
\multicolumn{4}{c|}{\textbf{Overall (Full Set)}} &
\multicolumn{5}{c}{\textbf{Subset (Triggered Only)}} \\
\cmidrule(lr){2-5}\cmidrule(lr){6-10}
& \multicolumn{2}{c|}{\textbf{Latency}} &
\multicolumn{2}{c|}{\textbf{Performance}} &
\multicolumn{1}{c|}{} &
\multicolumn{2}{c|}{\textbf{Latency}} &
\multicolumn{2}{c}{\textbf{Performance}} \\
\cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-6}\cmidrule(lr){7-8}\cmidrule(lr){9-10}
& \textbf{Mean(s)} & \textbf{P95(s)} & \textbf{ACC} & \textbf{AUC} &
\textbf{Trig\_Rate(\%)} & \textbf{Mean(s)} & \textbf{P95(s)} & \textbf{ACC} & \textbf{AUC} \\
\midrule
No MDT         & 50.44 & 62.98  & 0.84(0.80,0.88) & 0.88(0.84,0.92) & 0   & 52.72  & 59.81  & 0.74(0.53,0.89) & 0.55(0.18,0.89) \\
On-demand MDT  & 79.14 & 161.01 & 0.87(0.82,0.90) & 0.89(0.85,0.93) & 6 & 194.31 & 285.25 & 0.84(0.68,1.00) & 0.61(0.23,1.00) \\
\bottomrule
\multicolumn{10}{l}{\normalsize{Note: Latency is reported in seconds (s) per sample; Mean and P95 denote the average and 95th percentile latency, respectively;}} \\[-3pt]
\multicolumn{10}{l}{\normalsize{\phantom{Note: }Trig\_Rate = trigger rate; ACC = accuracy; AUC = area under the curve; Values in parentheses indicate 95\% confidence intervals.}} \\
\end{tabular}%
}%
}
\end{table}


\smallskip
\noindent{\textit{Domain-specific tools.} Table~\ref{tab:ablation} further quantifies the effect of each domain-specific tool. The results indicate that removing any single tool yields a consistent performance degradation, with the largest decrement observed when the Cardiology Fellow tool is ablated (ACC 0.87 vs. 0.77, $p<0.05$). Note that the remaining tools (Laboratory Technician, ECG Technician, and Echocardiographer) are not ablated, as they are required for preprocessing the laboratory test results, 12-lead ECGs, and echocardiograms, respectively. In addition, the adaptive test-time scaling (TTS) mechanism is designed for procedural robustness (see Appendix~\ref{app:tts}), while the final decision relies on multiple cross-modal evidence sources rather than any single external tool output, which limits the influence of isolated tool errors.}

\section{Conclusion}
This study introduces CardAIc-Agents, a multimodal framework with adaptive capabilities for cardiac-related tasks. Experiments on three public datasets showed that it outperformed general medical VLMs and state-of-the-art medical agents. In summary, by combining external tools with a cardiac knowledge base, this study presents a hierarchical adaptive framework spanning: complexity assessment; iterative plan refinement as new evidence emerges; dynamic activation of specialized team discussions for complex cases; and provision of visual outputs to support clinician verification. With this adaptive design, CardAIc-Agents delivers scalable multimodal decision support and shows potential for deployment, particularly in resource-limited clinical settings.


\clearpage
\section*{Acknowledgements}
The research was conducted using the Baskerville Tier 2 HPC service, which was funded by the EPSRC and UKRI through the World Class Labs
scheme (EP/T022221/1) and the Digital Research Infrastructure programme (EP/W032244/1). Baskerville is operated by Advanced Research Computing at the University of Birmingham.
\bibliography{main}

\appendix
\renewcommand{\thefigure}{S\arabic{figure}}
\setcounter{figure}{0}
\renewcommand{\thetable}{S\arabic{table}}
\setcounter{table}{0}
\section{Knowledge Base Construction}
\label{app:kb}
To streamline the information retrieval and improve clinical accuracy, the knowledge base is curated with a targeted focus on cardiac-related content. It selectively integrates information from authoritative medical sources and the most recent official clinical guidelines to ensure domain relevance and content reliability. In addition, this base is intended to mitigate the limitations of static large language models by facilitating the timely integration of updated medical knowledge without requiring repeated fine-tuning of the core model.

\subsection{Authoritative Medical Sources (Web-based Resources).} Medical content is collected from leading academic medical centers, national health service, and reputable online health information platforms. These sources include:
\begin{itemize}
\item Mayo Clinic (\citeyear{mfmmer2025}). A widely recognized medical information platform developed by Mayo Clinic and its affiliates, offering clinically validated content for public health education. In our knowledge base, we primarily focus on its cardiovascular section, which provides structured disease overviews, diagnostic pathways, and treatment guidelines aligned with clinical best practices.

\item Cleveland Clinic (\citeyear{Cleveland2025}). As a leading academic medical center with a strong emphasis on innovation and translational research, Cleveland Clinic provides comprehensive information on advanced diagnostic techniques, treatment strategies, and access to cutting-edge clinical trials, which are often not available through general platforms. Given these strengths, its heart disease–related content is incorporated into our cardiac knowledge base.

\item UK National Health Service (\citeyear{nhs2025}). The NHS offers comprehensive and authoritative information on symptoms, conditions, treatments, risk factors, and self-management strategies. It also provides structured guidance for navigating healthcare services, making informed decisions, and responding to public health events. To enrich our knowledge base with UK-specific clinical pathways and patient-facing recommendations, we incorporate its cardiac-related web content.

\item MedlinePlus (\citeyear{miller2000medlineplus}). MedlinePlus is a comprehensive online health information resource designed for patients and the general public. It is managed by the U.S. National Library of Medicine (NLM), which is part of the National Institutes of Health (NIH). As a leading American platform, it provides evidence-based, patient-centered information covering a wide range of health topics. To incorporate authoritative U.S.-focused cardiovascular content such as disease conditions, treatment options, and preventive care, materials from MedlinePlus are integrated into our knowledge base.
\end{itemize}

Note that other authoritative organizations, such as Healthline \cite{healthline2025} and the National Heart, Lung, and Blood Institute (NHLBI) \citeyear{NHLBI2025}, are also included. Institutions like the American Heart Association (AHA) \cite{AHA2025} can  provide valuable information; however, due to copyright and usage restrictions, their content is not directly incorporated into our knowledge base.

\subsection{Official Clinical Guidelines (Local Knowledge Repository).} The most recent authoritative guidelines from leading cardiology organizations are incorporated, including the latest versions of the European Society of Cardiology (ESC) guidelines \cite{mcdonagh20212021}, American Heart Association (AHA) guidelines \cite{perman2024aha}, Canadian Cardiovascular Society (CCS) guidelines \cite{guerra2024ccs}, Chinese Society of Cardiology guidelines \cite{shuyang2025chinese}, among others. This local repository offers two key advantages: first, it enables direct compilation of the most relevant and well-controlled up-to-date content, providing the model with authoritative references to support clinical decision-making; second, it circumvents issues related to automated data extraction from certain websites, such as the AHA, which restricts or limits web crawling. It is worth mentioning that all online data retrieval and access in this study are conducted strictly for research purposes only.

\section{Tools}
\label{app:tool}
CardAIc-Agents leverage a variety of tools that are not driven by large models, but are instead toolbox-style modules designed to perform specific domain functions. Each tool acts as an expert in its respective domain, collaboratively supporting the execution of cardiac tasks. The following are descriptions of these tools:

\smallskip
\noindent\textit{Laboratory technician.} This tool preprocesses laboratory test results (Labs, tabular data) for downstream analysis by extracting clinical information such as demographics, laboratory values, and medication history from structured or semi-structured text, producing both natural language outputs and their tokenized representations:
\[
(Lab_{\text{text}}, Lab_{\text{token}}) = \text{LabProcessor}(\text{Labs}).
\]

\smallskip
\noindent\textit{ECG technician.} This tool preprocesses raw 12-lead ECGs through bandpass filtering, noise removal, and baseline drift correction to support downstream analysis, and also extracts quantitative parameters such as mean amplitude and standard deviation:
\[
(\text{ECG}_{\text{text}}, \text{ECG}_{\text{signal}}) = \text{ECGProcessor}(\text{ECGs}).
\]

\smallskip
\noindent\textit{Electrophysiologists.} This functionality is implemented with NeuroKit2, a Python toolbox, to obtain 12-leads ECG measurements, include signal quality scores, heart rate variability (HRV) features, wave durations (e.g., QRS, PR, QT intervals), and extract heartbeat images from representative leads (e.g., \texttt{II}, \texttt{I}, and \texttt{V5}):
\[
(\hat{E}, \hat{B}) = \text{NeuroKit2}(\text{ECGs}),
\]
where \( \hat{E}=\{ \hat{e}_1,\dots, \hat{e}_m \} \) represents the extracted ECG measurements, and \( \hat{B}=\{ \hat{b}_I, \hat{b}_{II}, \hat{b}_{V5} \} \) denotes the extracted heartbeat images from the respective leads.

\smallskip
\noindent\textit{Echocardiography technician.} This tool functions as a view classifier \cite{vukadinovic2024echoprime} to extract standard cardiac views, including apical two-chamber (A2C), apical three-chamber (A3C), apical four-chamber (A4C), apical five-chamber (A5C), apical Doppler (AD), colour Doppler parasternal long-axis (DPL), colour Doppler parasternal short-Axis (DPS), parasternal long-axis (PSL), parasternal short-axis (PSS), suprasternal short-axis (SSN), and subcostal view (Sub), from raw DICOM data:
\[
View = \text{ViewClassifier}(\text{DICOM}).
\]

\smallskip
\noindent\textit{Echocardiography segmenter.} This tool performs segmentation for echocardiograms (ECHOs), which is essential for tracking cardiac function in clinical practice. Here, a segmentation network \cite{zhang2024development} is employed to generate pixel-wise masks (\( Mask \)) to delineate cardiac structures in apical four-chamber videos (\( \text{EchoVideo}_{A4c} \)):
\[
Mask = \text{SegNetwork}(\text{EchoVideo}_{A4c}).
\]

\smallskip
\noindent\textit{Cardiology fellow.} A fine-tuned multimodal model is employed for preliminary disease diagnosis (\( Y \)) based on diverse data modalities:
\[
Y = \text{TGMM}({\text{Labs}}, {\text{ECGs}}, {\text{ECHOs}}).
\]



\section{Dataset}
\label{app:dataset}
\subsection{MIMIC-IV Data}
In this study, the ICU module is excluded, and focus is placed on hospital stay records from 223,452 patients to ensure data stability and relevance \cite{johnson2023mimic}. After applying stringent exclusion criteria, such as removing hospital stays without diagnostic outcomes, a refined subset of 1,524 samples is used. This subset includes 12-lead ECGs, echocardiograms, and laboratory test results, comprising 708 patients with prevalent heart failure (HF) and 816 without prevalent HF, as determined by ICD-9/10 diagnostic codes \cite{hong2023icd}. The dataset is then split using iterative stratification at a ratio of 5:1:1 \cite{sechidis2011stratification}. The fine-tuned models are trained and validated on the training and validation sets, while all other methods are evaluated on test set comprising 305 samples.

This dataset comprises laboratory test results, ECGs, and ECHOs. Laboratory measurements include key biomarkers such as Anion Gap, Bicarbonate, Creatinine, Potassium, and Sodium, supplemented by patient metadata such as age, ethnicity, gender, medical history, medication history, BMI, height, and weight. Missing values are left as missing (i.e., no imputation is performed). All ECGs are 12-lead, 10 seconds in duration, and sampled at 500 Hz, while echocardiogram data are stored in their raw Digital Imaging and Communications in Medicine (DICOM) format.

\subsection{PTB-XL and PTB-XL+ Data}
The PTB-XL dataset contains 21,837 12-lead ECG recordings from 18,885 patients, each lasting 10 seconds and sampled at 500 Hz \cite{wagner2020ptb}. Complementarily, the PTB-XL+ dataset provides extracted ECG features along with key patient metadata such as gender and age \cite{strodthoff2023ptb}. These datasets are merged using patient identifiers and are used for myocardial infarction (MI) diagnosis. Each recording is independently annotated by two cardiologists, who assign probabilistic diagnostic labels, resulting in 9,514 Normal and 5,469 MI cases. For this study, only recordings with a 100\% diagnostic probability for MI are included, yielding a final dataset of 7,172 Normal and 2,975 MI samples, totaling 10,147 recordings. Following official dataset guidelines, the data are split into training (8,167 samples), validation (991 samples), and test (989 samples) sets. The fine-tuned model is trained and validated on the training and validation sets, while all comparative methods are evaluated on the independent test set.

\subsection{PTB Diagnostic ECG Data}
This dataset contains 549 ECG records collected from 290 subjects, aged between 17 and 87 years (mean age 57.2) \cite{goldberger2000physionet}. Each record comprises 15 simultaneously measured signals, including the standard 12-lead ECG (I, II, III, aVR, aVL, aVF, V1–V6) and 3 Frank leads (Vx, Vy, Vz), all sampled at 1000 Hz with 16-bit resolution over a ±16.384 mV range. Diagnostic classes are available for 268 subjects. Since no fine-tuning is performed on this dataset, it is directly employed to evaluate the generalization capability of the fine-tuned model, with all samples used across all methods. Note that only the 12-lead ECG signals are utilized, not 3 Frank leads.

\section{Implementation Details}
To uphold stringent data security and ensure full compliance with clinical governance frameworks, all models are deployed and executed entirely within on-premise infrastructure, with no reliance on external APIs. This ensures that patient data remains strictly within institutional boundaries, enabling secure inference in a fully controlled and auditable environment.

\subsection{CardAIc-Agents}
% \label{app:CardAIc-Agents}
\label{app:idus}
All experiments involving CardAIc-Agents, such as main experiments, ablation studies, and parameter analyses, are conducted on a system equipped with three NVIDIA A100-SXM4 GPUs, each with 80GB of memory. 

Both the guideline generation for the CardiacRAG Agent and the Chief Cardiologist rely on the DeepSeek-R1-Distill-Qwen-32B model, a distilled variant of the DeepSeek-R1 architecture designed to deliver efficient yet robust language generation. Following official guidelines, the model is configured with a temperature of 0.6 to balance creative variability with output consistency, and a maximum generation length is limited to 1024 tokens to ensure sufficiently detailed responses. Additional settings include: top-k is set to 40; top-p and typical-p are both set to 1.0; and a repetition penalty of 1.1 is applied to reduce output redundancy and enhance generation diversity.

For the multidisciplinary discussion team, two expert models are implemented. The first, MedGemma, which is based on Google's MedGemma-27b-it model \cite{sellergren2025medgemma}, specialized in medical image analysis. The second expert, Qwen2.5-VL from Alibaba \cite{qwen2.5-VL}, is optimized for video processing tasks. Both models are configured with reasoning capabilities enabled and employ similar generation settings: a very low temperature of 0.01 to ensure highly focused and deterministic outputs, a maximum token generation limit of 256, and sampling parameters including a top-k of 40, top-p and typical-p set at 1.0, along with repetition penalties of 1.1 to reduce redundancy. Device allocation is set to automatic to facilitate efficient resource management. Note that, since MedGemma could only process images, frames from each echocardiogram view are concatenated into a single image to capture both spatial and temporal information across frames.

\subsection{Prevent Inference Failures}
\label{app:tts}
Test-time scaling (TTS) encompasses strategies that allocate additional computational budget during inference to improve the reliability and accuracy of model predictions. Recent work shows that increasing test-time compute, such as sampling multiple reasoning trajectories, deepening intermediate deliberation, or ensembling candidate outputs, systematically improves performance on reasoning intensive tasks \cite{muennighoff2025s1, zhang2025survey}. At its core, TTS allows inference time computation to be adaptively increased to correct model failures or explore alternative reasoning pathways without modifying model parameters.

Within our workflow, a practical source of instability stems from intermittent generation of ill-formed intermediate outputs, including missing mandatory fields, malformed JSON structures, or incomplete action specifications. Such structural defects interrupt downstream execution and may propagate through the multi-agent pipeline, ultimately compromising the validity of the final decision. To mitigate these failure modes, we introduce a lightweight adaptive TTS mechanism in which the agent automatically regenerates its output whenever structural or parsing constraints are violated (e.g., missing keys or JSON decoding errors). This procedure dynamically allocates additional inference compute only when needed, thereby instantiating the central principle of TTS: compute is used adaptively to ensure output correctness and procedural stability.

\smallskip

\subsection{Prompt}
\label{app:prompt}
\noindent\textit{Prompt Details.} Below, we present the prompt details for constructing our CardAIc-Agents workflow. \\
% -----------------------------
% PlanPrompt
% ------------------------------
\smallskip
\noindent\rule{\textwidth}{0.4pt}
\noindent\textit{PlanPrompt (System)}\\
You are a cardiac task planner. Analyze the given cardiac related task and produce a clear stepwise plan. Each step must include a brief “step” description and “tools\_names” chosen only from the provided resource list. The plan must be returned as a single valid JSON object describing all steps, without repeated steps or tool calls, and without nested JSON, arrays, or markdown.

\smallskip
\noindent\textit{PlanPrompt (User)}\\
Task: \{["task\_name"]\}. Available data modalities: \{dataset\_map[["dataset"]]\}. Available Resources/Tools: \{tools\_info\}. Please provide a complete stepwise plan for this task and indicate which tool to use at each step, using only tools in \{tools\_all\}.\\
\noindent\rule{\textwidth}{0.4pt}

% ------------------------------
% UpdatePrompt
% ------------------------------
\smallskip
\noindent\textit{UpdatePrompt (System)}\\
You are the most authoritative cardiology expert. After each step, review all previous steps and all tool outputs to decide whether the current evidence is sufficient to give a final diagnosis for the task. If it is sufficient, give a concise conclusion and probability; if not, summarize what has been obtained so far and explain what additional information is needed. Always return one JSON object with fields: “conclusion”, “answer” (a value between 0 and 1 for the probability of \{task\_name\}), “action” (“stop” or “continue”), and “next\_step” (null if you will follow the original next planned step \{next\_planned\_step\}, or a JSON object describing a new step using tools from \{tools\_all\}). Use “stop” only when the diagnosis is clearly definitive.

\smallskip
\noindent\textit{UpdatePrompt (User)}\\
Please review the following information and decide whether the current evidence is sufficient to provide a final decision for \{["task\_name"]\}: \{logs\}. Return a single JSON object with keys “conclusion”, “answer”, “action”, and “next\_step”. If you decide to continue with the original planned next step \{next\_planned\_step\}, set “next\_step” to null; if you propose a different step, provide it as a JSON object. Do not assume or invent any parameters or test results that are not explicitly mentioned.\\
\noindent\rule{\textwidth}{0.4pt}

% ------------------------------
% Complexity Prompt
% ------------------------------
\smallskip
\noindent\textit{Complexity Level Prompt (System)}\\
You are a cardiology expert whose role is to determine the ComplexityLevel of a cardiac related task. Treat the case as a typical presentation unless otherwise stated. A task is “basic” if it has a clear diagnostic pipeline, and “advanced” if it is case dependent or requires expert judgment beyond standard diagnostic criteria. You must respond with a JSON object containing a single field “complexity” set to “basic” or “advanced”.

\smallskip
\noindent\textit{Complexity Level Prompt (User)}\\
Given the following cardiac related medical task and the complexity guidelines, determine whether the task is basic or advanced under a typical presentation assumption: \{task\_description\}. Respond with a JSON object that contains only the field “complexity” with value “basic” or “advanced”.\\
\noindent\rule{\textwidth}{0.4pt}
\vspace{-0.6em}
{\scriptsize\noindent\textit{Note.} The prompts listed here summarize the logic of each component. The complete prompt templates, with exact formatting and instructions, are provided in the code implementation.}\\


\subsection{Baseline Models}
\smallskip
\label{app:idbase}
\noindent\textit{VLMs and variants.} All VLMs and their variants are configured with consistent generation settings to ensure a fair comparison. A low temperature of 0.01 is applied to encourage deterministic outputs, top-k fixed at 40, a repetition penalty of 1.1, and a maximum generation length of 4086 tokens. Sampling is enabled in all cases. LLaVA-Med is based on llava-med-v1.5-mistral-7b (set to 1024 tokens), while MedGemma is built upon medgemma-27b-it. The variants, referred to as MedGemma (CoT and ReAct), employ the same configurations but incorporate Chain-of-Thought \cite{wei2022chain} and ReAct \cite{yao2023react} prompting strategies, respectively, to facilitate multi-step clinical reasoning and tool usage. Note that the ReAct prompting is implemented via LangChain, utilizing the same set of tools and tool descriptions as those adopted in the proposed CardAIc-Agents.

\smallskip
\noindent\textit{Medical agents.} ReConcile \cite{chen-etal-2024-reconcile} and MDAgents \cite{kim2024mdagents} are implemented using the same base models as CardAIc-Agents, including DeepSeek-R1-Distill-Qwen-32B, MedGemma-27b-it, and Qwen2.5-VL, and are configured with generation parameters identical to those of CardAIc-Agents to ensure consistency. In contrast, MedAgents \cite{tang2024medagents} relies on a single large model performing three distinct roles, for which MedGemma-27b-it is employed. To guarantee a fair comparison, ECG data are preprocessed using the tool of Electrophysiologists from CardAIc-Agents to convert the signals into textual representations, which are subsequently fed into these agents. All experiments are conducted on the same hardware setup, using two or three NVIDIA A100-SXM4 GPUs, each with 80GB of memory.

\smallskip
\noindent\textit{Fine-tuned VLMs.} Fine-tuning of larger models, such as Qwen2.5-VL-3B-Instruct \cite{qwen2.5-VL} and Janus-Pro -7B \cite{chen2025janus}, is performed using four NVIDIA A100-SXM4 GPUs, each equipped with 40GB of memory. Training leverages the Adafactor optimizer in conjunction with a ReduceLROnPlateau learning rate scheduler to adaptively adjust the learning rate based on validation performance. Mixed precision training is utilized to enhance computational efficiency. Low-Rank Adaptation (LoRA) is applied to reduce trainable parameters and improve training efficiency without sacrificing model performance.

\section{Additional Experiments and Analysis}
\subsection{Parameter Analysis}\label{app:pa}
The analysis was further extended to assess the sensitivity of two key parameters: the number of retrieved chunks and the maximum number of rounds allowed for the multidisciplinary discussion (see Figure \ref{fig:hyper}). MIMIC-IV was used as a typical example for this analysis. The results indicated that increasing the number of retrieved chunks initially improved performance, peaking at three chunks. Beyond this point, performance declined. This trend could be attributed to two factors: when too few knowledge chunks were available, the model lacked sufficient context to make informed decisions; however, when the number exceeded a certain threshold, the input might have surpassed the effective token processing capacity of the model, which degraded performance. Similarly, the analysis revealed that the best accuracy was achieved with two rounds of discussion. Both fewer and more rounds resulted in performance below the baseline (i.e., without the discussion tool). This highlighted the importance of carefully tuning the number of discussion rounds, as suboptimal settings could lead to unreliable intermediate reasoning and potentially misguide the final decision made by the chief cardiologist.
\begin{figure}
%\floatconts
\centering
\includegraphics[width=0.9\linewidth]{hyper.pdf}
\caption{Hyperparameter selection on the MIMIC-IV dataset.}
\label{fig:hyper}
\vspace{-5mm}
\end{figure}

\subsection{Confusion Matrices}
\label{app:cm}
The confusion matrix analysis in MIMIC-IV showed that the CardAIc-Agents delivered more balanced and reliable classification outcomes compared with the best-performing VLM (MedGemma) and the best-performing agent (MedAgents) baseline (Figure \ref{fig:cm}). Specifically, the CardAIc-Agents achieved 161 true negatives and 103 true positives, whereas the MedGemma exhibited substantially higher error counts, including 27 false positives and 47 false negatives. The MedAgents similarly demonstrated poorer performance, producing 47 false positives and 32 false negatives. As shown in the figure, the marked reduction in both false positives and false negatives highlighted the superior discriminative capability of the CardAIc-Agents, particularly in mitigating missed positive cases, thereby improving its potential clinical utility.

\begin{figure}[htbp]
%\floatconts
\centering
\includegraphics[width=1\linewidth]{cm.png}
\caption{Confusion Matrices: (a) CardAIc-Agents; (b) MedGemma  (best-performing VLM baseline); (c) MedAgents (best-performing agent baseline).}
\label{fig:cm}
%\vspace{-5mm}
\end{figure}

\subsection{Case Study}
\label{app:cs}
Figure \ref{fig:case1} presented a case study that illustrated the operational workflow of CardAIc-Agents, emphasizing its adaptive capability to dynamically adjust its approach based on the input data. Depending on the analysis outcomes, the system could either continue with the current plan, modify its strategy, or halt further processing. This case study effectively showed the flexibility of CardAIc-Agents in managing complex and evolving cardiac tasks. Note that this case was processed in 56.93 seconds using three NVIDIA A100-SXM4 GPUs.

\begin{figure}[htbp]
%\floatconts
\centering
\includegraphics[width=1\linewidth]{case1.pdf}
\caption{
Illustration of the case study that shows the end-to-end operational workflow of CardAIc-Agents, spanning user query intake, internal multimodal reasoning, and final decision generation. 
The internal reasoning trace exemplifies the stepwise update mechanism, in which the agent may (i) continue executing the current plan, (ii) revise the plan, or (iii) terminate execution prior to completing the originally proposed steps. 
For clarity, only representative reasoning steps are shown; The symbol $\blacktriangleright$ denotes prompt (system or user); xx denotes the content of each reasoning step, and \{\} specifies the tool invoked at that step.
}

\label{fig:case1}
%\vspace{-5mm}
\end{figure}
\subsection{Retrieved Knowledge Quality}
\label{app:retrieval}
The proposed hybrid retrieval method was further evaluated by comparing its retrieval quality against TF-IDF and DPR baselines. Retrieval performance was assessed using accuracy and the average redundancy score (AVG), where lower redundancy indicates less duplicated or irrelevant retrieved content. As shown in Table~\ref{tab:retrieval_quality}, TF-IDF achieved an accuracy of 0.83 with an average redundancy of 0.91, while DPR achieved 0.83 with an average redundancy of 0.96, indicating substantial redundancy in retrieved passages. In contrast, our method obtained the highest accuracy of 0.87 and produced the lowest redundancy (0.83), demonstrating that the hybrid retrieval design retrieved more relevant and diverse evidence. 


\begin{table}[ht]
\centering
\caption{Retrieval quality comparison using accuracy and average redundancy (AVG).}
\vspace{3pt}
\label{tab:retrieval_quality}
\begin{tabular}{lcc}
\toprule
\textbf{Method} & \textbf{ACC} & \textbf{AVG} \\
\midrule
TF-IDF & 0.83 (0.79, 0.87) & 0.91 (0.91, 0.92) \\
DPR & 0.83 (0.78, 0.87) & 0.96 (0.96, 0.96) \\
Ours & 0.87 (0.83, 0.90) & 0.83 (0.81, 0.84) \\
\bottomrule
\end{tabular}
\end{table}


\subsection{Sensitive Analysis of Task Complexity Assessment}
\label{app:sensitivity}
To examine the robustness of CardAIc-Agents in complexity assessment, a prompt sensitivity study was conducted. Specifically, four additional participants were invited to design alternative complexity assessment prompts for HF and AF detection (details below). Across all prompts (including the original prompt), the complexity labels were consistent, classifying HF as advanced and AF as basic, aligning with commonly reported clinical viewpoints \cite{heidenreich20222022, uk2018chronic, van2025ambulatory}.

Here, predefined complexity criteria are included in the prompt to reduce sensitivity: 1) basic: the task has a clear diagnostic pipeline. 2) advanced: the task is complex and case-dependent or requires expert judgment beyond standard diagnostic criteria. In addition, the impact of misclassification was further evaluated under a worst case setting, where an advanced case was incorrectly routed to the basic workflow. Results on the MIMIC-IV dataset showed that accuracy was 0.86 when the case was routed to the basic workflow, and 0.89 when routed to the advanced workflow, remained higher than all baseline methods.

\smallskip
\noindent\textit{Case 1: Complexity Level Prompt (User)}\\
Given the following cardiac related medical task and the complexity guidelines, determine whether the task is basic or advanced under a typical presentation assumption: \{task description\}. Complexity Guidelines: 1) basic: the task has a clear diagnostic pipeline. 2) advanced: the task is complex and case-dependent or requires expert judgment beyond standard diagnostic criteria. Respond with a JSON object that contains only the field ``complexity'' with value ``basic'' or ``advanced''.

\smallskip
\noindent\textit{Case 2: Complexity Level Prompt (User)}\\
You will be given a cardiac related medical task plus the complexity guidelines. Under a typical presentation assumption, decide whether this task should be labeled as basic or advanced. Task: \{task description\}. Complexity Guidelines: 1) basic: the task has a clear diagnostic pipeline. 2) advanced: the task is complex and case-dependent or requires expert judgment beyond standard diagnostic criteria. Output: Return only one JSON object with exactly one field: \{``complexity'':''basic''\} or \{``complexity'':''advanced''\}.

\smallskip
\noindent\textit{Case 3: Complexity Level Prompt (User)}\\
Based on the complexity guidelines, classify the following cardiac related medical task as either basic or advanced, assuming a typical clinical presentation: \{task description\}. Complexity Guidelines: 1) basic: the task has a clear diagnostic pipeline. 2) advanced: the task is complex and case-dependent or requires expert judgment beyond standard diagnostic criteria. Respond with a single JSON object containing only the key ``complexity'' and the value ``basic'' or ``advanced''.

\smallskip
\noindent\textit{Case 4: Complexity Level Prompt (User)}\\
Determine the complexity category for the cardiac related task below using the provided guidelines. Assume a typical patient presentation. Task description: \{task description\}. Complexity Guidelines: 1) basic: the task has a clear diagnostic pipeline. 2) advanced: the task is complex and case-dependent or requires expert judgment beyond standard diagnostic criteria. Return ONLY valid JSON with one field named ``complexity'' and a value of ``basic'' or ``advanced''. 

\end{document}