\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{amsmath}
\usepackage[table]{xcolor}
\usepackage[most]{tcolorbox}
\usepackage{threeparttable}
\usepackage{rotating}
\usepackage{tcolorbox}

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2026}

\title[ReaCT: Multimodal Reasoning for CTV Segmentation]{Guideline-Informed MLLM Reasoning for Pathology-Aware Postoperative Prostate CTV Segmentation}
% Multimodal
 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Yinhao Wu \nametag{$^{1}$}} \orcid{0009-0004-2982-7770} \Email{yxw2120@mavs.uta.edu}\\
\Name{Hengrui Zhao\nametag{$^{2}$}} \orcid{0000-0002-6712-5823} \Email{Hengrui.Zhao@UTSouthwestern.edu} \\
\Name{Haiqing Li\nametag{$^{1}$}} \orcid{0009-0004-0698-3594} \Email{hxl9110@mavs.uta.edu}\\
\Name{Wenliang Zhong\nametag{$^{1}$}} \Email{wxz9204@mavs.uta.edu}\\
\Name{Hehuan Ma\nametag{$^{1}$}} \Email{hehuan.ma@mavs.uta.edu}\\
\Name{Yuzhi Guo\nametag{$^{1}$}} \Email{yuzhi.guo@mavs.uta.edu}\\
\Name{Dan Nguyen\nametag{$^{2}$}} \Email{Dan.Nguyen@UTSouthwestern.edu} \\
\Name{Daniel Yang\nametag{$^{2}$}} \Email{Daniel.Yang@UTSouthwestern.edu} \\ 
\Name{Steve Jiang\nametag{$^{2}$}} \orcid{0000-0002-3083-6752} \Email{Steve.Jiang@UTSouthwestern.edu} \\ 
\Name{Junzhou Huang\nametag{$^{1}$}} \orcid{0000-0002-9548-1227} \Email{jzhuang@exchange.uta.edu} \\
\addr $^{1}$ The University of Texas at Arlington, Arlington, TX, USA. \\
\addr $^{2}$ UT Southwestern Medical Center, Dallas, TX, USA.
}

\begin{document}

\maketitle

\begin{abstract}
Accurate segmentation of the Clinical Target Volume (CTV) is a critical prerequisite for precise radiotherapy planning, pursuing complete irradiation of microscopic disease while minimizing toxicity to surrounding healthy organs. However, achieving automated CTV segmentation remains highly challenging due to the invisible microscopic disease on planning CT and the necessity of incorporating clinical context into delineation decisions. Unlike previous methods that rely solely on visual features or coarse global text reasoning,  we propose \textbf{ReaCT}, a unified framework that reformulates CTV segmentation as a multimodal reasoning task by explicitly integrating pathological information with visual context.
Specifically, we introduce a Guideline-Informed Attribute Extractor that follows the information-retrieval workflow of radiation oncologists. By distilling knowledge from clinical guidelines, this module filters and structures lengthy pathology reports into a concise set of clinically determinative pathological attributes, effectively bridging the semantic gap between unstructured clinical records and segmentation networks. Furthermore, we develop an Attribute-Specific MLLM Reasoner built upon a 3D residual U-Net that performs fine-grained spatial reasoning. By leveraging a sequence of attribute-specific query tokens, the model disentangles the distinct target implications of individual pathological attributes, enabling fine-grained anatomical alignment via multi-scale fusion using Two-Way Transformers. Experiments on a postoperative prostate cancer dataset demonstrate that ReaCT achieves state-of-the-art segmentation performance and exhibits strong robustness, with pronounced improvements under limited-annotation settings.
\end{abstract}

\begin{keywords}
Clinical Target Volume, Radiotherapy Planning, 3D Image Segmentation.
\end{keywords}

\section{Introduction}
\label{sec:formatting}
Radiotherapy is one of the most common treatments for cancer, delivering radiation doses to the target volume while sparing surrounding healthy tissues~\cite{bi2019deep,liu2021adversarial}. Achieving the optimal therapeutic effect relies on precise treatment planning, including the delineation of the Clinical Target Volume (CTV) for microscopic tumor extensions~\cite{lee2018international,balagopal2021deep}. Different from Gross Tumor Volume (GTV) which usually have a distinct contrast on CT images, CTV can be invisible on planning CT images and its segmentation presents a formidable challenge. This difficulty is exacerbated in postoperative radical prostatectomy, where the surgical removal of the prostate and nearby tissues leaves a void in the target area, and the CTV boundaries are usually invisible. This necessitates complex reasoning from clinical context in addition to visual perception.

 
To facilitate consistent and reliable delineation, radiation oncologists typically need to integrate patient-specific pathological attributes derived from pathology reports alongside planning CT images in their decision-making process. These attributes are essential because they must be interpreted together with consensus clinical guidelines to determine the appropriate boundaries of the CTV~\cite{jansen2000target,chang2007evaluation}. For instance, different pathological stages determine whether only the proximal base of the seminal vesicles should be included, whereas confirmed seminal vesicle invasion mandates the inclusion of the entire seminal vesicle bed~\cite{dal2023estro}. Consequently, distinct from conventional segmentation tasks, CTV segmentation inherently requires the incorporation of multimodal knowledge to reason about regions susceptible to microscopic metastases that are indistinguishable based on visual features alone.

Recently, Multimodal Large Language Models (MLLMs) have demonstrated exceptional reasoning capabilities when handling implicit or abstract text instructions~\cite{lai2024lisa,zou2025uncertainty}. Drawing inspiration from these advancements and considering the intrinsic reasoning demands of CTV segmentation, we posit that empowering MLLMs to perform multimodal reasoning is essential for resolving the ambiguity of invisible target boundaries. However, seamlessly integrating such specialized knowledge into MLLM architectures presents significant challenges. First, patient records such as pathology reports and clinical notes are typically lengthy, unstructured, and rich in domain-specific terminology, making it difficult to effectively encode and align them with segmentation networks. Second, existing MLLM-based approaches typically rely on a single, coarse reasoning token, which limits their ability to capture the fine-grained correspondence between individual pathological attributes and their distinct spatial implications for CTV segmentation. A more comprehensive review of related work is provided in Appendix~\ref{RW}.

To address the above limitations, we propose \textbf{ReaCT}, a unified multimodal framework that reformulates CTV segmentation as a reasoning-driven task by integrating patient-specific pathological attributes with visual context. As illustrated in Figure~\ref{fig:framework}, ReaCT incorporates a Guideline-Informed Attribute Extractor to follow the information-retrieval workflow commonly used by radiation oncologists. This module first distills relevant radiotherapy consensus guidelines to derive a principled set of pathological attributes that govern the spatial extent of the CTV. Leveraging this guideline-grounded schema, it then processes raw pathology reports through a multi-stage pipeline involving keyword-based context retrieval, semantic verification, and value standardization, transforming unstructured clinical documentation into a concise and structured attribute set for downstream multimodal reasoning.
Subsequently, ReaCT introduces a multimodal CTV segmentation network built upon a 3D residual U-Net backbone and a custom MLLM reasoner. The MLLM Reasoner jointly processes visual and textual tokens together with a sequence of attribute-specific \texttt{<SEG>} query tokens. This granular design enables fine-grained reasoning, where each query token independently encapsulates the distinct target implications of a specific pathological attribute by aggregating relevant multimodal context. The hidden embeddings from the last layer corresponding to these \texttt{<SEG>} tokens are aggregated and then fused with multi-scale visual features through bi-directional transformer modules at each decoder stage to ensure precise anatomical alignment. Benefiting from this fine-grained multimodal fusion, ReaCT achieves state-of-the-art performance on postoperative prostate cancer datasets and exhibits pronounced robustness even in limited-annotation regimes.  We highlight the following contributions:

\begin{itemize} 
\item We propose \textbf{ReaCT}, a unified framework that reformulates CTV segmentation as a multimodal reasoning task by explicitly integrating patient-specific pathological attributes with visual context. This formulation addresses the inherent clinical need for multimodal integration and provides a principled mechanism toward anatomically and clinically coherent CTV segmentation.

\item We design a Guideline-Informed Attribute Extractor that follows the information-retrieval workflow used by radiation oncologists. By distilling knowledge from consensus guidelines, the extractor transforms lengthy pathology reports into a concise set of clinically determinative attributes, bridging the semantic gap between unstructured clinical records and downstream segmentation networks.

\item We develop an Attribute-Specific MLLM Reasoner that performs fine-grained spatial reasoning through a sequence of query tokens, enabling the model to disentangle the distinct target implications of individual pathological attributes and ensuring precise anatomical alignment even in limited-annotation regimes. 
\end{itemize}



\section{Methodology}
\label{sec:methodology}
As illustrated in Figure~\ref{fig:framework}, ReaCT comprises two parts. First, a Guideline-Informed Attribute Extractor emulates the expert information-retrieval workflow, distilling consensus guidelines to transform unstructured pathology reports into a concise set of determinative attributes. Second, a multimodal CTV segmentation network predicts the CTV mask by integrating the 3D CT volume with patient-specific attributes. Built upon a 3D U-Net and an Attribute-Specific MLLM Reasoner, ReaCT utilizes a sequence of query tokens to generate fine-grained spatial reasoning embeddings, which are fused into the decoder via multi-scale Two-Way transformers to ensure precise anatomical alignment.

\begin{figure}[!t]
\floatconts
  {fig:framework}
  % \label{framework}
  {\caption{The overall framework of ReaCT. (a) A Guideline-Informed Attribute Extractor emulates clinical information-retrieval workflow by distilling knowledge from consensus guidelines to obtain structured determinative attributes. (b) A Multimodal CTV Segmentation Network integrates a 3D U-Net with an Attribute-Specific MLLM Reasoner, fusing fine-grained reasoning embeddings derived from distinct query tokens into the decoder via multi-scale Two-Way Transformers to ensure precise anatomical alignment.}}
  {\includegraphics[width=\linewidth]{MIDLLatexTemplate-master/framework.pdf}}
\end{figure}

\subsection{Guideline-Informed Attribute Extractor}
\label{sec:rule_extraction}
To bridge the semantic gap between unstructured clinical records and the downstream segmentation network, we construct a Guideline-Informed Attribute Extractor based on GPT-4o~\cite{hurst2024gpt}. This module is designed to reflect the information-retrieval process used by radiation oncologists through a two-step procedure: (1) distilling relevant knowledge from consensus radiotherapy guidelines into a compact attribute schema, and (2) extracting patient-specific attribute values from raw pathology reports. 

\vspace{-0.8em}
\subsubsection{Guideline-Based Schema Construction}
To establish a guideline-grounded reasoning framework for attribute extraction, the proposed extractor first constructs a comprehensive guideline corpus $\mathcal{G} = \{ g_m \}_{m=1}^{M}$ by retrieving and aggregating consensus radiotherapy guidelines, such as ESTRO~\cite{niyazi2016estro}, NCCN~\cite{carroll2016nccn}, and RTOG~\cite{kruser2019nrg}. Subsequently, under a fixed prompting policy $\mathcal{P}_{\text{schema}}(\cdot)$ with deterministic decoding, the extractor employs the LLM $\Psi_{\text{LLM}}$ to analyze the retrieved documents and distill pathological factors that explicitly influence the CTV definition. Formally, this distillation process yields a principled attribute schema
\[
\mathcal{S} = \Psi_{\text{LLM}}\big(\mathcal{P}_{\text{schema}}(\mathcal{G})\big) = \{ k_i \}_{i=1}^{N},
\]
where each $k_i$ corresponds to a clinically determinative pathological attribute.

\subsubsection{Multi-Stage Attribute Extraction.}
Guided by the schema $\mathcal{S}$, the extractor processes the raw patient-specific pathology report $\mathbf{D}_{\text{raw}}$ to derive a structured set of attribute--value pairs. To mitigate the noise and redundancy inherent in unstructured medical text, a coarse-to-fine filtering pipeline is applied.
First, a keyword matching operator $\mathcal{F}_{\text{key}}(\cdot)$ is used to retrieve a subset of relevant text segments $\mathbf{T}_{\text{rel}} \subset \mathbf{D}_{\text{raw}}$ that potentially contain information linked to $\mathcal{S}$, thereby efficiently narrowing the search space.
Subsequently, the extractor functions as a semantic verifier $\Phi_{\text{verify}}(\cdot)$ on $\mathbf{T}_{\text{rel}}$, filtering out irrelevant narratives (e.g., unrelated medical history) and eliminating redundant statements, while retaining only textual evidence that directly informs the attributes in $\mathcal{S}$.
Finally, the verified context is aggregated to assign a standardized value $v_k$ for each attribute $k$.
This process produces a structured clinical attribute set
\[
\mathcal{A} = \{(k, v_k) \mid k \in \mathcal{S}, v_k \neq \varnothing\}.
\]
The curated attribute set $\mathcal{A}$ serves as textual input for the downstream multimodal reasoning module, enabling more accurate and context-aware CTV segmentation.
Detailed implementation is provided in Appendix~\ref{agent_details}.

\subsection{Multimodal CTV Segmentation}
\label{sec:guide_segmentation}
The segmentation network consists of a 3D U-Net-based segmentation path that encodes and decodes multi-scale spatial features, and an MLLM reasoner that generates fine-grained reasoning embeddings from the clinical attributes and corresponding image context.
In this work, we use the term \textit{reasoning} to denote guideline-driven conditional inference over spatial decisions, rather than static multimodal conditioning via feature fusion. Unlike conventional multimodal approaches~\mbox{~\cite{liu2023clip,zhao2025foundation,zhao2025large}}
that model correlations of the form $P(\text{Mask} \mid \text{Image}, \text{Attributes})$ through concatenation or cross-attention, ReaCT performs sequential,
attribute-conditioned inference within a unified semantic space. Specifically, ReaCT interleaves visual tokens and pathological attribute tokens into a single autoregressive multimodal sequence $X = \{x_t\}_{t=1}^T$, where each token corresponds to either a visual embedding, an attribute embedding, or an attribute-specific reasoning query. Through deep self-attention across transformer layers, textual tokens representing clinical rules repeatedly interact with visual tokens, enabling the model to learn conditional dependencies of the form $P(x_t \mid x_{<t})$, where each token attends jointly to prior visual context and attribute-specific cues that guide spatial
delineation decisions.

\subsubsection{Segmentation Path}
\label{sec:seg_path}
We adopt a 3D residual U-Net~\cite{cciccek20163d} as the visual backbone to facilitate the hierarchical interaction between anatomical features and clinical reasoning. The encoder $\mathcal{F}_{\phi}$ first processes the input CT volume $\mathbf{x}_{\text{img}} \in \mathbb{R}^{B \times 1 \times H \times W \times D}$ to extract a pyramid of multi-scale spatial features $\{\mathbf{f}_l\}_{l=1}^{L}$, where $\mathbf{f}_l \in \mathbb{R}^{B \times C_l \times H_l \times W_l \times D_l}$ denotes the feature map at the $l$-th resolution scale. 
Meanwhile, the proposed MLLM Reasoner jointly encodes the visual tokens and the extracted pathological attributes to generate a set of fine-grained reasoning embeddings $\mathbf{H}_{\text{reason}}
= \{\mathbf{h}_k\}_{k=1}^{N} 
\in \mathbb{R}^{B \times N \times d'},$
where each $\mathbf{h}_k$ captures the inferred spatial implication of the $k$-th pathological attribute.
To integrate the fine-grained reasoning embeddings $\mathbf{H}_{\text{reason}}$ generated by the MLLM into the decoding path, we employ the Two-Way Transformer fusion mechanism from the Segment Anything Model (SAM)~\cite{kirillov2023segment}. This module facilitates bidirectional interaction between the spatial features $\mathbf{f}_l$ and attribute queries $\mathbf{H}_{\text{reason}}$ at each upsampling stage, yielding a clinically modulated representation
$\tilde{\mathbf{f}}_l = \text{TwoWayBlock}(\mathbf{f}_l, \mathbf{H}_{\text{reason}}).$ Finally, the decoder $\mathcal{D}_{\phi}$ progressively upsamples and aggregates these fused features $\{\tilde{\mathbf{f}}_l\}_{l=1}^{L}$ to reconstruct the segmentation mask, outputting the final CTV probability map $\hat{\mathbf{y}} \in [0,1]^{B \times 1 \times H \times W \times D}$.

\subsubsection{Attribute-Specific MLLM Reasoner}
\label{sec:mllm_reasoner}
We design the MLLM reasoner to jointly encode 3D image features and the extracted pathological attributes, enabling reasoning-guided CTV segmentation. Specifically, we build upon M3D-LaMed~\cite{bai2024m3d}, a specialized multimodal large language model for 3D medical imaging, which consists of a 3D ViT image encoder (M3D-ViT), a spatial pooling projector, and a LLaMA-2-7B~\cite{touvron2023llama} language backbone.

\paragraph{Multimodal LLM  for Fine-Grained Anatomical Reasoning.}
Given the 3D CT volume $\mathbf{x}_{\text{img}}$, the pretrained M3D-ViT encoder $\Phi_{\text{ViT}}$ first extracts patch-level embeddings $\mathbf{Z}_{\text{img}} = \Phi_{\text{ViT}}(\mathbf{x}_{\text{img}}) \in \mathbb{R}^{M_0 \times d_v}$, where $M_0$ denotes the number of 3D patches and $d_v$ represents the vision hidden dimension. To align with the LLM latent space, these embeddings are processed by a spatial pooling projector $\mathcal{P}_{\psi}(\cdot)$, which applies 3D average pooling followed by a series of Multi-Layer Perceptrons (MLPs). This projection yields the compressed visual embedding $\mathbf{F}_{\text{img}} = \mathcal{P}_{\psi}(\mathbf{Z}_{\text{img}}) \in \mathbb{R}^{M \times d'}$, where $M = 128$ is the reduced token length and $d'$ aligns with the hidden dimension of the LLM.  

To achieve fine-grained anatomical reasoning, we augment the tokenizer's vocabulary with a set of learnable attribute-specific tokens $\{\texttt{<SEG>}_i\}_{i=1}^N$. Each token $\texttt{<SEG>}_i$ is designed to act as a dedicated reasoning query for the $i$-th attribute in $\mathcal{A}$.
Formally, let $\mathbf{A}_i$ denote the tokenized embedding of the attribute pair $(k_i, v_i)$ (e.g., [SVI, Positive]). We construct the joint multimodal input sequence $\mathbf{X}$ by concatenating the visual context with a series of attribute-reasoning blocks. Formally, for the $i$-th attribute, the input sequence $\mathbf{X}_i$ is:
\begin{equation}
\mathbf{X}_i = \big[ \mathbf{F}_{\text{img}}; \mathbf{A}_i; \texttt{<SEG>}_i \big].
\end{equation}
Each input sequence functions as an independent reasoning unit, prompting the model to synthesize the shared visual context $\mathbf{F}_{\text{img}}$ with the specific pathological attribute $\mathbf{A}_i$ to encode the spatial intent into the corresponding $\texttt{<SEG>}_i$ token.

Subsequently, the input sequence $\mathbf{X}$ is processed through the LLM backbone $\mathcal{M}_{\theta}$ with $L$ transformer layers to model the conditional dependency $p(x_t | x_{<t})$, where $t$ denotes the token index. The hidden states are recursively transformed as:
\begin{equation}
\mathcal{H}^{(\ell)} \;=\; \mathcal{M}^{(\ell)}_\theta\!\left(\mathcal{H}^{(\ell-1)}\right) = \big\{\,\mathbf{h}_1^{(\ell)},\dots,\mathbf{h}_T^{(\ell)}\,\big\}, \quad \ell=1,\dots,L.
\end{equation}
To distill the fine-grained reasoning result, let $t^*_i$ denote the position index of the $i$-th attribute-specific query token \texttt{<SEG>}$_i$ within the sequence. We extract the last-layer hidden state at this specific position to obtain the final reasoned embedding $\mathbf{h}_{t^*_i}^{(L)}$. 
Aggregating these embeddings over all $N$ attributes yields the reasoning set $\mathbf{H}_{\text{reason}} = \{ \mathbf{h}_{t^*_i}^{(L)} \}_{i=1}^{N}$, which provides disentangled, spatially-aware guidance for the segmentation decoder.

\paragraph{LoRA-based Adaptation.}
To efficiently adapt the pretrained $\mathcal{M}_\theta$ to CTV segmentation, we employ Low-Rank Adaptation (LoRA)~\cite{hu2022lora} on the query and value projections. Instead of full-parameter updates, the weight transformation is parameterized as $W' = W + BA$, where $A \in \mathbb{R}^{r \times d'}$ and $B \in \mathbb{R}^{d' \times r}$ represent learnable low-rank matrices ($r \ll d'$). This strategy minimizes computational overhead while enabling the model to effectively capture the attribute correspondences essential for CTV segmentation.

% \subsection{Two-way Transformer Block}

\subsection{Training Objective}
We adopt a weighted combination of Dice loss and binary cross-entropy (BCE) to optimize the network, formulated as:
\begin{equation}
\mathcal{L}_{\text{total}}
= \lambda_0\, \mathcal{L}_{\text{Dice}}(\hat{\mathbf{y}}_i, \mathbf{y}_i)
+ \lambda_1\, \mathcal{L}_{\text{BCE}}(\hat{\mathbf{y}}_i, \mathbf{y}_i),
\end{equation}
where $\hat{\mathbf{y}}_i$ and $\mathbf{y}_i$ denote the predicted and ground-truth CTV masks, respectively, and $\lambda_0$, $\lambda_1$ are weighting coefficients.
The Dice and BCE loss are formulated as
$\mathcal{L}_{\text{Dice}} = 1 - \dfrac{2\sum_i \hat{\mathbf{y}}_i \mathbf{y}_i + \epsilon}{\sum_i \hat{\mathbf{y}}_i + \sum_i \mathbf{y}_i + \epsilon}$
and
$\mathcal{L}_{\text{BCE}} = -\dfrac{1}{|\Omega|}\sum_{j \in \Omega}\left[\mathbf{y}_j\log\hat{\mathbf{y}}_j + (1-\mathbf{y}_j)\log(1-\hat{\mathbf{y}}_j)\right],$
where $|\Omega|$ is the number of voxels and $\epsilon$ is a small constant for numerical stability. 


\section{Experiments}
\subsection{Datasets and Implementation Details}
We conduct experiments on a large-scale multimodal in-house dataset comprising 688 postoperative prostate cancer patients, collected from the Department of Radiation Oncology at UT Southwestern Medical Center. This cohort represents a clinically demanding scenario where the prostate has been surgically removed, requiring the CTV to be inferred by synthesizing surrounding anatomical landmarks (e.g., bladder, rectal wall) with patient-specific pathological attributes. Ground-truth CTV masks were manually delineated by six experienced radiation oncologists following consensus guidelines. In addition, the operative pathology report associated with each patient includes critical pathological attributes, such as pathological T-stage, Gleason Score, and Seminal Vesicle Invasion (SVI) status. To ensure rigorous evaluation, 
we perform five randomized training-validation splits. Specifically,  in each iteration, approximately 90\% ($N_{\text{train}}=$ 496) of the data is allocated for training and 10\% ($N_{\text{val}}=$ 54) for validation. Furthermore, 138 cases are reserved as a fixed hold-out test set to assess the final performance. All data splitting is performed at the patient level to prevent data leakage.

All CT volumes are preprocessed following the nnU-Net~\cite{isensee2021nnu} pipeline, including isotropic resampling to $1 \times 1 \times 1\ \text{mm}^3$, intensity normalization, and foreground cropping. The model is implemented in MONAI~\cite{cardoso2022monai} with a patch size of $320 \times 320 \times 64$ and batch size 1. The M3D-ViT input resolution is $32 \times 256 \times 256$, consistent with its pre-training setup, and the maximum LLM context length is 512 with 128 visual tokens. LoRA fine-tuning is applied to the query and value projections ($q\_proj, v\_proj$) using rank $r=16$, scaling factor $\alpha=16$, and dropout 0.05. Standard data augmentation (e.g., rotation, scaling, flipping) is used during training. The network is optimized with AdamW~\cite{loshchilov2018decoupled} (lr=$1\times10^{-5}$, weight decay=$1\times10^{-8}$) for up to 50 epochs on a cluster equipped with six NVIDIA H100 GPUs. Performance is evaluated using
Dice Similarity Coefficient (Dice), 95th Percentile Hausdorff Distance (HD95), and Average Symmetric Surface Distance (ASSD).


\subsection{Comparative Results} To evaluate the segmentation accuracy of ReaCT, we compare it with both vision-only and multimodal segmentation baselines. As summarized in Table~\ref{tab:comparison_main}, ReaCT achieves state-of-the-art performance, with a Dice score of 0.8185, HD95 of 4.15 mm, and ASSD of 1.38 mm.
First, vision-only baselines (e.g., 3D U-Net, nnU-Net) achieve relatively stable Dice results; however, the HD95 and ASSD metrics still indicate a deficiency in identifying precise CTV boundaries. This highlights the importance of pathological information to provide critical guidance for regions that are radiographically indistinguishable based on imaging modalities alone.
%

Second, to justify the computational cost of employing a large LLM backbone, we compare ReaCT against two simple and resource-efficient metadata fusion baselines:
(1) a 3D U-Net with one-hot encoded clinical attributes, where each attribute is represented as an orthogonal binary vector and
(2) a 3D U-Net with concatenated text embeddings, where pathological attributes are encoded using a pretrained biomedical text encoder PubMedBERT.
For both baselines, we follow the fusion strategy of the CLIP-Driven Universal Model~\mbox{~\cite{liu2023clip}}, concatenating the attribute representations with global visual features obtained via global average pooling applied to the final encoder layer.
As shown in Table~\mbox{~\ref{tab:comparison_main}}, simple text embedding concatenation leads to a performance drop compared to the vision-only baseline, as directly fusing high-dimensional textual representations with visual features introduces a substantial modality gap. While one-hot encoding yields only a marginal improvement (+0.34\% Dice), this representation treats clinical attributes as mathematically independent symbols with zero similarity, making it difficult to capture the structured and correlated dependencies implied by clinical guidelines (e.g., Stage T3b inherently implies seminal vesicle invasion). In contrast, ReaCT substantially outperforms both one-hot encoding (+3.04\% Dice) and text embedding concatenation (+5.1\% Dice), demonstrating that the performance gains stem from explicit multimodal reasoning rather than simple feature augmentation.
%
Third, text-prompted segmentation methods, including BiomedParse~\cite{zhao2025foundation}, SAT~\cite{zhao2025large}, and Medformer~\cite{rajendran2025autodelineation}, show worse performance than vision-only baselines. We attribute this to their reliance on text encoders with encoder-only architectures. While these text encoders excel at processing explicit, content-descriptive prompts, they lack the reasoning capacity required to translate abstract and implicit pathological attributes into effective segmentation embeddings. Consequently, while their fixed prompt templates allow for flexible text-conditioned segmentation, they fall short in the deep clinical reasoning essential for CTV segmentation.
Furthermore, compared to LLMSeg, which similarly employs LLaMA-2 to combine electronic medical records with images, ReaCT incorporates a more comprehensive set of attributes to enable fine-grained reasoning in postoperative scenarios where the GTV has been surgically removed. Moreover, ReaCT adopts an MLLM reasoner that jointly processes visual and textual tokens, which promotes more effective reasoning than relying on textual information only. Qualitative visualizations provided in the Appendix~\ref{qualitative_vis} further demonstrate that ReaCT produces contours with better anatomical consistency, especially in challenging regions where boundaries are ambiguous.


\begin{table}[t]
\caption{Quantitative results of CTV segmentation on the in-house dataset (Mean $\pm$ SD). $\uparrow$: higher is better; $\downarrow$: lower is better.}
\label{tab:comparison_main}
\centering
\small
\setlength{\tabcolsep}{3.5mm}
\renewcommand{\arraystretch}{1.1}
\resizebox{\columnwidth}{!}{
\begin{tabular}{lccc}
\toprule
\textbf{Methods} & \textbf{Dice} $\uparrow$ & \textbf{HD95 (mm)} $\downarrow$ & \textbf{ASSD (mm)} $\downarrow$ \\
\midrule
3D U-Net~\cite{cciccek20163d} & 0.7847$_{\pm0.01}$ & 6.97$_{\pm2.33}$ & 2.13$_{\pm0.54}$ \\
nnU-Net~\cite{isensee2021nnu} & 0.7822$_{\pm0.01}$ & 11.69$_{\pm4.31}$ & 3.70$_{\pm0.85}$ \\
UNETR~\cite{hatamizadeh2022unetr} & 0.7843$_{\pm0.01}$ & 7.02$_{\pm2.20}$ & 2.15$_{\pm0.55}$ \\
Swin-UNETR~\cite{hatamizadeh2021swin} & 0.7965$_{\pm0.01}$ & 5.51$_{\pm1.25}$ & 1.88$_{\pm0.44}$ \\
U-Mamba~\cite{ma2024u} & 0.7715$_{\pm0.02}$ & 7.50$_{\pm1.80}$ & 2.25$_{\pm0.55}$ \\
BiomedParse~\cite{zhao2025foundation} & 0.7680$_{\pm0.04}$ & 8.85$_{\pm1.50}$ & 2.45$_{\pm0.45}$ \\
SAT~\cite{zhao2025large} & 0.7560$_{\pm0.03}$ & 9.20$_{\pm2.20}$ & 2.68$_{\pm0.70}$ \\
Medformer~\cite{rajendran2025autodelineation} & 0.7750$_{\pm0.06}$ & 9.80$_{\pm3.10}$ & 2.80$_{\pm0.85}$ \\
LLMSeg~\cite{oh2024llm} & 0.7857$_{\pm0.01}$ & 5.20$_{\pm1.02}$ & 1.75$_{\pm0.44}$ \\
Text-Concat & 0.7675$_{\pm0.02}$ & 7.85$_{\pm2.10}$ & 2.45$_{\pm0.65}$ \\
One-Hot & 0.7881$_{\pm0.02}$ & 6.84$_{\pm1.95}$ & 2.03$_{\pm0.45}$ \\


\midrule
w/o Textual Tokens & 0.8015$_{\pm0.05}$ & 4.52$_{\pm0.75}$ & 1.51$_{\pm0.18}$ \\
w/o Visual Tokens & 0.7942$_{\pm0.04}$ & 5.05$_{\pm0.85}$ & 1.64$_{\pm0.20}$ \\
w/o MLLM Reasoner & 0.7885$_{\pm0.02}$ & 5.85$_{\pm1.10}$ & 1.92$_{\pm0.40}$ \\
Generic Text Prompt & 0.7989$_{\pm0.04}$ & 4.98$_{\pm0.80}$ & 1.63$_{\pm0.19}$ \\
Concat Attributes & 0.8017$_{\pm0.05}$ & 4.65$_{\pm0.70}$ & 1.56$_{\pm0.15}$ \\
\textbf{ReaCT (Ours)} & \textbf{0.8185$_{\pm0.05}^{**}$} & \textbf{4.15$_{\pm1.66}^{***}$} & \textbf{1.38$_{\pm0.48}^{***}$} \\
\bottomrule
\end{tabular}
}
\begin{tablenotes}
\footnotesize
\item 
\parbox{0.95\linewidth}{
$^{**}$ and $^{***}$ indicate statistically significant improvements over the strongest ablation baseline (\emph{w/o Textual Tokens}) based on the Wilcoxon signed-rank test.
Specifically, $^{**}$ denotes $p < 0.01$ and $^{***}$ denotes $p < 0.001$.
}
\end{tablenotes}
\end{table}

\subsection{Ablation Results}
% \subsubsection{Impact of Multimodal Reasoning Components.}
% We conduct ablation studies to validate the contributions of the MLLM-based reasoner and the specific prompt design strategies. 
% First, to examine the role of each modality, we evaluate variants using only image tokens (\textit{w/o Textual Tokens}) or only attribute text (\textit{w/o Visual Tokens}) as input to the MLLM. As shown in Table~\ref{tab:comparison_main}, both variants exhibit clear performance drops compared to ReaCT, indicating that neither modality alone is sufficient for accurate CTV segmentation. Notably, the \textit{w/o Textual Tokens} variant (Dice: 0.8015) still outperforms standard vision-only baselines, suggesting that the MLLM backbone enhances the expressiveness of visual representations even without explicit textual guidance. Conversely, the \textit{w/o Visual Tokens} variant performs significantly worse (Dice: 0.7942), confirming that pathological attributes alone cannot resolve anatomical details without visual context.
% Furthermore, replacing the MLLM reasoner with a standard biomedical text encoder (PubMedBERT~\cite{gu2021domain}) followed by Two-Way Transformer fusion (\textit{w/o MLLM Reasoner}) leads to a substantial degradation (Dice: 0.7885). This demonstrates that encoder-only text encoders are insufficient for capturing the conditional reasoning required for this task, whereas the MLLM provides essential joint reasoning capabilities.
\subsubsection{Impact of Multimodal Reasoning Components.}
We conduct ablation studies to validate the contributions of the MLLM-based reasoner and the specific prompt design strategies. 
First, to examine the role of each modality, we evaluate variants using only image tokens (\textit{w/o Textual Tokens}) or only attribute text (\textit{w/o Visual Tokens}) as input to the MLLM. In the \textit{w/o Textual Tokens} setting, all attribute tokens are removed from the input sequence. Following the design of M3D-LaMed~\mbox{\cite{bai2024m3d}}, the input to the LLaMA backbone consists solely of visual tokens extracted by the M3D-ViT image encoder, followed by the 3D spatial pooling projector. The LLaMA backbone processes these visual tokens using causal self-attention, modeling global semantic relationships among visual features without relying on any textual queries.
As shown in Table~\ref{tab:comparison_main}, both variants exhibit clear performance drops compared to ReaCT, indicating that neither modality alone is sufficient for accurate CTV segmentation. Notably, the \textit{w/o Textual Tokens} variant (Dice: 0.8015) outperforms standard vision-only baselines such as nnU-Net. We attribute this improvement not to parameter capacity alone, but to semantic priors transferred from large-scale language pretraining. As demonstrated in recent research~\mbox{\cite{tang2025pre}}, frozen LLM layers effectively function as semantic-aware visual boosters, where linguistic priors significantly enhance global visual representations even when processing visual tokens only.
However, textual attributes remain essential for attribute-conditioned spatial reasoning. The \textit{w/o Visual Tokens} variant (Dice: 0.7942) still outperforms standard text-conditioned baselines such as BiomedParse~\mbox{~\cite{zhao2025foundation}} and SAT~\mbox{~\cite{zhao2025large}}, indicating that the autoregressive MLLM architecture is more effective at modeling abstract and implicit pathological attributes compared to conventional encoder-only text embeddings (e.g., CLIP or PubMedBERT). Furthermore, as shown in Appendix ~\mbox{\ref{sec:robustness_analysis}}, corrupting individual pathological attributes causes systematic, attribute-dependent performance degradation, confirming explicit reliance on attribute-conditioned reasoning rather than generic model capacity.
Furthermore, replacing the MLLM reasoner with a standard biomedical text encoder (PubMedBERT~\cite{gu2021domain}) followed by Two-Way Transformer fusion (\textit{w/o MLLM Reasoner}) leads to a substantial degradation. This demonstrates that encoder-only text encoders are insufficient for capturing the conditional reasoning required for this task, whereas the MLLM provides essential joint reasoning capabilities.


\begin{figure}[t]
    \centering
    \begin{minipage}[b]{0.48\textwidth}
        \centering
        \includegraphics[width=0.85\textwidth]{pie_chart_simple.pdf}
        % \caption{(a) Distribution of patients showing segmentation improvement ($\Delta \text{Dice} \ge 0.5\%$) grouped by the number of contributing attributes. The majority (87.8\%) benefit from combinations of multiple attributes.}
        % \label{fig:pie_chart}
    \end{minipage}
    \hfill
    \begin{minipage}[b]{0.48\textwidth}
        \centering
        \label{2_b}
        \includegraphics[width=0.85\textwidth]{attribute_overlap_matrix.pdf}
        % \caption{(b) Overlap matrix of patients showing improvement across attribute pairs. High values along the diagonal and off-diagonal elements indicate strong inter-dependency among risk factors.}
        % \label{fig:overlap_matrix}
    \end{minipage}
    \caption{Analysis of Clinical Attribute Efficacy. (a) shows that most cases benefit from  multi-attribute reasoning, while (b) reveals the biological correlations and complementary nature of these pathological attributes.}
    \label{fig:attribute_analysis}
\end{figure}

% \vspace{-1em}
\subsubsection{Effectiveness of Fine-Grained Attribute Reasoning}
To verify the necessity of our fine-grained, multi-token design, we compare ReaCT against two alternative prompting strategies: 
(1) Generic Text Prompt, which uses a static instruction (i.e., \textit{Segment the postoperative Clinical Target Volume for prostate cancer based on the CT image}); and 
(2) Concat Attributes, which concatenates all extracted attributes into a single sequence followed by a single unified \texttt{<SEG>} query token.
The Generic Text Prompt yields a Dice score of 0.7989, which indicates that without patient-specific context, generic instructions fail to provide effective guidance for CTV segmentation. The Concat Attributes strategy improves performance to 0.8017, yet it still lags significantly behind ReaCT. This result supports our hypothesis that compressing diverse pathological factors into a single global representation creates a semantic bottleneck, preventing the model from disentangling their distinct spatial implications. In contrast, ReaCT's use of a sequence of attribute-specific query tokens enables the model to explicitly reason about how each attribute dictates local boundaries, leading to improved segmentation accuracy. Appendix~\ref{sec:robustness_analysis} further confirms this reliance on active semantic reasoning, where deliberately corrupting attribute values leads to significant performance degradation.

\subsubsection{Impact of Individual and Combinatorial Attributes.}
\label{3.3.3}
To further validate the necessity of each extracted attribute, we analyze the patient cohort where ReaCT yields significant segmentation improvements (i.e. $\Delta \text{Dice} \ge 0.5\%$). As shown in Figure~\ref{fig:attribute_analysis}(a), the majority of patients (87.8\%) benefit from the integration of multiple clinical attributes rather than single ones, among the six pathological attributes used in this study. This highlights the complementary nature of clinical information and the heterogeneity of patient-specific treatment responses.
Furthermore, the attribute overlap matrix in Figure~\ref{fig:attribute_analysis}(b) reveals strong inter-attribute correlations, suggesting that different pathological factors capture overlapping yet distinct aspects of tumor characteristics. For instance, high Gleason scores frequently co-occur with positive lymph node status, reflecting the known biological propensity for high-grade tumors to metastasize. Notably, Surgical Margin, Pathological Stage, and Extraprostatic Extension emerge as the most influential factors, showing the highest diagonal densities. This is consistent with clinical practice, where these parameters serve as key determinants in defining CTV expansion boundaries.


\begin{figure}[!t]
\floatconts
  {limited_compare}
  % \label{framework}
  {\caption{Performance comparison of ReaCT, 3D U-Net, and LLMSeg across varying training data ratios (5\%--100\%). ReaCT consistently demonstrates superior performance even in severe low-data regimes.}}
  {\includegraphics[width=\linewidth]{MIDLLatexTemplate-master/performance_comparison.png}}
\end{figure}
% \vspace{-1em}
\subsubsection{Data Efficiency and Robustness Analysis}
To investigate sample efficiency, we evaluate the performance of ReaCT against a representative vision-only baseline (3D U-Net) and a multimodal baseline (LLMSeg) under varying training data proportions ranging from 5\% to 100\%. As illustrated in Figure~\ref{limited_compare}, ReaCT consistently outperforms both baselines across all data regimes. 
Notably, the 3D U-Net suffers severe degradation in extreme low-data settings (e.g., 5\%--20\%), as indicated by a sharp drop in metrics. This confirms that without semantic guidance, visual features alone are insufficient to generalize from sparse supervision. 
While LLMSeg exhibits better stability than the vision-only model, it still lags behind ReaCT. These results highlight that explicitly modeling pathological reasoning substantially improves label efficiency and robustness, even when pixel-level supervision is scarce. 

\subsection{Discussion}

While ReaCT achieves strong performance by reformulating postoperative CTV segmentation as a multimodal reasoning task, we acknowledge that this study is conducted on a single-center in-house dataset, reflecting the limited availability of large-scale public benchmarks for postoperative prostate CTV segmentation that include paired pathology reports. Importantly, ReaCT is designed to reduce institution-specific bias by conditioning segmentation on explicit pathological attributes and consensus guideline-driven rules, rather than implicitly learning local contouring styles from image appearance alone. Unlike vision-only methods that may overfit site-dependent practices, ReaCT focuses on pathology-dependent spatial decisions (e.g., margin status) that are defined by widely adopted clinical guidelines and shared across institutions. As a result, although the current evaluation is single-center, the learned reasoning is conceptually decoupled from local annotation habits and is intended to generalize once corresponding multimodal inputs are available.
Moreover, clinical guidelines for postoperative radiotherapy evolve over time and may vary across institutions. ReaCT explicitly separates guideline interpretation from the segmentation network by encapsulating clinical knowledge within the LLM-based attribute extraction module, while the segmentation model operates on structured attributes and visual evidence. This modular design enables adaptation to updated or institution-specific guidelines by revising attribute definitions or prompt schemas without retraining the segmentation model itself. Such separation mirrors real-world radiotherapy workflows and enhances the transparency, controllability, and long-term clinical applicability of the proposed framework.

\section{Conclusion}
In this work, we present \textbf{ReaCT}, a unified framework reformulating CTV segmentation as a multimodal reasoning task. By designing a Guideline-Informed Attribute Extractor to distill determinative attributes and an Attribute-Specific MLLM Reasoner for fine-grained spatial inference, ReaCT effectively bridges the gap between abstract clinical logic and invisible anatomical boundaries. Experiments on a large-scale prostate dataset demonstrate state-of-the-art performance and remarkable robustness in limited-annotation regimes. Future work will focus on adaptively integrating evolving clinical guidelines to enhance generalization across diverse disease sites and institutional standards.

\midlacknowledgments{This work was partially supported by US National Science Foundation IIS-2412195, CCF-2400785, the Cancer Prevention and Research Institute of Texas (CPRIT) award (RP230363), the National Institutes of Health (NIH) R01 award (1R01AI190103-01) and Microsoft Accelerate Foundation Models Research (2024).}

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version


% \midlacknowledgments{We thank a bunch of people.}


\bibliography{midl26_80}


\clearpage 
\appendix

\section{Related Work}
\label{RW}

\subsection{CTV Segmentation Methods}
Compared to conventional segmentation tasks driven by visual contrast,  Clinical Target Volume
(CTV) segmentation requires identifying microscopic spread that is invisible on standard imaging. To bridge this visibility gap, previous methods have attempted to infer target boundaries using auxiliary geometric cues or handcrafted anatomical heuristics.
For example, Cardenas \textit{et al.}~\cite{Cardenas2018AutodelineationOO} proposed a dual-channel 3D U-Net that ingests both CT scans and Gross Tumor Volume (GTV) masks to infer target boundaries based on spatial proximity. Similarly, Jin \textit{et al.}~\cite{jin2021deeptarget} introduced a framework incorporating signed distance maps from the GTV and adjacent organs to provide explicit geometric constraints. Specific to postoperative prostate cancer, Wang \textit{et al.}~\cite{wang2022dynamic} modeled the prostate bed as a virtual target to guide segmentation in the absence of the primary organ. While these approaches improve consistency, their reliance on fixed geometric heuristics or spatial expansions limits their adaptability to anatomical variations, particularly in postoperative scenarios where the tumor has been surgically removed. In contrast, ReaCT addresses the intrinsic need for multimodal integration in CTV segmentation, enabling clinically informed reasoning about target extent that aligns with the decision-making process of radiation oncologists.

\subsection{LLM-based CTV Segmentation Methods}
The integration of Large Language Models (LLMs) into radiotherapy workflows marks a significant shift towards utilizing clinical data as auxiliary information for target volume segmentation. For instance, LLMSeg~\cite{oh2024llm} demonstrated the capability of LLMs to enhance CTV segmentation by encoding clinical texts, such as tumor stage and surgery type, for breast and prostate cancer. Building on this, RO-LMM~\cite{kim2025end} proposed a comprehensive agent covering tasks from report summarization to plan-guided segmentation, while Medformer~\cite{rajendran2024large,rajendran2025autodelineation} leveraged hierarchical vision transformers fused with LLM-extracted text features to improve target delineation.
However, these existing methods largely treat LLMs as static text encoders that offer only coarse global conditioning, without exploiting their reasoning capacity to model how individual pathological factors influence local anatomical boundaries. Consequently, they fail to deliver the fine-grained, attribute-specific reasoning required for accurate CTV segmentation. In contrast, ReaCT introduces a guideline-informed attribute schema and an attribute-specific multimodal LLM that performs fine-grained reasoning over visual and textual cues, enabling clinically coherent and anatomically precise boundary prediction.


\section{Details of the LLM-based Attribute Extractor}
\label{agent_details}
In this section, we provide detailed prompt designs and workflow specifications for the LLM-based Attribute Extractor.
The module is implemented using GPT-4o~\mbox{~\cite{hurst2024gpt}} and follows a multi-stage, schema-constrained pipeline designed to extract structured pathological attributes from free-text clinical reports in accordance with clinical consensus guidelines. The overall procedure is summarized in Algorithm~\mbox{~\ref{alg:pipeline}}, which provides an explicit description of the full extraction workflow.

\begin{algorithm}[htbp]
% ==========================================
% 关键修改：添加以下三行来整体缩小约 10-15%
\small                       % 1. 缩小算法正文字体和行距
\SetAlCapFnt{\small}         % 2. 缩小 Caption 内容字体
\SetAlCapNameFnt{\small}     % 3. 缩小 "Algorithm X" 标签字体
% ==========================================

\caption{LLM-based Attribute Extractor Pipeline}
\label{alg:pipeline}

% 定义输入输出和超参数
\SetKwInOut{Input}{Input}
\SetKwInOut{Output}{Output}
\SetKwInOut{Hyperparams}{Hyperparameters}

\Input{Raw Pathology Report $D_{raw}$, Consensus Clinical Guidelines $\mathcal{G}$}
\Output{Structured Clinical Attribute Set $\mathcal{A}$}
\Hyperparams{Schema Prompt $\mathcal{P}_{schema}$, Retrieval Prompt $\mathcal{P}_{retrieval}$, Extraction Prompt $\mathcal{P}_{extract}$}

\BlankLine
\tcp{Stage 1: Guideline Knowledge Distillation (Appendix B.1)}
$\mathcal{S} \leftarrow \text{LLM}(\mathcal{P}_{schema}, \mathcal{G})$ \tcp*{Distill determinative attribute schema}
\textbf{Freeze} schema $\mathcal{S}$ for inference\;

\BlankLine
\tcp{Stage 2: Patient-Specific Inference (Appendix B.2)}
$\mathcal{A} \leftarrow \emptyset$\;
\For{each patient report $D_{raw}$}{
    \tcp{Step 2.1: Relevant Context Retrieval}
    $T_{rel} \leftarrow \text{LLM}(\mathcal{P}_{retrieval}, \mathcal{S}, D_{raw})$ \tcp*{Filter irrelevant history}
    
    \tcp{Step 2.2: Semantic Verification \& Standardization}
    $V_{raw} \leftarrow \text{LLM}(\mathcal{P}_{extract}, T_{rel})$\;
    \For{each attribute $k \in \mathcal{S}$}{
        $v_k \leftarrow \text{Standardize}(V_{raw}[k])$ \tcp*{Map to standard values}
        $\mathcal{A}.\text{append}((k, v_k))$\;
    }
}
\Return{$\mathcal{A}$}
\end{algorithm}


\subsection{Guideline-Based Schema Construction}
The objective of this stage is to distill a fixed, principled schema of determinative attributes from authoritative sources. We first aggregate relevant clinical guidelines (e.g., ESTRO ACROP, NCCN, RTOG) retrieved from medical databases such as PubMed. Based on this compiled corpus, we employ a Knowledge Distillation Prompt $\mathcal{P}_{\text{schema}}(\cdot)$ that instructs the LLM to act as a domain expert to synthesize a standardized attribute list. The prompt is specifically designed to identify pathological factors that dictate boundary modifications for CTV segmentation, consolidating diverse guideline terminologies into a unified schema.

\begin{tcolorbox}[title=Prompt 1: Guideline Knowledge Distillation, colback=gray!5!white, colframe=gray!75!black, fontupper=\small, boxsep=1mm, left=1.5mm, right=1.5mm, top=1mm, bottom=1mm]
\textbf{System Role:} You are a board-certified radiation oncologist and expert in prostate cancer radiotherapy planning.

\textbf{Context:} Accurate delineation of the Clinical Target Volume (CTV) for postoperative prostate cancer relies on specific pathological risk factors defined in consensus guidelines.

\textbf{Task:} Read the aggregated guideline documents provided below. Identify and summarize the specific pathological attributes that explicitly govern the anatomical boundaries of the CTV. For each attribute, explain how it influences the target volume (e.g., "inclusion of seminal vesicle bed").

\textbf{Input Guidelines:}
[Insert full text of compiled ESTRO / NCCN / RTOG guidelines here...]

\textbf{Requirements:}
\begin{enumerate} \setlength{\itemsep}{0pt} \setlength{\parskip}{0pt}
    \item Output a structured list of determinative attributes (e.g., T-Stage, Gleason Score).
    \item Focus strictly on factors influencing anatomical target boundaries.
    \item Consolidate synonymous terms into a standardized schema key.
\end{enumerate}

\textbf{Output Format:} JSON list of keys.
\end{tcolorbox}

\noindent Based on the output, we finalized the attribute schema $\mathcal{S}$ by retaining factors with explicit spatial implications for CTV delineation. This selection was further verified by senior radiation oncologists to ensure alignment with clinical consensus. The six determinative attributes are: \textbf{Pathological T-Stage}, \textbf{Gleason Score}, \textbf{Seminal Vesicle Invasion}, \textbf{Extraprostatic Extension}, \textbf{Surgical Margin Status}, and \textbf{Lymph Node Status}.

\subsection{Multi-Stage Attribute Extraction}
This stage transforms lengthy, unstructured pathology reports into the structured attribute profile $\mathcal{A}$. The process involves a context retrieval step followed by semantic verification and value standardization.

\paragraph{1. Relevant Context Retrieval:}
To efficiently narrow the search space within lengthy patient records, we employ a Context Retrieval Prompt that functions as the operator $\mathcal{F}_{\text{key}}(\cdot)$. This step filters the raw document $\mathbf{D}_{\text{raw}}$ to identify candidate text spans related to the schema $\mathcal{S}$, strictly excluding irrelevant medical history. The output list constitutes the \textbf{relevant text set} $\mathbf{T}_{\text{rel}}$, which serves as the input for the subsequent verification step.

\begin{tcolorbox}[title=Prompt 2: Relevant Context Retrieval, colback=gray!5!white, colframe=gray!75!black, fontupper=\small, boxsep=1mm, left=1.5mm, right=1.5mm, top=1mm, bottom=1mm]
\textbf{System Role:} You are assisting in extracting pathological attributes for postoperative prostate cancer.

\textbf{Task:} Given the schema below, identify all sentences or short text spans from the pathology report that may contain information relevant to any attribute in the schema.

\textbf{Schema:}
[Insert Attribute Schema $\mathcal{S}$ derived from Prompt 1]

\textbf{Pathology Report:}
[Insert Raw Pathology Report $\mathbf{D}_{\text{raw}}$]

\textbf{Output:}
\begin{enumerate} \setlength{\itemsep}{0pt} \setlength{\parskip}{0pt}
    \item A list of relevant text spans (verbatim from the report).
    \item Do NOT infer values yet; only retrieve candidate segments.
\end{enumerate}
\end{tcolorbox}
\paragraph{2. Semantic Verification \& Standardization:}
We design a clinical extraction prompt to process the retrieved context $\mathbf{T}_{\text{rel}}$. This prompt is designed to perform semantic verification (e.g., distinguishing ``margins are negative'' from ``margins were not assessed'') and to standardize attribute values into a structured form suitable for downstream multimodal reasoning.

\begin{tcolorbox}[title=Prompt 3: Attribute Verification and Standardization, colback=gray!5!white, colframe=gray!75!black, fontupper=\small, boxsep=1mm, left=1.5mm, right=1.5mm, top=1mm, bottom=1mm]
\textbf{System Role:} You are an expert pathologist. Your task is to extract structured clinical variables from the provided text segments of a radical prostatectomy pathology report.

\textbf{Input Text:} [Insert filtered text segments $\mathbf{T}_{\text{rel}}$ from Prompt 2]

\textbf{Target Schema:} Extract values for the following attributes:
\begin{enumerate} \setlength{\itemsep}{0pt} \setlength{\parskip}{0pt}
    \item Pathological T-Stage
    \item Gleason Score (e.g., 7(3+4))
    \item Seminal Vesicle Invasion (SVI)
    \item Extraprostatic Extension (EPE)
    \item Surgical Margin Status
    \item Lymph Node Status
\end{enumerate}

\textbf{Instructions:}
\begin{enumerate} \setlength{\itemsep}{0pt} \setlength{\parskip}{0pt}
    \item \textbf{Semantic Verification:} Ignore text related to previous biopsy history or other irrelevant procedures. Focus only on the final surgical pathology.
    \item \textbf{Redundancy Removal:} If multiple mentions exist, prioritize the ``Final Diagnosis'' section.
    \item \textbf{Standardization:} Map the extracted values to the following standard formats:
    \begin{itemize} \setlength{\itemsep}{0pt} \setlength{\parskip}{0pt}
        \item SVI/EPE/Margins/Nodes: ``Positive'' or ``Negative''.
        \item T-Stage: e.g., ``pT2'', ``pT3a'', ``pT3b''.
        \item If an attribute is not mentioned or cannot be determined, output ``Unknown''.
    \end{itemize}
\end{enumerate}

\textbf{Output Format:} Provide the result as a JSON object: \texttt{\{"Attribute": "Standardized Value"\}}.
\end{tcolorbox}

\subsection{Additional Experiments}
\subsubsection{Validation of LLM-based Attribute Extraction}
\label{sec:appendix_llm_validation}
To assess the reliability of GPT-4o for extracting clinical attributes from pathology reports, we conducted a quantitative validation study. We randomly sampled 200 cases from our dataset and asked two radiation oncologists with experience in postoperative prostate radiotherapy to independently verify the extracted attributes against the
original pathology reports. In cases of disagreement, a consensus annotation was reached through joint discussion, and the final consensus labels were used as ground
truth for evaluation. We report both accuracy and F1-score for each attribute, as well as overall performance
across all attributes. Table~\mbox{~\ref{tab:gpt4o_validation}} summarizes the extraction performance for the six clinical attributes. The overall extraction accuracy was 97.1\% with an F1-score of 0.96, demonstrating the high reliability of the automated extraction pipeline. These results indicate that the guideline-informed extraction achieves high reliability
and provides a stable foundation for downstream multimodal segmentation.


\begin{table}[!t]
\centering
\caption{GPT-4o attribute extraction validation results on 200 randomly sampled cases.}
\label{tab:gpt4o_validation}
\begin{tabular}{lcc}
\toprule
Attribute & Accuracy (\%) & F1-Score \\
\midrule
Stage & 98.5 & 0.98 \\
Gleason Score & 97.5 & 0.97 \\
Extraprostatic Extension & 96.0 & 0.95 \\
Lymph Node Status & 98.0 & 0.97 \\
Surgical Margin & 96.5 & 0.96 \\
Seminal Vesicle Invasion & 96.0 & 0.95 \\
\midrule
\textbf{Overall} & \textbf{97.1} & \textbf{0.96} \\
\bottomrule
\end{tabular}
\end{table}




\subsubsection{Qualitative Comparison}
\label{qualitative_vis}

Figure~\ref{fig:qualitative_vis} presents a qualitative comparison of segmentation results across five representative patients from the test set. To rigorously assess clinical plausibility and boundary behavior, we deliberately selected cases spanning diverse pathological conditions, including varying Gleason scores
(ranging from 3+4 to 4+5) and different extents and locations of positive surgical margins (e.g., left apex, right postero-lateral, and multifocal invasive carcinoma).
To facilitate detailed inspection of boundary differences, synchronized region-of-interest (ROI) zoom-in views are provided for all methods.
As observed in these zoomed regions, conventional vision-only models (e.g., U-Net~\cite{cciccek20163d}, nnU-Net~\cite{isensee2021nnu}, and UNETR~\cite{hatamizadeh2022unetr}) generally capture the coarse CTV extent but consistently exhibit noticeable deviations at anatomically intricate boundaries,
often producing overly smoothed or imprecise contours that fail to strictly adhere to clinical guidelines.
Similarly, representative text-conditioned segmentation approaches
(e.g., BiomedParse~\cite{zhao2025foundation} and SAT~\cite{zhao2025large}) do not demonstrate consistent improvements in these boundary-critical regions, indicating that direct image--text fusion via feature concatenation or cross-attention is insufficient to bridge the modality gap
associated with highly implicit clinical text.
In contrast, ReaCT consistently adapts its boundary behavior across diverse pathological scenarios while maintaining stable and clinically plausible contours. This qualitative evidence demonstrates that ReaCT can accommodate heterogeneous
pathological presentations without sacrificing boundary accuracy, supporting its practical applicability for postoperative CTV delineation. 




\begin{sidewaysfigure}
\centering
\rotatebox{180}{
    \begin{minipage}{\textwidth}
        \centering
        \includegraphics[height=0.5\textwidth]{MIDLLatexTemplate-master/top5_slices_all_methods.png}
        \caption{Qualitative comparison on five representative cases. The Ground Truth is outlined in \textcolor{magenta}{pink}, and model predictions are in \textcolor{green}{green}. Columns: \textbf{ReaCT (Ours)}, 3D U-Net, nnU-Net, BiomedParse, and LLMSeg. ReaCT consistently demonstrates superior alignment with the ground truth in radiographically ambiguous regions compared to baseline methods.}
        \label{fig:qualitative_vis}
    \end{minipage}
}
\end{sidewaysfigure}


\subsubsection{Robustness and Sensitivity Analysis}
\label{sec:robustness_analysis}

To verify that ReaCT actively leverages clinical attributes for decision-making rather than treating text as a passive feature, we conducted a robustness analysis by deliberately corrupting individual attributes during inference. Specifically, for each experiment, we flipped the value of a single determinative attribute to its opposite clinical status (e.g., changing \textit{Surgical Margin Status} from ``Positive'' to ``Negative'' or vice versa) while keeping all other attributes and the image input unchanged. This setup isolates the causal impact of each specific attribute on the segmentation outcome. As shown in Table~\ref{tab:robustness}, incorrect pathological inputs consistently degrade performance. Notably, corrupting the Surgical Margin  status causes the most significant drop (Dice $-2.61\%$, HD95 $+0.67$ mm). This aligns with clinical guidelines where positive margins mandate aggressive CTV expansion, significantly altering target geometry. Similarly, incorrect Extraprostatic Extension and Pathological Stage inputs also lead to marked performance losses, confirming the model's dependency on accurate determinative factors.

\begin{table}[!t]
\centering
\caption{Robustness Analysis. Performance degradation when individual clinical attributes are corrupted. Each row shows results when a single attribute is deliberately flipped to its opposite clinical value (e.g., Positive $\leftrightarrow$ Negative) while keeping all other attributes correct.}
\label{tab:robustness}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccc}
\toprule
\textbf{Model Variant} & \textbf{Dice} $\uparrow$ & \textbf{HD95 (mm)} $\downarrow$ & \textbf{ASSD (mm)} $\downarrow$ \\
\midrule
\textbf{ReaCT (Original)} & \textbf{0.8185$_{\pm0.05}$} & \textbf{4.15$_{\pm1.66}$} & \textbf{1.38$_{\pm0.48}$} \\
\midrule
ReaCT w/ Wrong SM & 0.7924$_{\pm0.06}$ (\textcolor{red}{$-$2.61\%}) & 4.82$_{\pm1.89}$ (\textcolor{red}{$+$0.67}) & 1.56$_{\pm0.52}$ (\textcolor{red}{$+$0.18}) \\
ReaCT w/ Wrong EPE & 0.7979$_{\pm0.05}$ (\textcolor{red}{$-$2.06\%}) & 4.68$_{\pm1.78}$ (\textcolor{red}{$+$0.53}) & 1.52$_{\pm0.51}$ (\textcolor{red}{$+$0.14}) \\
ReaCT w/ Wrong Stage & 0.7995$_{\pm0.05}$ (\textcolor{red}{$-$1.90\%}) & 4.59$_{\pm1.74}$ (\textcolor{red}{$+$0.44}) & 1.49$_{\pm0.50}$ (\textcolor{red}{$+$0.11}) \\
ReaCT w/ Wrong GS & 0.8036$_{\pm0.05}$ (\textcolor{red}{$-$1.49\%}) & 4.42$_{\pm1.71}$ (\textcolor{red}{$+$0.27}) & 1.45$_{\pm0.49}$ (\textcolor{red}{$+$0.07}) \\
ReaCT w/ Wrong LNS & 0.8064$_{\pm0.05}$ (\textcolor{red}{$-$1.21\%}) & 4.35$_{\pm1.69}$ (\textcolor{red}{$+$0.20}) & 1.43$_{\pm0.49}$ (\textcolor{red}{$+$0.05}) \\
ReaCT w/ Wrong SVI & 0.8077$_{\pm0.05}$ (\textcolor{red}{$-$1.08\%}) & 4.31$_{\pm1.68}$ (\textcolor{red}{$+$0.16}) & 1.42$_{\pm0.49}$ (\textcolor{red}{$+$0.04}) \\
\bottomrule
\end{tabular}
}
\end{table}

\subsubsection{Leave-One-Out Attribute Ablation}
\label{appendix:loo_ablation}
To further examine whether the selected pathological attributes are redundant or jointly necessary, we conduct a leave-one-out attribute ablation study on the full ReaCT model. In this analysis, each clinical attribute is removed in turn while all remaining attributes and network components are kept unchanged. As shown in Table~\mbox{~\ref{tab:loo_ablation}}, removing any single attribute consistently leads to performance degradation across all three metrics, indicating that no attribute can be omitted without measurable loss in segmentation quality. Notably, attributes that directly determine boundary expansion decisions in clinical guidelines, such as surgical margin status, extraprostatic extension, and seminal vesicle invasion, exhibit the largest performance drops when removed, reflecting their critical role in postoperative CTV delineation.
Overall, this leave-one-out analysis demonstrates that the six pathological attributes are not redundant but instead provide complementary clinical information. The results support the conclusion that all selected attributes are jointly necessary for robust and guideline-consistent postoperative CTV segmentation.


\begin{table}[!t]
\centering
\caption{Leave-one-out attribute ablation on ReaCT. Each row removes one clinical attribute while keeping all others unchanged. Reported values show absolute performance and relative change compared to the full model.}
\label{tab:loo_ablation}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccc}
\toprule
\textbf{Model Variant} 
& \textbf{Dice} $\uparrow$ 
& \textbf{HD95 (mm)} $\downarrow$ 
& \textbf{ASSD (mm)} $\downarrow$ \\
\midrule
\textbf{ReaCT (6 attrs)} 
& \textbf{0.8185$_{\pm0.05}$} 
& \textbf{4.15$_{\pm1.66}$} 
& \textbf{1.38$_{\pm0.48}$} \\
\midrule
w/o Stage 
& 0.8092$_{\pm0.05}$ ($\textcolor{red}{-0.93\%}$) 
& 4.48$_{\pm1.71}$ ($\textcolor{red}{+0.33}$) 
& 1.47$_{\pm0.50}$ ($\textcolor{red}{+0.09}$) \\
w/o Gleason Score 
& 0.8101$_{\pm0.05}$ ($\textcolor{red}{-0.84\%}$) 
& 4.44$_{\pm1.70}$ ($\textcolor{red}{+0.29}$) 
& 1.46$_{\pm0.49}$ ($\textcolor{red}{+0.08}$) \\
w/o Extraprostatic Extension 
& 0.8048$_{\pm0.05}$ ($\textcolor{red}{-1.37\%}$) 
& 4.62$_{\pm1.76}$ ($\textcolor{red}{+0.47}$) 
& 1.51$_{\pm0.51}$ ($\textcolor{red}{+0.13}$) \\
w/o Seminal Vesicle Invasion 
& 0.8062$_{\pm0.05}$ ($\textcolor{red}{-1.23\%}$) 
& 4.55$_{\pm1.74}$ ($\textcolor{red}{+0.40}$) 
& 1.49$_{\pm0.50}$ ($\textcolor{red}{+0.11}$) \\
w/o Surgical Margin 
& 0.8019$_{\pm0.05}$ ($\textcolor{red}{-1.66\%}$) 
& 4.82$_{\pm1.89}$ ($\textcolor{red}{+0.67}$) 
& 1.56$_{\pm0.52}$ ($\textcolor{red}{+0.18}$) \\
w/o Lymph Node Status 
& 0.8085$_{\pm0.05}$ ($\textcolor{red}{-1.00\%}$) 
& 4.41$_{\pm1.69}$ ($\textcolor{red}{+0.26}$) 
& 1.45$_{\pm0.49}$ ($\textcolor{red}{+0.07}$) \\
\bottomrule
\end{tabular}
}
\end{table}

\subsubsection{Sensitivity Analysis of Dice Threshold Selection}
\label{appendix:sensitivity_threshold}

In Section~\mbox{~\ref{3.3.3}}, a Dice improvement threshold of 0.5\% was used to identify patients for whom ReaCT yields meaningful segmentation improvements over the vision-only baseline. This threshold was chosen to distinguish clinically relevant improvements from minor variations attributable to measurement noise introduced by sliding-window aggregation and boundary discretization during volumetric inference.

To assess the robustness of our conclusions with respect to this choice and to rule out potential selection bias, we conduct a sensitivity analysis across a range of Dice thresholds, including 0.00\%, 0.25\%, 0.50\%, 0.75\%, and 1.00\%. For each threshold, we recompute the subset of patients for whom ReaCT achieves Dice improvements exceeding the threshold relative to the baseline, and analyze how many clinical attributes contribute to these gains. Table~\mbox{~\ref{tab:threshold_sensitivity}} summarizes the results. Although stricter thresholds naturally reduce the number of included patients, the key finding remains consistent across all settings: the majority of patients (81--90\%) benefit from the integration of multiple clinical attributes (i.e., two or more attributes), rather than any single attribute alone. This observation confirms that the complementary nature of clinical information is not an artifact of threshold selection. Furthermore, the attribute overlap matrix shown in Figure~\mbox{~\ref{fig:attribute_analysis}}(b) demonstrates that different attributes benefit partially distinct patient subgroups, reinforcing the necessity of incorporating all six pathological attributes. Finally, we emphasize that the main quantitative results reported in Table~\mbox{~\ref{tab:comparison_main}} are computed on the full test set ($N=139$) without any threshold filtering, ensuring that the overall performance conclusions are not subject to selection bias.

\begin{table}[!t]
\centering
\caption{Sensitivity analysis of the Dice improvement threshold used to identify patients where integrating clinical attributes achieves meaningful segmentation gains over the baseline. For each threshold, we report the number of patients meeting the criterion and the distribution of beneficial attributes.}

\label{tab:threshold_sensitivity}
\begin{tabular}{cccccc}
\toprule
\textbf{Threshold} & \textbf{Patients} & \textbf{1-Attr. (\%)} & \textbf{2--3 Attrs. (\%)} & \textbf{4--5 Attrs. (\%)} & \textbf{All 6 (\%)}\\
\midrule
0.00\% & 125$_{\pm5}$ & 9.6$_{\pm1.2}$ & 16.0$_{\pm1.5}$ & 28.0$_{\pm2.0}$ & 46.4$_{\pm2.5}$ \\
0.25\% & 120$_{\pm4}$ & 11.7$_{\pm1.3}$ & 17.5$_{\pm1.6}$ & 27.5$_{\pm1.8}$ & 43.3$_{\pm2.3}$ \\
0.50\% & 115$_{\pm4}$ & 12.2$_{\pm1.4}$ & 19.1$_{\pm1.7}$ & 27.0$_{\pm1.9}$ & 41.7$_{\pm2.2}$ \\
0.75\% & 108$_{\pm5}$ & 15.7$_{\pm1.6}$ & 21.3$_{\pm1.8}$ & 25.9$_{\pm2.0}$ & 37.1$_{\pm2.4}$ \\
1.00\% & 100$_{\pm5}$ & 19.0$_{\pm1.8}$ & 24.0$_{\pm2.0}$ & 25.0$_{\pm2.1}$ & 32.0$_{\pm2.5}$ \\
\bottomrule
\end{tabular}
\end{table}





\subsection{Computational Cost and Clinical Deployment}
\label{appendix:computational_cost}

We benchmark the computational requirements of ReaCT on a single NVIDIA H100 GPU. The model contains 7.02B total parameters, with only 38.5M (0.55\%) trainable via LoRA adapters. Training converges in approximately 2.75 hours (26 epochs), and inference takes $\sim$85 ms per case with $\sim$27 GB VRAM in FP32 precision. For clinical deployment, half-precision (FP16) inference reduces memory requirements to $\sim$14 GB, compatible with standard workstation GPUs (e.g., RTX 3090/4090). The sub-second inference latency is negligible compared to the typical 20--30 minute manual CTV delineation time, enabling seamless integration into existing treatment planning workflows. 
Table~\mbox{~\ref{tab:computational_cost}} summarizes the computational specifications of ReaCT.

While the peak VRAM usage of $\sim$27 GB reflects our FP32 research configuration, clinical deployment is feasible on standard workstations. In practice, half-precision (FP16) inference reduces memory requirements to approximately 13-14 GB, fitting within widely available GPUs such as the RTX 3090/4090 with 24 GB VRAM. For hardware with stricter memory constraints, established quantization techniques (INT8/INT4)~\mbox{~\cite{lin2024awq}} can further reduce requirements to below 10 GB with minimal impact on segmentation accuracy. Furthermore, postoperative CTV delineation is an offline treatment planning task that typically requires 20--30 minutes of clinician time~\mbox{~\cite{cha2021clinical}}, making the $\sim$85 ms inference latency negligible in comparison and fully compatible with existing PACS/TPS workflows.

\begin{table}[!t]
\centering
\caption{Computational specifications of ReaCT.}
\label{tab:computational_cost}
\begin{tabular}{lll}
\toprule
\textbf{Metric} & \textbf{Value} & \textbf{Note} \\
\midrule
Total Parameters & 7.02B & Frozen M3D backbone + LoRA + 3D U-Net \\
Trainable Parameters & 38.5M (0.55\%) & LoRA + 3D U-Net \\
Training Time & $\sim$2.75 hours & 26 epochs on single H100 GPU \\
Inference Latency & $\sim$85 ms/case & Single forward pass \\
Inference VRAM & $\sim$27 GB & FP32 precision \\
\bottomrule
\end{tabular}
\end{table}


\end{document}
