\documentclass{midl} % Include author names
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- nnn}
\usepackage{soul,xcolor}

\usepackage{algorithm}
\usepackage{algorithmic}

\usepackage{caption} % DO NOT 
\usepackage{graphicx} % DO NOT 
\usepackage{mwe} 
\editors{Accepted for publication at MIDL 2026}

\title[RandP]{RandP: Effective and Efficient Medical Visual In-Context Learning via a Retrieve-and-Propagate Module for Prompt-Query Fusion}

\midlauthor{\Name{Rongge Mao\nametag{$^{1,2}$}}\Email{ronggemao@mail.ustc.edu.cn}\\
\addr $^{1}$ School of Biomedical Engineering, Division of Life Sciences and Medicine, University of Science and Technology of China (USTC), Hefei, 230026, China \\
\addr $^{2}$ Center for Medical Imaging, Robotics, Analytic Computing \& Learning (MIRACLE), Suzhou Institute for Advanced Research, USTC, Suzhou, 215123, China \\
\Name{Han Li\nametag{$^{\ast3,4}$}}\Email{tum\_han.li@tum.de}\\
\addr $^{3}$ Computer Aided Medical Procedures (CAMP), Technische Universitaet Muenchen (TUM). \\
\addr $^{4}$ Munich Center for Machine Learning (MCML), Munich, Germany. \\
\Name{Chengqi Dong\nametag{$^{1,2}$}} \Email{dongcq@mail.ustc.edu.cn}\\
\Name{Nassir Navab\nametag{$^{3,4}$}} \Email{nassir.navab@tum.de}\\
\Name{S Kevin Zhou\midljointauthortext{Corresponding Authors}\nametag{$^{1,2,5,6,7}$}} \Email{skevinzhou@ustc.edu.cn}\\
\addr $^{5}$ Key Laboratory of Intelligent Information Processing of Chinese Academy of Sciences (CAS), Institute of Computing Technology, CAS, Beijing, 100190, China \\
\addr $^{6}$ Jiangsu Provincial Key Laboratory of Multimodal Digital Twin Technology, Suzhou, 215123, China \\
\addr $^{7}$ State Key Laboratory of Precision \& Intelligent Chemistry, USTC, Hefei, China ORCID (S.Kevin Zhou): https://orcid.org/0000-0002-6881-4444\\ 
}
\begin{document}
\maketitle

\begin{abstract}
Visual In-Context Learning (ICL) has emerged as a promising paradigm for constructing vision generalists by conditioning on prompt pairs. Existing visual ICL methods typically adopt a grid-like prompt-query construction combined with Masked Image Modeling (MIM) as the training strategy. However, directly applying these frameworks to medical imaging tasks often leads to suboptimal performance. Moreover, the reliance on MIM restricts the backbone to Vision Transformer (ViT) and introduces unnecessary computational overhead due to the need to reconstruct the prompt label.
In this work, we revisit previous visual ICL paradigms for medical imaging and propose a training-inference aligned masking strategy to replace MIM. We further introduce a Retrieve-and-Propagate (RandP) module to enhance prompt-query fusion under this masking scheme. Experimental results show that our RandP visual ICL framework not only doubles the inference speed compared to prior visual ICL baselines but also achieves superior performance across multiple medical imaging tasks. Furthermore, unlike previous approaches constrained to vanilla ViT, our framework is compatible with U-Net-style architectures, enabling broader applicability and improved effectiveness in the medical imaging domain. Our code will be available.
\end{abstract}

\begin{keywords}
Medical imaging, Visual In-Context-Learning.
\end{keywords}

\section{Introduction}

Accurate medical image analysis is crucial for diagnosing various diseases~\cite{zhou2021review}. With the advancement of deep learning techniques, many tasks in medical image analysis, such as classification~\cite{yue2024medmamba}, detection~\cite{SATr}, segmentation~\cite{UNet}, restoration~\cite{AMIR}, reconstruction~\cite{reconstruction}, registration~\cite{voxelmorph} and report generation~\cite{deltanet}, have made substantial progress. However, most studies concentrate on specific visual tasks, anatomical regions, and image modalities, developing specialized network architectures, training methods, and techniques tailored to these tasks. Consequently, these models often lack generalizability across different medical imaging tasks.


\begin{figure}[H]
\centering

  \parbox{\linewidth}{%         % 占满单栏宽度
    \centering
    \includegraphics[width=\linewidth]{section/img/ICL8.pdf}
    \caption{A concise schematic diagram of in-context learning in language models and visual in-context learning.}
    \label{fig:ICL}
  }%

\end{figure}


The impressive performance of LLMs in natural language processing (NLP) has demonstrated the potential of In-Context Learning (ICL)~\cite{GPT-3, ICL}, a paradigm in which models perform tasks by conditioning on examples and instructions embedded in the input prompt, without any parameter updates. This framework offers several key advantages~\cite{zeroshot,Survey-ICL}:

\begin{itemize}
    \item \textbf{Unified Task Format:} Diverse NLP tasks can be handled under a single prompting framework, which supports new tasks through a few-shot approach and eliminates the need for task-specific fine-tuning.
    \item \textbf{Context-Aware Reasoning:} Task-relevant exemplars or instructions embedded in prompts guide model behavior without updating parameters.
\end{itemize}


Recent studies have demonstrated the strong effectiveness of in-context learning (ICL) for vision–language tasks, including a wide range of applications in the medical domain. In particular, studies~\cite{ICL4VL, context-medical} on GPT-4V~\cite{gpt4} show that large vision–language models can achieve competitive or even superior performance compared with expert-designed convolutional neural networks on sparse prediction pathology tasks such as colorectal tissue typing, polyp subtyping, and lymph node metastasis detection, using as few as one to ten exemplars. Similar gains from in-context learning have also been reported for COVID-19 chest X-ray classification and tauopathy recognition tasks~\cite{medical4V}.

Consistent improvements from in-context learning have been observed across multiple medical vision–language benchmarks. Models such as Gemini Pro Vision~\cite{gemini} and LLaVA-Med~\cite{llavamed} report gains of approximately 3–9\% AUC over zero-shot baselines on radiology and pathology tasks when provided with only four to sixteen in-context examples. Moreover, medical foundation models incorporating domain-specific vocabulary and training, including PathChat~\cite{PathChat}, RadFM~\cite{radfm}, and MUSK~\cite{MUSK}, achieve 5-10 shots in-context learning performance that is comparable to their fully fine-tuned counterparts for medical image classification and visual question answering.

Collectively, these studies highlight the potential of in context learning as a powerful paradigm for few shot generalization in biomedical imaging, offering improved interpretability and a substantially reduced annotation burden.

However, despite these promising results, existing medical in context learning research has largely focused on sparse prediction tasks (e.g., classification, Visual Question Answering), rather than dense prediction tasks (e.g., segmentation, denoising).

Given the demonstrated effectiveness of ICL in both natural language processing (NLP) and vision-language domains for sparse prediction tasks, a natural and compelling question arises: \textbf{Can the in-context learning paradigm be effectively extended to dense prediction tasks in computer vision, particularly in the field of medical imaging?}

In the field of natural images, the answer is yes. Early visual ICL frameworks~\cite{MAEVQGAN, Painter,PromptGIP} for dense prediction tasks predominantly rely on Masked Image Modeling (MIM)~\cite{SimMIM,MAE} as the core training mechanism and have achieved encouraging results. Fig.\ref{fig:ICL} briefly illustrates the schematic diagram of ICL in language models and visual ICL. For instance, MAE-VQGAN~\cite{MAEVQGAN} formulates vICL as an image inpainting task by concatenating prompt-query pairs and employing a random masking strategy~\cite{MAE} to predict the discrete visual tokens~\cite{VQGAN} corresponding to the masked patches. This framework enables the model to perform various visual tasks by conditioning on different prompts. Similarly, Painter~\cite{Painter} adopts a simpler Masked Image Modeling approach~\cite{SimMIM}, directly regressing pixel values in the image space. MVG~\cite{MVG} extends Painter to the medical imaging domain, adopting a hybrid training scheme that combines autoregressive training with MIM. %Building upon Painter, 
However, as documented by ~\cite{zhou2021review}, medical images are characterized by reduced inter-subject variability, elevated spatial resolution, and contrast patterns that are unique to the clinical domain. 

Echoing the findings of MVG~\cite{MVG}, our experiments corroborate that directly deploying vICL pipelines designed for natural images yields sub-optimal performance on medical data, underscoring an urgent yet largely unexplored demand for domain-specific frameworks.

\begin{figure}[htbp]
\floatconts
  {fig:train}
  {\caption{Illustration of different visual-ICL masking strategies during training (gray patches denote masked regions). Notably, all existing visual-ICL frameworks adopt the masking pattern shown in panel (d) at inference, whereas only our strategy preserves training–inference consistency.}}
  {
    \centering
    % --- subfigure 1 ---
    \begin{minipage}[t]{0.24\textwidth}
    
      \centering
      \includegraphics[width=\linewidth]{section/img/grid3.pdf}
      \caption*{(a) Grid-like Image}
      \hypertarget{fig:train_a}{}
    \end{minipage}
    \hfill
    % --- subfigure 2 ---
    \begin{minipage}[t]{0.24\textwidth}
    
      \centering
      \includegraphics[width=\linewidth]{section/img/mae.pdf}
      \caption*{(b) MAE-VQGAN}
      \hypertarget{fig:train_b}{}
    \end{minipage}
    \hfill
    % --- subfigure 3 ---
    \begin{minipage}[t]{0.24\textwidth}
    
      \centering
      \includegraphics[width=\linewidth]{section/img/painter.pdf}
      \caption*{(c) Painter}
      \hypertarget{fig:train_c}{}
    \end{minipage}
    \hfill
    % --- subfigure 4 ---
    \begin{minipage}[t]{0.24\textwidth}
    
      \centering
      \includegraphics[width=\linewidth]{section/img/ours2.pdf}
      \caption*{(d) Ours}
      \hypertarget{fig:train_d}{}
    \end{minipage}
  }
\end{figure}

In this study, we identify key limitations of existing visual in-context learning models in medical imaging, such as \textbf{ineffective masking strategies, rigid backbone, and high computational overhead.} To address these challenges, we adopt a novel training-inference aligned masking strategy along with a Retrieve-and-Propagate (RandP) module, both of which enhance prompt-query interaction and simultaneously reduce the number of visual tokens, which in turn improves model efficiency and performance.
Unlike prior MIM-based vICL methods that are tied with ViT backbone, our framework generalizes to convolutional networks and other non-transformer designs,
%Unlike prior MIM-based vICL methods that rely heavily on ViT backbone, our framework generalizes to diverse encoder-decoder architectures (e.g., U-Net and its variants), 
enabling broader applicability to dense prediction vision generalist in medical imaging. \textbf{Our main contributions are as follows}:
\begin{itemize}
\item We present a comprehensive analysis of the limitations of current vICL frameworks in medical imaging, regarding masking design, backbone flexibility and efficiency.
\item We propose a medical vICL framework adopting a training-inference aligned masking strategy and a Retrieve-and-Propagate (RandP) module to improve performance and computational efficiency.
\item Our experiments demonstrate that our framework can be effectively extended to U-Net-style~\cite{UNet} models, which was not feasible for prior vICL frameworks.
% \item Our approach generalizes well across multiple benchmarks and architectures, achieving high effectiveness and efficiency compared to prior vICL frameworks.
\end{itemize}


\section{Limitations of Previous Visual ICL Methods in Medical Imaging}
% \noindent\textbf{Limitations of Previous Visual ICL Methods in Medical Imaging}
Recent advances~\cite{MAEVQGAN,Painter,PromptGIP,lvm} in vICL have demonstrated strong performance across a range of vision tasks. 
Despite these innovations, directly applying such approaches to medical imaging remains suboptimal due to the following key limitations:

\textbf{(i) Ineffectiveness of MIM in Medical Settings. } 
Naive masked image modeling typically involves randomly masking a large portion (e.g., 75\%) of the input image. However, in medical imaging scenarios, where images often contain large homogeneous background regions, this approach fails to preserve sufficient task-relevant information for effective prompt-query reasoning. 
The masking strategies shown in Fig.\hyperlink{fig:train_a}{2.(b)} and Fig.\hyperlink{fig:train_c}{2.(c)} are those adopted during the training phase of MIM-based visual ICL models.%Furthermore, as illustrated in Fig.\hyperlink{fig:train_a}{2.(b) } and Fig.\hyperlink{fig:train_c}{2.(c)}, the masking strategies shown are those adopted during the training phase of MIM-based visual ICL models. 
In contrast, the inference stage requires the use of a different masking strategy, as depicted in Fig.\hyperlink{fig:train_d}{2.(d)}. 
This inconsistency results in a discrepancy between the training and inference procedures.


To further investigate this issue, we carefully design three distinct inference strategies for the trained  vICL models MAE-VQGAN, Painter, and PromptGIP~\cite{PromptGIP}:
\begin{enumerate} 
    \item \textbf{Training-like Inference:} Matches the training phase masking strategy, providing partial visibility to the query label during inference.  
    \item \textbf{Modified Inference:} Replaces the partially visible query label patches with black patches before applying Training-like Inference. %effectively removing partial GT.
    \item \textbf{Standard Inference:} The typical vICL setup where the prompt image, prompt label, and query image are fully visible, and only the query label is fully masked. 
\end{enumerate}




\begin{table}[t]
\caption{Comparison of the segmentation performance, measured by the Dice Similarity Coefficient (DSC, in percentage), using different inference strategies.}
\centering
\fontsize{9}{10}\selectfont
\begin{tabular}{lccc}
\hline
 & \textbf{MAE-VQGAN} & \textbf{PromptGIP} & \textbf{Painter} \\
\hline

\textbf{Training-like Inference} & \textcolor{gray}{83.20} & \textcolor{gray}{88.64} & \textcolor{gray}{86.43} \\
\textbf{Modified Inference} & 1.88 & 49.09 & 23.09 \\
\textbf{Standard Inference} & \textbf{44.14} & \textbf{69.92} & \textbf{78.99} \\
\hline
\end{tabular}

\label{tab:inference_comparison}
\end{table}




Under the \textbf{Training-like Inference} setting, a small portion of the ground truth (GT) is exposed to the model. Therefore, evaluation metrics are computed only on the masked patches, without the visible GT regions to ensure a fair assessment. From Table~\ref{tab:inference_comparison}, we observe that all models achieve their highest performance under the \textbf{Training-like Inference} setting. However, this setting grants the model partial access to the GT which is unrealistic in practical scenarios. When even this limited GT is removed, as in the \textbf{Modified Inference} setting, the performance drops sharply (e.g., MAE-VQGAN: \textcolor{gray}{83.20\% }$\rightarrow$ 1.88\%). These results indicate that visual ICL models trained via MIM are primarily effective at inpainting, meaning they infer missing content based on visible patches, rather than truly understanding and interpreting the query image. However, this capability is misaligned with the requirements of medical visual tasks, which demand a comprehensive understanding of the query image itself.

Moreover, comparing \textbf{Standard Inference} and \textbf{Modified Inference}, their main difference lies in whether the prompt labels are provided in full or only partially. When the prompt labels are reduced from complete to partial, model performance also declines significantly(e.g., Painter: \textcolor{gray}{78.99\%} $\rightarrow$ 23.09\%). This further underscores that prompts play a crucial role in guiding the model's processing of the query image within the vICL framework.

\textbf{(ii) Unnecessarily High Computational Overhead. } 
Although models like Painter and MVG attempt to mitigate this issue by adding image and label patches at shallow layers, 
\textbf{they still devote nearly half of their computational budget to prompt processing}. 
%This allocation limits the model’s capacity to focus on the query task itself, reducing overall efficiency.
\textbf{(iii) Limited Backbone Flexibility. }
MIM-based methods are tightly coupled with transformer-based architectures~\cite{Attention} and often perform poorly on convolutional networks~\cite{spark}, limiting their compatibility with widely used backbones in medical image analysis, such as U-Net~\cite{UNet} and its variants, which are more suitable for image-to-image tasks in medical imaging.
% including UX-Net~\cite{uxnet}, SwinUNETR~\cite{SwinUNETR}, and Restormer~\cite{restormer}, 
\begin{figure}[t]  % 单栏图
    \centering
    \includegraphics[width=0.9\linewidth]{section/img/RandP-3.pdf}
    \caption{The grid-like image is first processed by the Retrieve-and-Propagate (RandP) Module (1) to produce \textit{X-fused} feature, which can be fed into either a ViT backbone (2) with a pixel decoder or a U-Net-style (2') architecture to get the dense output. The query label is replaced with [MASK] tokens, which then serves as the Ground Truth to calculate the loss against the output. The italicized pi, pl, qi, and ql denote the prompt image, prompt label, query image, and query label, respectively. \textit{H} denotes Hungarian matching~\cite{Hungarian}, and \textit{Pos.} stands for positional encoding~\cite{Attention}.}
    \label{fig:RandP}
\end{figure}
%\input{section/related_work}
\section{Method}
\noindent\textbf{RandP Framework. } 
Our RandP medical vICL framework, as illustrated in the Fig.\ref{fig:RandP}, consists of three main components: a Retrieval-and- Propagate module for prompt-query fusion, a backbone for image feature extraction, and a pixel decoder for dense prediction.
%\noindent\textbf{Task Selection.} %\subsection{Task Selection}

\noindent\textbf{Unifying Input and Output Spaces. }
For each \textit{query image} in the training set, we randomly select another image with the same task and its corresponding label as the \textit{prompt image} and \textit{prompt label}. All four components—\textit{prompt image}, \textit{prompt label}, \textit{query image}, and \textit{query label}—are in \(\mathbb{R}^{3 \times H \times W}\). These are arranged into a grid: top-left, top-right, bottom-left, and bottom-right, respectively, forming a grid-like image \(X \in \mathbb{R}^{3 \times 2H \times 2W}\), as shown in Fig.\hyperlink{fig:train_a}{2.(a)}.
For segmentation tasks, which need to predict discrete one-hot labels, we follow~\cite{Painter} by assigning each semantic category a unique RGB color. During inference, the predicted label is obtained by mapping each output pixel to the nearest category via \(L_2\) distance, effectively turning segmentation into an image-to-image translation task. For low-level tasks, where outputs are already continuous in RGB space, no such transformation is needed.
After training, a given prompt image and label pair instructs the model to perform which task on the query image.




\noindent\textbf{Training-Inference Aligned Masking Strategy. }
Unlike prior vICL approaches that predominantly rely on masked image modeling during training, we adopt a fully training-inference aligned masking strategy, as depicted in Fig.\hyperlink{fig:train_d}{2.(d)}. In our setting, the \textit{prompt image}, \textit{prompt label}, and \textit{query image} are never masked, while the \textit{query label} is always fully masked. This design offers several advantages:

\begin{enumerate}
    \item \textbf{Consistency between training and inference:} By avoiding partial masking of the query image, the model is encouraged to truly understand the query image rather than simply reconstructing grid-like masked patterns, thus improving generalization to real ICL scenarios.

    \item \textbf{Broader backbone compatibility:} This strategy allows the use of backbones beyond ViT. Prior work~\cite{spark} has shown that the effectiveness of naive MIM degrades on CNNs due to the sparsity of masked inputs being diluted through stacked convolutional layers. By eliminating MIM-style masking, our framework can more effectively utilize convolutional architectures.

    \item \textbf{Reduced computational cost:} Since the prompt label is never masked, there is no need to reconstruct it. Consequently, the model only needs to predict the query label, enabling us to discard prompt-related tokens after the prompt information has been integrated. This leads to significant computational savings without compromising performance.
\end{enumerate}

\noindent\textbf{Retrieval-and-Propagate Module for Prompt-Query Fusion. }
In previous vICL methods, prompt tokens and query tokens are concatenated into a single sequence and interact via self-attention. However, due to the quadratic complexity $\mathcal{O}(N^2)$ of self-attention with respect to sequence length, this design introduces considerable computational overhead compared to conventional image-to-image models that do not use prompts.

While the redundancy of visual tokens has been extensively validated in MLLMs~\cite{FastV}, we argue that a similar level of redundancy may exist even when visual tokens are used purely as context. Inspired by recent efforts on visual token pruning or merging in MLLMs~\cite{ToMe,FasterVLM,DART}, we propose to fuse prompt and query tokens at the early stage of the network to reduce computation and enhance efficiency.

We introduce a Retrieval-and-Propagate token fusion strategy, which is particularly inspired by the characteristics of medical images from different patients that tend to exhibit strong visual similarities within corresponding anatomical regions—often more pronounced than those observed in natural images. Specifically, let \(\mathbf{I}_{pi}\), \(\mathbf{I}_{pl}\), \(\mathbf{I}_{qi}\), and \(\mathbf{I}_{ql}\) denote the prompt image, prompt label, query image, and query label, respectively. These inputs are first embedded with a patch embedding layer and added with learnable positional encodings:
\begin{align}
\mathbf{Z}_{pi},\ \mathbf{Z}_{pl},\ \mathbf{Z}_{qi} 
= \text{PatchEmbed}(\mathbf{I}_{pi},\ \mathbf{I}_{pl},\ \mathbf{I}_{qi}) + \mathbf{P}
\end{align}

The resulting latent representations are then fed into a shallow ViT encoder (e.g., 2 layers) to extract  features:
\begin{align}
\mathbf{X}_{\text{pi}},\ \mathbf{X}_{\text{pl}},\ \mathbf{X}_{\text{qi}},\ \mathbf{X}_{\text{ql}} 
= \text{ViT}([\mathbf{Z}_{\text{pi}},\ \mathbf{Z}_{\text{pl}},\ \mathbf{Z}_{\text{qi}},\ \text{[MASK]}])
\end{align}

Here, \(\mathbf{Z}_*\) and \(\mathbf{X}_*\) represent the latent representations before and after the encoder, respectively; \(\mathbf{P}\) denotes the positional encoding. \ \text{[MASK]} is the learnable \textit{mask token} used to replace the masked query label.



For each patch in the query image, we compute its cosine similarity with all patches in the prompt image, effectively allowing each query token to \textbf{retrieve} similar token from the prompt.
To preserve the full information of the prompt pairs, we employ Hungarian matching 
~\cite{Hungarian}, a classical algorithm for solving optimal bipartite assignment problems, 
rather than greedy matching, as it enforces globally optimal one-to-one correspondences 
between query and prompt tokens based on the similarity matrix:
\begin{equation}
\mathcal{M} = \text{Hungarian}\left(-\text{Norm}(\mathbf{Z}_{\text{pi}}) \cdot \text{Norm}(\mathbf{Z}_{\text{qi}})^\top\right)
\end{equation}
This design avoids information loss and ensures that each prompt token is effectively utilized in the fusion process.
Although Hungarian matching has a cubic complexity with respect to the number of tokens, 
the token set is small in our setting, and a detailed analysis of computational complexity, 
runtime, and scalability is provided in the Appendix.
The matched prompt and query tokens are concatenated along the feature dimension and subsequently fused via a linear layer. If a highly similar prompt token is retrieved, the model can directly reuse the associated prompt label token, effectively \textbf{propagating} it to the output.
\begin{equation}
\mathbf{X} = \text{Linear}\left(\text{Concat}\left[\mathcal{M}(\mathbf{X}_{\text{pi}}),\ \mathcal{M}(\mathbf{X}_{\text{pl}}),\ \mathbf{X}_{\text{qi}},\ \mathbf{X}_{\text{ql}}\right]\right)
\end{equation}


\noindent\textbf{Backbone and Pixel Decoder. }
Similar to previous visual ICL frameworks, we use ViT as the backbone. The pixel decoder is a simple prediction head with two convolutional layers, taking the concatenated feature maps from four different layers of ViT as input~\cite{ViTDet}.

\noindent\textbf{Loss Function. } 
The decoder outputs an image in $\mathbb{R}^{3 \times H \times W}$, and we compute the smooth L1 loss~\cite{fastrcnn} pixel-wise against the $\text{query label}$.
Additionally, we use cross-entropy (CE) loss to optimize task prediction. 
The total loss function is as follows:
\begin{equation*}
\mathcal{L} = \mathcal{L}_{\text{smoothL1}}(y_{\text{query\_label}}, \hat{y}_{\text{query\_label}}) + 0.1\cdot \mathcal{L}_{\text{CE}}(y_{\text{task}}, \hat{y}_{\text{task}})
\end{equation*}
\noindent\textbf{Extending Visual ICL via RandP + U-Net Variants}
 Our masking strategy and RandP module enable the extension of visual ICL method to other non-transformer architectures, including the commonly used U-Net and its variants in medical imaging. Specifically, the activations output from the RandP Module, with a stride of 16, match the spatial dimensions of the feature maps after downsampling four times in U-Net. We adjust the channel dimensions of these activations using a $1 \times 1$ convolutional layer and add them element-wise to the feature maps in the U-Net bottleneck.

\section{Experiment}
% \subsection{Comparative Experiment}
\noindent\textbf{Dataset and Implementation. } 
Following ~\cite{AMIR}, we select the IXI~\cite{IXI} 
 MRI dataset for the super-resolution task, the 2016 NIH AAPM-Mayo Clinic Low-Dose CT Grand Challenge~\cite{low-dose} dataset for denoising task, and PET synthesis dataset provided by ~\cite{AMIR}. Our segmentation dataset covers both CT and MRI modalities: PROMISE12~\cite{promise12}, Prostate\_MRI\_Dataset~\cite{samed2d20mdataset}, AMOS~\cite{amos}, and BTCV~\cite{BTCV}. Although we have chosen only these four tasks, our framework can be applied to any image-to-image task.
\begin{table}[t!]
\caption{Performance comparison across different visual ICL frameworks. Inference metrics include FLOPs, runtime (per image on a RTX 3090), and memory usage (batch size = 16).}
\centering
\setlength{\tabcolsep}{4.5pt} % 减小列间距
\fontsize{9}{11}\selectfont
\begin{tabular}{l|c|cc|cc|cc|c|c|c}
\hline
  & \textbf{Seg.} & \multicolumn{2}{c|}{\textbf{Denoising}} & \multicolumn{2}{c|}{\textbf{Super-Res.}} & \multicolumn{2}{c|}{\textbf{PET synthesis}} & \textbf{FLOPs} & \textbf{Time} & \textbf{Mem.} \\
 \textbf{Models} & Dice & PSNR & SSIM & PSNR & SSIM & PSNR & SSIM  & (G)↓ &  (ms)↓ &  (GB)↓ \\ \hline
\textbf{copy} & 0.90 & 9.17 & 21.92 & 16.04 & 48.65 & 19.67 & 68.65 & - & - & - \\
\textbf{MAE-VQGAN} & 44.14 & 21.16 & 74.14 & 23.23 & 75.89 & 27.65 & 83.66 & 664 & 33.2 & 20.5 \\
\textbf{PromptGIP} & 69.92 & 31.1 & 90.18 & 28.36 & 88.29 & 27.77 & 86.26 & 670 & 33.1 & 18.8 \\
\textbf{Painter} & 78.99 & 32.83 & 91.94 & 30.11 & \textbf{91.35} & 31.16 & 89.75 & 429 & 30.6 & 15.5 \\
\textbf{MVG} & 82.56 & 32.97 & 92.08 & 30.11 & 91.34 & 31.09 & 89.85 & 429 & 30.5 & 15.6 \\
\textbf{RandP} & \textbf{84.95} & \textbf{33.01} & \textbf{92.14} & \textbf{30.14} & 91.33 & \textbf{31.46} & \textbf{90.17} & \textbf{258} & \textbf{14.4} & \textbf{10.7} \\
\hline
\end{tabular}

\label{tab:comparison}
\end{table}

\begin{table}[b!]
\caption{Performance comparison under the Single-Task Separate Training  Setting.}
\centering
\setlength{\tabcolsep}{5.5pt} % 减小列间距
\fontsize{9}{11}\selectfont
\begin{tabular}{l|c|cc|cc|cc}
\hline
 & \textbf{Seg.} & \multicolumn{2}{c|}{\textbf{Denoising}} & \multicolumn{2}{c|}{\textbf{Super-Resolution}} & \multicolumn{2}{c}{\textbf{PET synthesis}} \\ 
 \textbf{Models} & Dice & PSNR & SSIM & PSNR & SSIM & PSNR & SSIM  \\ \hline
%\multicolumn{8}{l}{Router based} \\
\textbf{AMIR~\cite{AMIR}} & 84.45 & 33.71 & 92.47 & 30.52 & 92.11 & \textbf{31.56} & \textbf{90.48} \\
%\multicolumn{8}{l}{Visual ICL based} \\
\textbf{MVG~\cite{MVG}} & 77.91 & 32.54 & 91.65 & 29.94 & 91.08 & 31.19 & 89.96 \\
\textbf{RandP} & 79.32 & 32.80 & 91.88 & 29.93 & 91.17 & 31.18 & 89.97 \\
\textbf{RandP-UX-NET} & 83.27 & 32.77 & 89.82 & 29.44 & 91.02 & 30.40 & 88.72 \\
\textbf{RandP-SwinUNETR} & 83.63 & 32.44 & 89.33 & 29.16 & 90.25 & 30.67 & 89.13 \\ 
\textbf{RandP-Restormer} & \textbf{84.89} & \textbf{33.73} & \textbf{92.58} & \textbf{30.54} & \textbf{92.27} & 31.21 & 90.14 \\ \hline
\end{tabular}

\label{tab:single}
\end{table}

\begin{table}[b!]
\caption{Performance comparison under the  Multi-Task Joint Training Setting. Values in parentheses indicate the performance gain compared to single-task training. UX., Sw., and Res. refer to UX-NET, SwinUNETR, and Restormer, respectively.}
\centering
\setlength{\tabcolsep}{4.5pt} % 减小列间距
\fontsize{9}{11}\selectfont
\begin{tabular}{l|c|cc|cc|cc}
\hline
 & \textbf{Segment.} & \multicolumn{2}{c|}{\textbf{Denoising}} & \multicolumn{2}{c|}{\textbf{Super-Resolution}} & \multicolumn{2}{c}{\textbf{PET synthesis}} \\
 \textbf{Models} & Dice & PSNR & SSIM & PSNR & SSIM & PSNR & SSIM  \\ \hline
%\multicolumn{8}{l}{Router based} \\
\textbf{AMIR} & 83.5(-0.9) & 33.9(+0.2) & 92.8(+0.3) & 30.8(+0.3) & 92.3(+0.2) & 31.6(+0.1) & 90.6(+0.1) \\
%\multicolumn{8}{l}{Visual ICL based} \\
\textbf{RandP} & 85.0(+5.6) & 33.0(+0.2) & 92.1(+0.2) & 30.1(+0.2) & 91.3(+0.2) & 31.5(+0.3) & 90.2(+0.2) \\
\textbf{RandP-UX.} & 84.1(+0.8) & 33.8(+1.0) & 92.7(+2.9) & 30.5(+1.0) & 92.0(+1.0) & 31.6(+1.2) & 90.4(+1.6) \\
\textbf{RandP-Sw.} & 84.2(+0.6) & 33.7(+1.3) & 92.6(+3.3) & 30.3(+1.1) & 91.6(+1.4) & 31.6(+0.9) & 90.3(+1.1) \\
\textbf{RandP-Res.} & 85.7(+0.8) & 34.0(+0.3) & 92.9(+0.3) & 30.8(+0.3) & 92.5(+0.2) & 32.1(+0.9) & 91.0(+0.9) \\
\hline
\end{tabular}

\label{tab:multi}
\end{table}
For all models, the shallow ViT encoder in the RandP Module uses two layers. Following previous vICL frameworks~\cite{Painter, MVG, PromptGIP}, we adopt the identical experimental protocol, wherein the prompt–query fusion module, the backbone network, and the pixel decoder are all trained in an end-to-end manner. We train for 100 epochs with a maximum learning rate of $1\mathrm{e}{-3}$, a batch size of 64, and a Grid-like Image resolution of $512\times512$. In this paper, the reported Dice~\cite{dice} and SSIM~\cite{ssim} values are presented in percentage form.

\noindent\textbf{Comparison of Different Visual ICL Frameworks. } 
We re-implement multiple prior vICL methods on our medical datasets, including MAE-VQGAN, PromptGIP, Painter, and MVG, to construct strong baselines for comparison. Additionally, we include AMIR~\cite{AMIR}, a router-based multi-task model in medical imaging. We also introduce a \textbf{copy} baseline, which simply replicates the prompt label as the final output.For MAE-VQGAN, a pretrained VQGAN~\cite{VQGAN} is required to serve as the tokenizer. We initialized the VQGAN with weights pretrained on ImageNet~\cite{imagenet} and further fine-tuned it on our datasets. In contrast, all other models perform regression directly in the pixel space. All frameworks adopt ViT as the backbone. As shown in Table~\ref{tab:comparison}, our proposed \textbf{RandP} framework consistently outperforms previous vICL frameworks across various tasks, while achieving \textbf{inference speed 2$\times$ faster} than prior visual ICL frameworks and \textbf{lower memory consumption}.
\begin{figure*}[t!]
    \centering
    \includegraphics[width=1.0\textwidth]{section/img/visual-midl.pdf}
    \caption{Qualitative evaluation of our RandP models.}
    \label{fig:visual}
\end{figure*}




\noindent\textbf{Extending RandP to U-Net Variants.}
We extend the RandP framework to several widely-used U-Net variants in medical imaging. Specifically, we adopt pure advanced convolutional UX-Net~\cite{uxnet}, and SwinUNETR~\cite{SwinUNETR}, a window attention~\cite{Swintransformer} based model. Both of them originally designed for 3D tasks, we adapt them to 2D settings. Additionally, we incorporate Restormer~\cite{restormer}, a modified-transformer-based model commonly used for low-level vision tasks. These combinations result in three RandP-based medical vICL models: RandP-UX-Net, RandP-SwinUNETR, and RandP-Restormer. We first train these models independently for each task. As shown in Table~\ref{tab:single}, in the single-task separate training setting, models with the same backbone generally show similar performance—for example, RandP and MVG (with ViT), and RandP-Restormer and AMIR (with Restormer) perform comparably. However, ViT-based models still lag behind task-specific U-Net-style architectures, particularly for the segmentation task.

We further perform joint multi-task training using the RandP framework across different backbones. As shown in Table~\ref{tab:multi}, compared to single-task training, all backbones consistently benefit from joint optimization, demonstrating RandP’s capability to mitigate inter-task conflicts during learning. However, the router-based AMIR framework shows a performance drop on the segmentation task under multi-task training compared to its single-task counterpart.

Both RandP-Restormer and AMIR use Restormer as the backbone. While AMIR introduces a complex task routing mechanism and incurs additional computational cost to reduce task interference, RandP-Restormer achieves clear and consistent improvements across all tasks, highlighting the effectiveness and efficiency of the RandP framework.

Fig.~\ref{fig:visual} shows the qualitative evaluation of our RandP models, the last row of it indicates that when we provide a denoising prompt for a segmentation query image, the model executes the task as instructed by the prompt, rather than simply memorizing the dataset.




\begin{table*}[h!]
        % 淡红背景
  \parbox{\linewidth}{%      % 占满当前双栏宽度
    \centering
    \caption{Performance difference between random prompts, learned prompts and selected prompt}
    \fontsize{9}{11}\selectfont
    \begin{tabular}{l|c|cc|cc|cc}
    \hline
     & \textbf{Segmentation} & \multicolumn{2}{c|}{\textbf{Denoising}} & \multicolumn{2}{c|}{\textbf{Super-Resolution}} & \multicolumn{2}{c}{\textbf{PET synthesis}} \\ 
     & Dice & PSNR& SSIM & PSNR & SSIM & PSNR & SSIM  \\ \hline
     \textbf{Random Prompt} & 84.95 & 33.01 & 92.14 & 30.14 & 91.33  & 31.46 & 90.17 \\
    \textbf{Learned Prompt} & 85.11 & 33.04 & 92.16 & 30.12 & 91.32 & 31.39 & 90.16 \\ 
    \textbf{Selected Prompt} & \textbf{85.56} & \textbf{33.29} & \textbf{92.50} & \textbf{30.21} & \textbf{91.49} & \textbf{32.31} & \textbf{91.32} \\ \hline
    \end{tabular}
    \label{tab:learnable_prompt}
  }% end of parbox

\end{table*}
\noindent\textbf{Random Prompt vs. Learned Prompt vs. Selected Prompt }
In addition, we froze the parameters of the trained visual ICL model and treated the prompt image and prompt label as learnable embeddings. Each task was associated with a distinct set of prompt embeddings, enabling task-specific adaptation without modifying the backbone. These learnable embeddings are optimized using a learning rate of 1e-4 for 10 epochs without any warm-up schedule. During inference, we used the corresponding learned prompt embeddings for each task. As shown in Table~\ref{tab:learnable_prompt}, due to the marginal performance gap between learned and random prompts, the use of learned prompts can be seen to enhance the stability of medical visual ICL models. 
Upon visualizing the learned prompts, we find that they do not resemble semantically meaningful images; rather, they appear as structured noise patterns with no obvious visual interpretation. Building on the prompt-selection protocol proposed by ~\cite{context-medical, SuPR}, we retained the most relevant prompt via an external visual encoder;; the resulting “Selected Prompt” row in Table 8 demonstrates a clear performance gain. Sophisticated prompt-selection strategies~\cite{prompt-self, SCS} have recently been introduced in natural-image vICL, and we consider their adaptation to medical imaging a promising direction for future research.

\begin{table*}[t!]
\caption{Ablation Study Results. Dice and SSIM are reported in percentage.}
\centering
\setlength{\tabcolsep}{5.5pt} % 减小列间距
\fontsize{9}{11}\selectfont
\begin{tabular}{l|c|cc|cc|cc}
\hline
 & \textbf{Seg.} & \multicolumn{2}{c|}{\textbf{Denoising}} & \multicolumn{2}{c|}{\textbf{Super-Resolution}} & \multicolumn{2}{c}{\textbf{PET synthesis}} \\ 
 \textbf{Models} & Dice & PSNR & SSIM & PSNR & SSIM & PSNR & SSIM  \\ \hline
\multicolumn{8}{l}{\textbf{prompt-query token fusion strategies in ViT backbone}} \\
Patch Merge & 75.91 & 32.94 & 92.09 & 29.99 & 90.95 & 31.25 & 90.09 \\ 
Greedy Matching & 78.83 & 32.99 & 92.09 & 29.38 & 90.11 & 31.03 & 89.53 \\
Hungarian Matching & \textbf{84.95} & \textbf{33.01} & \textbf{92.14} & \textbf{30.14} & \textbf{91.33}  & \textbf{31.46} & \textbf{90.17} \\
\multicolumn{8}{l}{\textbf{Different Fusion Strategies when extending RandP to Restormer backbone}} \\
First-Stage     & 84.71 & 33.97 & 92.89 & 30.74 & 92.31 & 31.44 & 90.38 \\
Multi-Stage      & 83.62 & 33.97 & 92.78 & \textbf{30.86} & 92.47 & 31.13 & 90.12 \\
Bottleneck  & \textbf{85.70} & \textbf{33.99} & \textbf{92.92} & 30.83 & \textbf{92.50} & \textbf{32.08} & \textbf{91.04} \\  
\hline
\end{tabular}

\label{tab:ablation_match}
\end{table*}

% \subsection{\textbf{Ablation Study}}
\noindent\textbf{Ablation Study 1: Query-Prompt Interaction strategy. }
Painter and MVG adopt apatch merging mechanism to fuse images and labels. However, as shown in Table~\ref{tab:ablation_match}, under the training-inference aligned masking strategy and the setting where the prompt label is not reconstructed, the performance of patch merging (i.e., spatially aligned visual tokens from the prompt image, prompt label, query image, and query label are summed at the merge layer) is significantly inferior to that of our proposed RandP Module.
Another important aspect concerns the matching strategy between prompt tokens and query tokens during the fusion process. We experimented with both greedy matching and Hungarian matching. The primary difference lies in whether a single prompt token is allowed to match multiple query tokens. Experimental results show that Hungarian matching significantly outperforms greedy matching. We hypothesize that many-to-one matching leads to the neglect of numerous prompt tokens, thereby causing substantial prompt information loss. 

\noindent\textbf{Ablation Study 2: Fusion Strategy for Extending RandP to U-Net Variants.}
The mixed features from the RandP module are 16× downsampled relative to the input image. We explore three fusion strategies to integrate them into U-Net-style architectures:

\begin{itemize}
\item \textbf{First-stage fusion:} Upsample the mixed features via pixel shuffle and concatenate with the input image at the first encoder stage.

\item \textbf{Multi-stage fusion:} Upsample the mixed features to multiple scales and add them to encoder features at corresponding stages.

\item \textbf{Bottleneck fusion:} Inject the mixed features directly into the bottleneck layer of the encoder.
\end{itemize}

Experimental results show that bottleneck fusion achieves the best performance. We hypothesize that this is because early stages of U-Net primarily focus on low-level features such as textures and edges~\cite{understandunet}, where introducing prompt-related information provides limited benefit. In contrast, the role of prompts is more aligned with guiding high-level semantic understanding, for example by indicating the task being performed~\cite{findingvisualtaskvectors} and providing task-specific insights~\cite{effectiveness-taskvectors}. Injecting prompt-related information at fine spatial resolutions may therefore be suboptimal, as the prompt inevitably differs from the query image in low-level details. Since the model output is solely optimized for understanding the query image, introducing fine-grained prompt features at early stages may even be detrimental to query interpretation.

\section{Conclusion}
In this paper, we propose a medical vICL framework called RandP, which enables the execution of multiple different medical imaging tasks via visual prompt pairs. Our experiments demonstrate that RandP has superior performance while maintaining low computational cost. Furthermore, RandP can be extended to other architectures beyond ViT.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was supported by Natural Science Foundation of China under Grant 62271465, Suzhou Basic Research Program under Grant SYG202338, and Open Fund Project of Guangdong Academy of Medical Sciences, China (No. YKY-KF202206).}


\bibliography{midl26_72}

\newpage
\appendix

\section{Hungarian Matching for Prompt--Query Fusion}
\label{app:hungarian}

\subsection{Problem Formulation}

Hungarian matching~\cite{Hungarian} is a classical algorithm for solving the
minimum-cost bipartite assignment problem.
Given a cost matrix $\mathbf{C} \in \mathbb{R}^{N \times N}$, it computes a globally
optimal one-to-one assignment between two sets by minimizing the total matching cost.
In our framework, Hungarian matching is applied to align a small set of prompt tokens
with query tokens, enforcing explicit and globally consistent correspondences.

\subsection{Computational Complexity and Runtime Analysis}

Due to the $\mathcal{O}(N^3)$ complexity of Hungarian matching, it is generally unsuitable
for large-scale dense token matching.
However, in our design, the algorithm is intentionally restricted to a small set of
prompt--query tokens rather than dense patch tokens, ensuring that the computational
overhead remains manageable in practice.

We measured the execution times for the Hungarian matching step using both the Scipy implementation and the GPU-accelerated Hungarian matching algorithm from the HA4DeTR~\cite{ha4detr} for varying token sequence lengths.
Table~\ref{tab:hungarian_runtime} reports the per-matrix runtime measured on an
NVIDIA RTX~3090 GPU.

\begin{table}[b]
\centering
\caption{Runtime of Hungarian matching for different token sizes.}
\label{tab:hungarian_runtime}
\begin{tabular}{c|cc}
\hline
Number of tokens & GPU (ms) & SciPy (ms) \\
\hline
256  & 0.125 & 0.57  \\
512  & 0.427 & 3.05  \\
1024 & 1.607 & 12.93 \\
2048 & 8.082 & 54.38 \\
\hline
\end{tabular}
\end{table}

When $N \leq 256$, which is the setting used throughout all experiments in this work,
the GPU overhead of Hungarian matching is below $0.2$~ms and remains negligible
compared to the forward pass of a ViT backbone (approximately $13.8$~ms).
As expected, the runtime increases cubically with respect to $N$, and Hungarian matching
becomes the computational bottleneck for larger token sets (e.g., $N \geq 1024$).


\subsection{Scalability Considerations}

The above results indicate that Hungarian matching does not scale favorably to large
numbers of tokens.
This limitation is intrinsic to the algorithm's cubic complexity and motivates our
design choice to restrict its usage to small, semantically meaningful prompt--query sets.

For scenarios involving higher resolution, several extensions are possible:
\begin{itemize}
    \item \textbf{Windowed matching}, where Hungarian matching is applied only within
    local spatial windows to exploit locality and reduce computational cost.
    \item \textbf{Approximate assignment algorithms}, such as the Sinkhorn algorithm~\cite{sinkhorn}, as an alternative to Hungarian matching.

\end{itemize}


Our goal in this work is not to scale Hungarian matching to large token sets, but to study
the effectiveness of globally optimal one-to-one alignment for structured prompt--query
fusion.
Within this scope, Hungarian matching provides a principled and reliable mechanism for
enforcing explicit correspondence, while its computational cost remains well controlled.
We leave the exploration of these extensions to future work.


\section{\textbf{Prompt Analysis}}


Numerous studies~\cite{MAEVQGAN, prompt-self, SuPR} have highlighted the sensitivity of vICL models to the choice of prompts. Some works have explored heuristic strategies~\cite{prompt-self, SuPR, SCS} for selecting optimal visual prompts, while others have investigated learning-based approaches~\cite{VPT, InMeMo, E-InMeMo} to prompt construction.
Given the high demand for controllability in medical image analysis, vICL models are designed to take both the query image and a prompt pair—consisting of an image and its corresponding label—as input. Therefore, we conduct a comprehensive analysis of the prompt component in trained visual ICL models.

\begin{table}[h]
\caption{Average standard deviation of the performance of trained medical vICL models across 20 different prompts.}
\centering
\fontsize{9}{11}\selectfont
\begin{tabular}{l|c|ccc}
\hline
 & \textbf{Segmentation} & \textbf{Denoising} & \textbf{Super-Resolution} & \textbf{PET synthesis} \\
 & Dice-std &   & PSNR-std &   \\ \hline
\textbf{MVG} & \textbf{0.00455} & 0.00880 & \textbf{0.0057} & 0.0353 \\
%\multicolumn{5}{l}{\textbf{RandP}} \\
\textbf{RandP} & \underline{0.00704} & 0.00695 & \underline{0.0069} & 0.0416 \\
\textbf{RandP-UX-NET} & 0.00855 & \underline{0.00526} & 0.0195 & \underline{0.0327} \\
\textbf{RandP-SwinUNETR} & 0.00968 & 0.00815 & 0.0272 & \textbf{0.0284} \\
\textbf{RandP-Restormer} & 0.00722 & \textbf{0.00475} & 0.0239 & 0.0336 \\ \hline
\end{tabular}

\label{tab:std}
\end{table}



\noindent\textbf{Standard Deviations Across Different Prompt Pairs. }
First, for each query image in the test set, 20 prompt pairs from the same task were randomly selected for inference. We then calculated the standard deviation of performance metrics across these inferences. Finally, we averaged these standard deviations over the entire test set. The results are summarized in Table~\ref{tab:std}.

When using ViT as the backbone, our RandP model exhibits higher standard deviations in segmentation, super-resolution, and PET synthesis compared to MVG. We attribute this to our more aggressive merging strategy between prompt tokens and query tokens. When extending the RandP framework to other U-Net variants, the standard deviation tends to increase further across most tasks relative to the ViT-based version.

However, a lower standard deviation is not always preferable. We argue that a moderately low standard deviation is desirable—it ensures model stability across prompts while allowing prompts to exert meaningful influence on the model’s interpretation of the query. The results suggest that our trained medical visual ICL models under the RandP framework maintain appropriately low standard deviations, ensuring stable performance under different prompts. Meanwhile, the variability in prompt effectiveness implies that some prompts are better than others. Identifying optimal prompts for medical visual ICL models will be an important direction for our future work.%



\section{Pseudocode of RandP}
\begin{algorithm}[ht]
\caption{Retrieval and Propagate for Prompt-Query Interaction}
\label{alg:retrieval_paste}
\KwIn{Prompt image $\mathbf{I}_{pi}$, Prompt label $\mathbf{I}_{pl}$, Query image $\mathbf{I}_{qi}$}
\KwOut{Prompt-Query Fusion Feature $\mathbf{X}$}

\textbf{Step 1: Patch Embedding} \\
$\mathbf{Z}_{pi},\ \mathbf{Z}_{pl},\ \mathbf{Z}_{qi} \gets \text{PatchEmbed}(\mathbf{I}_{pi},\ \mathbf{I}_{pl},\ \mathbf{I}_{qi}) + \mathbf{P}$

\textbf{Step 2: Shallow ViT for Feature Extraction} \\
$\mathbf{X}_{pi},\ \mathbf{X}_{pl},\ \mathbf{X}_{qi},\ \mathbf{X}_{ql} \gets \text{ViT}([\mathbf{Z}_{pi},\ \mathbf{Z}_{pl},\ \mathbf{Z}_{qi},\ \text{[MASK]}])$

\textbf{Step 3: Cosine Similarity Matching} \\
$\mathbf{S} \gets \text{Norm}(\mathbf{X}_{pi}) \cdot \text{Norm}(\mathbf{X}_{qi})^\top$ \\
$\mathcal{M} \gets \text{Hungarian}(\mathbf{-S})$

\textbf{Step 4: Reorder Prompt Tokens} \\
$\mathcal{M}(\mathbf{X}_{pi}) \gets \text{Reorder}(\mathbf{X}_{pi}, \mathcal{M})$ \\
$\mathcal{M}(\mathbf{X}_{pl}) \gets \text{Reorder}(\mathbf{X}_{pl}, \mathcal{M})$

\textbf{Step 5: Prompt-Query Token Fusion} \\
$\mathbf{X} \gets \text{Linear}(\text{Concat}[\mathcal{M}(\mathbf{X}_{pi}),\ \mathcal{M}(\mathbf{X}_{pl}),\ \mathbf{X}_{qi},\ \mathbf{X}_{ql}])$

\textbf{Return} $\mathbf{X}$
\end{algorithm}

\end{document}
