\section{ReX-MLE}
\label{sec:method}



\subsection{Benchmark Design}

To systematically evaluate AI agent capabilities on medical imaging tasks, we construct \textbf{ReX-MLE}, a curated benchmark of medical imaging competitions adapted from Grand Challenge\footnote{\url{https://grand-challenge.org/challenges/}}. Our benchmark is designed to cover diverse modalities, task types, and clinical applications while maintaining rigorous standards for data quality and evaluation protocols, enabling systematic comparison of agent performance against expert-built solutions.

\paragraph{Challenge Selection Criteria.}
We select Grand Challenge tasks that meet standard requirements for data quality and reproducibility. 
Eligible challenges required substantial community participation (typically over 200 registrants or extensive submissions), open licensing that permits research use, and publicly released evaluation metrics and scripts. We included challenges with complete competition materials, including task descriptions and sample submissions. We ensured broad modality coverage and task diversity, spanning classification, detection, segmentation, regression, and image enhancement.

% We select challenges from Grand Challenge based on the following criteria to ensure data quality, clinical utility, and evaluation rigor:

% \begin{itemize}
%     \item \textbf{High community participation:} At least 200 registered participants or substantial submissions, ensuring well-established baselines and robust leaderboard.
%     \item \textbf{Open licensing}: Datasets available under open licenses permitting research use.
%     \item \textbf{Reproducible evaluation}: Clear metrics with publicly available evaluation scripts.
%     \item \textbf{Complete challenge materials}: Full competition package including task description, data introduction, and sample submission formats.
%     \item \textbf{Modality coverage:} Inclusion of CT, MRI, X-ray, ultrasound, and digital pathology.
%     \item \textbf{Task diversity}: Inclusion of classification, detection, segmentation, regression, and enhancement tasks.
% \end{itemize}

% \begin{table*}[!t]
% \footnotesize
% \centering  
% \caption{Overview of the 20 ReX-MLE challenges across 10 competitions.}\vspace{3pt}
% \label{tab:medical_mlebench_overview}
% \setlength{\tabcolsep}{3pt}
% \begin{tabular}{llll}
% \toprule
% \textbf{Challenge} & \textbf{Modality} & \textbf{Anatomy} & \textbf{Task Type} \\
% \midrule
% % \multicolumn{4}{l}{\textit{USenhance Challenge (2023)}} \\
% USenhance ~\cite{usenhance} & Ultrasound & Various Organs & Image Generation \\
% % \midrule
% % \multicolumn{4}{l}{\textit{LDCT-IQA Challenge (2023)}} \\
% LDCTIQA2023~\cite{ldct_iqa} & CT & Abdominal & Quality Assessment  \\
% % \midrule
% % ---------------------- PANTHER TASKS ----------------------
% % \multicolumn{4}{l}{\textit{PANTHER Challenge (2025)}} \\
% PANTHER-T1 ~\cite{panther}& MRI & Pancreas & Tumor Segmentation \\
% PANTHER-T2 ~\cite{panther}& MRI & Pancreas & Tumor Segmentation \\
% % \midrule
% % \multicolumn{4}{l}{\textit{SEG.A Challenge (2023)}} \\
% SEG.A ~\cite{seg_a} & CTA & Aorta / Heart & Segmentation \\
% % \midrule
% % \multicolumn{4}{l}{\textit{DENTEX Challenge (2023)}} \\
% DENTEX~\cite{hamamci2023dentex} & X-ray & Dental & Detection \\
% % \midrule
% % \multicolumn{4}{l}{\textit{TopCoW Challenge (2024)}} \\
% TopCoW-CTA-Seg~\cite{topcow} & CTA & Brain & Segmentation \\
% TopCoW-CTA-Det~\cite{topcow} & CTA & Brain & Detection \\
% TopCoW-CTA-Cls~\cite{topcow} & CTA & Brain & Classification \\
% TopCoW-MRA-Seg~\cite{topcow} & MRA & Brain & Segmentation \\
% TopCoW-MRA-Det~\cite{topcow} & MRA & Brain & Detection \\
% TopCoW-MRA-Cls~\cite{topcow} & MRA & Brain & Classification \\
% % \midrule
% % ---------------------- PUMA TASKS ----------------------
% % \multicolumn{4}{l}{\textit{PUMA Challenge (2025)}} \\
% PUMA-Track1-TissueSeg~\cite{puma} & Pathology & Skin & Segmentation \\ % Semantic Tissue Segmentation \\
% PUMA-Track1-NucleiDet~\cite{puma} & Pathology & Skin & Nuclei Detection \\ % (3 classes) \\
% PUMA-Track2-TissueSeg~\cite{puma} & Pathology & Skin & Segmentation \\ % Semantic Tissue Segmentation \\ %  (5 classes) \\
% PUMA-Track2-NucleiDet~\cite{puma} & Pathology & Skin & Nuclei Detection  \\ %  (10 classes) \\
% % \midrule
% % \multicolumn{4}{l}{\textit{ISLES'22 Challenge (2022)}} \\
% ISLES'22~\cite{hernandez2022isles} & MRI & Brain & Segmentation \\
% % \midrule
% % \multicolumn{4}{l}{\textit{NeurIPS Cell Segmentation Challenge (2022)}} \\
% CellSeg \cite{neurips_cellseg} & Microscopy & Cells & Segmentation \\
% % \midrule
% % \multicolumn{4}{l}{\textit{TopBrain Challenge (2025)}} \\
% TopBrain-CTA-Seg ~\cite{topcow} & CTA & Brain & Segmentation \\
% TopBrain-MRA-Seg ~\cite{topcow} & MRA & Brain & Segmentation \\
% \bottomrule
% \end{tabular}
% \end{table*}

\begin{table*}[!t]
\footnotesize
\centering  
\caption{Overview of the 20 ReX-MLE challenges across 10 competitions.}\vspace{3pt}
\label{tab:medical_mlebench_overview}
\setlength{\tabcolsep}{10pt}
\begin{tabular}{lll}
\toprule
\textbf{Challenge} & \textbf{Modality} & \textbf{Task Type} \\
\midrule
USenhance ~\cite{usenhance} & Ultrasound & Image Generation \\
LDCTIQA2023~\cite{ldct_iqa} & CT & Image Quality Assessment \\
PANTHER-T1 ~\cite{panther} & MRI & Tumor Segmentation \\
PANTHER-T2 ~\cite{panther} & MRI & Tumor Segmentation \\
SEG.A ~\cite{seg_a} & CTA & Segmentation \\
DENTEX~\cite{hamamci2023dentex} & X-ray & Detection \\
TopCoW-CTA-Seg~\cite{topcow} & CTA & Segmentation \\
TopCoW-CTA-Det~\cite{topcow} & CTA & Detection \\
TopCoW-CTA-Cls~\cite{topcow} & CTA & Classification \\
TopCoW-MRA-Seg~\cite{topcow} & MRA & Segmentation \\
TopCoW-MRA-Det~\cite{topcow} & MRA & Detection \\
TopCoW-MRA-Cls~\cite{topcow} & MRA & Classification \\
PUMA-Track1-TissueSeg~\cite{puma} & Pathology & Segmentation \\
PUMA-Track1-NucleiDet~\cite{puma} & Pathology & Nuclei Detection \\
PUMA-Track2-TissueSeg~\cite{puma} & Pathology & Segmentation \\
PUMA-Track2-NucleiDet~\cite{puma} & Pathology & Nuclei Detection \\
ISLES'22~\cite{hernandez2022isles} & MRI & Segmentation \\
CellSeg \cite{neurips_cellseg} & Microscopy & Segmentation \\
TopBrain-CTA-Seg ~\cite{topcow} & CTA & Segmentation \\
TopBrain-MRA-Seg ~\cite{topcow} & MRA & Segmentation \\
\bottomrule
\end{tabular}
\end{table*}


\paragraph{Benchmark Composition.}
ReX-MLE comprises 20 challenges across 10 major medical imaging competitions, covering diverse modalities, task types, and anatomical regions. Table~\ref{tab:medical_mlebench_overview} summarizes the benchmark composition.
The dataset distribution requires agents to operate across highly heterogeneous data formats and clinical contexts, covering neurovascular imaging (TopCoW, SEG.A, ISLES'22, TopBrain), oncology (PANTHER, PUMA), microscopy (CellSeg), dentistry (DENTEX), and image quality enhancement and reconstruction (LDCT-IQA, USenhance). This diversity enforces generalization across anatomical structures and clinical objectives, providing a realistic testbed for autonomous medical model development.

% The benchmark incorporates 8 imaging modalities, including X-ray, CT, CTA, MRI, MRA, ultrasound, digital pathology, and microscopy, requiring agents to operate across heterogeneous data formats. These tasks encompass segmentation, detection, classification, image quality assessment, and generative enhancement, thereby testing both foundational computer vision abilities and domain-specific reasoning skills essential for medical AI systems. 
% Clinically, the benchmark covers high-impact application areas ranging from vascular and neuroimaging (TopCoW, SEG.A, ISLES'22, TopBrain) to oncology (PANTHER, PUMA), multi-modal microscopy (NeurIPS-CellSeg), dentistry (DENTEX), and diagnostic imaging quality and reconstruction (LDCT-IQA, USenhance). 
% This breadth ensures that agents must generalize across diverse anatomical regions, clinical objectives, and data acquisition processes, offering a comprehensive and realistic evaluation of autonomous medical model-building capabilities.

\paragraph{Challenge Adaptation.}
For each challenge, we prepare standardized materials following the MLE-bench framework~\citep{chan2024mle}. Provided materials include a concise competition description, automated dataset-preparation scripts, sample submissions, and Python implementations of official evaluation metrics for fully local validation. Each task also includes a YAML metadata file defining challenge identifiers, task types, data paths, and grading parameters. All preparation and evaluation code is publicly released to support reproducibility and future extensions.

% This includes a comprehensive competition description detailing the task objectives, clinical context, dataset characteristics, and evaluation metrics; automated dataset-preparation scripts for downloading data from Zenodo, Figshare, or Google Drive, converting formats, and generating consistent train–validation–test splits; and sample submission files that illustrate the required output structure. We also recreate the official evaluation metrics in Python to enable fully local validation without access to test labels, and provide YAML metadata files specifying the challenge ID, task type, data paths, and grading parameters. All dataset preparation and evaluation code is publicly released to support reproducibility and extendability of the benchmark.

% \begin{itemize} 
%     \item \textbf{Competition description}: Comprehensive markdown document detailing task objectives, clinical context, dataset characteristics, and evaluation metrics
%     \item \textbf{Dataset preparation}: Automated scripts to download data from Zenodo, Figshare, or Google Drive; convert formats; and establish consistent train/validation/test splits
%     \item \textbf{Sample submission}: Template files demonstrating expected submission format for each challenge
%     \item \textbf{Local evaluation}: Recreated Python implementation of official evaluation metrics enabling agents to validate solutions locally without accessing test labels
%     \item \textbf{Metadata}: YAML configuration files specifying challenge ID, task type, data paths, and grading parameters
% \end{itemize}

% All code for dataset preparation and evaluation is made publicly available to facilitate reproducibility and future extensions of the benchmark.


\begin{figure}[!t]
    \centering
    \includegraphics[width=0.85\columnwidth]{figures/figure3.pdf}
    \caption{Overview of ReX-MLE Task Categories. This figure illustrates the four distinct task types included in the benchmark: Segmentation, Detection, Classification, and Image Generation. }
    \label{fig:workflows}
\end{figure}





\subsection{Baseline Agents}

\vspace{3pt} \noindent \textbf{AIDE}~\citep{schmidt2024aide} frames machine learning engineering as a code-space optimization problem and performs greedy tree search with iterative refinement. The agent proposes candidate solutions, executes them, and uses automated feedback to diagnose errors and incrementally correct code. 
% Prioritizing rapid trial-and-error over exhaustive search, AIDE achieved a 16.9\% medal rate on MLE-Bench, demonstrating the effectiveness of fast, locally guided improvements for ML pipeline construction.

\vspace{3pt} \noindent \textbf{ML-Master}~\citep{liu2025ml} integrates large-scale exploration and reflective reasoning through a Monte Carlo Tree Search (MCTS) framework. By maintaining multiple solution trajectories and adaptively balancing exploration of new strategies with exploitation of promising ones, ML-Master efficiently navigates complex search spaces. 
% It achieved a 47.7\% medal rate on MLE-Bench, nearly tripling AIDE’s performance, highlighting the advantage of structured, multi-trajectory search reinforced by selective memory mechanisms.

\vspace{3pt} \noindent \textbf{R$\&$D-Agent}~\citep{yang2025rdagentllmagentframeworkautonomous} employs a dual-agent design that separates conceptual exploration from implementation refinement. A Researcher module proposes strategic directions, while a Developer module resolves execution errors and iteratively improves the code, enabling diverse and resilient multi-trace exploration.
% A Researcher agent proposes higher-level methodological directions based on performance signals, while a Developer agent focuses on resolving execution errors and improving code quality. This division of labor allows the system to run multiple interacting exploration traces in parallel, strengthening robustness and search diversity. 
% On MLE-Bench, R$\&$D-Agent ranks among the strongest open-source baselines, underscoring the benefits of coordinated, multi-trace iterative exploration.

\subsection{Experimental Setup}


\begin{figure}[t]
    \centering
    \includegraphics[width=0.9\columnwidth]{figures/figure2.pdf}
    \caption{Autonomous Agent Interaction Workflow.  This diagram depicts the workflow between the Medical Image Challenge Environment and the AI Agent. The environment provides task instructions, data, and grading feedback, while the agent iteratively performs strategy generation, error analysis, coding, debugging, and model training to produce a final submission.}
    \label{fig:workflows}
\end{figure}


\vspace{3pt} \noindent \textbf{Computational Resources.}  
All agents are executed on standardized hardware consisting of NVIDIA H100 GPUs (80GB memory), 64 CPU cores, and 128GB RAM. This configuration reflects realistic research computing environments while remaining broadly accessible to the academic community. We select GPT-5 as the primary model for all experiments (unless otherwise noted) due to its widespread adoption and API accessibility.

\vspace{3pt} \noindent \textbf{Time Budget.}  
Each agent is allocated a strict 24-hour wall-clock budget per challenge to develop a complete solution. This constraint evaluates the agent’s ability to efficiently explore solution spaces, manage computational resources, and prioritize promising methodological directions under realistic time limitations.

\vspace{3pt} \noindent \textbf{Agent Inputs.}  
For every challenge, agents receive five standardized inputs:  
(1) a competition description document outlining task objectives and clinical context;  
(2) the full training dataset with accompanying annotations;
(3) the test data without ground truth annotations;
(4) a sample submission illustrating the expected output format;  
(5) a local evaluation script enabling iterative validation.  
The description follows a standardized format that includes: overview and clinical context, detailed task specifications (modality, organs, dataset characteristics), dataset organization and splits, evaluation metrics with mathematical formulations, and submission requirements. Without any human intervention, the agents must use these materials to autonomously perform the entire workflow, including data exploration, preprocessing, model design, training, validation, and final submission generation.

\vspace{3pt} \noindent \textbf{Agent Outputs.} Each agent is instructed to produce a \texttt{submission.csv} file following the format of the provided sample submission. Unlike MLE-Bench, however, tasks that require saving predictions such as segmentations must be submitted as a folder containing both the CSV and the corresponding prediction files, rather than encoding all outputs within a single CSV. This setup reflects a realistic workflow in which model outputs need to be generated and used directly, rather than converted into easily represented text. We extract these submission folders for evaluation.


\vspace{3pt} \noindent \textbf{Evaluation Metrics.} For each challenge, we collect the top 10 human competitor scores, or the maximum available if fewer, from the public test leaderboard. For each metric within a challenge (Appendix \ref{appendix:tables}), we reconstruct the competitors’ positional rankings and then average these positions to obtain a mean ranking, which serves as the basis for each competitor’s overall standing in the challenge. Although some competitors may have evaluated their submissions on private test sets, we recreate the evaluation conditions as closely as possible. We then assess agent performance comprehensively, using both absolute metrics and leaderboard‑relative metrics tailored to the Grand Challenge environment:

\begin{itemize}[leftmargin=*, nosep]
\item \textbf{Challenge-specific scores}: Raw performance on each challenge’s evaluation metrics, as defined by their respective leaderboards. These scores allow for direct, like-for-like comparison across agents.

\item \textbf{Competition rank}: For each challenge, we compute positional rankings for every metric relative to the human leaderboard, then average these positions to obtain a mean ranking ($\overline{\text{rank}}$). From this, we derive the agent’s percentile using: $1 - \frac{\overline{\text{rank}} - 1}{\text{number of human competitors}}$. Under this formulation for a competition with 10 human competitors, an agent with a mean ranking of 1 (best) is placed in the 100\textsuperscript{th} percentile, while a mean ranking of 11 (worst) corresponds to the 0\textsuperscript{th} percentile.

\item \textbf{ReX-MLE Rank}: The overall benchmark ranking, computed as the mean of all competition percentiles across challenges (with failures assigned 0\%).
\end{itemize}

\subsection{Capability Evaluation Methodology}

Beyond quantitative performance metrics, we conduct systematic capability analysis to understand \textit{why} agents fail.
For each agent and challenge, we evaluate the full execution logs, including reasoning traces, planning decisions, code generation, debugging attempts, and intermediate outputs, using the 13 “Winning Strategies” identified by \citet{eisenmann2023winner} (Figure~\ref{fig:capabilities}). This analysis is conducted through an automated LLM-as-a-judge pipeline, which applies a standardized rubric to determine whether there is explicit, verifiable evidence that an agent exhibited each strategy. A binary indicator is assigned accordingly, with detailed procedures and the exact prompts provided in Appendix \ref{appendix:capability-analysis}. Aggregating these scores across all 20 challenges yields capability profiles that reveal which core scientific and engineering practices current autonomous ML systems consistently fail to demonstrate.

% we analyze the complete logs of agent reasoning, planning decisions, code generation, and debugging attempts; all attempted solutions, including failed experiments and debugging iterations; and systematic categorization of failures (implementation bugs, memory issues, format errors, metric misunderstandings). 
% We also evaluate each agent against the 13 crucial winning strategies identified by Eisenmann et al.~\citep{eisenmann2023winner} from comprehensive analysis of biomedical competition winners (Figure~\ref{fig:capabilities}).
% Following a standardized protocol, we assign a binary indicator for whether a strategy is demonstrated in the textual reasoning trace, with detailed procedures and prompts provided in the Appendix. Averaging these scores across all 20 challenges yields a capability profile that reveals which core scientific and engineering practices are missing in current autonomous ML systems.

% To standardize and simplify this process between each agent, we extract the textual analysis excluding code produced by the agent during its challenge run. For each agent-challenge pair, we assign a binary score: 1 if the agent demonstrates the strategy (attempts or implements it, even partially) or 0 if the strategy is absent (not attempted or considered). These binary scores are averaged across all 20 challenges to produce final capability scores.
% This systematic evaluation reveals which fundamental capabilities are missing from current autonomous ML systems, providing actionable insights for future agent development targeted at medical imaging tasks.





% \begin{table*}[h]
% \centering
% \small
% \caption{Comparison of ML agent benchmarks. \textbf{Real Grade:} Uses real competition leaderboards. \textbf{Output Eval:} Evaluates actual model outputs/submissions. \textbf{Domain Expert:} Requires domain expertise. \textbf{Strategy Analysis:} Compares against documented winning strategies.} \vspace{3pt}
% \begin{tabular}{lcccc}
% \toprule
% & \multicolumn{4}{c}{\textbf{Evaluation}} \\
% \cmidrule{2-5}
% & Real Grade & Output Eval & Domain Expert & Strategy Analysis\\
% % & Grade & Eval & Expert & Analysis \\
% % \midrule
% MLE-Bench \cite{chan2024mle} & \textcolor{green}{\cmark} & \textcolor{red}{\xmark} & \textcolor{red}{\xmark} & \textcolor{red}{\xmark} \\
% SWE-Bench \cite{jimenez2023swe} & \textcolor{green}{\cmark} & \textcolor{green}{\cmark} & \textcolor{red}{\xmark} & \textcolor{red}{\xmark} \\
% TimeSeriesGym \cite{cai2025timeseriesgym} & \textcolor{red}{\xmark} & \textcolor{green}{\cmark} & \textcolor{red}{\xmark} & \textcolor{green}{\cmark} \\
% ML-Bench \cite{tang2023ml} & \textcolor{red}{\xmark} & \textcolor{red}{\xmark} & \textcolor{red}{\xmark} & \textcolor{red}{\xmark} \\
% ML-Dev-Bench \cite{padigela2025ml} & \textcolor{red}{\xmark} & \textcolor{green}{\cmark} & \textcolor{red}{\xmark} & \textcolor{red}{\xmark} \\
% ReX-MLE & \textcolor{green}{\cmark} & \textcolor{green}{\cmark} & \textcolor{green}{\cmark} & \textcolor{green}{\cmark} \\
% \bottomrule
% \end{tabular}
% \label{tab:benchmark_comparison}
% \end{table*}





% \begin{figure*}[t]
% \centering
% \begin{tikzpicture}[
%     node distance=0.3cm,
%     box/.style={rectangle, rounded corners, draw, thick, minimum width=4.2cm, minimum height=2.8cm, align=left, font=\footnotesize},
%     yellowbox/.style={box, fill=yellow!30},
%     bluebox/.style={box, fill=blue!20},
%     greenbox/.style={box, fill=green!20},
%     redbox/.style={box, fill=red!20},
%     purplebox/.style={box, fill=purple!20},
%     orangebox/.style={box, fill=orange!20},
% ]
% % Top row - Data Preparation
% \node[purplebox] (rawdata) at (-5.5,2) {
%     \textbf{Raw Medical Data}\\[0.2cm]
%     Figshare Dataset\\
%     • 56 CTA scans (NRRD)\\
%     • 3 institutions\\
%     • Expert annotations\\
%     • $\sim$15GB medical images
% };

% \node[orangebox] (prepare) at (0,2) {
%     \textbf{Domain Preparation}\\[0.2cm]
%     \texttt{prepare.py}\\
%     • Parse NRRD format\\
%     • Match image-mask pairs\\
%     • 80/20 train/test split\\
%     • Preserve spacing info\\
%     • Institution metadata
% };

% \node[bluebox] (structure) at (5.5,2) {
%     \textbf{Agent Data View}\\[0.2cm]
%     \texttt{prepared/public/}\\
%     ├─ train/images/ (45)\\
%     ├─ train/labels/ (45)\\
%     ├─ test/images/ (11)\\
%     ├─ description.md\\
%     └─ sample\_submission.csv
% };

% % Bottom row - Task specification
% \node[yellowbox] (instructions) at (-5.5,-1.5) {
%     \textbf{Task Instructions}\\[0.2cm]
%     Binary segmentation of\\
%     aortic vessel tree\\[0.2cm]
%     \textit{Requires understanding:}\\
%     • 3D medical imaging\\
%     • Cross-institutional\\
%     \hspace{0.3cm}generalization\\
%     • Clinical variability
% };

% \node[greenbox] (submission) at (0,-1.5) {
%     \textbf{Submission Format}\\[0.2cm]
%     \texttt{submission.csv}:\\
%     image\_id, predicted\_mask\_path\\[0.2cm]
%     \texttt{predictions/}:\\
%     D1.seg.nrrd (3D binary)\\
%     K5.seg.nrrd\\
%     ...
% };

% \node[redbox] (grading) at (5.5,-1.5) {
%     \textbf{Domain-Expert Grading}\\[0.2cm]
%     \texttt{grade.py}\\[0.1cm]
%     \textit{Per-case evaluation:}\\
%     • Load 3D NRRD masks\\
%     • Dice coefficient\\
%     • Hausdorff distance (mm)\\[0.1cm]
%     \textit{Aggregate metrics:}\\
%     • Mean/Median DSC
% };

% % Arrows showing flow
% \draw[->, very thick] (rawdata) -- (prepare);
% \draw[->, very thick] (prepare) -- (structure);
% \draw[->, very thick] (structure) -- (grading);
% \draw[->, very thick] (instructions) -- (submission);
% \draw[->, very thick] (submission) -- (grading);


% \end{tikzpicture}
% \caption{Task preparation pipeline for medical imaging benchmarks (SEG.A example). Medical imaging tasks require domain-specific data preparation handling 3D medical formats, complex submission formats with spatial predictions, and real pixel-level evaluation using clinical metrics.}
% \label{fig:medical_task_preparation}
% \end{figure*}
