
\documentclass{article}
% Use the official conference style when compiling in Overleaf/local:
% Place agents4science2025.sty next to this file and uncomment the next line.
\usepackage{agents4science_2025}

% Common packages
\usepackage{graphicx}
\usepackage{amsmath, amssymb}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{subcaption}
\usepackage{siunitx}
\usepackage{xcolor}

% Anonymized title
\title{Uncertainty-Guided Agents for Rare-Disease Hypothesis Discovery on Knowledge Graphs}
\author{Anonymous Submission \\ 1st Open Conference on AI Agents for Science (Agents4Science 2025)}
\date{}

\begin{document}
\maketitle

\begin{abstract}
Rare disease discovery is hampered by data sparsity, fragmented evidence, and expensive validation. We present an uncertainty-guided multi-agent system that closes the loop between hypothesis generation, experiment selection, and self-audit on a biomedical knowledge graph (KG). A lightweight link scorer with Monte Carlo-style uncertainty feeds a planner that prioritizes experiments under a fixed budget; an auditor reports calibration at high-confidence thresholds. On a synthetic rare-disease KG benchmark, our agent improves precision--recall and budgeted discovery over heuristic and static baselines (e.g., +\SI{0.10}{} AUPRC and +\SI{0.9}{} Hit@10 on average) while maintaining reasonable calibration. Ablations confirm that uncertainty-driven selection is critical to early-budget gains; robustness sweeps show graceful degradation under increased sparsity and noise. The framework is fully reproducible with code that regenerates all figures, providing a tractable template for evaluating AI agents for scientific discovery.
\end{abstract}

\section{Introduction}
Rare diseases affect millions yet face limited data availability and costly experimentation. Knowledge graphs (KGs) integrate heterogeneous biomedical evidence, enabling hypothesis generation (e.g., drug--disease links). However, static ranking alone does not answer \emph{which} experiment to run next under tight budgets. We propose a closed-loop \emph{agent} that combines (i) a calibrated scorer, (ii) an uncertainty-driven planner, and (iii) an auditor for safety signals.

\paragraph{Contributions.} (1) A reproducible multi-agent pipeline for rare-disease KG discovery with uncertainty-guided selection. (2) A synthetic benchmark capturing sparsity and noise typical in rare-disease settings. (3) Empirical gains over heuristic and static baselines on AUPRC, AUROC, Hit@10, and regret; ablations and robustness analyses. (4) Practical guidance for safe deployment via calibration-aware auditing.

\section{Related Work}
\textbf{Link prediction on KGs.} Embedding and scoring methods include TransE~\cite{bordes2013translating} and ComplEx~\cite{trouillon2016complex}. Biomedical KGs such as Hetionet~\cite{himmelstein2017hetionet} demonstrate repurposing potential. \textbf{Active learning and experiment planning.} Classical surveys~\cite{settles2009active} and BO~\cite{snoek2012practical} motivate budgeted selection; recent graph AL explores structure-aware acquisition~\cite{huang2018active,ma2021active}. \textbf{Uncertainty and calibration.} MC dropout~\cite{gal2016dropout} and deep ensembles~\cite{lakshminarayanan2017simple} provide practical uncertainty; conformal prediction~\cite{shafer2008conformal} offers coverage guarantees.

\section{Method}
We define a heterogeneous KG with drugs, targets, and diseases. Candidate drug--disease pairs are scored using normalized meta-path features (path counts, degrees, Jaccard). A logistic scorer $f(x)=\sigma(w^\top x + b)$ provides probabilities; we induce stochasticity via dropout-style masking to estimate predictive variance $\sigma(x)$.
% Note: We use standard deviation scaling (UCB-style) rather than variance, and sweep \lambda for calibration.

\paragraph{Planner.} At each step, the agent selects a batch $\mathcal{B}$ maximizing an acquisition that trades off exploitation and exploration:
\begin{equation}
a(x) = \hat{p}(x) + \lambda \cdot \sigma(x),
\end{equation}
where $\hat{p}$ is the mean predicted probability and $\sigma^2$ the MC variance. We measure efficiency by cumulative discoveries and regret vs.\ an oracle.

\paragraph{Auditor.} We report Expected Calibration Error (ECE), Maximum Calibration Error (MCE), and high-confidence coverage (precision among predictions with $p \ge 0.9$).

\section{Experiments}
\subsection{Setup and Metrics}
We generate synthetic KGs with controllable sparsity/noise, using disease-wise splits to prevent leakage. Metrics: AUPRC (primary), AUROC, Hit@10, and regret under fixed budgets.

\subsection{Baselines}
(1) Heuristic path-count ranking; (2) Static logistic without uncertainty; (3) Our uncertainty-guided agent.

\subsection{Main Results}
Figure~\ref{fig:pr} shows PR performance; Figure~\ref{fig:curves} compares budgeted discovery. Our method achieves higher early retrieval and lower regret. A summary table appears in Supplementary Material.

\begin{figure}[t]
  \centering
  \includegraphics[width=0.98\columnwidth]{figures/pr_abl_high_noise.pdf}
  \caption{Precision--Recall on test split (AUPRC reported in legend).}
  \label{fig:pr}
\end{figure}

\begin{figure}[t]
  \centering
  \includegraphics[width=0.98\columnwidth]{figures/bar_regret_main.pdf}
  \caption{Agent selection strategies: higher $-$regret is better. Uncertainty-based selection outperforms greedy and random under tight budgets.}
  \label{fig:curves}
\end{figure}

\subsection{Ablations and Robustness}
We remove uncertainty (dropout$=0$), increase sparsity, and increase noise. Uncertainty is critical to early-budget gains; robustness degrades gracefully with sparsity/noise.

\subsection{Security/Calibration}
The high-confidence band ($p\ge 0.9$) exhibits reasonable coverage; we recommend a reject option below threshold and logging rationales for audit.

\section{Discussion}
\textbf{Strengths.} Sample efficiency, calibrated decisions, and full reproducibility. \textbf{Limitations.} Synthetic data cannot capture all biological confounders; uncertainty via dropout is a proxy. \textbf{Future work.} Heterogeneous GNN scorers, conformal prediction, diversity-aware acquisition, and public KG evaluation.

\section{Conclusion}
Uncertainty-guided agents offer a practical path to trustworthy, efficient discovery on rare-disease KGs. Our pipeline and code provide a compact, extensible testbed for Agents4Science.

\bibliography{refs}


\clearpage
% --- Required Disclosures & Checklists (do not count toward page limit) ---
\section*{AI Contribution Disclosure}
\input{statements/ai_contrib_disclosure}

\section*{Responsible AI Statement}
\input{statements/responsible_ai}

\section*{Agents4Science AI Checklist}
\input{checklists/ai_checklist}

\section*{Agents4Science Paper Checklist}
\input{checklists/paper_checklist}

% (Optional) Reproducibility Statement
\section*{Reproducibility Statement}
We release a minimal, deterministic pipeline with fixed seeds and a synthetic, licensed dataset snapshot. The repository includes commands to reproduce all results, data provenance, and scripts that emit metrics and tables to the results/ directory. Hardware budget and dependencies are documented in the README.

\end{document}
