%%%%%%%% ICML 2025 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%

\documentclass{article}

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2025} with \usepackage[nohyperref]{icml2025} above.
\usepackage{hyperref}


% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}

% Use the following line for the initial blind version submitted for review:
% \usepackage{icml2025}

% If accepted, instead use the following line for the camera-ready submission:
\usepackage[accepted]{icml2025}
\input{math_commands.tex}
%\input{supplement.tex}

\def\UrlFont{\rm}
\newcommand{\NC}[1]{{\color{red}nicolo: #1}}
\newcommand{\RL}[1]{{\color{blue}lorry: #1}}
\newcommand{\rev}[1]{{\color{red}#1}}
\newcommand{\std}[1]{^{\scriptstyle{\pm#1}}}
\newcommand{\BibTeX}{\textsc{Bib}\TeX}
\newcommand{\etal}{et al.}
\usepackage{hyperref}
\usepackage{url}

\renewcommand{\arraystretch}{0.4} 
\def \iidtext {\textrm{i.i.d.}} 
\def \score {\textrm{conformity score}}
\def \calib {\mathrm{calib}}
\def \train {\mathrm{train}} 
\def \test {\mathrm{test}}
\def \nei {\mathcal{N}}
\def \cD {\mathcal{D}}
\def \cV {\mathcal{V}}
\def \cE {\mathcal{E}}
\def \cX {\mathcal{X}}
\def \Atrain {A^{\textrm{train}}}
\def \Aval {A^{\textrm{val}}}
\def \Act {A^{\textrm{ct}}}
\def \Atest {A^{\textrm{test}}}
\def \Acalib {A^{\textrm{calib}}}
\def \Wtc {W^{\textrm{train-calib}}}
\def \Wtrain {W^{\textrm{train}}}
\def \Wval {W^{\textrm{val}}}
\def \Wct {W^{\textrm{ct}}}
\def \Wtest {W^{\textrm{test}}}
\def \Wcalib {W^{\textrm{calib}}}
\def \Whold {W^{\textrm{holdout}}}
\def \Etrain {E^{\textrm{train}}}
\def \Eval {E^{\textrm{val}}}
\def \Ect {E^{\textrm{ct}}}
\def \Etest {E^{\textrm{test}}}
\def \Ecalib {E^{\textrm{calib}}}
\def \Ehold {E^{\textrm{holdout}}}
\def \Etc {E^{\textrm{train-calib}}}
\def \Rtrain {R^{\textrm{train}}}
\def \loss {\mathcal{L}}
\newcommand{\revise}[1]{{\color{black} #1}} 

\usepackage{url} 
\usepackage{graphicx} 
\usepackage{amsmath}
\usepackage{amsfonts}

\usepackage{adjustbox}
\usepackage{booktabs} 
\usepackage{algorithm, algpseudocode}
\usepackage{mathtools}
\usepackage{soul}
\usepackage{bbm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{adjustbox}
\usepackage{booktabs}
\usepackage{ragged2e}
\usepackage{amsmath}
\usepackage{svg}

\usepackage{adjustbox}
\usepackage{booktabs}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{question}[theorem]{Question}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
%\newtheorem{theorem}{Theorem}[section]
%\newtheorem{proposition}[theorem]{Proposition}
%\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
%\newtheorem{assumption}[theorem]{Assumption}
%\theoremstyle{remark}
%\newtheorem{remark}[theorem]{Remark}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}

% The \icmltitle you define below is probably too long as a header.
% Therefore, a short form for the running title is supplied here:
\icmltitlerunning{Submission and Formatting Instructions for ICML 2025}
\usepackage{xcolor}
\usepackage{lipsum} % 用于生成示例文本
% 设置背景色和字体颜色
%\pagecolor{black} % 设置背景为黑色
%\color{white} % 设置字体为白色

\begin{document}

\twocolumn[
\icmltitle{Residual Reweighted Conformal Prediction With Graph Neural Network}

% It is OKAY to include author information, even for blind
% submissions: the style file will automatically remove it for you
% unless you've provided the [accepted] option to the icml2025
% package.

% List of affiliations: The first argument should be a (short)
% identifier you will use later to specify author affiliations
% Academic affiliations should list Department, University, City, Region, Country
% Industry affiliations should list Company, City, Region, Country

% You can specify symbols, otherwise they are numbered in order.
% Ideally, you should not use this facility. Affiliations will be numbered
% in order of appearance and this is the preferred way.
\icmlsetsymbol{equal}{*}

\begin{icmlauthorlist}
\icmlauthor{Firstname1 Lastname1}{equal,yyy}
\icmlauthor{Firstname2 Lastname2}{equal,yyy,comp}
\icmlauthor{Firstname3 Lastname3}{comp}
\icmlauthor{Firstname4 Lastname4}{sch}
\icmlauthor{Firstname5 Lastname5}{yyy}
\icmlauthor{Firstname6 Lastname6}{sch,yyy,comp}
\icmlauthor{Firstname7 Lastname7}{comp}
%\icmlauthor{}{sch}
\icmlauthor{Firstname8 Lastname8}{sch}
\icmlauthor{Firstname8 Lastname8}{yyy,comp}
%\icmlauthor{}{sch}
%\icmlauthor{}{sch}
\end{icmlauthorlist}

\icmlaffiliation{yyy}{Department of XXX, University of YYY, Location, Country}
\icmlaffiliation{comp}{Company Name, Location, Country}
\icmlaffiliation{sch}{School of ZZZ, Institute of WWW, Location, Country}

\icmlcorrespondingauthor{Firstname1 Lastname1}{first1.last1@xxx.edu}
\icmlcorrespondingauthor{Firstname2 Lastname2}{first2.last2@www.uk}

% You may provide any keywords that you
% find helpful for describing your paper; these are used to populate
% the "keywords" metadata in the PDF but will not be shown in the document
\icmlkeywords{Machine Learning, ICML}

\vskip 0.3in
]

% this must go after the closing bracket ] following \twocolumn[ ...

% This command actually creates the footnote in the first column
% listing the affiliations and the copyright notice.
% The command takes one argument, which is text to display at the start of the footnote.
% The \icmlEqualContribution command is standard text for equal contribution.
% Remove it (just {}) if you do not need this facility.

%\printAffiliationsAndNotice{}  % leave blank if no need to mention equal contribution
\printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text.

\begin{abstract}
Graph Neural Networks (GNNs) are widely adopted for modeling graph-structured data, yet their deployment in high-stakes decision-making (e.g., healthcare, autonomous systems) remains limited due to the lack of rigorous uncertainty quantification. While Conformal Prediction (CP) provides statistically valid prediction sets with user-defined coverage guarantees, existing CP methods for GNNs often produce overly conservative intervals, ignoring intrinsic graph structure and heteroscedasticity in prediction difficulty. To address this, we propose Residual Reweighted Conformal Prediction GNN (RR-GNN), a framework to generate minimal prediction sets with guaranteed marginal coverage. 
RR-GNN use Graph-Structured Mondrian CP that leverages community structure to partition calibration data, ensuring coverage validity while exploiting topological dependencies. We define a non-conformity measure that adjusts prediction intervals based on estimated residuals by an separate GNN algorithm. We validate RR-GNN on 15 real-world graphs (transportation, citation, social networks) across node classification, regression, and edge weight prediction. Compared to CP baselines, RR-GNN achieves 6.15\% to 28.87\% improved efficiency, with no loss of coverage.
\end{abstract}


\section{Introduction}

Graph Neural Networks (GNNs) have achieved state-of-the-art performance on graph-structured data across various applications like recommendation systems, knowledge graphs, and molecular modeling \cite{lam2022graphcast, li2022graph, wu2022graph}. As GNNs are increasingly applied in high-stakes areas such as healthcare and autonomous systems, accurately assessing prediction uncertainty becomes paramount. A common approach to predicting uncertainty is to construct prediction intervals that capture the probability of true outcomes. While several methods of predicting uncertainty have been explored \cite{hsu2022makes, zhang2020mix, lakshminarayanan2017simple}, they typically lack rigorous theoretical guarantees on interval validity \cite{wang2021confident}. Improving uncertainty quantification for GNNs with probabilistic guarantees is critical to ensuring their safe, trusted application in real-world settings. 

Conformal Prediction (CP) is a machine learning framework for uncertainty quantification that constructs prediction intervals for any underlying point predictor in a theoretically valid manner \cite{vovk2005algorithmic}. Due to its principled formulation, rigorous guarantees, and distribution-free nature, CP has enabled uncertainty estimation across diverse applications, including computer vision \cite{angelopoulos2020uncertainty, bates2021distribution}, causal inference \cite{lei2021conformal, jin2023sensitivity, yin2024conformal}, time series \cite{gibbs2021adaptive, zaffran2022adaptive}, and drug discovery \cite{jin2023selection}. CP leverages a "calibration" dataset to output prediction sets for new test samples that provably cover the true outcome with at least 1 - $\alpha$ probability, where $\alpha$ is a user-specified error tolerance. CP is based on a nonconformity measure/score that measures the dissimilarity between a data point and others, reflecting disagreements according to the algorithm's feature-relationship assumptions. Crucially, each nonconformity score can represent a single algorithm, by defining a distinct CP predictor \cite{papadopoulos2008normalized}. Additionally, adding a Residual-Reweighting (RR) factor can refine prediction intervals \cite{papadopoulos2011regression, lei2018distribution}. By assigning weights to errors covariately, RR helps mitigate heteroscedasticity impacts on accuracy and reliability. Overall, carefully constructing the nonconformity measure and incorporating RR are pivotal to prediction performance. 

Here, we proposed a new residual-reweighted nonconformity measure to the conformalized graph neural network (RR-GNN) by predicting the expected accuracy independently. We adopt the concept of a normalized nonconformity function \cite{johansson2014regression,kath2021conformal} and local reweighted conformal method \cite{papadopoulos2008normalized}, which involves training a separate model specifically to predict the error of the underlying model as shown as Figure~\ref{fig: modelpipeline}. There are 2 GNNs in RR-GNN, Conformal GNN and Residual GNN. The Conformal GNN model will generate the prediction based on the input and the true label. Using the residuals between this prediction and the true label as the target, a Residual GNN model is trained in tandem with the Conformal GNN. The Residual GNN could calibrate the GNN outputs and produce valid and narrow prediction intervals. We applied RR-GNN for weight prediction, node classification and node regression tasks based on the graph-structured data. 
The RR-GNN helps capture the heteroscedastic nature of the graph data by learning the error structure of the GNN predictions. 
   

In summary, the contributions of this paper are:
\begin{itemize}

\item{ We proposed a novel framework that integrates conformal prediction with graph neural networks (GNNs) to enhance uncertainty quantification in graph-structured data.}
\item{To address the heteroscedasticity of graph data, we design a novel nonconformity score that reweights residuals using a separately trained GNN.}
\item{We propose a graph-based Mondrian CP, where nodes are clustered based on the graph structure, resulting in more fine-grained, context-aware prediction intervals. }
\item{To prevent information leakage between the primary GNN and residual-prediction GNN, we develop a cross-training strategy where models iteratively update each other. This ensures independence between calibration and training data—a critical requirement for CP validity—while maintaining model performance.}
\end{itemize}


\section{Related Work}
Several GNN-based models have been proposed to handle prediction tasks \cite{kipf2016semi,cai2021line,kollias2022directed}. Kipf et al. \cite{kipf2016semi} introduced an unsupervised representation learning approach utilizing graph neural networks, enabling the extraction of low-dimensional embeddings from graph-structured data. Cai et al. \cite{cai2021line} proposed the line graph neural network (LGNN) for link prediction, which applies line graph theory to convert each edge in the original graph into a node in the line graph. This transformation shifts the link prediction challenge into a node classification task within the line graph. Kollias et al. \cite{kollias2022directed} presented a novel class of auto-encoders tailored for directed graphs (DiGAE). By employing parameterized message-passing modules, this directed graph auto-encoder effectively learns latent representations that capture complex relationships in directed graph data, surpassing existing methods in node classification and link prediction tasks. Additionally, current GNN architectures can be readily adapted for regression tasks by modifying the output layer and selecting a suitable loss function, such as squared error, for the predicted values.

Another notable advancement in GNNs for multi-relational heterogeneous graphs is the Heterogeneous Graph Attention Network (HAN) proposed by Wang et al.\cite{wang2019heterogeneous}. HAN introduces a hierarchical attention mechanism with node-level and semantic-level attentions, enabling the model to capture the importance of both meta-path-based neighbors and different meta-paths. This approach has demonstrated state-of-the-art performance on various heterogeneous graph analytics tasks. Similarly, Iyer et al.\cite{iyer2021bi} proposed the Bi-Level Attention Graph Neural Network (BAGNN), which employs a novel bi-level attention mechanism to simultaneously capture node-level and graph-level patterns, enhancing the model's capability in learning complex relationships in heterogeneous data.

Existing GNN methods have primarily focused on achieving highly accurate vote values, which has led to limitations in flexibility, real-time adaptability, and generalization capability. Rather than requiring precise predictions, readers often benefit more from a general range as a reference. Conformal prediction offers prediction intervals instead of a single point estimate, allowing for easier adaptation to changes, real-time updates, better capture of uncertainty in the data, and enhanced generalization. Huang et al. introduced the conformalized graph neural network (CF-GNN), which extends conformal prediction to GNNs~\cite{huang2024uncertainty}.

The use of importance weighting schemes has emerged as a promising approach to enhancing the robustness of conformal prediction GNNs. Recent works have explored several novel reweighting approaches to CP frameworks. Guo et al. proposed a causal inference-driven importance weighting technique \cite{guo2017calibration}. Another study by Volpi et al.,  introduced a distribution matching-based reweighting strategy \cite{volpi2018generalizing}. The key idea is to align the training and test data distributions by minimizing the discrepancy between the two distributions, reducing the impact of distribution shifts on GNN performance. Despite the success of GNNs in the prediction tasks, existing techniques for uncertainty quantification do not provide rigorous guarantees on the coverage or reliability of their estimated prediction uncertainties \cite{bhagat2011node}. 


\begin{figure}[h]
\centerline{\includegraphics[height=8cm,width=0.5\textwidth,clip=]{figures/figure1_f2.pdf}}
\small
\caption{The overall pipeline of the RR-GNN model. The whole pipeline includes three parts: 1) A conformal graph neural network (GNN) is trained on graph-structured data to perform edge weight prediction, node regression, and node classification, which generates prediction intervals for each task;
2) A separate Residual GNN model is trained on validation data to predict residuals/errors between the true values and Conformal GNN's predictions;
3) An residual reweighting approach integrates the prediction intervals from the Conformal GNN and the residual weights predicted by the Residual GNN. Note that the Conformal GNN model is trained using a cross-training process with the Residual GNN model, and the final prediction of Conformal GNN is corrected by the prediction of Residual GNN.}
\label{fig: modelpipeline}
\end{figure}


\section{Methodology}\label{sec: m}
RR-GNN combines conformal prediction with a GNN-based framework to effectively address graph-structured data tasks. The model accepts a graph as input, either undirected or directed and yields predictions for edge weights or node features as output. RR-GNN
provides prediction intervals instead of a single point
value, which can better capture the uncertainty in the
data. For new test samples, the probability of the predicted interval would cover the true outcome with at least 1 - $\alpha$ with rigorous theoretical guarantees. 
%Compared previous methods, which assume homophily of the nodes and assign
%higher weights of the linked nodes, RR-GNN does not require the homophily
%assumption. Additionally, the assumption of homophily may not
%hold in traffic networks, as traffic conditions can vary; for
%example, a small road adjacent to a busy street might experience less traffic.

 RR-GNN performs graph-based Mondrian CP, in which the input graph is clustered to subnetworks to capture the community structure of the graph. It begins by training a predictive model on a training dataset named Conformal GNN, which gives the main prediction according to the tasks. Next, RR-GNN trains a Residual GNN model based on the validation set to predict the residual of Conformal GNN, which is next used to calculate prediction errors used as a reweight factor to establish nonconformity scores, measuring
how unusual the data point looks relative to previous examples. After determining a significance level based on the desired confidence, RR-GNN generates a cluster-specific fixed prediction interval based on the distribution of nonconformity scores in the calibration set of each cluster. The predicted interval of test data is given by the combination of predicted point estimation, residual prediction and the cluster-specific fixed interval. 
Notably, throughout this process, each dataset is divided into four distinct parts: training data, validation data, calibration data, and test data, with only the test data lacking labels. Algorithm~\ref{closs_train} and Figure~\ref{fig: modelpipeline} show the implementation details of training the main and residual models crossly simultaneously~\cite{cowell2006alternative,peste2021ac}. The Conformal GNN model is
trained using a cross-training process with the Residual GNN model. %The efficiency of cross-training of two residual model can be shown in \ref{prop: crosstraining2}.



We clarify the notation here. The input is a graph,  $\displaystyle \gG=(\displaystyle \sV,\displaystyle \mE)$, with node set $\displaystyle \sV$ and edge set $\displaystyle \mE \subseteq \displaystyle \sV \times \displaystyle \sV$.
Assume the graph has $n$ nodes with $m$ features.
Let $\displaystyle \mX \in \displaystyle \R^{n\times m}$ be the node feature matrix, and $\mX_{i,:} \in \displaystyle \R^{m}$ be the feature vector of the $i^{th}$ node. 
The binary adjacency matrix of $\displaystyle \gG$, $ \displaystyle \mA \in \{0, 1\}^{n\times n}$, encodes the binary (unweighted) connecting structure of the graph. For the weighted graph, we use $ \displaystyle \mW \in \displaystyle \R^{n\times n}$ to represent the weighted adjacency matrix.


In RR-GNN, taking the weighted graph as an example. We partition the edge set $\mE$ into three disjoint subsets: $\mE^{train}$, $\mE^{val}$, $\mE^{calib}$ and $\mE^{test}$, while satisfy that $\mE = \mE^{train} \cup \mE^{val} \cup \mE^{calib} \cup \mE^{test}$. 
We define
\begin{equation}\label{eq: weighted adj train}
\textbf{W}^{train} =
\begin{cases}
W_{ij}, & \textrm{if } (i, j) \in \mE^{train}; \\
\delta, & \textrm{if } (i, j) \in \mE^{val} \cup \mE^{calib} \cup \mE^{test}; \\
0, & \textrm{otherwise},
\end{cases}
\end{equation}
where $\delta > 0$ is a small positive constant to represent a minimal edge weight.
The Conformal GNN is represented by:
\begin{equation}
\label{eq: CGNN}
    [\hat{\vy},\hat{\vy}^{\alpha/2},\hat{\vy}^{1-\alpha/2}] = g_{\vtheta_{1}}\left( \displaystyle \mW^{train}, \displaystyle \mX \right) ,
\end{equation}
where $y$ is the label for a specific task; $g_{\vtheta_{1}}$ represents the GNN-based model mapping the input to the label; $\hat{\vy}^{\alpha/2}$ and $\hat{\vy}^{1-\alpha/2}$ is the predicted $\alpha/2$ and $1-\alpha/2$ quantile of the label. The Residual GNN is represented by the following equation:
\begin{equation}
\label{eq: RGAE0}
    \hat{\mR} = g_{\vtheta_{2}}\left( \displaystyle \mW^{val}, \displaystyle \mX \right) ,
\end{equation}
where $\hat{\mR}$ is the predicted residual of Conformal GNN prediction; $g_{\vtheta_{2}}$ represents the GNN-based model mapping the input to the residual. 
Then, we will predict residuals for the calibration set and calculate the nonconformity score
\begin{equation}
\label{eq: ncs}
    \mV = s\left(g_{\vtheta_{1}}\left( \displaystyle \mW^{calib}, \displaystyle \mX \right),g_{\vtheta_{2}}\left( \displaystyle \mW^{calib}, \displaystyle \mX \right) \right),
\end{equation}
where $s(.)$ is the nonconformity function. We will perform graph-based partition: the nodes in the graph $G=(V, E)$ are clustered into $K$ groups $\mathcal{C}_1, \mathcal{C}_2, \ldots, \mathcal{C}_K$ using Louvain clustering, where $G_{(m)}=(V_{(m)}, E_{(m)})$ is the subgraph for the cluster $m$.
Next, we will get an interval factor for each cluster, $d_{(m)}$, which is a quantile of nonconformity scores according to the significant level.
The predicted interval for a test data point in cluster m is 
\begin{multline}
\label{eq: itv}
    \mC_{(m)} = [l\left(g_{\vtheta_{1}}\left( \displaystyle \mW^{test}, \displaystyle \mX \right),g_{\vtheta_{2}}\left( \displaystyle \mW^{test}, \displaystyle \mX \right) , d_{(m)}\right),\\
    u\left(g_{\vtheta_{1}}\left( \displaystyle \mW^{test}, \displaystyle \mX \right),g_{\vtheta_{2}}\left( \displaystyle \mW^{test}, \displaystyle \mX \right) , d_{(m)} \right) ],
\end{multline}
where $l(.)$ and $u(.)$ represent the lower and upper bound of the prediction interval.

\begin{algorithm}[tb]
   \caption{Crossly-Training Algorithm for RR-GNN}
   \label{closs_train}
\begin{algorithmic}
   \STATE {\bfseries Input:} ${Model}_{C0}$ (Conformal GNN) weights $\vtheta_1 \in \displaystyle \R^N$, Residual ${Model}_{C1}$ (Residual GNN) weights $\vtheta_2 \in \displaystyle \R^N$, loop limit $n$
   \FOR{$i=1$ {\bfseries to} $n$ {\bfseries with step} $1$}
      \IF{$i$ is odd}
         \STATE Train ${Model}_{C0}$ with gradients and update $\vtheta_1$ using the training data.
      \ELSE
         \STATE Get the residual from ${Model}_{C0}$ as the label based on validation data.
         \STATE Train Residual ${Model}_{C1}$ with gradients and update $\vtheta_2$ based on validation data.
      \ENDIF
   \ENDFOR
\end{algorithmic}
\end{algorithm}



\subsection{RR-GNN on Edge Weight Prediction}
\subsubsection{Conformal GAE for Edge Weight Prediction } \label{subsec: GAE}
 The GAE \cite{kipf2016variational,ahn2021variational} is used for edge weight prediction tasks by learning node embeddings across various types of graphs, including directed graphs \cite{kollias2022directed}, weighted graphs \cite{zulaika2022lwp}, and graphs with different edge types \cite{samanta2020nevae}. The edge weight is then given by the similarity of node embeddings. There are two kinds of problem settings in link prediction shown in Figure 1 in supplementary material including transductive setting and inductive setting. We focus on the first one. We can see the details in the first part in supplementary material. We integrate CP in GAE's framework by making the encoder produce a triple output.
 We use $\displaystyle \mZ, \mZ^{\alpha/2}$, and $\mZ^{1-\alpha/2}\in \displaystyle \R^{n\times d}$ to represent the mean, $\alpha$/2 quantile, and $(1-\alpha / 2)$ quantile of node embedding matrix obtained from a Conformal GAE model. This differs from having three single-output GAE encoders because most network parameters are shared across the three embeddings.
The resulting embedding is 
\begin{equation}
[\mZ,\mZ^{\alpha/2},\mZ^{1-\alpha/2}] = f_{\vtheta}(\displaystyle \mX, \displaystyle \mA),
\label{eq:gae encoder}
\end{equation}
where $f_{\vtheta}$ is the structure of the encoder, and $\vtheta$ is a learnable parameter. Note that the traditional GNN model is applicable because it could generate $d$-dimentional output for each node, which represents node embeddings. Directed GAE designed for the directed graph \cite{kollias2022directed} is more flexible, using separate source and target embeddings, $\mZ=[\displaystyle \mZ^S,\mZ^T]$. As for undirected GAE, $\mZ^S=\mZ^T$. It is similar for $\mZ^{\alpha/2}$ and $\mZ^{1-\alpha/2}$.

We take the directed graph as an example for the following description. We next reconstruct the weighted adjacency matrix from the inner product between node embeddings, which is the Conformal GNN-based model. 
\begin{multline}
     g_{\vtheta_1}(\mX,\mA)=[\hat{\displaystyle \mW}, \hat{ \mW}^{\alpha/2}, \hat{\mW}^{1-\alpha/2}]=  [\mZ^S {( \mZ^T)}^\top,\\
     \mZ^{S,\alpha/2} {( \mZ^{T,\alpha/2})}^\top,\mZ^{S,1-\alpha/2} {( \mZ^{T,1-\alpha/2})}^\top].
\end{multline}
where $\hat{\mW}, \hat{\mW}^{\alpha / 2}$, and $\hat{\mW}^{1-\alpha / 2}$ be the mean, $\alpha / 2$, and (1- $\alpha$/2) quantiles of the edge weights.

The loss function $\mathcal{L}_{\mathrm{Conformal}-\mathrm{GNN}}$ is given by:
\begin{multline}
\mathcal{L}_{\mathrm{GAE}} +
   \sum_{(i, j) \in E^{\text{train}}} \rho_{\alpha / 2}\left(W_{i j}^{\text{train}}, \hat{W}_{i j}^{\alpha / 2}\right) \\
   \quad + \rho_{1-\alpha / 2}\left(W_{i j}^{\text{train}}, \hat{W}_{i j}^{1-\alpha / 2}\right).
\end{multline}

where $\mathcal{L}_{\mathrm{GAE}}$ is the squared error loss defined in (9) The second term is the pinball loss referenced to~\cite{romano2019conformalized,steinwart2011estimating}, defined as
$$
\rho_\alpha(y, \hat{y}):= \begin{cases}\alpha(y-\hat{y}) & \text { if } y>\hat{y} \\ (1-\alpha)(y-\hat{y}) & \text { otherwise }\end{cases}
$$

The first term is added to train the mean estimator, $\hat{\mW}$. 
%Algorithm $\underline{2}$ describes how to obtain the prediction intervals in this setup. Contrary to the CP conformity score (13), the CQR conformity score (18) considers both undercoverage and overcoverage scenarios. 
\begin{equation} \label{eq:Train_GAE}
    \loss_{\textrm{GAE}} = \| \displaystyle \mA^{train} \odot \hat{\displaystyle \mW} -\displaystyle \mW^{train} \|_.    
\end{equation}
%We train the model until convergence and then select the tuning parameters that minimize $\loss_{\textrm{GAE}}$ on the validation set, $\mW^{val}$.

\subsubsection{RR-GNN on Edge Weight Prediction}\label{sec: conformal}
%The RR-GNN framework fundamentally integrates conformal prediction with a graph autoencoder structure. Unlike methods such as  Neighborhood Adaptive Prediction Sets (NAPS) \cite{clarkson2023distribution} and Diffusion Adaptive Prediction Sets (DAPS) \cite{zargarbashi23conformal}, which rely on the homophily assumption of nodes and assign more weight to connected nodes from a test node, RR-GNN doesn't rely on this assumption. The homophily assumption may not hold in numerous scenarios. For instance, in traffic networks \cite{xiao2023spatial}, a minor road adjacent to a busy street might experience lighter traffic, meaning that the nearby nodes do not necessarily carry more weight.

We train a separate Residual GAE model  to predict the error of the edge weight prediction of the $g_{\vtheta_1}$, 
\begin{equation}
\label{eq: RGAE}
    \hat{R}_{ij}^{val} = g_{\vtheta_{2}}\left((i,j); \displaystyle \mA, \displaystyle \mX \right) ,
\end{equation}
where the label is $R^{val}_{ij} = \Wval_{ij} - \hat{W}_{ij}^{val}$, $\hat{W}_{ij}^{val}=g_{\vtheta_{1}}\left((i,j); \displaystyle \mA^{val},\displaystyle \mX \right)$ is the output of conformal GAE, and the RR-GAE is trained by minimizing
\begin{equation} \label{eq: train RGAE}
    \loss_{\textrm{Residual GNN}} = \| \displaystyle \mA^{val} \odot \hat{\displaystyle \mR}^{val} - \displaystyle \mR^{val} \|_F.  
\end{equation}
\hfill

We use the standard deviation of these predictions as a proxy of the residual. More concretely, we propose a new nonconformity score function, which is the interval of predicted edge weight reweighted according to the absolute value of the residual as predicted by the RR-GNN model (\ref{eq: RGAE}).
\begin{equation}\label{eq: Vij}
V^{\textrm{RR}}_{ij} = \max\left \{ \frac{\hat{W}^{\alpha/2}_{ij} - \Wcalib_{ij}}{\big|\hat{R}_{ij}\big|}, \frac{\Wcalib_{ij} - \hat{W}^{1-\alpha/2}_{ij}}{\big|\hat{R}_{ij}\big|}  \right\}, \; 
\end{equation}
\begin{equation}
(i, j) \in \displaystyle \mE^{calib},  
\end{equation}
where $\hat{\mW}^{\alpha/2}$ and $\hat{\mW}^{1-\alpha/2}$ is the predicted edge weight quantile of the Conformal GAE based on the calibration set.  
%
Let $d^{\textrm{RR}}_{(m)} $ be the $k$-th smallest value in $\{V^{\textrm{RR}}_{ij}|(i,j)\in \mE^{calib}_{(m)}\}$ for the cluster m, where $k=\lceil(n/2 +1)(1-\alpha)\rceil$ where n is size of $\displaystyle \mE^{calib}_{(m)}$. 
The RR prediction intervals for cluster m are:
\begin{equation}\label{eq: interval RR R-GAE}
   C_{ab}^{(m)} =  \Big[ \hat{W}^{\alpha/2}_{ab} - d^{\textrm{RR}}_{(m)} \big|\hat{R}_{ab}\big|,  \;
    \hat{W}^{1-\alpha/2}_{ab} + d^{\textrm{RR}}_{(m)}
    \big|\hat{R}_{ab}\big| \Big], \;
\end{equation}
\begin{equation}    
    (a, b) \in \displaystyle \mE^{test}_{(m)},
\end{equation}
The theoretical guarantees on interval validity can be referenced to~\cite{luo2024conformal}.



\begin{algorithm}[h]
   \caption{Residual Reweighted Conformalized Graph Neural Network for Edge Weight Prediction}
   \label{alg: CQR}
\begin{algorithmic}[1]
   \STATE {\bfseries Input:} The binary adjacency matrix $\displaystyle \mA \in \{0, 1\}^{n\times n}$, edge weight matrix $\displaystyle \mW \in \displaystyle \R^{n\times n}$, node features $\displaystyle \mX\in \displaystyle \R^{n\times m}$, training edges and weights $\displaystyle \mE^{train}$ and $\displaystyle \mW^{train}$, validation edges and weights $\displaystyle \mE^{val}$ and $\displaystyle \mW^{val}$ (used for training Residual GNN), calibration edges and weights $\displaystyle \mE^{calib}$ and $\displaystyle \mW^{calib}$, and test edges $\displaystyle \mE^{test}$, user-specified error rate $\alpha \in (0,1)$, two GNN models $g_{\vtheta_1}$ and $g_{\vtheta_2}$ with trainable parameters $\vtheta_1$ and $\vtheta_2$.
   \STATE Cluster the whole graph $\mE = \mE^{train} \cup \mE^{val} \cup \mE^{calib} \cup \mE^{test}$ into $K$ clusters using Louvain clustering.
   \STATE Train the models $g_{\vtheta_1}$ and $g_{\vtheta_2}$ with $\mA$, $\mX$, $\displaystyle \mW^{train}$ and $\displaystyle \mW^{val}$ according to Algorithm \ref{closs_train}.
   \STATE Predict the interval $[\hat{W}^{\alpha/2}_{ij},\hat{W}^{1-\alpha/2}_{ij}]$ as the output of $g_{\vtheta_1}$ and the residual $\hat{R}_{ij}$ as the output of $g_{\vtheta_2}$ using the calibration data as input.
   \STATE Compute the nonconformity score $V^{\textrm{RR}}_{ij}$ for the calibration data according to equation~\ref{eq: Vij}.
   \STATE Compute $d_{(m)}=$ the $k$-th smallest value in $\{V^{\textrm{RR}}_{ij}|(i,j)\in \mE^{calib}_{(m)}\}$, where $k=\lceil(|\displaystyle \mE^{calib}_{(m)}| +1)(1-\alpha)\rceil$.
   \STATE Construct a prediction interval for test edges according to equation~\ref{eq: interval RR R-GAE}.
\end{algorithmic}
   \STATE {\bfseries Output:} Prediction of confidence intervals for the test edges $(a, b) \in \displaystyle \mE^{test}$ with the coverage guarantee according to equation~\ref{eq: interval RR R-GAE}.
\end{algorithm}





\begin{table*}[h!]
\label{table: IneffReduce}
\centering
\caption{Inefficiency reduction comparison between Ours(RR-Without Mondrian CP) and baseline methods}
\begin{tabular}{@{}llccccc@{}}
\toprule
\textbf{Comparison} & \textbf{Model} & \textbf{GraphConv} & \textbf{SGAEConv} & \textbf{GCN} & \textbf{GAT} & \textbf{Average} \\ \midrule
\multirow{\textbf{Ours vs CP}} 
    & \textbf{GAE}   & 10.66\% & 10.77\% & 9.49\%  & 8.64\%  & 9.89\%  \\
    & \textbf{DiGAE} & 12.26\%  & 18.08\%  & 18.08\%  & 12.17\%  & 13.62\%  \\
    & \textbf{LGNN}  & 48.60\%  & 51.64\%  & 53.17\%  & 41.72\%  & 48.78\%  \\ 
\cline{2-7}
\textbf{Average}    &              & 23.84\%  & 26.83\%  & 24.88\%  & 20.84\%  & -       \\ \midrule
\multirow{\textbf{Ours vs CQR}} 
    & \textbf{GAE}   & 5.67\%   & 9.60\%   & 18.21\%  & 8.07\%   & 10.39\%  \\
    & \textbf{DiGAE} & 7.41\%   & 10.15\%  & 8.84\%   & 11.71\%  & 9.53\%   \\
    & \textbf{LGNN}  & 5.38\%   & 8.60\%   & 16.81\%  & 13.34\%  & 11.03\%  \\ 
\cline{2-7}
\textbf{Average}    &              & 6.15\%   & 9.45\%   & 14.62\%  & 11.04\%  & -       \\ \bottomrule
\end{tabular}
\end{table*}

\begin{table*}[h!]
\label{table: IneffReduce2}
\centering
\caption{Inefficiency reduction comparison between Ours(RR-With Mondrian CP) and baseline methods}
\begin{tabular}{@{}llccccc@{}}
\toprule
\textbf{Comparison} & \textbf{Model} & \textbf{GraphConv} & \textbf{SGAEConv} & \textbf{GCN} & \textbf{GAT} & \textbf{Average} \\ \midrule
\multirow{\textbf{Ours vs CP}} 
    & \textbf{GAE}   & 14.64\% & 14.73\% & 14.37\%  & 13.64\%  & 14.35\%  \\
    & \textbf{DiGAE} & 14.26\%  & 19.08\%  & 18.06\%  & 16.56\%  & 16.99\%  \\
    & \textbf{LGNN}  & 49.98\%  & 52.31\%  & 54.19\%  & 41.97\%  & 49.61\%  \\ 
\cline{2-7}
\textbf{Average}    &              & 26.29\%  & 28.71\%  &  28.87\%  &24.06\%  & -       \\ \midrule
\multirow{\textbf{Ours vs CQR}} 
    & \textbf{GAE}   & 10.63\%   & 14.11\%   & 19.24\%  & 14.26\%   & 14.56\%  \\
    & \textbf{DiGAE} & 11.45\%   & 15.75\%  & 15.89\%   & 16.89\%  &15.00\%   \\
    & \textbf{LGNN}  & 9.68\%   & 13.62\%   & 21.21\%  & 19.32\%  & 15.96\%  \\ 
\cline{2-7}
\textbf{Average}    &              & 10.59\%   & 14.49\%   & 18.78  \%  & 16.82\%  & -       \\ \bottomrule
\end{tabular}
\end{table*}




\subsection{RR-GNN on Node Regression}\label{sec: problem2}
We apply RR-GNN for the node regression task to predict a continuous target variable $y_i$ associated with each node $i$ in a graph. 
Firstly, we train a traditional GNN model (Conformal GNN) for the node regression task using the training set. The GNN model learns a function $f: \gG \rightarrow \displaystyle \R^n$, where $\gG$ is the input graph and $f(\gG)_i$ represents the predicted target variable for node $i$.  Node regression minimizes the distance between the direct output and labels. 
\begin{align}
\hat{\vy}  &= f_{\text{GNN}}(\mathbf{X}, \mathbf{A
}) ~\label{enbedding_equation} \\
\mathcal{L} &= |\vy - \hat{\vy}|^2
\end{align}
Here, we use the GNN-based model, GAE, to deal with several cases, which generates the embedding using an encoder function and generate node regression using the decoder function.
For the following steps, a Residual GNN predicts the residual of the node labels, generating the weight for the non-conformity measure when computing prediction sets. The details of the algorithm are shown in Appendix Algorithm 1. %~\ref{alg: nodd regression}.

\begin{table*}[t]
\caption{Performance comparison of the proposed models}
\label{tab:eff_all_models1gsmg}
\centering
\begin{adjustbox}{width=\textwidth}
\begin{tabular}{|l|c|c|c|c|c|c|c|c|}
\toprule
%GNN Model on Anaheim Data& \multicolumn{2}{c|}{GraphConv} & \multicolumn{2}{c|}{SAGEConv}  & \multicolumn{2}{c|}{GCNConv} & \multicolumn{2}{c|}{GATConv} \\ \cmidrule{1-9}
%Score Method-CP  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
%GAE&$0.9156\std{0.0569} $ 
%&${5.4093}\std{0.6783}$
%&$0.9161\std{0.0617}$
%&$6.2633\std{0.6041}$
%&$0.9273\std{0.0556} $ 
%&$6.3644\std{0.7133}$
%&$0.9264\std{0.0702}$
%&$6.4278\std{0.6963}$\\
%DiGAE&$0.9163\std{0.0590} $ 
%&${5.6076}\std{0.6497}$
%&$0.9143\std{0.0662}$
%&$6.3111\std{0.6624}$
%&$0.9206\std{0.07034} $ 
%&$6.4915\std{0.6903}$
%&$0.9291\std{0.0539}$
%&$6.3954\std{0.0416}$\\
%LGNN&$0.9452\std{0.0287}$ 
%&$6.9076\std{0.2908}$
%&$0.9373\std{0.0360}$
%&$6.4227\std{0.0455}$
%&$0.9365\std{0.0388}$ 
%&${6.3655}\std{0.5026}$
%&$0.9391\std{0.0333}$
%&$6.6962\std{0.3638}$\\ 
%\midrule
%Average & 0.9257 & 5.9748 & 0.9226 & 6.3324 & 0.9281 & 6.4071 & 0.9315 & 6.4398\\
%\midrule
%Score Method-CQR  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
%GAE&$0.9548\std{0.0206}$ 
%&${5.2680}\std{0.3499}$
%&$0.9535\std{0.0285}$
%&$5.8272\std{0.2352}$
%&$0.9576\std{0.0419}$
%&$4.2310\std{1.4752}$
%&$0.9578\std{0.0346}$
%&$4.1396\std{1.3386}$\\
%DiGAE&$0.8984\std{0.0926} $ 
%&${5.0580}\std{0.2792}$
%&$0.8975\std{0.0982}$
%&$5.6483\std{0.2399}$
%&$0.9040\std{0.0873} $ 
%&$5.7600\std{0.2960}$
%&$0.9115\std{0.0691}$  
%&$5.7889\std{0.2722}$\\
%LGNN&$0.9010\std{0.0555}$ 
%&${5.4381}\std{0.1453}$
%&$0.9167\std{0.0480}$
%&$5.9004\std{0.2302}$
%&$0.9333\std{0.0430}$ 
%&$6.1160\std{0.1818}$
%&$0.9080\std{0.0607}$
%&$6.0694\std{0.1861}$ \\ 
%\midrule
%Average & 0.9180 & 5.2547 & 0.9226 & 5.7920 & 0.9316 & 5.3690 & 0.9258 & 5.3326\\
%\midrule  
%Score Method-CQR-RR(Ours)  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
%GAE&$0.9545\std{0.0223} $ 
%&${5.2184}\std{0.0862}$
%&$0.9507\std{0.0224}$
%&$5.1030\std{0.2044}$
%&$0.9543\std{0.0223} $ 
%&$\textbf{3.9605}\std{0.1844}$
%&$0.9501\std{0.0243}$
%&$5.1843\std{0.2138}$\\
%DiGAE&$0.9498\std{0.0353}$ 
%&${5.0672}\std{0.2145}$
%&$0.9394\std{0.0745}$
%&$5.2498\std{0.1524}$
%&$0.9534\std{0.0143}$ 
%&$5.0522\std{0.1653}$
%&$0.9518\std{0.0515}$
%&$5.0513\std{0.1748}$\\
%LGNN&$0.9485\std{0.0553}$
%&${5.0194}\std{0.0816}$
%&$0.9471\std{0.0438}$ 
%&$5.0365\std{0.1846}$
%&$0.9498\std{0.0173}$
%&$5.2534\std{0.1024}$
%&$0.9484\std{0.0342}$
%&$5.0162\std{0.1034}$\\ 
%\midrule
%Average & \underline{\textbf{0.9509}} & \underline{\textbf{5.1017}} & \underline{\textbf{0.9457}} & \underline{\textbf{5.1298}} & \underline{\textbf{0.9525}} & \underline{\textbf{4.7554}} & \underline{\textbf{0.9501}} & \underline{\textbf{5.0839}}\\
%\bottomrule
 %&  &  &  &  &  &  & \\
%\bottomrule
GNN Model On Chicago Data & \multicolumn{2}{c|}{GraphConv} & \multicolumn{2}{c|}{SAGEConv}  & \multicolumn{2}{c|}{GCNConv} & \multicolumn{2}{c|}{GATConv} \\ \cmidrule{1-9}
Score Method-CP  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
GAE&$0.7984\std{0.1181}$ 
&$3.6659\std{0.3313}$
&$0.8297\std{0.1264}$
&${3.6350}\std{0.2231}$
&$0.8234\std{0.1213}$ 
&$3.6918\std{0.2454}$
&$0.9524\std{0.0333}$
&$3.3493\std{0.5910}$\\
DiGAE&$0.8081\std{0.1257} $ 
&${3.5721}\std{0.1951}$
&$0.8196\std{0.1215}$
&$3.5978\std{0.1884}$
&$0.8135\std{0.1361} $ 
&$3.5846\std{0.2050}$
&$0.8135\std{0.1319}$
&$3.6346\std{0.2432}$\\
LGNN&$0.9174\std{0.0238}$ 
&$6.7157\std{0.1325}$
&$0.9152\std{0.0256}$
&$6.5865\std{0.1577}$
&$0.9151\std{0.0246}$ 
&$6.5265\std{0.1426}$
&$0.9075\std{0.0618}$
&${6.0679}\std{0.1862}$\\ 
\midrule
Average & 0.8477 & 4.6512 & 0.8548 & 4.5998 & 0.8507 & 4.6010 & 0.8912 & 4.3506 \\
\midrule 
Score Method-CQR  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
GAE&$0.9514\std{0.0144} $ 
&${3.5041}\std{0.1312}$
&$0.9517\std{0.0141}$
&$3.6075\std{0.2107}$
&$0.9578\std{0.0420}$
&$4.0504\std{1.2916}$
&$0.9524\std{0.0333}$
&$3.3752\std{0.5866}$\\
DiGAE&$0.9205\std{0.0498} $ 
&${3.4171}\std{0.1172}$
&$0.9223\std{0.0469}$
&$3.4391\std{0.1260}$
&$0.9250\std{0.0479} $ 
&$3.4873\std{0.1271}$
&$0.9089\std{0.0611}$
&$3.6485\std{0.2348}$\\
LGNN&$0.9284\std{0.0296}$ 
&${3.7099}\std{0.1029}$
&$0.9305\std{0.0258}$
&$3.6442\std{0.1233}$
&$0.9290\std{0.0284}$ 
&$3.7940\std{0.1050}$
&$0.9379\std{0.0261}$
&$4.3605\std{0.5445}$\\ 
\midrule  
Average & 0.9334 & 3.3716 & 0.9348 & 3.4865 & 0.9373 & 3.7086 & 0.9331 & 3.6752\\
\midrule
Score Method-CQR-RR(Ours)  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
GAE&$0.9578\std{0.0134} $ 
&${3.1297}\std{0.1401}$
&$0.9578\std{0.0189}$
&$3.0985\std{0.1478}$
&$0.9527\std{0.0123}$ 
&$3.1614\std{0.1622}$
&$0.9520\std{0.0145}$
&$ \textbf{2.8927}\std{0.1223}$\\
DiGAE&$0.9513\std{0.0415} $ 
&$\textbf{3.0262}\std{0.1412}$
&$0.9501\std{0.0312}$
&$\textbf{2.8976}\std{0.1393} $
&$0.9507\std{0.0456}$
&$2.9347\std{0.1139}$
&$0.9442\std{0.0735}$ 
&$3.0321\std{0.2134}$\\
LGNN&$0.9438\std{0.0396}$
&${3.3562}\std{0.0355}$
&$0.9473\std{0.0423}$ 
&$3.1422\std{0.0423}$
&$0.9497\std{0.0323}$
&$\textbf{2.9913}\std{0.0732}$
&$0.9507\std{0.0324}$
&$3.5195\std{0.1231}$\\
%\midrule
%Average & \underline{\textbf{0.9493}} & \underline{\textbf{3.1721}} & \underline{\textbf{0.9507}} & \underline{\textbf{3.0315}} & \underline{\textbf{0.9516}} & \underline{\textbf{3.0545}} & \underline{\textbf{0.9492}} & \underline{\textbf{3.2741}}\\
\bottomrule

\end{tabular}    
\end{adjustbox}
\vspace{0.0005in}
\justify
 The results are based on the conditional coverage (equation 10 in supplementary material) and inefficiency (equation 9 in supplementary material) on edge weight prediction task. The complete table is shown as table 1 in supplementary material. The models were tested using several widely-used graph convolutional layers, including GraphConv 
 \cite{morris2019weisfeiler}, SAGEConv \cite{hamilton2017inductive}, GCNConv \cite{kipf2016semi}, and GATConv \cite{velivckovic2017graph}. The best conditional coverage and inefficiency for each graph convolutional layer is highlighted in bold. Across diverse datasets and graph convolutional layers, CQR-GAE and CAR-RR-GAE demonstrate strong performance in both inefficiency and conditional coverage, while CQR-DiGAE and CQR-RR-DiGAE excel in minimizing inefficiency. 
\end{table*}

\subsubsection{RR-GNN on Node Classification}\label{sec: problem3}
The node classification problem is a fundamental task in graph-based machine learning, where the goal is to predict a discrete label or class for each node in a given graph.

We first trained the Conformal GNN model, a function $f: \displaystyle \gG \rightarrow \displaystyle \{0,1,...,K\}^n$, that maps the node features to the corresponding labels with K classes.
Compared with regression, node classification uses binary cross-entropy loss as the loss function:
\begin{align}
\hat{\vy} &= f_{\text{GNN}}(\mX,\mA) \\
\mathcal{L} &= -\frac{1}{N}\sum_{i=1}^{N}\sum_{k=1}^{K}y_{i,k}\log(\hat{y}_{i,k})
\end{align}
We then generate the residual of the predicted value and the actual label as the label for the Residual GNN. We do a softmax operation to get a vector, representing the probability of the node belonging to each class:
\begin{align}
\hat{\vl} &= \text{softmax}(\hat{\vy})
\end{align}
The residual $\hat{\vr}$ is obtained by:
\begin{align}
\hat{\vr} &= \hat{\vl}- \vy
\end{align}
Where $\vy$ is the one-hot label of which class the node belongs to. 
% ~\ref{alg: nodd classification} 
In Algorithm 2 in supplementary material, we set a small positive real number $\epsilon$ (1e-9) to avoid the denominator equal to 0.  In addition, we need the differentiable quantile method in equation (21). Since the non-conformity score is usually differentiable, it only requires differentiable quantile calculation where there are well-established methods available \cite{chernozhukov2010quantile,blondel2020fast}.


\section{Results} \label{sec: r}
\subsection{Empirical analysis}\label{sec: empirical}
In this section, we showcase the application of the proposed RR-GNN on 15 datasets for edge weight prediction, node regression, and node classification problems. We conduct a comparative
analysis of the performance of RR-GNN and four competitors based on two metrics. For the data split, 30\% of the data was designated for training, 30\% for validation, 20\% for testing, and the remaining 20\% for calibration. One example of parameter setting in edge weight prediction on Chicago data is shown in Table 2 in supplementary material.


% \subsection{Transportation Network Snapshot}
\vspace{0.1in}
\noindent
{\bf Datasets:} To evaluate the effectiveness of RR-GNN algorithm, we conduct experiments on four categories of benchmark graph datasets: 1) traffic datasets, 2) citation connection datasets, 3) social network datasets, and 4) additional datasets.  

We apply RR-GNN on the traffic network and traffic flow data from 
Chicago and Anaheim to predict each node's edge weight and traffic volume \cite{bar2021transportation}. Chicago dataset consists of 541 nodes representing road junctions and 2150 edges representing road segments with directions, while the Anaheim dataset consists of 413 nodes and 858 edges.
In this context, each node is characterized by a two-dimensional feature $\mX_{i,:}\in \displaystyle \R^{2}$ representing its coordinates, and each edge is associated with a weight that signifies the traffic volume passing through the corresponding road segment. We collect three widely used citation network datasets for the citation datasets: Cora, PubMed, and CiteSeer. We apply RR-GNN to paper classification and citation prediction. Social network datasets like Twitch, CS, and Physics have become increasingly important resources for graph machine-learning research.
 
 %The dataset includes information on 168,114 users and 4,949,552 connections between them, representing activities such as friendships, channel subscriptions, and chat interactions. Each user is represented by a set of features describing their platform activity, such as the number of followers, videos watched, and channels subscribed to. 
%The CS dataset focuses on the social network of researchers in the computer science domain. With 18,772 nodes (researchers) and 81,894 edges (co-authorship relationships), this dataset provides rich node features such as publication history, research topics, and academic positions (e.g., professor, student, staff). Analyzing this dataset can yield insights into the collaboration patterns and academic hierarchies within the computer science research community. The Physics dataset captures the social network of researchers in the physics domain, with 34,546 nodes (researchers) and 420,877 edges (citations and collaborations). Each researcher is represented by features like their publication venues, citations, and research areas, as well as labels indicating their academic rank (e.g., junior, senior). This dataset enables the study of knowledge diffusion and academic status within the physics research community using graph-based methods.


%We adopt a similar data partitioning procedure from \cite{jia2020residual, huang2023uncertainty}, where we allocate 50\% of the data for the training set $\Etrain$, 10\% for the validation set $\Eval$, and the remaining 40\% for the combined calibration and test set $\Ect$. Figure \ref{fig: Chicago} provides an example of how the Chicago network data is divided into these different sets.

%The figure also depicts the prediction outcome of our proposed Equivariant Residual Conformal Quantile Regression with Graph Autoencoder (RRC-CQR-GAE) model, as described in Algorithm \ref{alg: TAR}. The middle plot shows the predicted edge weights, while the right-hand plot illustrates the width of the prediction intervals.
%By applying our RRC-CQR-GAE model to these real-world transportation datasets, we can provide users with reliable and locally adaptive prediction intervals for the edge weights, enabling more informed decision-making in transportation planning and management.


\begin{figure}[ht]
\centerline{\includegraphics[height=4cm,width=0.45\textwidth,clip=]{figures/Figure1-F2.png}}
\small
\caption{The prediction interval of node regression generated by CF-GNN and RR-GAE. The x-axis represents the node, which is sorted by the label. The y-axis represents the prediction intervals of nodes. The error rate $\alpha$ is 0.05. Blue and red represent the results of RR-GAE and CFNN-GAE, respectively. }

\label{fig: comparisonnr}
\end{figure}

\begin{figure}[h]
\centerline{\includegraphics[height=5cm,width=0.5\textwidth,clip=]{figures/nc-r2.pdf}}
\small
\caption{The histogram of predicted values of the node classification task on dataset MedPub. The error rate $\alpha$ is 0.05. Each sub-figure represents one classification from 0 to 5. The x-axis is the predicted value from the model. The y-axis is the frequency corresponding to the predicted value. Blue and yellow represent the results of RR-GAE and CF-GNN, respectively. }
\label{fig: comparisonnc}
\end{figure}

\textbf{Metrics:} We use inefficiency (ineff) and weighted symmetric calibration (WSC) as evaluation metrics (details of ineff and WSC can be accessed in Appendix). Lower ineff and higher WSC indicate better performance. To generate prediction intervals, we independently sample 1000 vectors $v$ from the unit sphere in $\displaystyle \R^{2m}$ space. The parameters $a, b, \delta$ are fine-tuned via grid search. Additionally, 25\% of test data is utilized to estimate optimal $v, a, b, \delta$ values. The conditional coverage is then calculated on the remaining 75\% of test data. 

\begin{table*}[h]
\caption{Results of Ours (RR-GNN) on Node Regression Datasets}
\label{tab:eff_all_models12}
\centering
\begin{adjustbox}{width=\textwidth}
\begin{tabular}{|l|c|c|c|c|c|c|c|c|}
\toprule
Dataset& \multicolumn{2}{c|}{GraphSAGE} & \multicolumn{2}{c|}{SGC}  & \multicolumn{2}{c|}{GCN} & \multicolumn{2}{c|}{GATS} \\ \cmidrule{1-9}
 Metrics & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
Anaheim: CF-GNN&$0.9520\std{0.0669} $ 
&$\textbf{1.9231}\std{0.0483}$
&$0.9559\std{0.0617}$
&$2.2031\std{0.0241}$
&$0.9519\std{0.0531}$ 
&$2.3782\std{0.0533}$
&$0.9523\std{0.0302}$
&$2.1499\std{0.0463}$\\
Anaheim: RR-GAE&$\textbf{0.9543}\std{0.0320} $ 
&${1.9647}\std{0.0197}$
&$\textbf{0.9577}\std{0.0657}$
&$\textbf{2.0188}\std{0.0246}$
&$\textbf{0.9585}\std{0.0413}$ 
&$\textbf{2.2179}\std{0.0254}$
&$\textbf{0.9638}\std{0.0302}$
&$\underline{\textbf{1.8996}}\std{0.0249}$\\ \midrule
Chicago: CF-GNN&$0.9494\std{0.0519} $ 
&${2.3426}\std{0.0384}$
&$0.9486\std{0.0247}$
&$1.0423\std{0.0372}$
&$0.9505\std{0.0447}$
&$2.0456\std{0.0443}$
&$0.9508\std{0.0569}$
&$1.1396\std{0.0686}$\\
Chicago: RR-GAE&$\textbf{0.9476}\std{0.0426} $ 
&$\textbf{2.2291}\std{0.0325}$
&$\textbf{0.9546}\std{0.0328}$
&$\textbf{1.2012}\std{0.0251}$
&$\textbf{0.9538}\std{0.0356}$ 
&$\underline{\textbf{1.5769}}\std{0.0252}$
&$\textbf{0.9540}\std{0.0362}$  
&$\textbf{1.1283}\std{0.0256}$\\ \midrule  
Education: CF-GNN&$0.9501\std{0.0242} $ 
&${2.3808}\std{0.0427}$
&$0.9500\std{0.0285}$
&$2.4892\std{0.0351}$
&$0.9483\std{0.0408}$
&$2.4380\std{0.0442}$
&$0.9502\std{0.0392}$
&$2.4209\std{0.0376}$\\
Education: RR-GAE&$\textbf{0.9599}\std{0.0417} $ 
&$\textbf{2.0573}\std{0.0280}$
&$\textbf{0.9586}\std{0.0225}$
&$\textbf{2.0447}\std{0.0239}$
&$\textbf{0.9580}\std{0.0333}$ 
&$\textbf{1.8731}\std{0.0260}$
&$\textbf{0.9594}\std{0.0386}$  
&$\underline{\textbf{1.9080}}\std{0.0221}$\\ \midrule
Election: CF-GNN&$0.9498\std{0.0211} $ 
&${0.9268}\std{0.0429}$
&$0.9495\std{0.0215}$
&$0.9279\std{0.0302}$
&$0.9506\std{0.0473}$
&$0.9009\std{0.0282}$
&$0.9488\std{0.0363}$
&$0.9136\std{0.0681}$\\
Election: RR-GAE&$\textbf{0.9558}\std{0.0215}$ 
&$\textbf{0.9213}\std{0.0279}$
&$\textbf{0.9567}\std{0.0242}$
&$\textbf{0.9487}\std{0.0259}$
&$\textbf{0.9510}\std{0.0432} $ 
&$\textbf{0.9343}\std{0.0341}$
&$\textbf{0.9567}\std{0.0317}$  
&$\underline{\textbf{0.6698}}\std{0.0201}$\\ \midrule
Income: CF-GNN&$0.9512\std{0.0264} $ 
&${2.7580}\std{0.0342}$
&$0.9504\std{0.0405}$
&$2.4892\std{0.0302}$
&$0.9511\std{0.0250}$
&$2.5272\std{0.0318}$
&$0.9508\std{0.0329}$
&$2.4396\std{0.0328}$\\
Income: RR-GAE&$\textbf{0.9552}\std{0.0618}$ 
&$\textbf{2.1003}\std{0.0492}$
&$\textbf{0.9519}\std{0.0513}$
&$\textbf{1.9616}\std{0.0358}$
&$\textbf{0.9566}\std{0.0501}$ 
&$\textbf{1.9203}\std{0.0354}$
&$\textbf{0.9545}\std{0.0347}$  
&$\underline{\textbf{1.8555}}\std{0.0423}$\\ \midrule
Unemploy: CF-GNN&$0.9526\std{0.0415}$ 
&${2.2298}\std{0.0523}$
&$0.9510\std{0.0320}$
&$2.4587\std{0.0491}$
&$0.9506\std{0.0294}$
&$2.5013\std{0.0326}$
&$0.9502\std{0.0354}$
&$2.4332\std{0.0376}$\\
Unemploy: RR-GAE&$\textbf{0.9569}\std{0.0419}$ 
&$\textbf{2.0816}\std{0.0218}$
&$\textbf{0.9517}\std{0.0313}$
&$\textbf{2.0534}\std{0.0367}$
&$\textbf{0.9523}\std{0.0369}$ 
&$\textbf{2.0480}\std{0.0190}$
&$\textbf{0.9523}\std{0.0448}$  
&$\underline{\textbf{1.9503}}\std{0.0312}$\\ \midrule
Twitch: CF-GNN&$\textbf{0.9524}\std{0.0443} $ 
&$\textbf{2.6634}\std{0.0365}$
&$0.9523\std{0.0392}$
&$2.6835\std{0.0394}$
&$0.9529\std{0.0257} $ 
&$2.5409\std{0.0404}$
&$0.9515\std{0.0275}$
&$2.6243\std{0.0460}$\\
Twitch: RR-GAE&$0.9515\std{0.0367}$ 
&${5.0491}\std{0.0513}$
&$\textbf{0.9541}\std{0.0284}$
&$\underline{\textbf{2.1005}}\std{0.0189}$
&$\textbf{0.9571}\std{0.0219}$ 
&$\textbf{2.2398}\std{0.0225}$
&$\textbf{0.9535}\std{0.0280}$
&$\textbf{2.1353}\std{0.0262}$\\ \bottomrule

\end{tabular}    
    \end{adjustbox}    
\end{table*}

{\bf Models and baselines: }
There are three basic GNN models for the model: 1) GAE (Section \ref{subsec: GAE}), 2) line graph neural network (LGNN \cite{cai2021line})  and 3) a directed variant of GAE, called DiGAE.  We use two nonconformity score, including 1) CP \cite{huang2024uncertainty} and 2) conformal quantile regression (CQR). For the encoder in the basic GNN models, we choose 4 different structures: 1) GraphSAGE \cite{hamilton2017inductive}, 2) SAGEConv \cite{morris2019weisfeiler}, 3) GCN \cite{kipf2016semi}, and 4) GAT \cite{velivckovic2017graph}. We name the model that combines conformal prediction (CP, as described in work \cite{huang2024uncertainty}) with graph autoencoder (GAE, Section \ref{subsec: GAE}) as CP-GAE. Similarly, we name the model with the nonconformity score and the GNN models. For example, the models use CQR, and GAE is referred to as CQR-GAE. Additionally, if the residual reweighting approach is performed based on the CQR approach,  we also add RR to the name of the models. For example, the residual reweighted CQR-GAE is referred to as CQR-RR-GAE. The baseline models are the above models without residual reweight.

We use four popular graph neural network (GNN) model structures for encoder and decoder - GCN \cite{kipf2016semi}, GraphConv \cite{morris2019weisfeiler}, GAT \cite{velivckovic2017graph}, and GraphSAGE \cite{hamilton2017inductive} - as the base graph convolution layers for both the CP and CQR based models.

{\bf Experiment result\footnote{The code is available at GitHub.}:} 
As for the task of edge weight prediction,  we ran the experiment 10 times and split the data into training, validation, and the combined calibration and test sets for each dataset and model. We conduct 100 random splits of calibration and testing edges to perform the baseline model and RR-GNN and evaluate the empirical coverage. Our method achieved 6.15\% to 28.87\% reduce on inefficiency interval length shown in table 1 and 2 on the task of edge weight prediction. In addition, we conducted a paired t-testing on the mean inefficiency values for our method (RR) compared to CP and CQR, taking the meaning value of each (12 values in total in Table 3) after repeating the experiments 10 times. The p-values are 0.00035 and 0.00024. Coverage is defined as the probability that the ground truth value lies within the predicted confidence interval. Our method allows control of the coverage through the parameter $\alpha$, where the expected coverage is $1-\alpha$. In our manuscript, we set $\alpha = 0.05$, corresponding to a target coverage of 0.95. As shown in Tables 1-3 of the manuscript, the empirical coverages achieved by our method are close to 0.95, indicating that the coverage is well controlled and aligns with the expected theoretical value. 

We conducted a conditional coverage (Equation 10 in supplementary material) of RR-GNN and baseline methods on the Chicago and Anaheim traffic dataset for the edge weight prediction task. %Basic models are 1) GAE, 2) DiGAE and 3) LGNN. Encoder structures are 1)  GraphCon, 2) SAGEConv, 3) GCNConv and 4) GATConv.
The results presented in Table~\ref{tab:eff_all_models1gsmg} show that the overall RR-GNN models outperform others in terms of inefficiency (as defined in Equation 9 in supplementary material) and conditional coverage  (Equation 10 in supplement material). This indicates that the RR variants can strike a better balance between capturing the uncertainty in the predictions and maintaining a high level of accuracy. We also find that GAE and LGNN outperform DiGAE, highlighting the efficacy of the autoencoder approach in weight prediction from Table 1. 
We showcase the prediction interval produced by model of LGNN with RR(CQR) in Figure~\ref{fig: comparisonwep}.  Furthermore, Figure~\ref{fig: comparisonwep} also illustrates the adaptability of the RR models by generating the smallest prediction intervals of varying sizes, which aligns with the data characteristics.



%It is worth noting that for the RR approach, tuning the regularization hyperparameter can be a notably challenging task. The performance of the RR models is highly sensitive to the choice of this hyperparameter.
%These findings suggest that the benefits of incorporating the RR approach may be context-dependent and require careful hyperparameter tuning to achieve the desired balance between coverage and efficiency in transportation network applications.

For the node regression task, we apply the models to 7 different datasets: 1) Anaheim traffic dataset, 2) Chicago traffic dataset, 3) Education dataset, 4) Income dataset, 5) Unemploy dataset, and 7) Twitch dataset. Table~\ref{tab:eff_all_models12} shows that our method outperforms the baseline model, CF-GNN \cite{huang2024uncertainty}, both on WSC score (coverage) and inefficiency on 7 datasets.  Besides, the visualization result in Figure~\ref{fig: comparisonnr} shows that we have a smaller interval size than that from CF-GNN. Similarly, for the task-node classification, we compared RR-GNN and CF-GNN on 8 datasets: 1) Cora, 2) DBLP, 3) CiteSeer, 4) PubMed, 5) Computer, 6) Photo, 7) CS and 8) Physics. Number and visual results can be seen in Table 3 and Figure~\ref{fig: comparisonnc}. Our model achieves better results in both accuracy and inefficiency.

In summary, leveraging the RR-based models can generate prediction intervals that are both efficient and well-calibrated, making them a more suitable choice for different network-based tasks, especially real-world transportation applications where accurate and reliable predictions are crucial for informed decision-making. 



%\begin{figure*}[h]
%    \centering
%        \includesvg[height=3.05cm]{figures/us_election_resultsOriFFF}
%        \caption{Original SVG Image}
%        \label{fig:original}
%    \hfill
%        \includesvg[height=3.5cm]{figures/us_election_resultsOursFFF}
%        \caption{Our SVG Image}
%        \label{fig:our}
%    \caption{Comparison of SVG Images}
%    \label{fig:comparison}
%\end{figure*}

Furthermore, we compared the predicted residual between CF-GNN and RR-GNN. Figure 3 in the supplementary material shows the predicted residual of road's traffic volume for Chicago and Anathm. The residual value of CF-GNN is higher than that of RR-GAE. Figure 2 in supplementary material also shows the residual of the U.S.A election result, where we can see that the global residual/difference between the model output and ground truth from RR-GAE is much lower than these baselines. %We also conducted an ablation study to assess the impact of initially setting the edge weights for the validation, calibration, and test edge sets on edge weight prediction tasks on Anaheim and Chicago datasets. We tried three initial settings: 1) set all weights as 0, 2) set all weights as average, and 3) set all weights as a random number. The results of the ablation study can be found in Table~\ref{tab:ablationstudy} in the supplemental material. Overall, the second initialization method is the best choice.





%\begin{tabular}{llllll}
%\label{ours_result2}
%Model & GraphConv & SGAEConv & GCN & GAT\\
%\hline
%GAE & $5.98\%$ & $10.34\%$ & $19.83\%$ & $10.07\%$\\
%DiGAE & $9.76\%$ & $12.74\%$ & $10.46\%$ & $13.62\%$\\
%LGNN & $5.38\%$ & $8.60\%$ & $16.81\%$ & $13.34\%$\\
%\hline
%Average & $6.15\%$ & $9.45\%$ & $14.62\%$ & $11.04\%$
%\end{tabular}
%\begin{tabular}{lllll} 
%Dataset & Node & Edge & Time & Ineff \\
%\hline Anaheim & 413 & 858 & 0.0343 s & 3.6918 \\
%\hline Election & 3163 & 6003 & 0.2769 s & 0.7749 \\
%\hline Twitch & 7,126 & 35,324 & 0.7524 s & 2.2638 \\
%\hline DBLP & 318,120 & $1,049,866$ & 5.1746 s & 3.2316
%\end{tabular}

%From the results shown in the table, we observed that the computation time remains acceptable, as the complexity grows sublinearly with the increasing number of nodes and edges across different datasets. Furthermore, the performance (inefficiency) remains stable and does not degrade as the size of the data grows, demonstrating the strong scalability and efficiency of our method.

%\begin{tabular}{llllll}
%Model & GraphConv & SGAEConv & GCN & GAT \\
%\hline 
%GAE & 11.32\% & 10.91\% & 10.96\% & 9.92\% \\
%DiGAE & 13.26\% & 19.01\% & 19.72\% & 13.28\%  \\
%LGNN & 49.10\% & 51.63\% & 53.17\% & 42.72\%  \\
%\hline
%Average & 25.84\% & 27.81\% & 25.82\% & 23.88\%

%\end{tabular}


\begin{figure}[htbp]
\centerline{\includegraphics[height=5cm,width=0.5\textwidth,clip=]{figures/edge-weight-interval.jpg}}
\small
\caption{The graph shows the traffic volume prediction intervals generated on the Chicago traffic dataset. All methods set their error rate $\alpha$ at 0.05. The x-axis represents individual roads sorted by their actual/ground truth traffic volumes. The y-axis represents the predicted intervals. Different colors distinguish the results from different prediction methods. }
\label{fig: comparisonwep}
\end{figure}






%\begin{table}[H]
%\caption{Ablation Study of CQR Score Results on %the Dataset of Anaheim and Chicago}
%\label{tab:ablationstudy}
%\begin{adjustbox}{width=0.49\textwidth}
%\begin{tabular}{|l|c|c|c|c|}
%\toprule
%GNN Model & \multicolumn{2}{c|}{Anaheim Dataset} %& \multicolumn{2}{c|}{Chicago Dataset}   \\ %\cmidrule{1-5}
%Set All 0  & cover$^x$  & ineff & cover$^x$ & ineff \\\midrule				%
%GAE&${0.8908}\std{0.0627} $ 
%&${5.2968}\std{0.3054}$	
%&${0.9001}\std{0.0513}$
%&${3.4034}\std{0.1034}$\\	
%DiGAE&${0.9063}\std{0.0713} $ 
%&${5.1402}\std{0.3131}$
%&${0.9219}\std{0.0474}$ 
%&${3.4093}\std{0.1410}$\\ \midrule
 
%Set Means  & cover$^x$  & ineff & cover$^x$ & ineff \\\midrule
%GAE&${0.8909}\std{0.0980}$ 
%&${5.1462}\std{0.2833}$
%&${0.9017}\std{0.0543}$
%&${3.2502}\std{0.0987}$\\
%DiGAE&${0.9094}\std{0.1182} $ 
%&${\textbf{4.9396}}\std{0.3067}$
%&${0.9253}\std{0.0425}$ 
%&${\textbf{3.1729}}\std{0.1278}$\\  \midrule  
%Set Random Choose  & cover$^x$  & ineff & cover$^x$ & ineff\\\midrule		
%GAE&${0.8941}\std{0.0601}$
%&${5.1863}\std{0.2914}$ 
%&${0.9048}\std{0.0393}$
%&${3.2912}\std{0.0870}$\\
%DiGAE&${0.9106}\std{0.0896} $ 
%&${5.0127}\std{00.3143}$		
%&${0.9273}\std{0.0455}$
%&${3.1938}\std{0.1035}$\\\bottomrule
%\end{tabular}    
%    \end{adjustbox}
%\end{table}

\section{Conclusion}\label{sec: conclusion}
This paper introduces the Residual Reweighted Conformal Prediction Graph Neural Network (RR-GNN), which enhances graph neural networks (GNNs) by integrating conformal prediction (CP). While traditional GNNs yield point predictions, RR-GNN provides predictive regions reflecting varying confidence levels. Existing nonconformity measures often produce uniform-width regions, neglecting the differing prediction difficulties. RR-GNN overcomes this by employing a novel residual reweighting nonconformity measure that adjusts predictive region widths based on expected accuracy for each example. We validate RR-GNN's effectiveness on 15 datasets including real-world datasets, like transportation and social networks, across tasks like edge weight prediction and node classification. RR-GNN consistently delivers tighter predictive regions, higher accuracy, and improved efficiency compared to standard GNN methods, advancing uncertainty-aware predictions in graph machine learning.



\begin{thebibliography}{}
\setlength{\itemindent}{-\leftmargin}
\makeatletter\renewcommand{\@biblabel}[1]{}\makeatother
\bibitem{lam2022graphcast} Lam, Remi and Sanchez-Gonzalez, Alvaro and Willson, Matthew and Wirnsberger, Peter and Fortunato, Meire and Alet, Ferran and Ravuri, Suman and Ewalds, Timo and Eaton-Rosen, Zach and Hu, Weihua and others (2022).
    \newblock GraphCast: Learning skillful medium-range global weather forecasting.
    \newblock \textit{arXiv preprint arXiv:2212.12794} \textbf{}, .

\bibitem{li2022graph} Li, Michelle M and Huang, Kexin and Zitnik, Marinka (2022).
    \newblock Graph representation learning in biomedicine and healthcare.
    \newblock \textit{Nature Biomedical Engineering} \textbf{6}, 1353--1369.

\bibitem{wu2022graph} Wu, Shiwen and Sun, Fei and Zhang, Wentao and Xie, Xu and Cui, Bin (2022).
    \newblock Graph neural networks in recommender systems: a survey.
    \newblock \textit{ACM Computing Surveys} \textbf{55}, 1--37.

\bibitem{hsu2022makes} Hsu, Hans Hao-Hsun and Shen, Yuesong and Tomani, Christian and Cremers, Daniel (2022).
    \newblock What makes graph neural networks miscalibrated?.
    \newblock \textit{Advances in Neural Information Processing Systems} \textbf{35}, 13775--13786.

\bibitem{zhang2020mix} Zhang, Jize and Kailkhura, Bhavya and Han, T Yong-Jin (2020).
    \newblock \textit{Mix-n-match: Ensemble and compositional methods for uncertainty calibration in deep learning}.
    \newblock International conference on machine learning.
\bibitem{lakshminarayanan2017simple} Lakshminarayanan, Balaji and Pritzel, Alexander and Blundell, Charles (2017).
    \newblock Simple and scalable predictive uncertainty estimation using deep ensembles.
    \newblock \textit{Advances in neural information processing systems} \textbf{30}, .

\bibitem{wang2021confident} Wang, Xiao and Liu, Hongrui and Shi, Chuan and Yang, Cheng (2021).
    \newblock Be confident! towards trustworthy graph neural networks via confidence calibration.
    \newblock \textit{Advances in Neural Information Processing Systems} \textbf{34}, 23768--23779.
    
\bibitem{vovk2005algorithmic} Vovk, Vladimir and Gammerman, Alexander and Shafer, Glenn (2005).
    \newblock \textit{Algorithmic learning in a random world}.
    \newblock Springer.
\bibitem{angelopoulos2020uncertainty} Angelopoulos, Anastasios and Bates, Stephen and Malik, Jitendra and Jordan, Michael I (2020).
    \newblock Uncertainty sets for image classifiers using conformal prediction.
    \newblock \textit{arXiv preprint arXiv:2009.14193} \textbf{}, .
\bibitem{bates2021distribution} Bates, Stephen and Angelopoulos, Anastasios and Lei, Lihua and Malik, Jitendra and Jordan, Michael (2021).
    \newblock Distribution-free, risk-controlling prediction sets.
    \newblock \textit{Journal of the ACM (JACM)} \textbf{68}, 1--34.
\bibitem{lei2021conformal} Lei, Lihua and Cand{\`e}s, Emmanuel J (2021).
    \newblock Conformal inference of counterfactuals and individual treatment effects.
    \newblock \textit{Journal of the Royal Statistical Society Series B: Statistical Methodology} \textbf{83}, 911--938.
\bibitem{jin2023selection} Jin, Ying and Cand{\`e}s, Emmanuel J (2023).
    \newblock Selection by prediction with conformal p-values.
    \newblock \textit{Journal of Machine Learning Research} \textbf{24}, 1--41.
\bibitem{jin2023sensitivity} Jin, Ying and Ren, Zhimei and Cand{\`e}s, Emmanuel J (2023).
    \newblock Sensitivity analysis of individual treatment effects: A robust conformal inference approach.
    \newblock \textit{Proceedings of the National Academy of Sciences} \textbf{120}, e2214889120.
\bibitem{yin2024conformal} Yin, Mingzhang and Shi, Claudia and Wang, Yixin and Blei, David M (2024).
    \newblock Conformal sensitivity analysis for individual treatment effects.
    \newblock \textit{Journal of the American Statistical Association} \textbf{119}, 122--135.
\bibitem{gibbs2021adaptive} Gibbs, Isaac and Candes, Emmanuel (2021).
    \newblock Adaptive conformal inference under distribution shift.
    \newblock \textit{Advances in Neural Information Processing Systems} \textbf{34}, 1660--1672.
\bibitem{zaffran2022adaptive} Zaffran, Margaux and F{\'e}ron, Olivier and Goude, Yannig and Josse, Julie and Dieuleveut, Aymeric (2022).
    \newblock \textit{Adaptive conformal predictions for time series}.
    \newblock International Conference on Machine Learning.
\bibitem{papadopoulos2008normalized} Papadopoulos, Harris and Gammerman, Alex and Vovk, Volodya (2008).
    \newblock \textit{Normalized nonconformity measures for regression conformal prediction}.
    \newblock Proceedings of the IASTED International Conference on Artificial Intelligence and Applications (AIA 2008).
\bibitem{papadopoulos2011regression} Papadopoulos, Harris and Vovk, Vladimir and Gammerman, Alexander (2011).
    \newblock Regression conformal prediction with nearest neighbours.
    \newblock \textit{Journal of Artificial Intelligence Research} \textbf{40}, 815--840.
\bibitem{lei2018distribution} Lei, Jing and G’Sell, Max and Rinaldo, Alessandro and Tibshirani, Ryan J and Wasserman, Larry (2018).
    \newblock Distribution-free predictive inference for regression.
    \newblock \textit{Journal of the American Statistical Association} \textbf{113}, 1094--1111.
\bibitem{kipf2016semi} Kipf, Thomas N and Welling, Max (2016).
    \newblock Semi-supervised classification with graph convolutional networks.
    \newblock \textit{arXiv preprint arXiv:1609.02907} \textbf{}, .
\bibitem{cai2021line} Cai, Lei and Li, Jundong and Wang, Jie and Ji, Shuiwang (2021).
    \newblock Line graph neural networks for link prediction.
    \newblock \textit{IEEE Transactions on Pattern Analysis and Machine Intelligence} \textbf{44}, 5103--5113.

\bibitem{wang2019heterogeneous} Wang, Xiao and Ji, Houye and Shi, Chuan and Wang, Bai and Ye, Yanfang and Cui, Peng and Yu, Philip S (2019).
    \newblock Heterogeneous graph attention network.
    \newblock \textit{The world wide web conference} \textbf{2019}, 2022--2032.

\bibitem{iyer2021bi} Iyer, Roshni G and Wang, Wei and Sun, Yizhou (2021).
    \newblock Bi-level attention graph neural networks.
    \newblock \textit{2021 IEEE International Conference on Data Mining (ICDM)} \textbf{2021}, 1126--113.

\bibitem{kollias2022directed} Kollias, Georgios and Kalantzis, Vasileios and Id{\'e}, Tsuyoshi and Lozano, Aur{\'e}lie and Abe, Naoki (2022).
    \newblock \textit{Directed graph auto-encoders}.
    \newblock Proceedings of the AAAI Conference on Artificial Intelligence.
\bibitem{huang2024uncertainty} Huang, Kexin and Jin, Ying and Candes, Emmanuel and Leskovec, Jure (2024).
    \newblock Uncertainty quantification over graph with conformalized graph neural networks.
    \newblock \textit{Advances in Neural Information Processing Systems} \textbf{36}, .
\bibitem{guo2017calibration} Guo, Chuan and Pleiss, Geoff and Sun, Yu and Weinberger, Kilian Q (2017).
    \newblock \textit{On calibration of modern neural networks}.
    \newblock International conference on machine learning.
\bibitem{volpi2018generalizing} {Volpi, Riccardo} and Namkoong, Hongseok and Sener, Ozan and Duchi, John C and Murino, Vittorio and Savarese, Silvio (2018).
    \newblock Generalizing to unseen domains via adversarial data augmentation.
    \newblock \textit{Advances in neural information processing systems} \textbf{31}, .
\bibitem{bhagat2011node} Bhagat, Smriti and Cormode, Graham and Muthukrishnan, S (2011).
    \newblock Node classification in social networks.
    \newblock \textit{Social network data analytics} \textbf{}, 115--148.
\bibitem{kipf2016variational} Kipf, Thomas N and Welling, Max (2016).
    \newblock Variational graph auto-encoders.
    \newblock \textit{arXiv preprint arXiv:1611.07308} \textbf{}, .
\bibitem{ahn2021variational} Ahn, Seong Jin and Kim, MyoungHo (2021).
    \newblock \textit{Variational graph normalized autoencoders}.
    \newblock Proceedings of the 30th ACM international conference on information \& knowledge management.
\bibitem{kollias2022directed} Kollias, Georgios and Kalantzis, Vasileios and Id{\'e}, Tsuyoshi and Lozano, Aur{\'e}lie and Abe, Naoki (2022).
    \newblock \textit{Directed graph auto-encoders}.
    \newblock Proceedings of the AAAI Conference on Artificial Intelligence.
\bibitem{zulaika2022lwp} Zulaika, Unai and Sanchez-Corcuera, Ruben and Almeida, Aitor and Lopez-de-Ipina, Diego (2022).
    \newblock LWP-WL: Link weight prediction based on CNNs and the Weisfeiler--Lehman algorithm.
    \newblock \textit{Applied Soft Computing} \textbf{120}, 108657.

\bibitem{samanta2020nevae} Samanta, Bidisha and De, Abir and Jana, Gourhari and G{\'o}mez, Vicen{\c{c}} and Chattaraj, Pratim Kumar and Ganguly, Niloy and Gomez-Rodriguez, Manuel (2020).
    \newblock Nevae: A deep generative model for molecular graphs.
    \newblock \textit{The Journal of Machine Learning Research} \textbf{21}, 4556--4588.

\bibitem{clarkson2023distribution} Clarkson, Jase (2023).
    \newblock \textit{Distribution free prediction sets for node classification}.
    \newblock International Conference on Machine Learning.
\bibitem{zargarbashi23conformal} H. Zargarbashi, Soroush and Antonelli, Simone and Bojchevski, Aleksandar (2023).
    \newblock \textit{Conformal Prediction Sets for Graph Neural Networks}.
    \newblock Proceedings of the 40th International Conference on Machine Learning.
\bibitem{xiao2023spatial} Xiao, Congxi and Zhou, Jingbo and Huang, Jizhou and Xu, Tong and Xiong, Hui (2023).
    \newblock \textit{Spatial heterophily aware graph neural networks}.
    \newblock Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining.
\bibitem{shafer2008tutorial} Shafer, Glenn and Vovk, Vladimir (2008).
    \newblock A tutorial on conformal prediction..
    \newblock \textit{Journal of Machine Learning Research} \textbf{9}, .
    
\bibitem{romano2019conformalized} Romano, Yaniv and Patterson, Evan and Candes, Emmanuel (2019).
    \newblock Conformalized quantile regression.
    \newblock \textit{Advances in neural information processing systems} \textbf{32}, .

\bibitem{steinwart2011estimating} Steinwart, Ingo and Christmann, Andreas (2011).
    \newblock Estimating conditional quantiles with the help of the pinball loss,.

\bibitem{luo2024conformal} Luo, Rui and Colombo, Nicolo (2024).
    \newblock Conformal Load Prediction with Transductive Graph Autoencoders.
    \newblock \textit{arXiv preprint arXiv:2406.08281} \textbf{}, .
\bibitem{papadopoulos2011regression} Papadopoulos, Harris and Vovk, Vladimir and Gammerman, Alexander (2011).
    \newblock Regression conformal prediction with nearest neighbours.
    \newblock \textit{Journal of Artificial Intelligence Research} \textbf{40}, 815--840.
\bibitem{lei2018distribution} Lei, Jing and G’Sell, Max and Rinaldo, Alessandro and Tibshirani, Ryan J and Wasserman, Larry (2018).
    \newblock Distribution-free predictive inference for regression.
    \newblock \textit{Journal of the American Statistical Association} \textbf{113}, 1094--1111.
\bibitem{guan2023localized} Guan, Leying (2023).
    \newblock Localized conformal prediction: A generalized inference framework for conformal prediction.
    \newblock \textit{Biometrika} \textbf{110}, 33--50.
\bibitem{johansson2014regression} Johansson, Ulf and Bostr{\"o}m, Henrik and L{\"o}fstr{\"o}m, Tuve and Linusson, Henrik (2014).
    \newblock Regression conformal prediction with random forests.
    \newblock \textit{Machine learning} \textbf{97}, 155--176.
\bibitem{kath2021conformal} Kath, Christopher and Ziel, Florian (2021).
    \newblock Conformal prediction interval estimation and applications to day-ahead and intraday power markets.
    \newblock \textit{International Journal of Forecasting} \textbf{37}, 777--799.
\bibitem{cowell2006alternative} Cowell, Charles and Hopkins, Pamela Clinton and McWhorter, Rochell and Jorden, Debra L (2006).
    \newblock Alternative training models.
    \newblock \textit{Advances in Developing Human Resources} \textbf{8}, 460--475.
\bibitem{peste2021ac} Peste, Alexandra and Iofinova, Eugenia and Vladu, Adrian and Alistarh, Dan (2021).
    \newblock Ac/dc: Alternating compressed/decompressed training of deep neural networks.
    \newblock \textit{Advances in neural information processing systems} \textbf{34}, 8557--8570.
\bibitem{morris2019weisfeiler} Morris, Christopher and Ritzert, Martin and Fey, Matthias and Hamilton, William L and Lenssen, Jan Eric and Rattan, Gaurav and Grohe, Martin (2019).
    \newblock \textit{Weisfeiler and leman go neural: Higher-order graph neural networks}.
    \newblock Proceedings of the AAAI conference on artificial intelligence.

\bibitem{luo2024conformal} Luo, Rui and Colombo, Nicolo (2024).
     \newblock \textit{Conformal Load Prediction with Transductive Graph Autoencoders}.
     \newblock arXiv preprint arXiv:2406.08281

\bibitem{hamilton2017inductive} Hamilton, Will and Ying, Zhitao and Leskovec, Jure (2017).
    \newblock Inductive representation learning on large graphs.
    \newblock \textit{Advances in neural information processing systems} \textbf{30}, .
\bibitem{velivckovic2017graph} Veli{\v{c}}kovi{\'c}, Petar and Cucurull, Guillem and Casanova, Arantxa and Romero, Adriana and Lio, Pietro and Bengio, Yoshua (2017).
    \newblock Graph attention networks.
    \newblock \textit{arXiv preprint arXiv:1710.10903} \textbf{}, .
\bibitem{chernozhukov2010quantile} Chernozhukov, Victor and Fern{\'a}ndez-Val, Iv{\'a}n and Galichon, Alfred (2010).
    \newblock Quantile and probability curves without crossing.
    \newblock \textit{Econometrica} \textbf{78}, 1093--1125.
\bibitem{blondel2020fast} Blondel, Mathieu and Teboul, Olivier and Berthet, Quentin and Djolonga, Josip (2020).
    \newblock \textit{Fast differentiable sorting and ranking}.
    \newblock International Conference on Machine Learning.

\bibitem{bar2021transportation} Bar-Gera, H and Stabler, B and Sall, E (2023).
    \newblock Transportation networks for research core team.
    \newblock \textit{Transportation Network Test Problems. Available online: \url{https://github. com/bstabler/TransportationNetworks} (accessed on 10 September 2023)} \textbf{}, .
\bibitem{hamilton2017inductive} Hamilton, Will and Ying, Zhitao and Leskovec, Jure (2017).
    \newblock Inductive representation learning on large graphs.
    \newblock \textit{Advances in neural information processing systems} \textbf{30}, .


\end{thebibliography}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%




























































\end{document}


% This document was modified from the file originally made available by
% Pat Langley and Andrea Danyluk for ICML-2K. This version was created
% by Iain Murray in 2018, and modified by Alexandre Bouchard in
% 2019 and 2021 and by Csaba Szepesvari, Gang Niu and Sivan Sabato in 2022.
% Modified again in 2023 and 2024 by Sivan Sabato and Jonathan Scarlett.
% Previous contributors include Dan Roy, Lise Getoor and Tobias
% Scheffer, which was slightly modified from the 2010 version by
% Thorsten Joachims & Johannes Fuernkranz, slightly modified from the
% 2009 version by Kiri Wagstaff and Sam Roweis's 2008 version, which is
% slightly modified from Prasad Tadepalli's 2007 version which is a
% lightly changed version of the previous year's version by Andrew
% Moore, which was in turn edited from those of Kristian Kersting and
% Codrina Lauth. Alex Smola contributed to the algorithmic style files.
