\documentclass[namedreferences,hyperref,optionalrh]{springer}
\usepackage{graphicx}  
\usepackage{color}      
\usepackage{float}
\def\UrlFont{\rm}
\usepackage{mathptmx}
\newcommand{\NC}[1]{{\color{red}nicolo: #1}}
\newcommand{\RL}[1]{{\color{blue}lorry: #1}}
\newcommand{\rev}[1]{{\color{red}#1}}


\newcommand{\std}[1]{^{\scriptstyle{\pm#1}}}
\newcommand{\BibTeX}{\textsc{Bib}\TeX}
\newcommand{\etal}{et al.}
\usepackage{svg}
\usepackage{algorithm, algpseudocode}
\usepackage{mathtools}
\usepackage{url}
\usepackage{soul}
\usepackage{bbm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{adjustbox}
\usepackage{booktabs}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{question}[theorem]{Question}

\newcounter{algsubstate}
\renewcommand{\thealgsubstate}{\alph{algsubstate}}
\newenvironment{algsubstates}
  {\setcounter{algsubstate}{0}%
   \renewcommand{\State}{%
     \stepcounter{algsubstate}%
     \Statex {\footnotesize\thealgsubstate:}\space}}
  {}
  
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator{\vect}{vec}
\def \iid {\stackrel{\text{i.i.d.}}{\sim}}
\def \iidtext {\textrm{i.i.d.}} 
\def \score {\textrm{conformity score}}
\def \calib {\mathrm{calib}}
\def \train {\mathrm{train}} 
\def \test {\mathrm{test}}
\def \nei {\mathcal{N}}
\def \cD {\mathcal{D}}
\def \cV {\mathcal{V}}
\def \cE {\mathcal{E}}
\def \cX {\mathcal{X}}
\def \Atrain {A^{\textrm{train}}}
\def \Aval {A^{\textrm{val}}}
\def \Act {A^{\textrm{ct}}}
\def \Atest {A^{\textrm{test}}}
\def \Acalib {A^{\textrm{calib}}}
\def \Wtc {W^{\textrm{train-calib}}}
\def \Wtrain {W^{\textrm{train}}}
\def \Wval {W^{\textrm{val}}}
\def \Wct {W^{\textrm{ct}}}
\def \Wtest {W^{\textrm{test}}}
\def \Wcalib {W^{\textrm{calib}}}
\def \Whold {W^{\textrm{holdout}}}
\def \Etrain {E^{\textrm{train}}}
\def \Eval {E^{\textrm{val}}}
\def \Ect {E^{\textrm{ct}}}
\def \Etest {E^{\textrm{test}}}
\def \Ecalib {E^{\textrm{calib}}}
\def \Ehold {E^{\textrm{holdout}}}
\def \Etc {E^{\textrm{train-calib}}}
\def \Rtrain {R^{\textrm{train}}}
\def \loss {\mathcal{L}}
\newcommand{\revise}[1]{{\color{black} #1}} 

\usepackage{comment}
\newif\ifshow
\showtrue
%\showfalse
\ifshow
  \newenvironment{allcomments}{}{}
\else
  \excludecomment{allcomments}
\fi


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}

\begin{frontmatter}
\title{Error Reweighted Conformal Prediction with Transductive Graph Autoencoders}

\runningtitle{Conformal Load Prediction with Transductive Graph Autoencoders}

\begin{abstract}
In this paper, we apply conformal prediction (CP) to the graph neural networks (GNNs)-base method and propose an error reweighting approach, named error reweighted conformal prediction graph neural network (ER-GNN). Unlike traditional GNN methods which produce point predictions, Conformal Predictors generate prediction intervals, a region that satisfies a given conﬁdence level. Using regular nonconformity measures, the resulting prediction intervals tend to have a uniform or consistent width across all examples in the test set. However, it would be more natural for the size of the regions to vary according to how difﬁcult to predict each example. We deﬁne a residual reweighting nonconformity measure, which produces predictive regions of variable width depending on the expected accuracy of the algorithm on each example.  We apply ER-GNN to edge weight prediction as well as node classification and regression tasks on 15 datasets, including real-world transportation, citation connection, and election datasets. Consequently, ER-GNN yields results with much tighter predictive regions, higher accuracy, and higher efficiency.

\end{abstract}

\keywords{Error Reweighting, Edge Weight Prediction, Node Regression and Classification, Graph Autoencoder, Conformal Quantile Regression, Conformal Prediction}
\end{frontmatter}
%-------------------------------------------------}



\begin{figure}[h]
\includesvg[height=3.5cm]{figures/us_election_resultsOriFFF.svg}
\includesvg[height=3.5cm]{figures/us2016-ours.svg}
\caption{Residual for predicted 2016 U.S. county-level presidential election. (a) The baseline model's residual~\citep{jia2020residual}, which is the normalized absolute difference between the predicted and ground truth vote count.  (b) ER-GNN's residual result. }
\label{fig: comparison2}
\end{figure}


\section{Introduction}

\textbf{Graph Neural Networks (GNNs)} have seen widespread adoption for their strong performance on graph-structured data across various applications like recommendation systems, knowledge graphs, and molecular modeling \citep{lam2022graphcast, li2022graph, wu2022graph}. As GNNs are increasingly deployed in high-stakes domains like healthcare and autonomous systems, reliably quantifying prediction uncertainty is paramount. A common approach to predicting uncertainty involves constructing prediction intervals intended to capture the true outcome. While several methods have been explored \citep{hsu2022makes, zhang2020mix, lakshminarayanan2017simple}, they often lack rigorous theoretical guarantees on interval validity \citep{wang2021confident}. This limits deploying GNNs with confidence where errors carry consequences. Improving uncertainty quantification for GNNs with probabilistic guarantees is critical to ensuring their safe, trusted application in real-world settings. Addressing this challenge will allow unlocking GNNs' full potential across science, engineering. 

\textbf{Conformal Prediction} (CP) is a machine learning framework for uncertainty quantification that constructs prediction intervals for any underlying point predictor in a theoretically valid manner \citep{vovk2005algorithmic}. CP is based on a nonconformity score that measures the dissimilarity between a data point and others. CP leverages a "calibration" dataset to output prediction sets for new test samples that provably cover the true outcome at least 1 - $\alpha$ of the time, where $\alpha$ is a user-specified error tolerance. Due to its principled formulation, rigorous guarantees, and distribution-free nature, CP has enabled uncertainty estimation across diverse applications. This includes computer vision \citep{angelopoulos2020uncertainty, bates2021distribution}, causal inference \citep{lei2021conformal, jin2023sensitivity, yin2024conformal}, time series \citep{gibbs2021adaptive, zaffran2022adaptive}, and drug discovery \citep{jin2023selection}. 

To apply CP to an algorithm, a nonconformity measure quantifying the difference between new and historical examples is required. The measure reflects disagreements according to the algorithm's feature-relationship assumptions. Crucially, multiple nonconformity measures can represent a single algorithm, each defining a distinct CP predictor \citep{papadopoulos2008normalized}. Additionally, adding an Error-Reweighting (ER) factor can refine prediction intervals \citep{papadopoulos2011regression, lei2018distribution}. By assigning weights to errors covariately, ER helps mitigate heteroscedasticity impacts on accuracy and reliability. Overall, carefully constructing the nonconformity measure and incorporating ER are pivotal to prediction performance.

%Huang et al. \citep{huang2024uncertainty} developed the conformalized graph neural network (CF-GNN), extending conformal prediction (CP) to GNNs. They proved CP validity for graphs by showing the nonconformity score function need only be invariant to calibration/test sample order, aligning with graph exchangeability theory.

Here, we proposed a new error-reweighted nonconformity measure to the conformalized graph neural network (ER-GNN) by predicting the expected accuracy independently. ER-GNN could be applied for weight prediction, node classification and node regression tasks based on the graph-structured data. In detail,
a residual GNN is trained in tandem with the main GNN model, using the residuals from the GNN’s prediction as the target. By integrating the error predictions of GNN into the conformal prediction framework, our approach can effectively calibrate the GNN outputs and produce valid and small prediction intervals.
The ER-GNN helps capture the heteroscedastic nature of the edge weights by learning the error structure of the GNN predictions, as shown in Figure.~\ref{fig: comparison2} and Figure.~\ref{fig:comparisontraffic}.

The rest of the paper is organized as follows. Section~\ref{sec: rw} provides background on GNNs, edge weight prediction as well as node classification and regression problems. Section~\ref{sec: m} outlines our reweighted GNN correction methods. Section~\ref{sec: r} presents the
experimental results.



\section{Related Work}\label{sec: rw}
\subsection{GNN-based Conformal Prediction}\label{subsec: CP}

Nowadays, many GNN-based models have been proposed to handle prediction tasks. Kipf et al \citep{kipf2016semi} proposed an unsupervised representation learning method based on graph neural networks, which can be used to learn low-dimensional embedding representations of graph-structured data. Cai et al \citep{cai2021line} proposal line graph neural network (LGNN) for link prediction, which leverages line graph theory to map each edge in the original graph to a node in the line graph, thereby transforming the link prediction problem into a node classification task in the line graph. Kollias et al \citep{kollias2022directed} introduce a new class of auto-encoders specifically designed for directed graphs (DiGAE). By leveraging parameterized message-passing modules, the proposed directed graph auto-encoder can effectively learn latent representations that capture the complex relationships in directed graph data, outperforming existing methods on node classification and link prediction tasks. Existing GNN architectures can easily be adapted for regression problems by simply changing the output layer and choosing a loss function such as the squared error in the predicted value.

However, existing GNN methods have focused on obtaining highly accurate vote values, which has resulted in a lack of flexibility, real-time adaptability, and generalization capability. Instead of precise predictions, readers often need a general range as a reference. Conformal prediction provides prediction intervals instead of a single point value, which can adapt more easily to changes, update in real-time, better capture the uncertainty in the data, and lead to improved generalization. Furthermore, the prediction intervals offer greater interpretability, allowing users to better understand the reliability of the model's outputs. Huang et al \citep{huang2024uncertainty} developed the conformalized graph neural network (CF-GNN), extending conformal prediction to GNNs. Specifically, they showed that validity holds as long as the nonconformity score function is invariant to permutations of the calibration and test samples.


\subsection{Reweighting Techniques}
The reliance on importance weighting schemes has emerged as a promising direction to improve the robustness of conformal prediction graph neural networks. Recent works \citep{guo2017calibration,volpi2018generalizing} have explored several novel reweighting approaches to CP approaches. Guo et al. \citep{guo2017calibration} proposed a causal-inference-driven importance weighting technique. By leveraging the causal relationships between input features and model predictions, this method can identify the most influential samples and assign them higher weights. This technique helps mitigate the performance degradation caused by distribution shifts between the training and test data. Another study by Olpi et al., \citep{volpi2018generalizing} introduced a distribution matching-based reweighting strategy. The key idea is to align the training and test data distributions by minimizing the discrepancy between the two distributions, reducing the impact of distribution shifts on GNN performance. Rafiq et al. \citep{rafiq2022transfer} presented a causal inference-inspired importance weighting approach. This technique establishes a causal model linking the input features, hidden representations, and output labels. It then uses this causal understanding to detect samples with negative transfer effects on the model's predictions and down weights those samples accordingly.

\subsection{Edge Weight Prediction}
\subsubsection{Similarity-based Methods}
This section surveys methods for edge weight prediction based on network topology. One approach involves computing proximity or "similarity" measures between nodes. Early techniques adapted from graph theory and network analysis include common neighbors approaches like Jaccard coefficient and Adamic-Adar, which hypothesize more links form between nodes sharing many common neighbors. Other measures include preferential attachment and structural similarity based on shortest path length and random walks \cite{liben2003link}.

Common neighbors methods leverage the existing network structure, hypothesizing higher likelihood of links between nodes sharing more contacts. For edge weight prediction, these techniques must be modified from their original node-to-node formulation. Overall, early work in this area demonstrated the value of characterizing node relationships topologically using properties like neighborhood overlap and distance metrics within the network. This helped establish a basis for GNN-based models that have since achieved state-of-the-art performance on graph-structured prediction tasks.

\subsubsection{Probabilistic and Statistical Approaches}
Many early network formation models statistically analyzed probabilistic principles, opening doors to statistical edge weight prediction methods. These approaches usually assume network structure is known and build fitted probabilistic models, estimating parameters statistically to compute non-observed link probabilities for ranking. For example, Clauset et al. \citep{clauset2008hierarchical} represented hierarchical network divisions into distinct, hierarchically connected layers for modularity. Relatedly, the stochastic block model considers nodes distributed across communities/blocks, with edge weights directly dependent on block membership \citep{guimera2009missing}.

Overall, these statistical network modeling techniques demonstrated the utility of probabilistic analysis and community detection for edge prediction tasks when structure is known. They helped lay foundations for later graph neural network approaches that can automatically learn network properties for superior prediction performance without relying on fixed assumptions. Statistical network models provided early insights into characterizing topology from statistical perspectives.

\subsubsection{Dimension Reduction Approaches}
Recent works have explored network embedding and matrix decomposition for dimension reduction and edge weight prediction. These methods learn low-dimensional node representations capturing network structure \citep{kumar2020link}. Early approaches included spectral methods like Laplacian eigenmaps \citep{belkin2001laplacian} and embedding techniques like Logically Linear \citep{roweis2000nonlinear} and Isomap \citep{tenenbaum2000global}.

Random-walk-based methods like DeepWalk \citep{perozzi2014deepwalk} and Node2vec \citep{grover2016node2vec} generate embeddings by sampling node sequences. Matrix factorization decomposes the feature matrix into low-rank components \citep{acar2009link, ma2017nonnegative, sharan2008temporal} for representation. Autoencoders also provide dimension reduction via deep learning \citep{yang2014embedding}.

\subsection{Node Classification}
Node classification is a fundamental task in graph machine learning, aiming to predict the labels of nodes in a graph. Traditional approaches often rely on hand-crafted features and classical machine learning algorithms. With the advent of deep learning, graph neural networks (GNNs) have emerged as a powerful tool for node classification. Kipf and Welling \citep{kipf2016semi} proposed the Graph Convolutional Network (GCN), which generalizes the convolution operation to graph-structured data. Since then, numerous GNN-based methods have been proposed, such as GraphSAGE \citep{hamilton2017inductive}, Graph Attention Networks (GATs) \citep{velivckovic2017graph}, and Graph Isomorphism Networks (GINs) \citep{xu2018powerful}. These models incorporate different neighborhood aggregation and feature transformation strategies to capture the structural and feature information of graphs.

Despite the success of GNNs in node classification, the uncertainty quantification of their predictions remains a challenging problem. Bhagat et al. \citep{bhagat2011node} provide an overview of two broad categories of classification techniques for the node classification problem: methods based on the iterative application of traditional classifiers using graph information as features, and methods which propagate the existing labels via random walks. However, these methods do not provide rigorous guarantees on the coverage or reliability of their uncertainty estimates.

\subsection{Node Regression}
Node regression is to predict continuous target values associated with nodes in a graph. This problem has wide applications, such as predicting traffic flow in transportation networks \citep{li2017diffusion}, forecasting stock prices in financial markets \citep{seo2018structured}, and estimating molecular properties in computational chemistry \citep{wang2023graph}.

Graph neural networks (GNNs) have been extensively applied to node regression problems, as they can effectively capture the complex relationships between nodes and their neighborhoods. Li et al. \citep{li2017diffusion} proposed a spatio-temporal GNN for traffic flow forecasting, which combines graph convolutions to model spatial dependencies and recurrent neural networks to capture temporal dynamics. Seo et al. \citep{seo2018structured} introduced a dual-stage attention-based GNN for stock price prediction, where the attention mechanism selects relevant nodes and edges to improve the model's performance.

Uncertainty quantification is crucial in node regression tasks, as it allows us to assess the reliability of the model's predictions. Hasanzadeh et al. \citep{hasanzadeh2020bayesian} developed a Bayesian GNN that employs variational inference to estimate the posterior distribution of the model parameters, providing uncertainty estimates for the predictions. Wang et al \citep{wang2023graph} proposed a probabilistic GNN that models the uncertainty by learning a distribution over the node representations, which can be used for uncertainty-aware molecular property prediction.

%While these methods have shown promising results, there are still several challenges and open problems in node regression on graphs:

%1. Theoretical Guarantees: Establishing stronger theoretical foundations for the validity and reliability of the uncertainty estimates provided by GNN-based node regression models.

%2. Out-of-distribution Generalization: Designing GNN architectures that can generalize well to unseen graph structures and node features, without overfitting to the training data.

\begin{figure}
\centerline{\includegraphics[width=1.00\textwidth,clip=]{figures/figure1_f.pdf}}
\small
\caption{The overall pipeline of the ER-GNN model. The whole pipeline includes three parts: 1) A conformal graph neural network (GNN) is trained on graph-structured data to perform edge weight prediction, node regression, and node classification, which generates prediction intervals for each task;
2) A separate Residual GNN model is trained on validation data to predict residuals/errors between the true values and Conformal GNN's predictions;
3) An error reweighting approach integrates the prediction intervals from the Conformal GNN and the residual weights predicted by the Residual GNN. Note that the Conformal GNN model is trained using a cross-training process with the residual GNN model, and the final prediction of Conformal GNN is corrected by the prediction of residual GNN.}
\label{fig: modelpipeline}
\end{figure}


\section{Method}\label{sec: m}
\subsection{Tasks Using GNNs}
\subsubsection{Guaranteed Edge Weight Prediction Using GNNs}\label{sec: problem}
Let $G=(V, E)$ be a graph with node set $V$ and edge set $E \subseteq V \times V$.
Assume the graph has $n$ nodes with $f$ features.
Let $X \in \mathbb{R}^{n\times f}$ be the node feature matrix, and $X_i \in \mathbb{R}^{f}$ be the feature vector of the $i^{th}$ node. 
The binary adjacency matrix of $G$, $A \in \{0, 1\}^{n\times n}$, encodes the binary (unweighted) structure of the graph. It is defined by:
\begin{equation} 
A_{ij} =
\begin{cases}
1, & \textrm{if } (i, j) \in E; \\
0, & \textrm{otherwise}.
\end{cases}
\end{equation}
We define the weight matrix as $W \in \mathbb{R}^{n\times n}{\geq 0}$, where $W{ij}$ denotes the weight of the edge connecting node $i$ to node $j$. In the context of a road system, for example, we can interpret $W_{ij}$ as the volume of traffic transitioning from junction $i$ to junction $j$.

%If $(i, j) \notin E$, it is possible to assign a small positive number, such as the minimum or average of the existing edge weights, to $W_{ij}$ to represent prior knowledge or assumptions about the unknown edge weight. The choice does not change the prediction because $W$ is always masked by the binary adjacency matrix $A$.
% \begin{allcomments}
%     \NC{Should we capitalize all random variables?}
% \end{allcomments}

We partition the edge set $E$ into three disjoint subsets: $\Etrain$, $\Eval$, and $\Etest$, while satisfy that $E = \Etrain \cup \Eval \cup \Etest$.
We assume that the weights of the edges in $\Etrain$ and $\Eval$ are known.
The objective is to estimate the unknown weights of the edges in $\Etest$.
Additionally, we assume that the entire graph structure, represented by the adjacency matrix $A$, is known.
%
To mask the validation and test sets, we define \begin{equation}
\Atrain \in \{0, 1\}^{n\times n}, \quad 
\Atrain_{ij} =
\begin{cases}
1, & \textrm{if } (i, j) \in \Etrain; \\
0, & \textrm{otherwise}.
\end{cases}
\end{equation}
$\Aval$ and $\Atest$ are defined  in the same way based on $\Eval$ and $\Etest$, respectively.

If $(i, j) \notin \Etrain$,  it is possible to assign a small positive number to the corresponding element $\Wtrain_{ij}$, such as an assigned value or the minimum of the existing edge weights. This can represent prior knowledge or assumptions about the unknown edge weight. In the following part, we use a positive constant $\delta > 0$ to represent this minimal or assigned value. Incorporating this unknown edge weight information effectively leverages the underlying graph structure. The resulting weighted adjacency matrix is:
\begin{equation}\label{eq: weighted adj train}
\Wtrain =
\begin{cases}
W_{ij}, & \textrm{if } (i, j) \in \Etrain; \\
\delta, & \textrm{if } (i, j) \in \Eval \cup \Etest; \\
0, & \textrm{otherwise},
\end{cases}
\end{equation}


% \begin{equation}
%     \Wtrain = \Atrain \odot W, \; 
%     \Wval = \Aval \odot W, \; 
%     \Wtest = \Atest \odot W, 
% \end{equation}
% where $\odot$ is the element-wise product. 
% The nonzero entries of $\Wtrain$ are the weights of the training edges.
% The algorithms will use that information to predict the (supposedly unknown) nonzero entries of $\Wval$ and $\Wtest$.
%
In the transductive setting (Figure~\ref{fig: transductive}(a)), the structure of the entire graph, represented by the adjacency matrix $A$, is known during the training, validation, and testing phases. To calibrate the prediction, we extract a subset from $\Etest$ as a calibration edge set. This ensures that the calibration and test samples are exchangeable.


% Extension to non-i.i.d. edge selections will be considered in Section \ref{sec: conformal}.

% \begin{allcomments}
%     \NC{Should we refer to "val" as the calibration set? or do we cut out a piece of $\Etest$ to calibrate?}
% \end{allcomments}
\begin{figure}
\centerline{\includegraphics[width=1.05\textwidth,clip=]{figures/figure1.pdf}}
\small
\caption{Schematic figure for transductive and inductive settings for edge weight prediction.
Different colors indicate the availability of the nodes during the training or testing phases.
Solid and dashed lines represent edges used for training and the predicted edge in the testing phases, respectively.
(a) Transductive edge weight prediction performs both training and inference on the same graph.
(b) Inductive edge weight prediction inference is performed on a new, unseen graph }
\label{fig: transductive}
\end{figure}
Consider edge weight prediction in traffic networks. The road system, $A$, and partial traffic volumes, $\Wtrain + \Wval$, are known. The task is to predict volumes, $\Wtest$, for remaining roads.

During training, models observe node features and graph structure to learn functions for node classification/regression and embedding. At inference, models deduce edge connections between nodes (Figure 1).

Three GNN approaches are evaluated. The first is a Graph Autoencoder (GAE) \citep{kipf2016variational} that trains and infers on the full graph. The second is DiGAE \citep{kollias2022directed}, a directed GAE variant. The third is the line graph neural network (LGNN) \citep{cai2021line} that transforms edges to nodes in line graphs.

\subsubsection{Graph Autoencoder(GAE)-based Method for Edge Weight Prediction} \label{subsec: GAE}
The GAE \citep{kipf2016variational} learns an embedding for the nodes of undirected unweighted graphs.
Using GAEs in link prediction tasks is a popular approach.
The practice has been extended to isolated nodes \citep{ahn2021variational}, directed graphs \citep{kollias2022directed}, weighted graphs \citep{zulaika2022lwp}, and graphs with different edge types \citep{samanta2020nevae}.

We let $Z\in \mathbb{R}^{n\times d}$ be the node embedding matrix obtained from a base GAE model.
The resulting embedding is 
\begin{equation}
Z = f_{\theta}(X, A),
\label{eq:gae encoder}
\end{equation}
where $f_{\theta}$ is the structure of the encoder, and $\theta$ is a learnable parameter.
%
We reconstruct the binary adjacency matrix from the inner product between node embeddings, i.e. 
\begin{equation}\label{eq:gae decoder}
P({\hat{A}\,|\,Z}) = \prod_{i=1}^n\prod_{j=1}^n P(\hat{A}_{ij}\,|\,Z_i,Z_j)\, , \,\,\, \text{with} \; P(\hat{A}_{ij}=1\,|\,Z_i,Z_j) = \sigma(Z_i^\top Z_j) \, ,
\end{equation}
Where $\hat{A}$ is the reconstructed binary adjacency matrix, $Z_i$ is the embedding of node $i$, and $\sigma(\cdot)$ is the logistic sigmoid function.

Directed GAE proposed by \citep{kollias2022directed} is more flexible. To highlight the roles of nodes as either a source or a target in directed graphs, the directed GAE uses separate source and target embeddings, $Z^S$ and $Z^T$, instead of the single node embedding used in Equation 5. 
We also replace the binary adjacency matrix with the weighted adjacency matrix $\Wtrain$ (\ref{eq: weighted adj train}) which effectively leverages the entire graph structure.

% \begin{allcomments}
%     \NC{Question: we say we use the entire graph structure. But $\Wtrain$ does not seem to include the binary info in $\Aval$ or $\Atest$. Does it?}
% \end{allcomments}
The predicted weighted adjacency matrix is
\begin{equation}
    \hat{W} = Z^S {Z^T}^\top.
\end{equation}
To optimize the GNNs parameters, we minimize 
\begin{equation} \label{eq:Train_GAE}
    \loss_{\textrm{GAE}} = \| \Atrain \odot \hat{W} -\Wtrain \|_.    
\end{equation}
through gradient descent.
We train the model until convergence and then select the parameters that minimize $\loss_{\textrm{GAE}}$ on the validation set, $\Wval$.

%\subsection{Line Graph Neural Network-LGNN} \label{subsec: line graph} 
%An alternative approach to predict edge weights is through an edge-centric line graph model. The idea is to convert the weight prediction task into a node regression problem. We define a line-graph GNN and train it using standard message-passing techniques.

%Given a weighted directed graph, $G$, the corresponding line graph, $L(G)$, is a graph where each node of $L(G)$ represents an edge of $G$. Two nodes of $L(G)$ are adjacent if and only if their corresponding edges share a common endpoint in $G$. Equivalently, $L(G)$ is the intersection graph of the edges of $G$. Each edge of $G$ becomes a node of $L(G)$, labeled by the set of its two endpoints.

%Let $L = L(G)$ and $X^L$ be the node feature matrix of $L$. To obtain $X^L$, we combine the node features of the corresponding source node and target node in the original graph $G$. We then define a GNN to process the nodes and the binary adjacency matrix of $L$. The predicted node values are
%\begin{equation}\label{eq: LGNN}
%    Z^L = f_{\theta}(X^L, A^L),
%\end{equation}
%Similar to the GAE approach, we tune the GNN parameters by minimizing  
%\begin{equation}
%    \loss_{\textrm{LGNN}} = \sum_{(i, j) \in \Etrain} \left(Z^L_{(i, j)} - \Wtrain_{ij} \right) ^ 2.
%    % \theta^* = \argmin\limits_{\theta} \sum_{(i, j) \in \Etrain} \left(Z^L_{(i, j)} - \Wtrain_{ij} \right) ^ 2
%\end{equation}
%By converting the original graph $G$ to its line graph $L(G)$, the load prediction task becomes a node regression problem, where the node values in $L(G)$ are used as the labels. We split the (augmented) node set of $L(G)$ into training, test, and calibration sets. The weights learned during the GAE training process correspond to the values of the training nodes in $L(G)$.




\subsubsection{Error Reweighted Conformalized Graph Neural Network for Edge Weight Prediction}\label{sec: conformal}
%In this section, we describe how to integrate Error Reweighted Model Training into a conformlized GNN model. Firstly, we train two GNN following the equation~\ref{eq:Train_GAE} crossly to get two GNN model according to Algorithm~\ref{closs_train}. Secondly, Algorithm \ref{alg: CQR} describes how to obtain the prediction intervals in this setup. 



%Conformal Prediction/Conformal Quantile Regression uncertainty estimation can be found in \citep{luo2024conformal}


%\subsubsection{Conformal Prediction- CP- GAE}\label{subsec: CP}
%We assume that we have access to the graph structure, $A$, the node features, $X$, and the edge weights of the training edges, $\Wtrain$.
%Let $(a, b)$ be the endpoints of a test edge.
%We aim to generate a prediction interval, $ C_{ab} = f_{\theta}((a, b), A, X, \Wtrain)  $ among whom $  C_{ab} \subset \mathbb{R}$, for the weight of the such a test edge.
%The prediction interval should be marginally valid, i.e. it should obey 
%\begin{equation}\label{eq: desired coverage}
%P\left( W_{ab} \in C_{ab} \right) \geq 1 - \alpha, 
%\end{equation}
%The user-defined error rate is denoted as $\alpha \in (0, 1)$. The probability in Equation \eqref{eq: desired coverage} is taken over the data-generating distribution.

%To ensure efficiency, we focus on the split Conformal Prediction (CP) approach \citep{papadopoulos2002inductive}. In this method, we divide the training data into two sets: a training set, which consists of the nonzero entries of $\Wtrain$, and a calibration set, which consists of the nonzero entries of $\Wcalib$. The training set is used to fit the prediction model $f_{\theta}$. For each sample in the calibration set, we then calculate a \score{} that evaluates how well the predictions match the observed labels. Lower scores typically indicate better predictions. Given a user-specified error rate $\alpha$ and the endpoints $(a, b)$ of a test edge, we compute the corresponding prediction interval $C_{ab}$ using the $(1-\alpha)$-th sample quantile of the calibration conformity scores. This approach allows us to efficiently generate reliable prediction intervals that meet the desired error tolerance, as specified by the user-defined error rate $\alpha$. The split CP method ensures that the uncertainty quantification is based on a separate calibration set, which helps maintain the validity of the prediction intervals.
%If the calibration edges and $(a, b)$ are exchangeable, $C_{ab}$ has the required coverage (\ref{eq: desired coverage}).
%This implies that the exchangeability requirement is only necessary between the calibration and test edges, aligning with the methodology of \citep{huang2023uncertainty}. In real-world traffic applications, we often encounter a fixed set of training edges, for instance, a designated area in a city with well-documented traffic flow data. Furthermore, a separate set might serve as both calibration and test sites, where traffic detectors are placed randomly. This arrangement ensures that the calibration and test edges are exchangeable.

%Algorithm \ref{alg: split CP} shows how to use split CP with a GAE model for predicting edge weights. 
%Proposition \ref{prop: exchangeability} shows that the load prediction intervals generated by applying split CP to the GAE model are marginally valid in the sense of \eqref{eq: desired coverage}.

%\begin{algorithm}
%\caption{Split Conformal Prediction for Graph Autoencoder}
%\label{alg: split CP}
%\hspace*{\algorithmicindent} \textbf{Input:} The binary adjacency matrix $A \in \{0, 1\}^{n\times n}$, node features $X\in \mathbb{R}^{n\times f}$, training edges and  weights $\Etrain$, $\Wtrain$, calibration edges and  weights $\Ecalib$, $\Wcalib$, and test edges $\Etest$, user-specified error rate $\alpha \in (0,1)$, GAE model $f_\theta$ with trainable parameter $\theta$.\\
%\begin{algorithmic}[1]
%\State Train the model $f_\theta$ with $\Wtrain$ according to (\ref{eq: train DiGAE}).
%\State Compute the score {QR/CQR/CP} which measures how well the calibration edge weights $\Wcalib$ agree with the model $f_\theta$:
%\begin{equation}\label{eq: score}
%    V_{ij} = \left| f_\theta \left((i, j); A, X, \Wtrain \right) - \Wcalib_{ij} \right|, \; (i, j) \in \Ecalib.
%\end{equation}
%\State Compute $d =$ the $k$th smallest value in $\{V_{ij}\}$, where $k=\lceil(|\Ecalib| +1)(1-\alpha)\rceil$.
%\State Construct a prediction confidence interval for test edges:  
%\begin{equation}
%\begin{split}
%   & C_{ab} \\
%= & \Big[f_\theta\left( (a, b); A, X, \Wtrain \right) - d, f_\theta\left( (a, b); A, X, \Wtrain \right) + d \Big], \; (a, b) \in \Etest. \nonumber
%\end{split}
%\end{equation}
%\end{algorithmic}
%\hspace*{\algorithmicindent} \textbf{Output:} Prediction confidence intervals for the test edges $(a, b) \in \Etest$ with the coverage guarantee:
%\begin{equation}
%    P\big(\Wtest_{ab} \in C_{ab} \big) \geq 1 - \alpha.
%\end{equation}
%\end{algorithm}
%The GAE model in Algorithm \ref{alg: split CP} uses the graph structure, i.e. the binary adjacency matrix, $A$, and the training edge weights, $\Wtrain$, and the node features, $X$.
%As the order of the nodes is arbitrary, the \score{} the calibration and test samples are exchangeable (see Assumption 1 of \citep{huang2023uncertainty}). 
%Intuitively, varying the choice of the calibration and test sets will not statistically alter the \score{}.

%Conformal Quantile Regression (CQR) combines the advantages of Conformal Prediction (CP) and Quantile Regression (QR) when dealing with heteroscedastic data \citep{romano2019conformalized}. CQR can provide more flexible and adaptive prediction intervals that account for the varying uncertainty across the graph. 

%To improve the computational efficiency of the GAE model, we modify the encoder (described in Section \ref{subsec: GAE}) to produce a triple output instead of a single embedding for each node. Specifically, the encoder now generates three embeddings for each node. The decoder then aligns these three embeddings to the mean, the $\alpha/2$ quantile, and the $(1-\alpha/2)$ quantile of the predicted edge weights, respectively. This approach differs from using three separate single-output GAE encoders, as it allows for the sharing of most network parameters across the three embeddings. By producing a triple output, the encoder can capture more information about the distribution of the edge weights, which can then be leveraged by the decoder to generate more accurate and reliable prediction intervals. Importantly, this modification to the GAE model improves its computational efficiency compared to using multiple independent encoders, as the majority of the model parameters are shared across the three embedding outputs.
%
%Let $\hat{W}$, $\hat{W}^{\alpha/2}$, and $\hat{W}^{1 - \alpha/2}$ be the mean, $\alpha/2$, and $(1-\alpha/2)$ quantiles of the edge weights, i.e.
%\begin{equation}\label{eq: CQR output}
%    f_\theta\left( (i, j); A, X, \Wtrain \right) = \left[ \hat{W}_{ij}, \hat{W}^{\alpha/2}_{ij}, \hat{W}^{1 - \alpha/2}_{ij}  \right]
%\end{equation}
%
%We train the embedding by minimizing 
%\begin{equation}\label{eq: train CQR-GAE}
%    \loss_{\textrm{CQR-GAE}} = \loss_{\textrm{GAE}} + \sum_{(i, j) %\in \Etrain} \rho_{\alpha/2}(\Wtrain_{ij}, \hat{W}^{\alpha/2}_{ij}) + \rho_{1 - \alpha/2}(\Wtrain_{ij}, \hat{W}^{1-\alpha/2}_{ij}), 
%\end{equation}
%where $\loss_{\textrm{GAE}}$ is the squared error loss defined in (\ref{eq: train DiGAE})
%The second term is the pinball loss of \citep{steinwart2011estimating, romano2019conformalized}, defined as 
%\begin{equation}
%    \rho_{\alpha}(y, \hat{y}) \coloneqq 
%    \begin{cases}
%        \alpha (y - \hat{y}) & \textrm{if } y > \hat{y} \\
%        (1 - \alpha) (y - \hat{y}) & \textrm{otherwise}
%    \end{cases}
%\end{equation}

%The CQR is used which is a  conformity score (\ref{eq: CQR score}) considering both undercoverage and overcoverage scenarios.


To begin with, we provide the motivation of ER-GNN. The basic framework of ER-GNN lies in combining conformal prediction with a graph autoencoder framework to solve edge weight prediction problems as well as node and node classification problems. Compared with methods like NAPS \citep{clarkson2023distribution} and Diffusion Adaptive Prediction Sets (DAPS) \citep{zargarbashi23conformal} methods, which assume homophily of the nodes and assign higher weights of the linked nodes, ER-GNN does not require the homophily assumption. This is distinct from that of \citep{huang2023uncertainty}, which focuses on node prediction problems. Additionally, the assumption of homophily may not hold in traffic networks \citep{xiao2023spatial}, as traffic conditions can vary; for example, a small road adjacent to a busy street might experience less traffic.
Moreover, our experiments with the line graph demonstrate that the setup of an autoencoder framework and the transformation of an original graph into its line graph are not equivalent. 

When the calibration and test samples are exchangeable, both conformal prediction (CP) ~\citep{shafer2008tutorial} and conformal quantile regression (CQR)~\citep{romano2019conformalized} yield prediction intervals that meet the marginal coverage condition \citep{luo2024conformal}. 
However, the local adaptability of these intervals can be further improved by adding an Error-Reweighting (ER) factor \citep{papadopoulos2011regression, lei2018distribution}.

To this end, the idea behind ER is to assign covariate-dependent weights to the errors, which helps mitigate the impact of heteroscedasticity on the accuracy and reliability of the predictions.
The empirical simulations conducted by Guan et al. \citep{guan2023localized} have shown that combining the CQR and ER approaches can produce efficient and locally adaptive prediction intervals. This combined method requires training a residual model in addition to the main prediction model, which captures the local variations present~\citep{kollias2022directed} in the data.
Interestingly, in the case of CQR, the residual model can be obtained at no extra cost, as it can be derived from the distance between the $\alpha/2$-th and $(1 - \alpha/2)$-th predicted quantiles. By incorporating the ER factor into the CQR framework, the resulting prediction intervals can adapt more effectively to the local characteristics of the data, leading to improved efficiency and reliability compared to using CP or CQR alone.
% \rev{This approach differs from the local weighted split conformal method utilized in \citep{papadopoulos2008normalized}. While the latter method fits a secondary conditional mean regression model to the absolute residuals of the training set, we instead leverage the differences in quantiles as the residuals for our model. This approach bypasses the statistical constraint mentioned in \citep{romano2019conformalized}, where the residual tends to inflate for the test set because the prediction model aims to minimize the residuals on the training set.}
Additionally, we adopt the concept of a normalized conformity function \citep{johansson2014regression} or local reweighted conformal method \citep{papadopoulos2008normalized}, which involves training a separate model specifically to predict the error of the underlying model.
To achieve this, we train a separate residual GNN model along with the main model to predict the error of the edge weight prediction, using the residual,$ \hat{R}_{ij}$, from the prediction of the GNN model as the target label:
\begin{equation}
\label{eq: RGAE}
    \hat{R}_{ij} = g_{\theta_{2}}\left( (i, j); A, X, \Rtrain \right) ,
\end{equation}
where $\Rtrain_{ij} = \Wtrain_{ij} - \hat{W}_{ij}$, $\hat{W}_{ij}=g_{\theta_{1}}\left( (i, j); A, X, \Wtrain\right)$ is the output of conformal GNN, and the ER-GAE is trained by minimizing
\begin{equation} \label{eq: train RGAE}
    \loss_{\textrm{residual GNN}} = \| \Atrain \odot \hat{R} - \Rtrain \|_F.    
\end{equation}
 Algorithm~\ref{closs_train} and Figure.~\ref{fig: modelpipeline} show the implementation details of training the main and residual models crossly simultaneously \citep{cowell2006alternative,peste2021ac}. A conformal graph neural network (GNN) is trained on graph-structured data, generating prediction
intervals. A separate Residual GNN model is trained on validation data to
predict residuals/errors between the true values and Conformal GNN’s predictions. The Conformal GNN model is
trained using a cross-training process with the residual GNN model.

\begin{algorithm}
\caption{Crossly-Training Algorithm for ER-GNN}
\label{closs_train}
\begin{algorithmic}[1]
\State \textbf{Init:} Initialize weights of ${Model}_{C0}$ (Conformal GNN) model $\theta_1 \in \mathbb{R}^N$. and weights of Residual $Model_{C1}$  (Residual GNN) $\theta_2 \in \mathbb{R}^N$.
\For{$i$ in loop}
    \If{$i$ is odd}
        \State Train ${Model}_{C0}$ with gradient and update $\theta_{1}$ using the training data.
%       \State Update the residual as the  
    \Else
    \State Get the residual from ${Model}_{C0}$, taking validation data as input,
        \State Train Residual $Model_{C1}$  with gradient and update $\theta_{2}$ based on validation data.
    \EndIf
\EndFor
\end{algorithmic}
\end{algorithm}


\begin{proposition}\label{prop:crosstraining2}
The Technique of Cross-Training of Two Telated Models Has No Bad Effect on the Models' Final Performance.
\end{proposition} 
\begin{proof}
Consider two related models A and B, where A's outputs label B. We iteratively train them.

The entire dataset $\mathcal{D}$ is split into training $\mathcal{D}_{train}$ and validation/test $\mathcal{D}_{val/test}$ sets.

We iteratively train A and B as:

Train A on $\mathcal{D}_{train}$ and evaluate on $\mathcal{D}_{val/test}$.
Use A's predictions on $\mathcal{D}_{train}$ as labels to train B, then evaluate B on $\mathcal{D}_{val/test}$.
Repeat until a stopping criterion, e.g. reaching a maximum number of iterations or no improvement on validation.

This approach does not affect final performance because:

A and B share the same training distribution $\mathcal{D}_{train}$.
They train independently in each iteration without influencing each other.
Evaluation set $\mathcal{D}_{val/test}$ remains fixed, allowing direct comparison.
We select the model with best $\mathcal{D}_{val/test}$ performance as final.
While B uses A's labels, both models train on the same distribution, and A's outputs should provide useful information for B.
\end{proof}



\begin{algorithm}
\caption{Error Reweighted Conformalized Graph Neural Network for Edge Weight Prediction}
\label{alg: CQR}
\hspace*{\algorithmicindent} \textbf{Input:} The binary adjacency matrix $A \in \{0, 1\}^{n\times n}$, edge weight matrix $W \in \mathbb{R}^{n\times fn}$, node features $X\in \mathbb{R}^{n\times f}$, training edges and weights $\Etrain$ and $\Wtrain$, validation edges and weights $\Eval$ and $\Wval$, calibration edges and weights $\Ecalib$ and $\Wcalib$, and test edges $\Etest$, user-specified error rate $\alpha \in (0,1)$, two GNN model $f_{\theta1}$ and $f_{\theta2}$ with trainable parameter $\theta1$ and $\theta2$.\\
\begin{algorithmic}[1]
\State Train the model $f_{\theta1}$ and $f_{\theta2}$ with $\Wtrain$ and $\Wval$ according to Algorithm \ref{closs_train}.
\State Predict the confidence interval $[\hat{W}^{\alpha/2}_{ij},\hat{W}^{1-\alpha/2}_{ij}]$ as the output of $f_{\theta1}$ and the residual $\hat{R}_{ij}$ as the output of $f_{\theta2}$ taking the calibration data as input.
\State Compute the \score{} which quantifies the residual of the calibration edge weights $\Wcalib$ projected onto the nearest quantile produced by $f_{\theta1}$ and $f_{\theta2}$ :
\begin{equation}
\label{eq: CQR score}
V^{\textrm{ER}}_{ij} = \max\left \{ \frac{\hat{W}^{\alpha/2}_{ij} - \Wcalib_{ij}}{\big|\hat{R}_{ij}\big|}, \frac{\Wcalib_{ij} - \hat{W}^{1-\alpha/2}_{ij}}{\big|\hat{R}_{ij}\big|}  \right\}, \; (i, j) \in \Ecalib,    
\end{equation}
\State Compute $d =$ the $k$-th smallest value in $\{V^{\textrm{ER}}_{ij}\}$, where $k=\lceil(|\Ecalib| +1)(1-\alpha)\rceil$;
\State Construct a prediction interval for test edges:  
\begin{equation}
    C_{ab} = \Big[\hat{W}^{\alpha/2}_{ab} - d\big|\hat{R}_{ab}\big|, \hat{W}^{1-\alpha/2}_{ab} + d\big|\hat{R}_{ab}\big| \Big], \; (a, b) \in \Etest. \nonumber
\end{equation}
\end{algorithmic}
\hspace*{\algorithmicindent} \textbf{Output:} Prediction of confidence intervals for the test edges $(a, b) \in \Etest$ with the coverage guarantee:
\begin{equation}
    P\big(\Wtest_{ab} \in C_{ab} \big) \geq 1 - \alpha.
\end{equation}
\end{algorithm}
We use the standard deviation of these predictions as a proxy of the residual. 
% \begin{allcomments}
%     \NC{Does computing the difference between the quantiles help?
%     The error we would like to estimate is the difference between the estimated and true quantiles.
%     As for ER, it looks like the reweighting error depends on the uncertainty of the model on the training set. 
%     This may make the intervals unnecessarily big because the training residuals are smaller than the calibration residuals.
%     Do you agree?}
% \end{allcomments}
More concretely, we propose the score function in (\ref{eq: CQR score}) with \begin{equation}
V^{\textrm{ER}}_{ij} = \max\left \{ \frac{\hat{W}^{\alpha/2}_{ij} - \Wcalib_{ij}}{\big|\hat{R}_{ij}\big|}, \frac{\Wcalib_{ij} - \hat{W}^{1-\alpha/2}_{ij}}{\big|\hat{R}_{ij}\big|}  \right\}, \; (i, j) \in \Ecalib,    
\end{equation}
which is reweighted according to the absolute value of the residual as predicted by the ER-GNN model (\ref{eq: RGAE}).


%
Let $d^{\textrm{ER}} $ be the $k$-th smallest value in $\{V^{\textrm{ER}}_{ij}\}$, where $k=\lceil(n/2 +1)(1-\alpha)\rceil$.
The ER prediction intervals are 
\begin{equation}\label{eq: interval ERC}
\begin{split}
   C_{ab} =  \Big[ &\hat{W}^{\alpha/2}_{ab} - d^{\textrm{ER}} \big|\hat{W}^{1-\alpha/2}_{ab} - \hat{W}^{\alpha/2}_{ab} \big|,  \\
   & \hat{W}^{1-\alpha/2}_{ab} + d^{\textrm{ER}}\big|\hat{W}^{1-\alpha/2}_{ab} - \hat{W}^{\alpha/2}_{ab}\big| \Big], \; (a, b) \in \Etest,
\end{split}
\end{equation}
for CQR-ER,
\begin{equation}\label{eq: interval ER R-GAE}
   C_{ab} =  \Big[ \hat{W}^{\alpha/2}_{ab} - d^{\textrm{ER}} \big|\hat{R}_{ab}\big|,  \;
    \hat{W}^{1-\alpha/2}_{ab} + d^{\textrm{ER}}\big|\hat{R}_{ab}\big| \Big], \; (a, b) \in \Etest,
\end{equation}
%for CQR-ER with a separate residual model, and
%%\begin{equation}\label{eq: interval ER CP}
%\begin{split}
%   C_{ab} = \Big[ & f_\theta \left((a, b); A, X, \Wtrain \right) - d^{\textrm{ER}} \big( s_{ab}^{\textrm{MC}} + \epsilon \big),  \\
%   & f_\theta \left((a, b); A, X, \Wtrain \right) + d^{\textrm{ER}} \big( s_{ab}^{\textrm{MC}} + \epsilon \big) \Big], \; (a, b) \in \Etest,
%\end{split}
%\end{equation}
%for CP-ER.



%\begin{proposition}\label{prop:exchangeability}
%The prediction intervals generated by split CP (Algorithm \ref{alg: split CP}), CQR (Algorithm \ref{alg: CQR}), and ER (Section \ref{subsec: ERC}), are marginally valid, i.e. obey (\ref{eq: desired coverage}).
%\end{proposition} 

%\begin{proof}
%    First, we show that the calibration and test conformity scores defined in (\ref{eq: score}) are exchangeable. 
%    Given the entire graph structure, $A$, all the node features, $X$, and the edge weights of the training edges, $\Wtrain$, 
 %   the node embeddings are trained based on $\Wtrain$, and the edge weights in the remaining $\Ect$ are set randomly, the division of $\Ect$ into $\Ecalib$ and $\Etest$ have no impact on the training process. Consequently, the \score{s} for $\Ecalib$ and $\Etest$ are exchangeable.
%    In practice, we split $\Ect$ into $\Ecalib$ and $\Etest$ randomly (as detailed in Section \ref{sec: empirical})  by converting the graph into its line graph and then selecting nodes uniformly at random.
    
    % the predicted edge weights $\{\hat{W}_{ij}\}_{(i, j)\in \Ecalib \cup \Etest} = f(X, A, \Wtrain)$ does not depend on how the remaining edges are divided into $\Ecalib$ and $\Etest$. 
    % When we choose $\Ecalib$ and $\Etest$ uniformly at random (as detailed in Section \ref{sec: empirical}), the distribution of calibration and test edges is exchangeable. 
    % In practice, this is achieved by converting the graph into its line graph and then selecting nodes randomly.

%    In addition to the proof presented in \citep{huang2023uncertainty}, we explore an alternative proof that is equivalent but applied within a line graph setting.

%\end{proof}

%\begin{figure}
%\centerline{\includegraphics[width=\textwidth,clip=]{figures/Chicago/Chicago.png}}
%\small
%\caption{The figure showcases the application of our proposed prediction models, which provide a coverage guarantee, using a snapshot of traffic network and traffic flow data from Chicago, IL, United States \citep{bar2021transportation}. The traffic network is divided into training roads (represented by black solid lines) and test roads (represented by red dashed lines).
%Our Conformal Quantile Regression with Graph Autoencoder and Reweight Technology (ERC-CQR-GAE) model, as described in Algorithm \ref{alg: TAR}, is developed to generate prediction intervals with a user-specified error rate of $\alpha=0.05$. The middle plot displays the predicted edge weights $\hat{W}$, where the line thickness increases proportionally with the predicted edge weights.
%The right plot illustrates the lengths of the prediction intervals, with darker lines indicating wider intervals or higher inefficiency as defined by Equation \ref{eq: ineff}. This visualization helps to identify the regions in the traffic network where the prediction intervals are wider, suggesting higher uncertainty in the predictions.
%By applying our CQR-GAE model to this real-world transportation dataset, we can provide users with reliable and locally adaptive prediction intervals for the edge weights, enabling more informed decision-making in transportation planning and management..}
%\label{fig: Chicago}
%\end{figure}


\subsubsection{Error Reweighted Conformalized Graph Neural Network for Node Regression}\label{sec: problem2}
We propose ER-GNN for the node regression task to predict a continuous target variable $y_i$ associated with each node $i$ in a graph. Comparing with the algorithm of edge weight prediction, which uses the node embedding to predict the edge weight, node regression minimizes the distance of the direct output of decoder and labels. The algorithm consists of three steps: 1) Train a GNN model, 2) Define a non-conformity measure, 3) Compute prediction sets.

\begin{algorithm}
\caption{Error Reweighted Conformalized Graph Neural Network for Node Regression}
\label{alg: nodd regression}
\hspace*{\algorithmicindent} \textbf{Input:} The binary adjacency matrix $A \in \{0, 1\}^{n\times n}$, Training node features $X\in \mathbb{R}^{n\times f}$, training node and feature variable $N^{train}$, $Y^{train}$, calibration edges and weights $N^{calib}$, $Y^{calib}$, and test edges $N^{test}$, user-specified error rate $\alpha \in (0,1)$, two GNN model $f_{\theta1}$ and $f_{\theta2}$ with trainable parameter $\theta1$ and $\theta2$.\\
\begin{algorithmic}[1]
\State Train the model $f_{\theta1}$ and $f_{\theta2}$ with $Y^{train}$ according to Algorithm \ref{closs_train}.
\State Compute the \score{} which quantifies the residual of the calibration node labels $Y^{calib}$ projected onto the nearest quantile produced by $f_{\theta1}$ and $f_{\theta2}$ :
\begin{equation}\label{eq: CQR score1}
V^{\textrm{ER}}_{i} = \max\left \{ \frac{\hat{Y}^{\alpha/2}_{i} - Y^{calib}_{i}}{\big|\hat{R}_{i}\big|}, \frac{Y^{calib}_{i} - \hat{Y}^{1-\alpha/2}_{i}}{\big|\hat{R}_{i}\big|}  \right\}, \; (i) \in N^{calib},    
\end{equation}
\State Compute $d =$ the $k$th smallest value in $\{V^{\textrm{ER}}_{i}\}$, where $k=\lceil(|N^{calib}| +1)(1-\alpha)\rceil$;
\State Construct a prediction interval for test edges:  
\begin{equation}
    C_{a} = \Big[\hat{Y}^{\alpha/2}_{a} - d\big|\hat{R}_{a}\big|, \hat{Y}^{1-\alpha/2}_{a} + d\big|\hat{R}_{a}\big| \Big], \; (a) \in N^{test}. \nonumber
\end{equation}
\end{algorithmic}
\hspace*{\algorithmicindent} \textbf{Output:} Prediction of confidence intervals for the test edges $(a) \in \Etest$ with the coverage guarantee:
\begin{equation}
    P\big(Y^{test}_{a} \in C_{a} \big) \geq 1 - \alpha.
\end{equation}
\end{algorithm}



Firstly, we train a traditional GNN model for the node regression task using the available graph data. The GNN model we used in our experiments is GAE. The GNN model learns a function $f: \mathcal{G} \rightarrow \mathbb{R}^n$, where $\mathcal{G}$ is the input graph and $f(G)_i$ represents the predicted target variable for node $i$.

\begin{align}
\mathbf{Z} &= f_{\text{encoder}}(\mathbf{X}, \mathbf{A
}) ~\label{enbedding_equation} \\
\hat{\mathbf{Y}} &= f_{\text{decoder}}(\mathbf{Z}) \\
\mathcal{L} &= |\mathbf{Y} - \hat{\mathbf{Y}}|^2
\end{align}
And we get fixed $\theta_{\text{encoder}}$ and $\theta_{\text{decoder}}$ which are the parameters of the encoder and decoder, respectively. The objective is to minimize the reconstruction error $\mathcal{L}$ by optimizing the encoder and decoder parameters.

For the following second and the third steps, a residual GNN predicts the residual of the node labels, generating the weight for non-conformity measure when compute prediction sets. The details of the algorithm is shown in Algorithm~\ref{alg: nodd regression}.



\begin{algorithm}
\caption{Error Reweighted Conformalized Graph Neural Network for Node Classification}
\label{alg: nodd classification}
\hspace*{\algorithmicindent} \textbf{Input:} The binary adjacency matrix $A \in \{0, 1\}^{n\times n}$, Training node features $X\in \mathbb{R}^{n\times f}$, training node and class label variable $N^{train}$, $L^{train}$, calibration edges and weights $N^{calib}$, $L^{calib}$, and test edges $N^{test}$, user-specified error rate $\alpha \in (0,1)$, two GNN model $f_{\theta1}$ and $f_{\theta2}$ with trainable parameter $\theta1$ and $\theta2$.\\
\begin{algorithmic}[1]
\State Train the model $f_{\theta1}$ and $f_{\theta2}$ with $A^{train}$ according to Algorithm \ref{closs_train}.
\State Compute the \score{} which quantifies the residual of the calibration node classes $\Wcalib$ projected onto the nearest quantile produced by $f_{\theta1}$ and $f_{\theta2}$ :
\begin{equation}\label{eq: CQR score2}
V^{\textrm{ER}}_{i} = \max\left \{ \frac{\hat{L}^{\alpha/2}_{i} - L^{calib}_{i}}{\big|\hat{R}_{i}\big| + \epsilon}, \frac{L^{calib}_{i} - \hat{L}^{1-\alpha/2}_{i}}{\big|\hat{R}_{i}\big| + \epsilon}  \right\}, \; (i) \in N^{calib},    
\end{equation}
\State Compute $d =$ the $k$th smallest value in $\{V^{\textrm{ER}}_{i}\}$, where \
\begin{equation} \label{diffquantile}
    k=DiffQuantile(\lceil(|N^{calib}| +1)(1-\alpha)\rceil);
\end{equation}
\State Construct a prediction interval for test edges:  
\begin{equation}
    C_{a} = \Big[\hat{L}^{\alpha/2}_{a} - d\big|\hat{R}_{a}\big|, \hat{L}^{1-\alpha/2}_{a} + d\big|\hat{R}_{a}\big| \Big], \; (a) \in N^{test}. \nonumber
\end{equation}
\end{algorithmic}
\hspace*{\algorithmicindent} \textbf{Output:} Prediction of confidence intervals for the test edges $(a, b) \in \Etest$ with the coverage guarantee:
\begin{equation}
    P\big(L^{test}_{a} \in C_{a} \big) \geq 1 - \alpha.
\end{equation}
\end{algorithm}

\subsubsection{Guaranteed Node Classification Using GNNs}\label{sec: problem3}

The node classification problem is a fundamental task in graph-based machine learning, where the goal is to predict a discrete label or class for each node in a given graph. Formally, let $G = (V, E)$ be a graph with $n$ nodes, where $V = \{1, 2, \dots, n\}$ is the set of nodes and $E \subseteq V \times V$ is the set of edges. Each node $i \in V$ has a corresponding feature vector $x_i \in \mathbb{R}^d$ and a true label $y_i \in \mathcal{Y}$, where $\mathcal{Y}$ is the set of possible labels.

Similar to the task of node regression, the approach consists of the following steps:  1) Train a GNN model, 2) Define a non-conformity measure, 3) Compute prediction sets.

1. \textbf{Train a GNN model}: Train a traditional graph neural network (GNN) model for the node classification task using the available graph data. We use GAE in our experiments. The GNN model learns a function $f: \mathbb{R}^d \rightarrow \mathcal{Y}$ that maps the node features to the corresponding labels.

The training process of the GAE model is as follows:

\begin{align}
\mathbf{Z} &= f_{\text{encoder}}(\mathbf{X}, \mathbf{A}) \\
\hat{\mathbf{Y}} &= f_{\text{decoder}}(\mathbf{Z}) \\
\mathcal{L} &= -\frac{1}{N}\sum_{i=1}^{N}\sum_{c=1}^{C}y_{i,c}\log(\hat{y}_{i,c})
\end{align}

where $\mathcal{L}$ is the loss function when training. After training and testing our model, we do some processing since the labels and output are one-hot type. We do a softmax operation to get a vector the node belongs to which class and multiply with the index vector, $\mathbf{I}$=[0,1,2,..., C-1].


Then, we use softmax function to get node classification to form the output of decoder:
\begin{align}
\hat{L} &= \text{softmax}(\mathbf{\hat{Y}}) \times \mathbf{I}^\top
\end{align}

The motivation of this above method is to get a real but discrete number of class to get a flexible d to master better error structure of residual in Algorithm~\ref{alg: nodd classification}. After that $\hat{R}$ is obtained by:

\begin{align}
\mathbf{\hat{R}} &= \hat{L}- L
\end{align}
Where $L$ is the label of which class the node belongs to. 

In Algorithm~\ref{alg: nodd classification}, we set a small positive real number $\epsilon$ to avoid the denominator (1e-9) equals to 0.  In addition, we need the Differentible Quantile method in equation (21). Since the non-conformity score is usually differentiable, it only requires differentiable quantile calculation where there are well-established methods available \citep{chernozhukov2010quantile,blondel2020fast}.


\section{Results} \label{sec: r}
\subsection{Empirical Analysis}\label{sec: empirical}
In this section, we showcase the application of the proposed ER-GNN on 15 datasets for edge weight prediction, node regression, and node classification problems. We conduct a comparative
analysis of the performance of ER-GNN and four competitors based on two metrics.

% \subsection{Transportation Network Snapshot}
\vspace{0.1in}
\noindent
{\bf Dataset:}

We apply our proposed algorithm to four kinds of datasets, 1) Traffic datasets, 2) Citation connection datasets, 3) Social network datasets, and 4) others.  For real-world traffic network data, specifically the traffic network and traffic flow data from 
Chicago and Anaheim\citep{bar2021transportation}. Chicago dataset consists of 541 nodes representing road junctions and 2150 edges representing road segments with directions, while the Anaheim dataset consists of 413 nodes and 858 edges.
In this context, each node is characterized by a two-dimensional feature $X_i\in \mathbb{R}^{2}$ representing its coordinates, and each edge is associated with a weight that signifies the traffic volume passing through the corresponding road segment.

For the Citation datasets, Cora, PubMed, and CiteSeer datasets are three widely used citation network datasets that are extensively employed in graph neural network (GNN) and node classification research. The Cora dataset is a citation network composed of computer science literature, containing 2,708 nodes and 5,429 edges. Each paper in the Cora dataset is represented by a 1,433-dimensional bag-of-words vector, indicating the presence of words, and there are 7 categories in the computer science domain used as labels. The PubMed dataset, on the other hand, is a citation network of biomedical literature, with 19,717 nodes and 44,338 edges. In PubMed, each paper is represented by a 500-dimensional bag-of-words vector, and there are 3 categories corresponding to different medical topics. The CiteSeer dataset is also a citation network of computer science literature, but it is smaller in scale, with 3,312 nodes and 4,732 edges. The node features in CiteSeer are represented by a 3,703-dimensional bag-of-words vector for each paper. CiteSeer contains 6 categories in the computer science domain as labels.


 Social network datasets like Twitch, CS, and Physics have become increasingly important resources for graph machine-learning research. The Twitch dataset is a widely used social network dataset in graph machine learning. It captures the complex interactions and relationships within the Twitch live-streaming platform, a leading gaming and creative content service. The dataset includes information on 168,114 users and 4,949,552 connections between them, representing activities such as friendships, channel subscriptions, and chat interactions. Each user is represented by a set of features describing their platform activity, such as the number of followers, videos watched, and channels subscribed to. 
The CS dataset, on the other hand, focuses on the social network of researchers in the computer science domain. With 18,772 nodes (researchers) and 81,894 edges (co-authorship relationships), this dataset provides rich node features such as publication history, research topics, and academic positions (e.g., professor, student, staff). Analyzing this dataset can yield insights into the collaboration patterns and academic hierarchies within the computer science research community. The Physics dataset captures the social network of researchers in the physics domain, with 34,546 nodes (researchers) and 420,877 edges (citations and collaborations). Each researcher is represented by features like their publication venues, citations, and research areas, as well as labels indicating their academic rank (e.g., junior, senior). This dataset enables the study of knowledge diffusion and academic status within the physics research community using graph-based methods.


%We adopt a similar data partitioning procedure from \citep{jia2020residual, huang2023uncertainty}, where we allocate 50\% of the data for the training set $\Etrain$, 10\% for the validation set $\Eval$, and the remaining 40\% for the combined calibration and test set $\Ect$. Figure \ref{fig: Chicago} provides an example of how the Chicago network data is divided into these different sets.

%The figure also depicts the prediction outcome of our proposed Equivariant Residual Conformal Quantile Regression with Graph Autoencoder (ERC-CQR-GAE) model, as described in Algorithm \ref{alg: TAR}. The middle plot shows the predicted edge weights, while the right-hand plot illustrates the width of the prediction intervals.
%By applying our ERC-CQR-GAE model to these real-world transportation datasets, we can provide users with reliable and locally adaptive prediction intervals for the edge weights, enabling more informed decision-making in transportation planning and management.


\begin{figure}
\centerline{\includegraphics[width=\textwidth,clip=]{figures/Node_RegressionF1.png}}
\small
\caption{The prediction interval of node regression generated by CF-GNN and ER-GAE. The x-axis represents the node, which is sorted by the label. The y-axis represents the prediction intervals of nodes. The error rate $\alpha$ is 0.05. Blue and red represent the results of ER-GAE and CFNN-GAE, respectively. }

\label{fig: comparisonnr}
\end{figure}

%\begin{figure}
%\centerline{\includegraphics[width=\textwidth,clip=]{figures/Node_ClassificatinF1.png}}
%\small
%\caption{The prediction interval generated by both CF-GNN and Ours (ER-GAE) on the task Node Classification. And both of them are sorted by average prediction which is constructed with a user-specified error rate of $\alpha=0.05$.  Notably, Ours outperforms baseline model in terms of inefficiency (\ref{eq: ineff}) }
%\label{fig: comparisonnc}
%\end{figure}

\begin{figure}
\centerline{\includegraphics[width=\textwidth,clip=]{figures/nc-r.pdf}}
\small
\caption{The histogram of predicted values of the node classification task on dataset MedPub. The error rate $\alpha$ is 0.05. Each sub-figure represents one classification from 0 to 5. The x-axis is the predicted value from the model. The y-axis is the frequency corresponding to the predicted value. Blue and yellow represent the results of ER-GAE and CF-GNN, respectively. }
\label{fig: comparisonnc}
\end{figure}

\begin{figure}
\centerline{\includegraphics[width=\textwidth,clip=]{figures/edge-weight-interval.png}}
\small
\caption{The graph shows the traffic volume prediction intervals generated on the Chicago traffic dataset. All methods set their error rate $\alpha$ at 0.05. The x-axis represents individual roads sorted by their actual/ground truth traffic volumes. The y-axis represents the predicted intervals. Different colors distinguish the results from different prediction methods. }

\label{fig: comparisonwep}
\end{figure}

\vspace{0.1in}
\noindent
{\bf Evaluation Metrics:}
For evaluation, we use the marginal coverage, defined as 
\begin{equation}\label{eq: cover}
    \textrm{cover} = \frac{1}{|\Etest|} \sum_{(i,j)\in \Etest} \mathbbm{1}\big(\Wtest_{ij} \in C_{ij}\big),
\end{equation}
where $C_{ij}$ is prediction interval for edge $(i, j)$. Another one is inefficiency which is defined as
\begin{equation}\label{eq: ineff}
    \textrm{ineff} = \frac{1}{|\Etest|} \sum_{(i,j)\in \Etest} |C_{ij}|,
\end{equation}
which measures the average length of the prediction interval.

In addition to the marginal coverage, we also consider the conditional coverage. Specifically, we use the method of \citep{romano2020classification, cauchois2020knowing} to measure the coverage over a slab of the feature space $S_{v, a, b}= \big\{[X_i \mathbin\Vert X_j]\in \mathbb{R}^{2f}: a \leq v^\top x \leq b \big\}$, where $[X_i \mathbin\Vert X_j]$ denotes the node feature of two connected nodes of an edge $(i, j)$ and $v \in \mathbb{R}^{2f}$ and $a < b \in \mathbb{R}$ are chosen adversarially and independently from the data. For any prediction interval $f_\theta^*$ and $\delta \in (0, 1)$, the \textit{worst slice coverage} is defined as 
\begin{equation}\label{eq: cond cover}
\begin{split}
    \textrm{WSC}(f_\theta^*, \delta) &= \inf\limits_{\substack{v \in \mathbb{R}^{2f}, \\ a < b \in \mathbb{R}}} \big \{ P \big( \Wtest_{ij} \in C_{ij} \mid [X_i \mathbin\Vert X_j] \in S_{v, a, b}  \big) \\
    & \phantom{----------} \textrm{s.t. } P([X_i \mathbin\Vert X_j] \in S_{v, a, b}) \geq \delta  \big \}.
\end{split}
\end{equation}
We use inefficiency (ineff) and weighted symmetric calibration (WSC) as evaluation metrics. Lower ineff and higher WSC indicate better performance. To generate prediction intervals, we independently sample 1000 vectors $v$ from the unit sphere in $\mathbb{R}^{2f}$ space. The parameters $a, b, \delta$ are fine-tuned via grid search. Additionally, 25\% of test data is utilized to estimate optimal $v, a, b, \delta$ values. The conditional coverage is then calculated on the remaining 75\% of test data.





\vspace{0.1in}
\noindent
{\bf Models and baselines: }
We name the model that combines conformal prediction (CP, as described in work \citep{huang2024uncertainty}) with graph autoencoder (GAE, Section \ref{subsec: GAE}) as CP-GAE. Similarly, we name the model that uses CP with line graph neural network (LGNN, in work \citep{cai2021line}) as CP-LGNN. For the models that use conformal quantile regression (CQR, Algorithm \ref{alg: CQR}), we refer to them as CQR-GAE. Additionally, we name the models that incorporate Error Reweighted Residual Conformal (ER, Section \ref{sec: conformal}) as CQR-ER-GAE and CQR-ER-LGNN. We also experiment with the directed variant of GAE, called DiGAE. The corresponding models are CP-DiGAE, CQR-DiGAE, and CQR-ER-DiGAE.

We use four popular graph neural network (GNN) models - GCN \citep{kipf2016semi}, GraphConv \citep{morris2019weisfeiler}, GAT \citep{velivckovic2017graph}, and GraphSAGE \citep{hamilton2017inductive} - as the base graph convolution layers for both the CP and CQR based models.

\vspace{0.1in}
\noindent
{\bf Result\footnote{The code is available at \url{https://github.com/zhangzheng01310911/RGA}.}:} 


For each dataset and model, we ran the experiment 10 times and split the data into training, validation, and the combined calibration and test sets. We conduct 100 random splits of calibration and testing edges to perform the baseline model and ER-GNN and evaluate the empirical coverage.  


The results presented in Table 1 show that the overall ER-GNN models outperform others in terms of inefficiency (as defined in Equation \ref{eq: ineff}) and conditional coverage (Equation \ref{eq: cond cover}). This indicates that the ER variants can strike a better balance between capturing the uncertainty in the predictions and maintaining a high level of coverage. Table 1 shows that GAE and LGNN outperform LGNN, highlighting the efficacy of the autoencoder approach in weight prediction. 


\begin{table}[ht]
\label{tab:eff_all_models1gsmg}
\centering
\begin{adjustbox}{width=\textwidth}
\begin{tabular}{|l|c|c|c|c|c|c|c|c|}
\toprule
GNN Model on Anaheim Data& \multicolumn{2}{c|}{GraphConv} & \multicolumn{2}{c|}{SAGEConv}  & \multicolumn{2}{c|}{MC-DropOut} & \multicolumn{2}{c|}{GATS} \\ \cmidrule{1-9}
Score Method-CP  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
GAE&$0.9156\std{0.0569} $ 
&${5.4093}\std{0.6783}$
&$0.9161\std{0.0617}$
&$6.2633\std{0.6041}$
&$0.9273\std{0.0556} $ 
&$6.3644\std{0.7133}$
&$0.9264\std{0.0702}$
&$6.4278\std{0.6963}$\\
DiGAE&$0.9163\std{0.0590} $ 
&${5.6076}\std{0.6497}$
&$0.9143\std{0.0662}$
&$6.3111\std{0.6624}$
&$0.9206\std{0.07034} $ 
&$6.4915\std{0.6903}$
&$0.9291\std{0.0539}$
&$6.3954\std{0.0416}$\\
LGNN&$0.9452\std{0.0287}$ 
&$6.9076\std{0.2908}$
&$0.9373\std{0.0360}$
&$6.4227\std{0.0455}$
&$0.9365\std{0.0388}$ 
&${6.3655}\std{0.5026}$
&$0.9391\std{0.0333}$
&$6.6962\std{0.3638}$\\ 
\midrule
Average & 0.9257 & 5.9748 & 0.9226 & 6.3324 & 0.9281 & 6.4071 & 0.9315 & 6.4398\\
\midrule
Score Method-CQR  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
GAE&$0.9548\std{0.0206}$ 
&${5.2680}\std{0.3499}$
&$0.9535\std{0.0285}$
&$5.8272\std{0.2352}$
&$0.9576\std{0.0419}$
&$4.2310\std{1.4752}$
&$0.9578\std{0.0346}$
&$4.1396\std{1.3386}$\\
DiGAE&$0.8984\std{0.0926} $ 
&${5.0580}\std{0.2792}$
&$0.8975\std{0.0982}$
&$5.6483\std{0.2399}$
&$0.9040\std{0.0873} $ 
&$5.7600\std{0.2960}$
&$0.9115\std{0.0691}$  
&$5.7889\std{0.2722}$\\
LGNN&$0.9010\std{0.0555}$ 
&${5.4381}\std{0.1453}$
&$0.9167\std{0.0480}$
&$5.9004\std{0.2302}$
&$0.9333\std{0.0430}$ 
&$6.1160\std{0.1818}$
&$0.9080\std{0.0607}$
&$6.0694\std{0.1861}$ \\ 
\midrule
Average & 0.9180 & 5.2547 & 0.9226 & 5.7920 & 0.9316 & 5.3690 & 0.9258 & 5.3326\\
\midrule  
Score Method-CQR-ER(Ours)  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
GAE&$0.9545\std{0.0223} $ 
&${5.2184}\std{0.0862}$
&$0.9507\std{0.0224}$
&$5.1030\std{0.2044}$
&$0.9543\std{0.0223} $ 
&$\textbf{3.9605}\std{0.1844}$
&$0.9501\std{0.0243}$
&$5.1843\std{0.2138}$\\
DiGAE&$0.9498\std{0.0353}$ 
&${5.0672}\std{0.2145}$
&$0.9394\std{0.0745}$
&$5.2498\std{0.1524}$
&$0.9534\std{0.0143}$ 
&$5.0522\std{0.1653}$
&$0.9518\std{0.0515}$
&$5.0513\std{0.1748}$\\
LGNN&$0.9485\std{0.0553}$
&${5.0194}\std{0.0816}$
&$0.9471\std{0.0438}$ 
&$5.0365\std{0.1846}$
&$0.9498\std{0.0173}$
&$5.2534\std{0.1024}$
&$0.9484\std{0.0342}$
&$5.0162\std{0.1034}$\\ 
\midrule
Average & \underline{\textbf{0.9509}} & \underline{\textbf{5.1017}} & \underline{\textbf{0.9457}} & \underline{\textbf{5.1298}} & \underline{\textbf{0.9525}} & \underline{\textbf{4.7554}} & \underline{\textbf{0.9501}} & \underline{\textbf{5.0839}}\\
\bottomrule
GNN Model On Chicago Data & \multicolumn{2}{c|}{GraphConv} & \multicolumn{2}{c|}{SAGEConv}  & \multicolumn{2}{c|}{MC-Dropout} & \multicolumn{2}{c|}{GATS} \\ \cmidrule{1-9}
Score Method-CP  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
GAE&$0.7984\std{0.1181}$ 
&$3.6659\std{0.3313}$
&$0.8297\std{0.1264}$
&${3.6350}\std{0.2231}$
&$0.8234\std{0.1213}$ 
&$3.6918\std{0.2454}$
&$0.9524\std{0.0333}$
&$3.3493\std{0.5910}$\\
DiGAE&$0.8081\std{0.1257} $ 
&${3.5721}\std{0.1951}$
&$0.8196\std{0.1215}$
&$3.5978\std{0.1884}$
&$0.8135\std{0.1361} $ 
&$3.5846\std{0.2050}$
&$0.8135\std{0.1319}$
&$3.6346\std{0.2432}$\\
LGNN&$0.9174\std{0.0238}$ 
&$6.7157\std{0.1325}$
&$0.9152\std{0.0256}$
&$6.5865\std{0.1577}$
&$0.9151\std{0.0246}$ 
&$6.5265\std{0.1426}$
&$0.9075\std{0.0618}$
&${6.0679}\std{0.1862}$\\ 
\midrule
Average & 0.8477 & 4.6512 & 0.8548 & 4.5998 & 0.8507 & 4.6010 & 0.8912 & 4.3506 \\
\midrule 
Score Method-CQR  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
GAE&$0.9514\std{0.0144} $ 
&${3.3652}\std{0.1312}$
&$0.9517\std{0.0141}$
&$3.5878\std{0.2107}$
&$0.9578\std{0.0420}$
&$4.0504\std{1.2916}$
&$0.9524\std{0.0333}$
&$3.3292\std{0.5866}$\\
DiGAE&$0.9205\std{0.0498} $ 
&${3.3135}\std{0.1172}$
&$0.9223\std{0.0469}$
&$3.3872\std{0.1260}$
&$0.9250\std{0.0479} $ 
&$3.4241\std{0.1271}$
&$0.9089\std{0.0611}$
&$3.6158\std{0.2348}$\\
LGNN&$0.9284\std{0.0296}$ 
&${3.4362}\std{0.1029}$
&$0.9305\std{0.0258}$
&$3.4844\std{0.1233}$
&$0.9290\std{0.0284}$ 
&$3.6514\std{0.1050}$
&$0.9379\std{0.0261}$
&$4.0805\std{0.5445}$\\ 
\midrule  
Average & 0.9334 & 3.3716 & 0.9348 & 3.4865 & 0.9373 & 3.7086 & 0.9331 & 3.6752\\
\midrule
Score Method-CQR-ER(Ours)  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
GAE&$0.9554\std{0.0152} $ 
&${3.2751}\std{0.1413}$
&$0.9537\std{0.0189}$
&$3.2435\std{0.1478}$
&$0.9513\std{0.0123}$ 
&$3.3126\std{0.1622}$
&$0.9506\std{0.0145}$
&$3.1268\std{0.1223}$\\
DiGAE&$0.9499\std{0.0415} $ 
&${3.1342}\std{0.1483}$
&$0.9487\std{0.0302}$
&$\textbf{3.0435}\std{0.1423} $
&$0.9492\std{0.0424}$
&$3.1557\std{0.1529}$
&$0.9412\std{0.0724}$ 
&$3.1923\std{0.2125}$\\
LGNN&$0.9425\std{0.0344}$
&${3.4521}\std{0.0635}$
&$0.9454\std{0.0283}$ 
&$3.1845\std{0.0456}$
&$0.9482\std{0.0345}$
&$\textbf{3.0372}\std{0.0713}$
&$0.9493\std{0.0282}$
&$3.5361\std{0.1158}$\\
\midrule
Average & \underline{\textbf{0.9493}} & \underline{\textbf{3.2871}} & \underline{\textbf{0.9493}} & \underline{\textbf{3.1552}} & \underline{\textbf{0.9496}} & \underline{\textbf{3.1685}} & \underline{\textbf{0.9470}} & \underline{\textbf{3.2851}}\\
\bottomrule


\end{tabular}    
    \end{adjustbox}
    \vspace{0.0005in}
        \vspace{0.05in}\caption{Performance comparison of the proposed models, based on the conditional coverage (\ref{eq: cond cover}) and inefficiency (\ref{eq: ineff}) on edge weight prediction task. The models were tested using several widely-used graph convolutional layers, including GraphConv \citep{morris2019weisfeiler}, SAGEConv \citep{hamilton2017inductive}, GCNConv \citep{kipf2016semi}, and GATConv \citep{velivckovic2017graph}. The best conditional coverage and inefficiency for each graph convolutional layer is highlighted in bold. Across diverse datasets and graph convolutional layers, CQR-GAE and CAR-ER-GAE demonstrate strong performance in both inefficiency and conditional coverage, while CQR-DiGAE and CQR-ER-DiGAE excel in minimizing inefficiency. }
\end{table}


Figure~\ref{fig: comparisonwep} illustrates the prediction interval produced by LGNN and ER(CQR) models. These prediction intervals are constructed with a user-specified error rate of $\alpha=0.05$. Furthermore, Figure~\ref{fig: comparisonwep} also illustrates the adaptability of the CQR models by generating prediction intervals of varying sizes, which aligns with the data characteristics. This suggests that the CQR-based approaches can provide more locally adaptive and reliable prediction intervals, compared to the CP-based models.

By leveraging the ER-based models can generate prediction intervals that are both efficient and well-calibrated, making them a more suitable choice for real-world transportation applications where accurate and reliable predictions are crucial for informed decision-making. 

%It is worth noting that for the ER approach, tuning the regularization hyperparameter can be a notably challenging task. The performance of the ER models is highly sensitive to the choice of this hyperparameter.
%These findings suggest that the benefits of incorporating the ER approach may be context-dependent and require careful hyperparameter tuning to achieve the desired balance between coverage and efficiency in transportation network applications.

For another two tasks: node regression and node classification, we also conduct two experiments for each of them. For the node regression, we can see from Table 2 that our method outperforms the baseline model which is CF-GNN\citep{huang2024uncertainty} both on WSC score (coverage) and inefficiency. Besides, the visualization result in Figure~\ref{fig: comparisonnr} shows that we have a smaller interval size than that from CF-GNN. Similarly, for the task-node classification, number and visual results can be seen in Table 3 and Figure~\ref{fig: comparisonnc}. Our model achieves better results considering both accuracy and inefficiency.



\begin{figure}[h]
\includegraphics[width=0.23\textwidth]{figures/AnOri.png}
\includegraphics[width=0.23\textwidth]{figures/AnOursF.png}
\includegraphics[width=0.23\textwidth]{figures/ChiOri.png}
\includegraphics[width=0.23\textwidth]{figures/ChiOursF.png}
\caption{Residual between predicted and actual traffic volumes across roads in two cities under different models.  (a) Residual of predicted roads' traffic volume in Anaheim of baseline model~\citep{huang2024uncertainty}, which is the absolute value of prediction and ground truth. (b) Residual of predicted roads' traffic volume in Anaheim of ER-GAE. (c) Residual of predicted roads' traffic volume in Chicago of baseline model. (d) Residual of predicted roads' traffic volume in Chicago of ER-GAE. For each city, the residuals from the two models were independently normalized to a 0-1 range for comparison purposes.}
\label{fig:comparisontraffic}
\end{figure}


\begin{table}[ht]
\label{tab:eff_all_models12}
\centering
\begin{adjustbox}{width=\textwidth}
\begin{tabular}{|l|c|c|c|c|c|c|c|c|}
\toprule
Dataset& \multicolumn{2}{c|}{GraphSAGE} & \multicolumn{2}{c|}{SGC}  & \multicolumn{2}{c|}{GCN} & \multicolumn{2}{c|}{GATS} \\ \cmidrule{1-9}
 Metrics & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
Anaheim: CF-GNN&$0.9520\std{0.0669} $ 
&$\textbf{1.9231}\std{0.0483}$
&$0.9559\std{0.0617}$
&$2.2031\std{0.0241}$
&$0.9519\std{0.0531} $ 
&$2.3782\std{0.0533}$
&$0.9523\std{0.0302}$
&$2.1499\std{0.0463}$\\
Anaheim: ER-GAE&$\textbf{0.9543}\std{0.0320} $ 
&${1.9647}\std{0.0197}$
&$\textbf{0.9563}\std{0.0562}$
&$\textbf{2.0338}\std{0.0224}$
&$\textbf{0.9535}\std{0.0407} $ 
&$\textbf{2.2328}\std{0.0304}$
&$\textbf{0.9590}\std{0.0332}$
&$\underline{\textbf{1.9136}}\std{0.0256}$\\ \midrule
Chicago: CF-GNN&$0.9448\std{0.0519} $ 
&${2.3426}\std{0.0384}$
&$0.9486\std{0.0247}$
&$1.0423\std{0.0372}$
&$0.9505\std{0.0447}$
&$2.0456\std{0.0443}$
&$0.9508\std{0.0569}$
&$1.1396\std{0.0686}$\\
Chicago: ER-GAE&$\textbf{0.9476}\std{0.0426} $ 
&$\textbf{2.2581}\std{0.0392}$
&$\textbf{0.9496}\std{0.0382}$
&$\textbf{1.2342}\std{0.0231}$
&$\textbf{0.9522}\std{0.0373}$ 
&$\underline{\textbf{1.5899}}\std{0.0268}$
&$\textbf{0.9520}\std{0.0371}$  
&$\textbf{1.1423}\std{0.0292}$\\ \midrule  
Education: CF-GNN&$0.9501\std{0.0242} $ 
&${2.3808}\std{0.0427}$
&$0.9500\std{0.0285}$
&$2.4892\std{0.0351}$
&$0.9483\std{0.0408}$
&$2.4380\std{0.0452}$
&$0.9502\std{0.0392}$
&$2.4209\std{0.0376}$\\
Education: ER-GAE&$\textbf{0.9588}\std{0.0426} $ 
&$\textbf{2.0715}\std{0.0289}$
&$\textbf{0.9567}\std{0.0372}$
&$\textbf{2.0607}\std{0.0239}$
&$\textbf{0.9566}\std{0.0373}$ 
&$\textbf{1.8871}\std{0.0260}$
&$\textbf{0.9583}\std{0.0386}$  
&$\underline{\textbf{1.9080}}\std{0.0221}$\\ \midrule
Election: CF-GNN&$0.9498\std{0.0211} $ 
&${0.9268}\std{0.0429}$
&$0.9495\std{0.0215}$
&$0.9279\std{0.0302}$
&$0.9506\std{0.0473}$
&$0.9009\std{0.0282}$
&$0.9488\std{0.0363}$
&$0.9136\std{0.0681}$\\
Election: ER-GAE&$\textbf{0.9514}\std{0.0326}$ 
&$\textbf{0.9203}\std{0.0279}$
&$\textbf{0.9567}\std{0.0372}$
&$\textbf{0.9307}\std{0.0239}$
&$\textbf{0.9510}\std{0.0873} $ 
&$\textbf{0.7743}\std{0.0320}$
&$\textbf{0.9525}\std{0.0317}$  
&$\underline{\textbf{0.6698}}\std{0.0201}$\\ \midrule
Income: CF-GNN&$0.9512\std{0.0264} $ 
&${2.7580}\std{0.0342}$
&$0.9504\std{0.0405}$
&$2.4892\std{0.0302}$
&$0.9511\std{0.0250}$
&$2.5272\std{0.0318}$
&$0.9508\std{0.0329}$
&$2.4396\std{0.0328}$\\
Income: ER-GAE&$\textbf{0.9524}\std{0.0726}$ 
&$\textbf{2.1560}\std{0.0492}$
&$\textbf{0.9505}\std{0.0482}$
&$\textbf{1.9616}\std{0.0358}$
&$\textbf{0.9554}\std{0.0463} $ 
&$\textbf{1.9343}\std{0.0360}$
&$\textbf{0.9531}\std{0.0338}$  
&$\underline{\textbf{1.8699}}\std{0.0403}$\\ \midrule
Unemploy: CF-GNN&$0.9526\std{0.0415}$ 
&${2.2298}\std{0.0523}$
&$0.9510\std{0.0320}$
&$2.4587\std{0.0491}$
&$0.9506\std{0.0294}$
&$2.5013\std{0.0326}$
&$0.9502\std{0.0354}$
&$2.4332\std{0.0376}$\\
Unemploy: ER-GAE&$\textbf{0.9556}\std{0.0426}$ 
&$\textbf{2.1036}\std{0.0308}$
&$\textbf{0.9527}\std{0.0331}$
&$\textbf{2.0607}\std{0.0379}$
&$\textbf{0.9507}\std{0.0373}$ 
&$\textbf{2.0620}\std{0.0260}$
&$\textbf{0.9506}\std{0.0429}$  
&$\underline{\textbf{1.9620}}\std{0.0362}$\\ \midrule
Twitch: CF-GNN&$\textbf{0.9524}\std{0.0443} $ 
&$\textbf{2.6634}\std{0.0365}$
&$0.9523\std{0.0392}$
&$2.6835\std{0.0394}$
&$0.9529\std{0.0257} $ 
&$2.5409\std{0.0404}$
&$0.9515\std{0.0275}$
&$2.6243\std{0.0460}$\\
Twitch: ER-GAE&$0.9503\std{0.0384}$ 
&${5.0643}\std{0.0547}$
&$\textbf{0.9524}\std{0.0350}$
&$\underline{\textbf{2.1292}}\std{0.0319}$
&$\textbf{0.9536}\std{0.0347}$ 
&$\textbf{2.2638}\std{0.0251}$
&$\textbf{0.9520}\std{0.0280}$
&$\textbf{2.1493}\std{0.0255}$\\ \bottomrule

\end{tabular}    
    \end{adjustbox}
    \vspace{0.0005in}
    \caption{Results of Ours (ER-GNN) on Node Regression Datasets}
\end{table}










\begin{table}[ht]
\label{tab:eff_all_models1}
\centering
\begin{adjustbox}{width=\textwidth}
\begin{tabular}{|l|c|c|c|c|c|c|c|c|}
\toprule
Dataset& \multicolumn{2}{c|}{GraphSAGE} & \multicolumn{2}{c|}{SGC}  & \multicolumn{2}{c|}{GCN} & \multicolumn{2}{c|}{GATS} \\ \cmidrule{1-9}
Dataset  & cover$^x$  & ineff & cover$^x$ & ineff & cover$^x$ & ineff& cover$^x$ & ineff\\\midrule
Cora: CF-GNN&$0.9456\std{0.0569} $ 
&${1.6284}\std{0.0483}$
&$0.9461\std{0.0603}$
&$1.6633\std{0.0441}$
&$0.9473\std{0.0556} $ 
&$1.6344\std{0.0418}$
&$0.9464\std{0.0702}$
&$1.6278\std{0.0334}$\\
Cora: ER-GAE&$\textbf{0.9463}\std{0.0509} $ 
&$\textbf{1.6076}\std{0.0397}$
&$\textbf{0.9468}\std{0.0662}$
&$\textbf{1.6017}\std{0.0465}$
&$\textbf{0.9476}\std{0.0732} $ 
&$\textbf{1.6315}\std{0.0303}$
&$\textbf{0.9491}\std{0.0539}$
&$\textbf{1.6254}\std{0.0396}$\\ \midrule
DBLP: CF-GNN&$0.9501\std{0.0523} $ 
&${1.5723}\std{0.0683}$
&$\textbf{0.9451}\std{0.0617}$
&$1.5274\std{0.0416}$
&$0.9473\std{0.0596} $ 
&$1.5644\std{0.0733}$
&$0.9467\std{0.0717}$
&$1.5729\std{0.0463}$\\
DBLP: ER-GAE&$\textbf{0.9503}\std{0.0510} $ 
&$\textbf{1.5607}\std{0.0487}$
&$0.9443\std{0.0462}$
&$\textbf{1.3921}\std{0.0624}$
&$\textbf{0.9430}\std{0.0713} $ 
&$\textbf{1.5491}\std{0.0278}$
&$\textbf{0.9491}\std{0.0539}$
&$\textbf{1.5720}\std{0.0322}$\\ \midrule
CiteSeer: CF-GNN&$0.9528\std{0.0203} $ 
&${1.1680}\std{0.0439}$
&$0.9525\std{0.0257}$
&$\textbf{1.1827}\std{0.0552}$
&$0.9496\std{0.0392}$
&$1.2310\std{0.0332}$
&$0.9508\std{0.0309}$
&$1.2396\std{0.0416}$\\
CiteSeer: ER-GAE&$\textbf{0.9540}\std{0.0926} $ 
&$\textbf{1.1679}\std{0.0605}$
&$\textbf{0.9594}\std{0.0582}$
&$1.1898\std{0.0399}$
&$\textbf{0.9518}\std{0.0373}$ 
&$\textbf{1.2153}\std{0.0290}$
&$\textbf{0.9548}\std{0.0491}$  
&$\textbf{1.2020}\std{0.0392}$\\ \midrule  
PubMed: CF-GNN&$0.9502\std{0.0207} $ 
&${1.4680}\std{0.0361}$
&$0.9508\std{0.0276}$
&$1.4272\std{0.0325}$
&$0.9516\std{0.0458}$
&$1.5310\std{0.0514}$
&$0.9512\std{0.0434}$
&$1.4396\std{0.0485}$\\
PubMed: ER-GAE&$\textbf{0.9512}\std{0.0426}$ 
&$\textbf{1.3275}\std{0.0392}$
&$\textbf{0.9520}\std{0.0482}$
&$\textbf{1.3897}\std{0.0339}$
&$\textbf{0.9521}\std{0.0473}$ 
&$\textbf{1.3732}\std{0.0296}$
&$\textbf{0.9515}\std{0.0419}$  
&$\textbf{1.3989}\std{0.0522}$\\ \midrule
Computers: CF-GNN&$0.9471\std{0.0276} $ 
&${3.3680}\std{0.3499}$
&$0.9492\std{0.0235}$
&$3.8272\std{0.0292}$
&$0.9457\std{0.0435}$
&$3.2310\std{0.0652}$
&$0.9478\std{0.0325}$
&$3.1396\std{0.0586}$\\
Computers: ER-GAE&$\textbf{0.9484}\std{0.0526} $ 
&$\textbf{2.7580}\std{0.0292}$
&$\textbf{0.9495}\std{0.0326}$
&$\underline{\textbf{2.6483}}\std{0.0428}$
&$\textbf{0.9466}\std{0.0419}$ 
&$\textbf{2.5631}\std{0.0387}$
&$\textbf{0.9479}\std{0.0691}$  
&$\underline{\textbf{2.7889}}\std{0.0272}$\\ \midrule
Photo: CF-GNN&$0.9511\std{0.0275} $ 
&${3.2680}\std{0.0395}$
&$0.9515\std{0.0263}$
&$2.2276\std{0.0354}$
&$0.9486\std{0.0419}$
&$2.2010\std{0.0387}$
&$0.9509\std{0.0391}$
&$2.1986\std{0.0286}$\\
Photo: ER-GAE&$\textbf{0.9530}\std{0.0926} $ 
&$\textbf{2.5624}\std{0.0459}$
&$\textbf{0.9519}\std{0.0982}$
&$\textbf{2.0176}\std{0.0346}$
&$\textbf{0.9498}\std{0.0873} $ 
&$\textbf{2.0142}\std{0.0560}$
&$\textbf{0.9512}\std{0.0467}$  
&$\textbf{1.8133}\std{0.0272}$\\ \midrule
CS: CF-GNN&$0.9438\std{0.0224} $ 
&${1.8669}\std{0.0347}$
&$0.9435\std{0.0284}$
&$1.6272\std{0.0452}$
&$0.9476\std{0.0416}$
&$3.6310\std{0.0325}$
&$0.9478\std{0.0317}$
&$2.7396\std{0.0286}$\\
CS: ER-GAE&$\textbf{0.9484}\std{0.0626} $ 
&$\textbf{1.8580}\std{0.0392}$
&$\textbf{0.9475}\std{0.0582}$
&$\textbf{1.6183}\std{0.0361}$
&$\textbf{0.9440}\std{0.0573} $ 
&$\underline{\textbf{2.7600}}\std{0.0260}$
&$\textbf{0.9485}\std{0.0391}$  
&$\textbf{2.3889}\std{0.0238}$\\ \midrule
Physics: CF-GNN&$0.9495\std{0.0243} $ 
&${1.2218}\std{0.0463}$
&$0.9507\std{0.0292}$
&$1.2430\std{0.0324}$
&$0.9489\std{0.0257} $ 
&$1.2005\std{0.0604}$
&$0.9505\std{0.0275}$
&$1.2243\std{0.0246}$\\
Physics: ER-GAE&$\textbf{0.9503}\std{0.0624}$ 
&$\textbf{1.2190}\std{0.0247}$
&$\textbf{0.9514}\std{0.0553}$
&$\textbf{1.2407}\std{0.0419}$
&$\textbf{0.9494}\std{0.0347}$ 
&$\textbf{1.2128}\std{0.0451}$
&$\textbf{0.9508}\std{0.0480}$
&$\textbf{1.2317}\std{0.0255}$\\ \bottomrule

\end{tabular}    
    \end{adjustbox}
    \vspace{0.0005in}
    \caption{Results of Ours (ER-GNN) on Node Classification Datasets}
\end{table}


%\begin{figure}[h]
%    \centering
%        \includesvg[height=3.05cm]{figures/us_election_resultsOriFFF}
%        \caption{Original SVG Image}
%        \label{fig:original}
%    \hfill
%        \includesvg[height=3.5cm]{figures/us_election_resultsOursFFF}
%        \caption{Our SVG Image}
%        \label{fig:our}
%    \caption{Comparison of SVG Images}
%    \label{fig:comparison}
%\end{figure}

Furthermore, we compared the residual between the baseline model: CF-GNN ~\citep{huang2024uncertainty}) and Ours (ER-GNN) doing the reweighting operation. One is a visualization result on the Chicago and Anathm Traffic dataset, showing the middle residual output, which can be seen in Figure.~\ref{fig:comparisontraffic}. On the whole, the residual value of CF-GNN is higher than that of ER-GAE. Another visualization result shown in Figure.~\ref{fig: comparison2} also shows a similar result, we can see that the global residual/difference between the model output and ground truth from ER-GAE is much lower than these baselines\footnote{Code Reference: https://github.com/RoshanRShetty/Visualization-For-Presidential-Election-2016-.git}.

We also conduct an ablation study to assess the impact of initially setting the edge weights for the validation, calibration, and test edge sets. Initially, we set these edge weights to zero. Subsequently, we assign them the average edge weight from the trailing edges. Additionally, we assign weights randomly by bootstrapping the training edge weights and allocating the sampled values to them. Results of the ablation study can be found in Table 5. Overall, the second initialization method is the best choice.

\begin{table}[ht]
\label{tab:ablationstudy}
\centering
\begin{adjustbox}{width=\textwidth}
\begin{tabular}{|l|c|c|c|c|}
\toprule
GNN Model & \multicolumn{2}{c|}{Anaheim Dataset} & \multicolumn{2}{c|}{Chicago Dataset}   \\ \cmidrule{1-5}
Set All 0  & cover$^x$  & ineff & cover$^x$ & ineff \\\midrule				
GAE&${0.8908}\std{0.0627} $ 
&${5.2968}\std{0.3054}$	
&${0.9001}\std{0.0513}$
&${3.4034}\std{0.1034}$\\	
DiGAE&${0.9063}\std{0.0713} $ 
&${5.1402}\std{0.3131}$
&${0.9219}\std{0.0474}$ 
&${3.4093}\std{0.1410}$\\ \midrule
 
Set Means  & cover$^x$  & ineff & cover$^x$ & ineff \\\midrule
GAE&${0.8909}\std{0.0980}$ 
&${5.1462}\std{0.2833}$
&${0.9017}\std{0.0543}$
&${3.2502}\std{0.0987}$\\
DiGAE&${0.9094}\std{0.1182} $ 
&${\textbf{4.9396}}\std{0.3067}$
&${0.9253}\std{0.0425}$ 
&${\textbf{3.1729}}\std{0.1278}$\\  \midrule  
Set Random Choose  & cover$^x$  & ineff & cover$^x$ & ineff\\\midrule		
GAE&${0.8941}\std{0.0601}$
&${5.1863}\std{0.2914}$ 
&${0.9048}\std{0.0393}$
&${3.2912}\std{0.0870}$\\
DiGAE&${0.9106}\std{0.0896} $ 
&${5.0127}\std{00.3143}$		
&${0.9273}\std{0.0455}$
&${3.1938}\std{0.1035}$\\\bottomrule
\end{tabular}    
    \end{adjustbox}
    \vspace{0.0005in}
    \caption{Ablation Study of CQR Score Results on the dataset of Anaheim and Chicago}
\end{table}


As a potential future research direction, we plan to conduct an in-depth analysis of the conditional coverage for various network-based features, such as clustering coefficients, betweenness centrality, PageRank, and others, as explored in \citep{huang2023uncertainty}.

Ultimately, this proposed line of inquiry will contribute to a more comprehensive understanding of the suitability and applicability of the CP-ER-GNN and related conformal prediction techniques in transportation network analysis and prediction tasks.

% Among the models considered, CQR-GAE stands out by demonstrating the most effective performance in terms of inefficiency. This suggests that CQR-GAE exhibits superior predictive accuracy and precision when estimating the prediction intervals, making it a particularly reliable choice in practical applications.



\section{Conclusion}\label{sec: conclusion}

In this paper, we have applied conformal prediction (CP) to graph neural networks (GNNs) and proposed a residual reweighting approach, named residual reweighted conformal prediction graph neural network (ER-GNN), to extend the typical nonconformity measure.

Unlike traditional GNN methods, which produce point predictions, Conformal Predictors output predictive regions that satisfy a given confidence level. The regular nonconformity measure results in predictive regions with more or less the same width for all examples in the test set. However, it would be more natural for the size of the regions to vary according to how difficult to predict each example is.

To address this, we have defined a residual reweighting nonconformity measure, which produces predictive regions of variable width depending on the expected accuracy of the algorithm on each example. We have applied ER-GAE to edge weight prediction, as well as node classification and regression tasks on real-world transportation datasets, citation connection datasets and social network datasets.

As a consequence, ER-GAE yields results with much tighter predictive regions, higher accuracy, and higher efficiency compared to traditional GNN methods. This demonstrates the effectiveness of our proposed residual reweighted conformal prediction approach for enhancing the reliability and interpretability of graph neural network models.

Overall, this work advances the state-of-the-art in graph machine learning by leveraging the benefits of conformal prediction to produce uncertainty-aware and adaptive predictions on graph-structured data.

% Future directions:
% \begin{enumerate}
%     \item CP requires exchangeability between the calibration set and the test set. However, allocating a portion of the training data for calibration in split CP can be inefficient. Is it possible to incorporate part of the calibration set into the training set, yet still use them for calculating the \score{}? Can this be done while incorporating weights to account for any violation of exchangeability \citep{barber2023conformal} with the test set?
%     \item Withhold a small fraction of the calibration dataset, and use it to select a novel \score{} based on the graph structure. This is inspired by \citep{colombo2020training} and \citep{huang2023uncertainty}, and the latter further introduced a differentiable inefficiency loss to update the GNN model parameters. 
%     \item Instead of edge representation by decoding of node embeddings, we can pursue a direct edge embedding method \citep{he2022pytorch}. 
% \end{enumerate}

% We relax the requirements of exchangeability between the training data and the test/calibration data. Instead, we only need the test data to be exchangeable with the calibration data. This adjustment maintains the validity of the approach, ensuring the guaranteed coverage of CP. However, it is important to note that the efficiency of the model may be compromised due to the inherent mismatch between the training set and the test/calibration data. 

% \NC{
% Why do we say, "We relax"? Is the exchangeability between training and calibration/testing required for CP? Also, is this because the model is trained on the specific graph for which we predict the weights? Do people normally train the embedding on a separate graph?  
% }

\section{Declaration}
\begin{enumerate}
    \item Funding - The work described in this paper was partially supported by a grant from the City University of Hong Kong (Project No. 9610639).
    \item Conflicts of interest/Competing interests - Not Applicable.
    \item Ethics approval - Not Applicable.
    \item Consent to participate - Not Applicable.
    \item Consent for publication - Not Applicable.
    \item Availability of data and material - The data used in this study is available at \url{https://github.com/zhangzheng01310911/RGA}.
    \item Code availability - The data used in this study is available at \url{https://github.com/zhangzheng01310911/RGA}.
    \item Authors' contributions - R.L. and N.C. contributed to the conception and design of the work. R.L. and N.C. wrote the main manuscript text. R.L. and N.C. reviewed and revised the manuscript.
\end{enumerate}




%\acknowledgment US spelling: \verb+\acknowledgment+
%\acknowledgement British  spelling: \verb+\acknowledgement+

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \appendix
% \section{Algorithm of Weighted Conformal Prediction for Inductive Edge Weight Prediction} \label{app: weighted CP}

% \begin{algorithm}[tbp]\footnotesize
% \caption{Weighted Conformal Prediction for Graph Autoencoder}
% \label{alg: inductive}
% \hspace*{\algorithmicindent} \textbf{Input:} The binary adjacency matrix $A \in \{0, 1\}^{n\times n}$, node features $X\in \mathbb{R}^{n\times f}$, training edges and their weights $\Etrain$, $\Wtrain$, calibration edges and their weights $\Ecalib$, $\Wcalib$, holdout edges and their weights $\Ehold$, $\Whold$, and test edges $\Etest$, user-specified error rate $\alpha \in (0,1)$, a weighting model $g_{\phi}$ with trainable parameter $\phi$.\\
% \begin{algorithmic}[1]
% \State Obtain the node embeddings $Z$ and the conformity scores $V$ of the calibration edges from the GAE model $f_\theta^*$ that has been trained according to Algorithm \ref{alg: CQR}. 
% \State Train the weighting model $g_\phi$:
% \begin{algsubstates}
% \State Compute the weights $\omega=g_\phi(\Ehold, \Ecalib, Z)$, where $\omega_{e_1 e_2}$ represents the weight assigned to the calibration edge $e_2$ by the holdout edge $e_1$. 
% \State Construct the prediction intervals for $\Ehold$ based on weighted exchangeability:
% \begin{equation}
%     f_\theta^*\left( e_i; A, X, \Wtrain \right) = \Big[\hat{W}^{\alpha/2}_{e_i} - d_{e_i}, \hat{W}^{1-\alpha/2}_{e_i} + d_{e_i} \Big], \; (a, b) \in \Ehold, \nonumber
% \end{equation}
% where $d_{e_i} = Q_{1-\alpha} \big( \sum_{e_j \in \Ecalib} \Bar{\omega}_{e_i e_j } \delta_{V_{e_j}} \big)$ denotes the $(1-\alpha)$-quantile of a distribution and $\delta_{V_{e_j}}$ denotes a point mass at $V_{e_j}$, and $\Bar{\omega}_{e_i e_j} = \frac{\omega_{e_i e_j}}{\sum_{e_k \in \Ecalib} \omega_{e_i e_k}}$ is the normalized weight.
% \State Compute the coverage loss:
% \begin{equation} \label{eq: zero-one loss}
%     \loss_{\textrm{coverage}} = \frac{1}{|\Ehold|} \sum_{(a, b)\in \Ehold}  \mathbbm{1} \big( W_{e_i} \notin f_\theta^*\left( (a, b); A, X, \Wtrain \right) \big).
% \end{equation}
% \State Approximate the coverage loss using the hinge loss:
% \begin{equation} \label{eq: hinge loss}
%     \loss_{\textrm{hinge}} = \frac{1}{|\Ehold|} \sum_{(a, b)\in \Ehold} \big[ 1 - z_{e_i} \big]_{+},
% \end{equation}
% where $z_{e_i} = \min\big\{ W_{e_i} - \hat{W}^{\alpha/2}_{e_i} + d_{e_i},  \hat{W}^{1-\alpha/2}_{e_i} + d_{e_i} - W_{e_i} \big\}$.
% \State Include an additional term with respect to the size of the prediction interval into the loss function:
% \begin{equation} 
%     \loss = \loss_{\textrm{hinge}} + \frac{1}{|\Ehold|} \sum_{(a, b)\in \Ehold} 2 d_{e_i},
% \end{equation}

% \end{algsubstates}
% \State Construct a prediction interval for test edges:  
% \begin{equation}
%     f_\theta^*\left( e_i; A, X, \Wtrain \right) = \Big[\hat{W}^{\alpha/2}_{e_i} - d_{e_i}, \hat{W}^{1-\alpha/2}_{e_i} + d_{e_i} \Big], \; e_i \in \Etest, \nonumber
% \end{equation}
% where $d_{e_i} = Q_{1-\alpha} ( \sum_{e_j \in \Ecalib} \Bar{\omega}_{e_i e_j } \delta_{V_{e_j}} )$ and $\omega_{e_i e_j} = g_\phi(e_i, \Ecalib, Z)$.
% \end{algorithmic}
% \hspace*{\algorithmicindent} \textbf{Output:} Prediction intervals for the test edges $e_i \in \Etest$ with the coverage guarantee:
% \begin{equation}
%     P\big(\Wtest_{e_i} \in f_\theta^*\left( e_i; A, X, \Wtrain \right) \big) \geq 1 - \alpha.
% \end{equation}
% \end{algorithm}

\bibliographystyle{springer}
     % name your Bibtex file containing your references (.bib)
\bibliography{bibliography}  

     % Checking: look if the file containing the ``\bibitem'' exits
     %           so check if the .bbl file exist (bibTeX compilation)
\IfFileExists{\jobname.bbl}{} {\typeout{}
\typeout{****************************************************}
\typeout{****************************************************}
\typeout{** Please run "bibtex \jobname" to obtain} \typeout{**
the bibliography and then re-run LaTeX} \typeout{** twice to fix
the references !}
\typeout{****************************************************}
\typeout{****************************************************}
\typeout{}}



\end{document}
