\section{Experiments} \label{sec:experiments}

We conduct experiments on four real-world datasets to demonstrate the effectiveness of our proposed federated conformal prediction method on graph data with varying numbers of clients. 

\begin{table*}[h]
\centering
\caption{Conformal prediction (CP) set size comparison on four datasets with partition numbers \( K = 3, 5, 10, \) and \( 20 \) using the APS non-conformity score. Set sizes are presented for confidence levels \( 1 - \alpha = 0.95, 0.90, \) and \( 0.80 \). The methods \texttt{Loc}, \texttt{Fed}, and \texttt{Gen} correspond to models trained locally on each client, with standard federated averaging, and with our generative neighbor completion framework, respectively. The corresponding standard deviations are given, averaged over 5 runs.}
\label{results1}
\setlength{\tabcolsep}{2pt}
\scalebox{0.75}{
\begin{tabular}{@{}lcccc cccc@{}}
\toprule
& \multicolumn{4}{c}{\textbf{Cora}} 
& \multicolumn{4}{c}{\textbf{PubMed}} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9}
\textbf{Model} & \textbf{\( K=3 \)} & \textbf{\( K=5 \)} & \textbf{\( K=10 \)} & \textbf{\( K=20 \)} & \textbf{\( K=3 \)} & \textbf{\( K=5 \)} & \textbf{\( K=10 \)} & \textbf{\( K=20 \)} \\
\midrule
\texttt{Loc (0.95)} & 4.97 \std{0.02} & 5.27 \std{0.02} & 5.53 \std{0.02} & 5.90 \std{0.02} 
        & 1.94 \std{0.03} & 1.97 \std{0.03} & 1.98 \std{0.04} & 2.04 \std{0.01}  \\
        
\texttt{Fed (0.95)}   & 4.31 \std{0.02} & \textbf{4.94} \std{0.02} & 5.02 \std{0.02} & 5.79 \std{0.02} 
        & 1.80 \std{0.02} & 1.78 \std{0.02} & 1.83 \std{0.01} & 1.77 \std{0.02}  \\
        
\texttt{Gen (0.95)}   & \textbf{4.25} \std{0.02} & 5.09 \std{0.02} & \textbf{4.86} \std{0.02} & \textbf{5.40} \std{0.02} 
        & \textbf{1.72} \std{0.02} & \textbf{1.78} \std{0.01} & \textbf{1.80} \std{0.02} & \textbf{1.69} \std{0.02}  \\
        
               \cmidrule(lr){2-5} \cmidrule(lr){6-9}
               
\texttt{Loc (0.90)} & 4.12 \std{0.01} & 4.54 \std{0.01} & 4.83 \std{0.03} & 4.99 \std{0.03} 
        & 1.79 \std{0.00} & 1.86 \std{0.03} & 1.90 \std{0.03} & 1.95 \std{0.03}  \\
        
\texttt{Fed (0.90)}   & 3.34 \std{0.03} & 4.14 \std{0.03} & 4.32 \std{0.02} & 4.13 \std{0.01}
        & 1.61 \std{0.03} & \textbf{1.60} \std{0.01} & 1.68 \std{0.01} & 1.53 \std{0.02}  \\
        
\texttt{Gen (0.90)}   & \textbf{3.34} \std{0.02} & \textbf{4.10} \std{0.02} & \textbf{3.98} \std{0.01} & \textbf{3.90} \std{0.04} 
        & \textbf{1.55} \std{0.01} & \textbf{1.60} \std{0.01} & \textbf{1.62} \std{0.02} & \textbf{1.49} \std{0.01}  \\
        
               \cmidrule(lr){2-5} \cmidrule(lr){6-9}
               
\texttt{Loc (0.80)} & 3.17 \std{0.01} & 3.77 \std{0.01} & 3.87 \std{0.02} & 4.14 \std{0.02} 
        & 1.72 \std{0.01} & 1.78 \std{0.01} & 1.80 \std{0.01} & 1.69 \std{0.01}  \\
        
\texttt{Fed (0.80)}   & \textbf{2.45} \std{0.01} & \textbf{2.95 }\std{0.01} & 2.93 \std{0.03} & 3.17 \std{0.03} 
        & 1.55 \std{0.01} & 1.60 \std{0.02} & 1.62 \std{0.00} & 1.49 \std{0.02}  \\
        
\texttt{Gen (0.80)}   & 2.51 \std{0.03} & 2.98 \std{0.05} & \textbf{2.92} \std{0.02} & \textbf{2.88} \std{0.03} 
        & \textbf{1.41} \std{0.04} & \textbf{1.42} \std{0.01} & \textbf{1.45} \std{0.03} & \textbf{1.37} \std{0.03}  \\
        
               
\midrule
& \multicolumn{4}{c}{\textbf{CiteSeer}} 
& \multicolumn{4}{c}{\textbf{Computers}} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9}
\textbf{Model} & \textbf{\( K=3 \)} & \textbf{\( K=5 \)} & \textbf{\( K=10 \)} & \textbf{\( K=20 \)} & \textbf{\( K=3 \)} & \textbf{\( K=5 \)} & \textbf{\( K=10 \)} & \textbf{\( K=20 \)} \\
\midrule
\texttt{Loc (0.95)} & 4.80 \std{0.02} & 4.95 \std{0.02} & 4.99 \std{0.02} & 4.99 \std{0.03}
        & 6.18 \std{0.03} & 6.31 \std{0.04} & 6.71 \std{0.02} & 6.45 \std{0.02}  \\
        
\texttt{Fed (0.95)}   & 3.89 \std{0.04} & 4.12 \std{0.01} & \textbf{4.19} \std{0.04} & 4.42 \std{0.01} 
        & 5.58 \std{0.03} & 5.96 \std{0.02} & 4.76 \std{0.03} & 6.10 \std{0.02}  \\
        
\texttt{Gen (0.95)}   & \textbf{3.72} \std{0.02} & \textbf{4.11} \std{0.01} & 4.55 \std{0.02} & \textbf{4.27} \std{0.01} 
        & \textbf{5.08} \std{0.02} & \textbf{5.86} \std{0.03} & \textbf{4.21} \std{0.02} & \textbf{5.30} \std{0.01}  \\
        
               \cmidrule(lr){2-5} \cmidrule(lr){6-9}
               
\texttt{Loc (0.90)} & 4.14 \std{0.01} & 4.62 \std{0.01} & 4.73 \std{0.03} & 4.87 \std{0.03} 
        & 5.27 \std{0.04} & 5.26 \std{0.04} & 5.66 \std{0.02} & 5.73 \std{0.03}  \\
        
\texttt{Fed (0.90)}   & 3.16 \std{0.01} & 3.09 \std{0.02} & 3.14 \std{0.03} & 3.82 \std{0.02} 
        & 4.78 \std{0.04} & 5.37 \std{0.02} & 4.09 \std{0.02} & 5.46 \std{0.01}  \\
        
\texttt{Gen (0.90)}   & \textbf{2.95} \std{0.02} & \textbf{2.96} \std{0.01} & \textbf{3.08} \std{0.02} & \textbf{3.65} \std{0.01} 
        & \textbf{4.59} \std{0.03} & \textbf{5.09} \std{0.02} & \textbf{3.57} \std{0.01} & \textbf{4.69} \std{0.01}  \\
        
               \cmidrule(lr){2-5} \cmidrule(lr){6-9}
               
\texttt{Loc (0.80)} & 3.35 \std{0.01} & 3.59 \std{0.02} & 3.84 \std{0.03} & 3.98 \std{0.02} 
        & 4.24 \std{0.04} & 4.28 \std{0.01} & 4.55 \std{0.03} & \textbf{3.96 }\std{0.01}  \\
        
\texttt{Fed (0.80)}   & 2.45 \std{0.03} & 2.22 \std{0.02} & 2.17 \std{0.02} & 2.85 \std{0.01} 
        & \textbf{3.60} \std{0.01} & 4.28 \std{0.03} & 3.39 \std{0.01} & 4.66 \std{0.03}  \\
        
\texttt{Gen (0.80)}   & \textbf{2.19} \std{0.02} & \textbf{2.11} \std{0.02} & \textbf{2.14} \std{0.03} & \textbf{2.66} \std{0.02} 
        & 3.62 \std{0.03} & \textbf{3.97} \std{0.02} & \textbf{2.96} \std{0.02} & 3.97 \std{0.01}  \\
        
               
\bottomrule

\end{tabular}
}
\end{table*}

\subsection{Experimental Setup}

We evaluate our method on four widely used graph datasets: Cora, CiteSeer, PubMed \citep{yang2016revisiting}, and Amazon Computers \citep{shchur2018pitfalls}. To simulate a federated learning environment, we partition each graph into clusters of $K = 3$, $5$, $10$, and $20$ using the METIS graph partitioning algorithm \citep{karypis1997metis}, which ensures clusters are of similar sizes and minimizes edge cuts between partitions. This partitioning introduces missing links between subgraphs, reflecting real-world scenarios where data is distributed across different clients with incomplete neighbor information.

We implement two-layer local permutation-invariant GCN models with mean pooling and employ the FedAvg algorithm \citep{mcmahan2017communication} to train the global GNN model. The batch size and learning rate for training local GNNs are set to 32 and 0.01, respectively, using the Adam optimizer. For the VAE and VGAE, we use the official implementations from the PyTorch Geometric package \citep{fey2019fast}, with hidden dimensions of size 64 and 16, respectively. The hyperparameters for percentages and number of clusters  were determined through a grid search over \( p \in \{0.5, 1, 2, 4, 6, 8, 10, 12\} \) and \( M \in \{2, 5, 10, 20\} \). The VAE decoder mirrors the encoder dimensions, while the VGAE utilizes an inner product decoder.

Each local subgraph is divided into training, calibration, and test sets in a 20\%/40\%/40\% ratio. 20\% of the training set is used for validation. All experiments are conducted on NVIDIA RTX A5000-24GB GPUs.


As recommended by \citet{lu2023federated}, we apply temperature scaling to the conformal procedure. We average the locally learned temperatures across clients before initiating the federated conformal procedure.

To estimate the set-valued function \( C_\alpha \) within our framework, we compute the quantile of conformal scores \( \{s^k_i\}_{i=1}^{n_k} \) for each client \( k \in [K] \), where each score \( s^k_i = S(x^k_i, y^k_i) \) is distributed across \( K \) clients. We employ distributed quantile estimation techniques, which have proven effective in traditional federated conformal prediction settings \citep{lu2023federated}. Specifically, we adopt quantile averaging \citep{luo2016quantiles} and T-Digest \citep{dunning2021t}, a quantile sketching algorithm designed for efficient online quantile estimation in distributed settings.

We present our main results using the APS non-conformity score and quantile averaging for distributed quantile estimation. APS was chosen for its robust performance, and a detailed comparison with other non-conformity scores (RAPS and LAC) is available in Appendix~\ref{appendix:nonconformity_comparison}. Similarly, our choice of quantile averaging is supported by a comparative study with the T-Digest method in Appendix~\ref{appendix:impact_of_quantile}. We report the average conformal prediction set sizes across clients after local training of GNN models on each client (denoted as \texttt{Loc} in Table~\ref{results1}). This allows us to evaluate the empirical impact of the federated conformal procedure on graph data. The \texttt{Fed} entry in the table presents results from the experimental validation of our federated conformal prediction method. Results from our generative framework are indicated as \texttt{Gen}.

\subsection{Main Results and Efficiency Analysis}

Our experimental results, presented in Table~\ref{results1}, consistently demonstrate that federated training (\texttt{Fed}) achieves smaller CP set sizes compared to local training (\texttt{Loc}). This improvement is particularly pronounced as the number of clients increases. For example, on the PubMed dataset with \( K = 20 \) clients at a confidence level of \( 1 - \alpha = 0.95 \), the federated approach yields an average CP set size of \( 1.77 \pm 0.02 \), compared to \( 2.04 \pm 0.01 \) for local training, indicating a significant reduction in uncertainty.  

Examining the trends across rows, we observe the adverse effects of missing links on CP set sizes: as the number of clients increases (left to right along each row), the fragmentation of local graphs exacerbates uncertainty, leading to larger CP sets. This underscores the necessity of mitigating missing edge information in federated settings.  

Averaging across datasets and client configurations, transitioning from \texttt{Loc} to \texttt{Fed} yields a notable \( 15.4\% \) improvement in CP set sizes. This aligns with our theoretical justification, which enables the federated application of conformal prediction while leveraging information across clients to counteract the impact of missing local edges.  

Our missing neighbor generator (\texttt{Gen}) further refines the CP set sizes across varying configurations and datasets. This approach effectively reconstructs missing node connections, allowing the conformal predictor to approximate its optimal state, where all edges are present. For instance, in the Computers dataset with \( K = 20 \) and a confidence level of \( 1 - \alpha = 0.95 \), \texttt{Gen} achieves a CP set size of \( 5.30 \pm 0.01 \), improving upon both federated (\( 6.10 \pm 0.02 \)) and local training (\( 6.45 \pm 0.02 \)).  

Overall, our neighbor completion strategy in \texttt{Gen} provides an additional \( 4.9\% \) improvement over \texttt{Fed}, cumulatively leading to CP sets that are \( 19.5\% \) smaller than those obtained with \texttt{Loc}. This demonstrates that, despite the challenges posed by missing links in federated settings, our approach successfully reconstructs latent structures, enhancing predictive efficiency in federated graph learning. 

\begin{figure}[htbp]
    \centering
        \includegraphics[width=0.46\textwidth]{figures/coverage_rates.png}
        \caption{\textbf{Coverage Rates} for \texttt{Fed} (left) and \texttt{Gen} (right) models across varying client numbers 
        \( K \) on the Cora dataset. The diagonal line represents the desired coverage rate.}
        \label{fig:coverage_rates}
\end{figure}

\subsection{Coverage Rates}

As established in our theoretical analysis (Section~\ref{sec:method}), applying conformal prediction in federated graph learning preserves the lower bound for CP coverage. We empirically validate this on the Cora dataset, as shown in Figure~\ref{fig:coverage_rates}, which illustrates the coverage rates across different numbers of clients. The results confirm that our method consistently meets the desired coverage threshold. Notably, for lower \( 1 - \alpha \) values, the model easily satisfies the required coverage, and its performance remains robust even at more challenging miscoverage rates. This highlights the effectiveness of CP as a reliable uncertainty quantification method in high-stakes applications. Furthermore, incorporating our generative model maintains the desired coverage while significantly reducing CP set sizes, as demonstrated on the right side of Figure~\ref{fig:coverage_rates}.

\subsection{Parameter Sensitivity}

As shown in Figure~\ref{fig:param_sensitivity}, varying number of cluster centers consistently improves the set size returned in the \texttt{Fed} setting. However, the relationship between the number of clusters and performance does not follow a direct correlation. We hypothesize that due to heterogeneity in client distributions, using a fixed \( M\) value across all clients may not always yield the most representative feature prototypes. An adaptive approach to selecting the number of cluster centers per client could potentially enhance performance, which we leave for future work. For the \(p\) parameter, we observed that adding a small number of edges to the client subgraphs generally improves performance. However, increasing the number of edges continues to provide benefits in many cases, suggesting that the added edges may act as a form of oversmoothing, making the model less sensitive to noise in the graphs.

\begin{figure}[htbp]
    \centering
        \includegraphics[width=0.46\textwidth]{figures/param_sensitivity.png}
        \caption{Effect of the number of clusters ($M$) and percentage ($p$) on CP set size under the three-client setting on the PubMed dataset. All other parameters are kept constant. The left plot shows CP set size as a function of $M$, while the right plot shows CP set size as a function of $p$. The dotted horizontal line represents the baseline CP set size for the federated setting (Fed = 1.802).}
        \label{fig:param_sensitivity}
\end{figure}



