% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams





\usepackage{multirow}
\usepackage{amssymb}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{dsfont}

\algrenewcommand\algorithmicrequire{\textbf{Input:}}
\algrenewcommand\algorithmicensure{\textbf{Output:}}

\usepackage{amsthm}

\makeatletter
\newtheorem*{rep@theorem}{\rep@title}
\newcommand{\newreptheorem}[2]{%
\newenvironment{rep#1}[1]{%
 \def\rep@title{#2 \ref{##1}}%
 \begin{rep@theorem}}%
 {\end{rep@theorem}}}
\makeatother

\newtheorem{theorem}{Theorem}
\newreptheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newreptheorem{lemma}{Lemma}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{ALIN: An Active Learning Framework for Incomplete Networks}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Tung Khong}
\author[1]{Cuong Pham}
\author[1]{Cong Tran}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
% \affil[1]{%
%     Computer Science Dept.\\
%     Cranberry University\\
%     Pittsburgh, Pennsylvania, USA
% }
\affil[1]{%
    Posts \& Telecommunications Institute of Technology, Hanoi, Vietnam
}


  \begin{document}
\maketitle
\begin{abstract}
Significant progression has been made in active learning algorithms for graph networks in various tasks. However real-world applications frequently involve incomplete graphs with missing links, which pose the challenge that existing approaches might not adequately address. This paper presents an active learning approach tailored specifically for handling incomplete graphs, termed ALIN. Our algorithm employs graph neural networks (GNN) to generate node embeddings and calculates losses for both node classification and link prediction tasks. The losses are combined with appropriate weights and iteratively updating the GNN, ALIN efficiently queries nodes in batches, thereby achieving a balance between training feedbacks and resource utilization. Our empirical experiments have shown ALIN can surpass state-of-the-art baselines on Cora, Citeseer, Pubmed, and Coauthor-CS datasets. 
\end{abstract}

% \keywords{Active Learning, Incomplete Graphs, Graph Neural Networks, Node Classification, Link Prediction, Semi-Supervised Learning, Joint Loss Function}

\section{Introduction}

The concept of graphs (or networks) has become pervasive across numerous domains, such as citation graphs and social graphs. Similar to other forms of data, graph data are undergoing rapid expansion, presently attaining substantial magnitudes. Consequently, the expanding dimensions of these graphs pose formidable challenges in attempting to analyze such type of data comprehensively.

Graph embeddings, the technique that transforms a given graph into a lower-dimensional space while preserving its underlying structural attributes and other inherent characteristics, are now gaining considerable attention in research areas \citep{GOYAL201878, 9780569}. By generating node embeddings, a spectrum of graph analytical tasks, including but not limited to node classification, node clustering, and link prediction, can be executed with heightened efficiency, optimizing both temporal and spatial considerations \citep{ou2016asymmetric}. The semi-supervised graph embedding algorithms typically assume the training labeled data are given, which may not be always true in real practice \citep{song2022graph}. Given a labeling budget, the strategic selection of training labeled nodes to maximize eventual performance is thus of great importance. Addressing this concern, the concept of Active Learning (AL) has been introduced as a solution \citep{9669159}. AL strategies offer a highly efficient mechanism for enhancing the process of data annotation by prioritizing the identification and labeling of the most informative instances. This, in turn, serves to optimize the efficiency and overall performance of machine learning models.
Significantly, the domain of graph-based tasks, including many applications such as social network analysis, recommendation systems, and biological network inference, has benefited greatly from these developments \citep{9772660, vatter2023evolution}.

Recent AL-based approaches on graphs often assume the underlying network is fully known \citep{ma2022partition}. However, this assumption tends to be overly simplistic as the underlying network cannot be fully observed in many real-world applications of network analyses \citep{valente2007identifying, rice2012mobilizing}.
While, in theory, it is conceivable to allocate additional resources towards the exhaustive exploration of the entire network, the endeavor to acquire a comprehensive network structure frequently proves to be prohibitively costly, demanding in terms of labor, or entirely unfeasible in practice \citep{valente2007identifying}. 
For example, network data extracted from social media platforms bear privacy concern limitations as a substantial 52.6\% of Facebook users took measures to conceal their friends' connections during a demographic analysis of Facebook in New York City in June 2011.\footnote{We refer to \citet{dey2012facebook} for the statistics.} Consequently, when working with graph data, one should assume a more practical case that only a part of the network structure is available in practice \citep{hou2022meta, teji2022predicting, tran2021community}. This raises a critical challenge: How do we adapt AL methods to effectively operate on such incomplete graphs?  


To tackle this pressing challenge, we introduce a new active learning framework explicitly tuned to handle incomplete networks: ALIN (\textbf{A}ctive \textbf{L}earning for \textbf{I}ncomplete \textbf{N}etworks) \footnote{The source code used in this paper is available online (https://github.com/manhtung001/ALIN).}. We propose a framework that incorporates an edge-based scoring mechanism into the AL framework. Conventionally, AL approaches in graphs have prioritized node-centric objectives, such as optimizing node classification accuracy, which is no longer sufficient and there is an inherent need to strategically select nodes that contribute to graph completeness. However, simply introducing edge scores can compromise the primary goal of node classification, leading to reduced overall accuracy. To strike a balance between enhancing graph completeness and preserving node classification accuracy, we introduce a two-phase training. In the initial epochs, we focus on link prediction as an auxiliary task. This early phase aims to establish an effective synergy between node scores and edge scores, facilitating the creation of informative edges within the incomplete graph. In the subsequent epochs, our approach seamlessly transitions towards prioritizing the core task of node classification, ensuring that the final objective is met with high accuracy. By combining the objectives of improving graph completeness and enhancing node classification accuracy, our proposed AL framework addresses the unique challenges posed by incomplete graphs. This innovative approach not only extends the applicability of AL techniques to real-world scenarios but also opens doors to more comprehensive and accurate graph-based data analysis.

In this paper, we present a comprehensive set of contributions, each addressing a distinct facet of the active learning problem in the context of incomplete graphs:

\begin{itemize}

\item  We introduce the novel Active Learning on Incomplete Graphs (ALIN) framework that is meticulously designed to tackle the unique challenges posed by incomplete graph structures, offering a robust end-to-end solution.

\item We extend the conventional node scoring approach by introducing edge scores. This innovation caters specifically to the optimization needs of incomplete graphs, allowing for more effective query node selection.

\item We propose a novel joint loss function that seamlessly combines node classification and link prediction. This integration ensures that the interplay between these two critical components is optimized. Furthermore, we introduce a method to harmonize these two losses, thereby achieving superior results in the ultimate task of node classification.

\item Our contributions are substantiated through an extensive series of experiments conducted on datasets. These experiments not only establish the superior performance of ALIN when compared to conventional active learning methods on benchmark graphs but also underscore the robustness of our approach across various datasets and with different GNN backbones.

\end{itemize}


\section{Related work}
The framework that we proposed is related to the
following three research lines.

\textbf{Active Learning}.
Traditional active learning algorithms operate by querying individual samples for labeling in a sequential manner. However, such an approach proves to be suboptimal when applied to deep learning models as it frequently retrains but updates little, and it is prone to overfitting \citep{ren2021survey}. Therefore, in deep active learning, the batch-mode setting, where a diverse set of instances are sampled and queried, is more often considered. In recent years, the optimal experimental design principle \citep{pukelsheim2006optimal, 10.1007/s10107-019-01464-2} motivated the machine learning community to minimize the use of training resources and avoid tuning on a validation set. Combining the settings of one-shot learning and batch-mode active learning, several recent studies \citep{contardo2017metalearning, wu2019active} adopted a one-step batch-mode active learning setting.

\begin{table*}[ht]
\begin{center}
\resizebox{1.7\columnwidth}{!}{%
\begin{tabular}{|c|cccc|c|c|}
\hline
\multirow{2}{*}{Experiment}                                 & \multicolumn{4}{c|}{Method}                                                                               & \multirow{2}{*}{Incomplete Network} & \multirow{2}{*}{Adaptive} \\ \cline{2-5}
                                                            & \multicolumn{1}{c|}{EER} & \multicolumn{1}{c|}{Heuristics} & \multicolumn{1}{c|}{Uncertainty} & GraphPart &                                     &                           \\ \hline
\citet{Zhu2003CombiningAL}               & \multicolumn{1}{c|}{x}   & \multicolumn{1}{c|}{}           & \multicolumn{1}{c|}{}            &           & No                                  & No                        \\ \hline
\citet{Macskassy2009UsingGM}             & \multicolumn{1}{c|}{x}   & \multicolumn{1}{c|}{x}          & \multicolumn{1}{c|}{}            &           & No                                  & Yes                       \\ \hline
\citet{Gu2012TowardsAL}                  & \multicolumn{1}{c|}{x}   & \multicolumn{1}{c|}{}           & \multicolumn{1}{c|}{}            &           & No                                  & No                        \\ \hline
\citet{NIPS2013_7810ccd4}               & \multicolumn{1}{c|}{}    & \multicolumn{1}{c|}{}           & \multicolumn{1}{c|}{x}           &           & No                                  & No                        \\ \hline
\citet{cai2017active}                    & \multicolumn{1}{c|}{}    & \multicolumn{1}{c|}{x}          & \multicolumn{1}{c|}{x}           &           & No                                  & Yes                       \\ \hline
\citet{wu2019active}                     & \multicolumn{1}{c|}{}    & \multicolumn{1}{c|}{}           & \multicolumn{1}{c|}{x}           &           & No                                  & Yes                       \\ \hline
\citet{ma2022partition}                  & \multicolumn{1}{c|}{}    & \multicolumn{1}{c|}{}           & \multicolumn{1}{c|}{x}           & x         & No                                  & Yes                       \\ \hline
\textbf{\underline{ALIN} (ours)} & \multicolumn{1}{c|}{}    & \multicolumn{1}{c|}{}           & \multicolumn{1}{c|}{x}           & x         & Yes                                 & Yes                       \\ \hline
\end{tabular}
}
\caption{\label{tab:literature_review} Summary of active learning techniques for node classification on graphs. Here, the Adaptive column indicates that the active learner is updated based on the newly labeled instances.}
\end{center}
\end{table*}

Based on the query strategy, the majority of work can be divided into three categories \citep{Aggarwal2014ActiveLA}: heterogeneity-based, performance-based, and representativeness-based. Heterogeneity-based \citep{zhang2017active} labeled the instances that are most different from the current known model. Performance-based \citep{guo2007optimistic} minimize labeled uncertainty of the remaining unlabelled instance. Representativeness-based \citep{li2013active} labeled the instance that can represent the underlying distribution of training instances

\textbf{Active Learning on Graphs}.
The majority of work can be divided into four categories, including: EER, Heuristics, Uncertainty, and GraphPart. EER (Expected Error Reduction) \citep{Zhu2003CombiningAL, Macskassy2009UsingGM, Gu2012TowardsAL} is a criterion in active learning that selects instances with the highest expected reduction in classification error, aiming to improve model performance efficiently. Heuristics \citep{Macskassy2009UsingGM, cai2017active} are rule-of-thumb strategies used in active learning to guide the selection of informative data points for labeling, often based on measures like uncertainty, diversity, or disagreement among models. Uncertainty sampling \citep{NIPS2013_7810ccd4, cai2017active, wu2019active, ma2022partition} is an active learning method that selects instances for labeling based on the uncertainty of their predicted class probabilities, targeting instances where the model is least confident in its predictions. Recently, GraphPart \citep{ma2022partition} first splits the graph into disjoint partitions and then selects representative nodes within each partition to query. It is worth noting that all prior work operated under the assumption of complete graphs, which diverges from reality given the incomplete nature characteristic of most real-world graphs. In Table \ref{tab:literature_review}, we summarize the aforementioned AL methods for the node classification task. 


\textbf{Link Prediction} 
 is a fundamental problem that attempts to estimate the likelihood of the existence of a link between two nodes \citep{Lu2010LinkPI}. This process enhances our comprehension of the connection between specific nodes and the evolution of the entire network.
Link prediction has been widely applied to a variety of fields such as biology \citep{Lei2013ANL} and social networks \citep{liben2007link,bonchi2011social}.
A multitude of methodologies exist for the prediction of links within networks. \citet{heemakshi2016survey} provided an extensive survey that encompasses diverse link prediction algorithms, with a particular emphasis on scrutinizing the limitations inherent in such methods. \citet{Lei2013ANL} presented an excellent survey by summarizing different approaches, introducing typical applications, and outlining future challenges of link prediction algorithms. Building upon this foundation, \citet{martinez2016survey} furnished a more contemporary perspective by incorporating recent methodologies and conducting a meticulous comparative analysis of similarity-based techniques.
Since it is difficult to identify a method that has the best performance in all complex networks, which strongly depends on the structural properties of the network, the authors in \citet{wu2022link} categorized various link prediction strategies, including common neighbors-based, paths-based, probabilistic and statistical models based, classifier based, and network embedding based techniques.

\section{Problem Formulation}

In this section, we describe a formal definition of the problem of active learning on an incomplete graph under iterative batch-mode settings and introduce a uniform set of notations.

Let us denote an underlying network \(\mathcal{G=(\mathcal{V}, \mathcal{E})}\) with \textit{N} nodes \(v_i \in \mathcal{V}\), edges \( (v_i,v_j) \in \mathcal{E}\). Each node is associated with a feature matrix $\textbf{X} \in \mathbb{R}^{N \times F}$, where $F$ denotes the dimensionality of the feature vector for each node.  Additionally, there exists a label matrix $\textbf{Y} \in \mathbb{R}^{N \times C}$ for labeled nodes, where $C$ represents the number of node classes. Here, \(\textbf{Y}_{ij} = 1\) indicates that node \(i\) has label \(j\), where $y_i$ represents the label assigned to node $i$ and $y_i = c$ denotes $c$-th element within the set $\{1, 2, \ldots, C\}$. An oracle is available to label a query node along with its associated edges, within a given labeling budget $B$. We assume $y_i$ is drawn randomly from a distribution $\mathds{P}_{y|x_i}$ supported on $\textbf{Y}$. We denote $\eta_c(v) = \Pr[y = c|v]$ as the probability that $y = c$ given node $v$, and $\eta(v) = (\eta_1(v), \dots, \eta_C(v))^T$.



In this study, we follow the iterative batch-mode setting \citep{wu2019active}. In this setting, for each iteration, we deplete a predetermined resource budget to select a batch of nodes for labeling, streamlining the querying process to minimize redundant retraining. We entail segmenting a given budget $B$ into $K$ equitably sized partitions. For each iteration $k = \{0,\cdots,K\}$, an active learning algorithm $\mathcal{A}^{(k)}$ selects $b = [B/K]$ nodes for querying, which forms a set of selected nodes, denoted as $\mathcal{Q}^{(k)}_{b}$. The primary objective underlying this approach is to harness the informative feedback derived from the training process, while simultaneously safeguarding against excessive resource consumption. This contrasts with the fundamental AL setup, where only a solitary node is chosen at a time, potentially imposing considerable training overhead.

Since we tackle a setting where graph data is incomplete, for each iteration $k = \{0,\cdots,K\}$, we are given an incomplete graph  $\tilde{\mathcal{G}}^{(k)}=(\mathcal{V}, \tilde{\mathcal{E}}^{(k)})$ and an incomplete label set $\tilde{\textbf{Y}}^{(k)}$, where $\tilde{\mathcal{E}}^{(k)} \subset \mathcal{E}$ and $\tilde{\textbf{Y}}^{(k)} \subset \textbf{Y}$ are the edge set and the set of updated node labels at $k$-th iteration, respectively. At $k$-th iteration, when  $\mathcal{A}^{(k)}$ queries $b$ nodes, we obtain  ${\tilde{\textbf{Y}}_{q}}^{(k)}$ and ${\tilde{\mathcal{E}}_{q}}^{(k)}$ as the sets of newly obtained node labels and edges after querying, respectively. Additionally, we denote  ${\tilde{\textbf{Y}}_{u}}^{(k)}$ and ${\tilde{\mathcal{E}}_{u}}^{(k)}$ as the sets of updated node labels and updated edges set at the $k$-th iteration. Thus, we have  ${\tilde{\textbf{Y}}_{u}}^{(k)}$ = \(\tilde{\textbf{Y}}^{(k)} \cup {\tilde{\textbf{Y}}_{q}}^{(k)} \) and ${\tilde{\mathcal{E}}_{u}}^{(k)}$ = \(\tilde{\mathcal{E}}^{(k)} \cup {\tilde{\mathcal{E}}_{q}}^{(k)} \); and the budget $b$ is the maximum number of updated node labels. In this setting, we assume that the node feature matrix \textbf{X} is fully observable. 

We aim to train a GNN-based classification model $\mathcal{M}^{(k)}$ by iteratively updating its parameters $\theta^{(k)}$. The GNN model $\mathcal{M}^{(k)}$ maps (${\tilde{\mathcal{E}}_{u}}^{(k)}$, $\textbf{X}$) to prediction vectors $\hat{\textbf{Y}}^{(k)}$ and $\hat{\mathcal{E}}^{(k)}$. From the prediction and the observation, we compute node classification loss \(l_{NC}({\tilde{\textbf{Y}}_{u}}^{(k)},\hat{\textbf{Y}}^{(k)})\) and link prediction loss \(l_{LP}({\tilde{\mathcal{E}}_{u}}^{(k)}, \hat{\mathcal{E}}^{(k)})\). To combine both losses, we sum \(l_{NC}\) and \(l_{LP}\) with a hyperparameter weight $\beta$, denoted as $\mathcal{L}^{(k)}$. If $\mathcal{M}$ is the same for all active learning strategies, we can slightly abuse the notation $\mathcal{A}^{(k)} = \mathcal{M}_{\mathcal{A}^{(k)}}$ to emphasize the focus of active learning algorithms. We also assume that the class probabilities are given by a ground truth GCN; i.e., there exists a GCN \(\mathcal{M}^*\) that predicts Pr$[y_i = c]$ on the entire training set.

Our goal is to minimize the loss under a given budget $b$ at the $k$-th iteration:

\begin{equation}
\min_{\theta^{(k)}, \mathcal{Q}_{b}} \mathcal{L}^{(k)}
\end{equation}


\section{ALIN Framework}
\label{sec:implement-detail}

In this section, we represent ALIN, a comprehensive solution designed to address the challenge of active learning within the context of an incomplete graph. The process underlying our framework can be dissected into two principal components: the query phase and the training phase, both composed of distinct functions as follows:
\begin{figure*}[t]
    \centering
    \includegraphics[width=1.8\columnwidth]{pipeline.pdf}
    \caption{A schematic overview of our proposed ALIN framework.}
    \label{fig:overview-of-framework}
\end{figure*}


\begin{itemize}
\item Query Phase: This phase encompasses node selection and subsequent updates. During the initial node selection, we utilize the InitNodes function. In subsequent iterations, we calculate node scores, edge scores and combine them to identify the most informative node. We then update the selected nodes and the lost edges associated with them.

\item Training Phase: In this phase, we focus on the core of our methodology: a unified loss function that combines node classification and link loss prediction.
\end{itemize}

\textit{Example 1}. {A schematic overview of our proposed ALIN framework is visualized in Fig.~\ref{fig:overview-of-framework}, where we depict the initial two iterations of the ALIN framework. In first iteration, $(\textbf{X},{\tilde{\textbf{Y}}}^{(0)},{\tilde{\mathcal{E}}}^{(0)})$ contains eight unlabeled nodes and six missing edges. During the Query Phase, two nodes, namely 4 and 5 (highlighted by yellow circles), are chosen for labeling. Consequently, the three edges connected to these nodes are integrated, resulting in the updated $(\textbf{X},{\tilde{\textbf{Y}}_{u}}^{(0)},{\tilde{\mathcal{E}}_{u}}^{(0)})$. Subsequently, $(\textbf{X},{\tilde{\textbf{Y}}_{u}}^{(0)},{\tilde{\mathcal{E}}_{u}}^{(0)})$ undergoes the GNN model to generate node embeddings and to predict both  $\hat{\textbf{Y}}^{(0)}$ and $\hat{\mathcal{E}}^{(0)}$. From  $({\tilde{\textbf{Y}}_{u}}^{(0)},{\tilde{\mathcal{E}}_{u}}^{(0)})$ and $(\hat{\textbf{Y}}^{(0)}, \hat{\mathcal{E}}^{(0)})$, we compute  \(l_{NC}\) and \(l_{LP}\) and its amalgamation $\mathcal{L}^{(0)}$, which is subsequently utilized in the backpropagation process into the GNN. Moving on to the second iteration, we obtain $({\tilde{\textbf{Y}}}^{(1)},{\tilde{\mathcal{E}}}^{(1}) \gets ({\tilde{\textbf{Y}}_{u}}^{(0)},{\tilde{\mathcal{E}}_{u}}^{(0)})$} and two additional nodes, namely 2 and 3 (highlighted by yellow circles), are selected for labeling. As a result, the two associated edges are incorporated, leading to the update of $(\textbf{X},{\tilde{\textbf{Y}}_{u}}^{(1)},{\tilde{\mathcal{E}}_{u}}^{(1)})$.

The technical details of the two phases are described in the following.

\subsection{Query Phase}

\subsubsection{InitNodes}
 As our framework relies on hidden representations of nodes or the predicted class distribution from the initial model, we operate within iterative settings, necessitating an initial model trained with the seed set. Consequently, we require the 'InitNodes' function to select the initial set of nodes. This function allows us to employ various selection strategies, such as random selection or employing recent methods such as FeatProp \citep{wu2019active}, Centrality \citep{cai2017active}, GraphPart \citep{ma2022partition}. For the implementation of ALIN, we employ GraphPart as the 'InitNodes' function. GraphPart works by dividing the graph into separate partitions and then selecting representative nodes within each partition for active learning with GNN. To mitigate interference across partitions in GraphPart, \citet{ma2022partition} also introduces GraphPartFar, a method that penalizes selecting nodes close to medoids chosen in prior partitions, thereby promoting diversity among the returned nodes. According to two variants of GraphPart, we also present another variant of ALIN, termed ALINFar. The distinction between ALIN and ALINFar lies in their respective utilization of the 'InitNodes' function, i.e., ALIN employs the GraphPart function for this purpose, whereas ALINFar opts for the utilization of GraphPartFar as the initialization function.

\subsubsection{Combine Score}
In the context of querying the constituents of the graph, given a prescribed number of $b$ queries, the query function is constructed based on an equilibrium criterion encompassing the informational value of nodes into node score $\phi_{NS}^{(k)}$ and the informational value of edges into edge score $\phi_{ES}^{(k)}$. The amalgamation of $\phi_{NS}^{(k)}$ and $\phi_{ES}^{(k)}$ with a weight parameter denoted as $\alpha$ yields the composite score $\phi_{CS}^{(k)}$.

\textbf{\textit{Node Score.}} 
The use of entropy as a scoring metric provides valuable insights into the confidence of the GNN model's predictions for individual nodes. Higher entropy values indicate greater uncertainty, suggesting that a node's classification is less certain and may require further exploration or refinement in subsequent iterations of the process. In contrast, lower entropy values signify a higher level of confidence in the node's classification, making it less likely to be selected for additional query iterations. Following  by \citet{cai2017active}, $\phi_{NS}$ of candidate node $v_i$ at the $k$-th iteration is calculated as follows:
\begin{equation}
\phi_{NS}^{(k)}(v_i) = -\sum_{c=1}^{C}\mathds{M}^{(k)}_{ic}\log\mathds{M}^{(k)}_{ic},
\label{eq:entropy}
\end{equation}
where $\mathds{M}^{(k)}_{ic} = \mathds{P}(\tilde{\textbf{Y}}^{(k)}_{ic}=1|\tilde{\mathcal{G}}^{(k)},\tilde{\textbf{Y}}^{(k)},\textbf{X})$ is the probability of node $v_i$ belonging to class $c$ predicted by GNN  at the $k$-th iteration; $\tilde{\textbf{Y}}^{(k)}_{ic}=1$ indicates node \(i\) has label \(c\). Furthermore, the efficiency of our entropy-based node scoring approach plays a crucial role in accelerating the overall query process. With reduced computational overhead, our framework facilitates faster exploration of the graph and enhances the overall efficiency of the active learning framework. 

\textbf{\textit{Edge Score.}} In scenarios where the graph is incomplete, we aim to not only select nodes with high entropy but also nodes to allow the model to learn on a more complete graph. Intuitively, nodes with a larger difference in observable and predicted degrees are prioritized for inclusion in the active learning process since they offer the potential for improving the overall graph representation and classification performance. Thus, $\phi_{ES}$ of node $v_i$ at the $k$-th iteration is calculated as follows:
\begin{equation}
\phi_{ES}^{(k)}(v_i) =\sum_{n=1}^{N}\mathds{P}(\tilde{\mathcal{E}}^{(k)}_{in}=1|\tilde{\mathcal{G}}^{(k)},\tilde{\mathcal{E}}^{(k)},\textbf{X}) - D_{v_i}(\tilde{\mathcal{E}}^{(k)}),
\label{eq:edge_score}
\end{equation}
where $\sum_{n=1}^{N}\mathds{P}{(\tilde{\mathcal{E}}^{(k)}_{in}=1}|\tilde{\mathcal{G}}^{(k)},\tilde{\mathcal{E}}^{(k)},\textbf{X})$ is the probability of node $v_i$ has connect to node $v_n$ and $D_{v_i}(\tilde{\mathcal{E}}^{(k)})$ is the degree of node $v_i$ in $\tilde{\mathcal{E}}^{(k)}$; $\tilde{\mathcal{E}}^{(k)}_{in}=1$ indicates node \(i\) has a connection to node \(n\) at the $k$-th. Intuitively, $\phi_{ES}^{(k)}(v_i)$ can be interpreted as the residual degree of a node $v_i$. By incorporating the $\phi_{ES}^{(k)}$ alongside the $\phi_{NS}^{(k)}$, our active learning framework ensures the selection of nodes that not only exhibit uncertainty but also contribute to enhancing the graph's completeness and discriminative power.

The combination of $\phi_{NS}^{(k)}$ and $\phi_{ES}^{(k)}$ represents a promising direction for active learning in graph-based settings. By leveraging uncertainty and graph completeness, our approach strikes a balance between exploration and exploitation, thereby achieving efficient and reliable active learning in real-world scenarios. The versatility of our approach makes it well-suited for a wide range of applications, including social networks, recommendation systems, and bioinformatics, among others

\subsection{Training Phase}

GNN model training: In our active learning framework, we utilize two loss functions during the training of the Graph Neural Network (GNN) model. For node classification, we employ the cross entropy loss function, and for link prediction, we utilize the binary cross entropy with logits loss function. The rationale behind using a combined loss is that both $\phi_{NS}^{(k)}$ and $\phi_{ES}^{(k)}$ are influenced by the predictive power of the GNN model. By simultaneously retraining both the node classifier and the edge classifier models, we ensure that the combined score captures valuable information about both node attribute information and the structural information surrounding each node. This concurrent retraining of the edge classifier is motivated by the objective to query nodes that offer a high value of information for both aspects, thereby enhancing the overall representation of the graph. During the model retraining phase, we update the GNN model based on the newly acquired information from the queried nodes. This iterative improvement of the GNN model helps refine the parameters $\theta^{(k)}$ with each query round, resulting in an increasingly accurate and informative model.

\begin{figure*}[h!]
\begin{theorem}
\label{theorem:theorem_1}
Suppose that the label vector $\tilde{{\textbf{Y}}}_{u}^{(k)}$ is sampled independently from the distribution $y_v \sim \eta(v)$ and the loss function $l^{(k)}_{NC}$ is bounded by $[-L, L]$. Then under mild assumptions, there exists a probability $1 - \delta$ the expected classification loss of $\mathcal{A}^{(k)}$ satisfies

\resizebox{\linewidth}{!}{
  \begin{minipage}{\linewidth}
  \begin{align}
      \frac{1}{n} l^{(k)}_{NC}(\mathcal{A}^{(k)}|\tilde{\mathcal{G}}^{(k)}, X, \tilde{{\textbf{Y}}}_{u}^{(k)}) & \leq \sum_{i=1}^{n} \sum_{c=1}^{C} \left[ \frac{\lambda}{n} (\mathcal{M}^*)_{j,c} \min_{j \in \mathcal{Q}^{(k)}_{b}} |(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}| + \frac{L}{n}((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c}) \right] + \sqrt{\frac{L \log(1/\delta)}{2n}}
  \end{align}
    \end{minipage}
}
\end{theorem}
\end{figure*}

\subsection{Theoretical Analysis}

Recall that we use $(\mathcal{A}^{(k)})_i = (\mathcal{A}(\tilde{\mathcal{G}}^{(k)}, X))_i \in \mathbb{R}^{C}$ be the prediction for node $i$ under input $\tilde{\mathcal{G}}^{(k)}, X$, and $(\mathcal{A}^{(k)})_{i,c}$ be the $c$-th element of $(\mathcal{A}^{(k)})_i$ (i.e., the prediction for class $c$). $(\mathcal{M}^{*})_i \in \mathbb{R}^{C}$ is the prediction for node $i$ of ground truth GCN. Our approach shares similarities with the work of \citet{wu2019active} in the context of active learning. However, our framework diverges significantly, particularly in how we handle the incomplete graph structure. We focus on the uncertainty of node, which offers a more nuanced understanding of the incomplete graph compared to simply relying on translated features. The concept of 'translated features' in \citet{wu2019active} refers to pairwise distances between hidden representations of nodes. These features are highly informative when computed on complete graphs, as they encapsulate the full relational structure and interactions between nodes. In a complete graph, every possible link is present, allowing for a comprehensive and accurate representation of node relationships through these features. However, this approach encounters significant challenges in the context of incomplete graphs, where some links are missing. In such scenarios, the absence of certain links can lead to a distorted or incomplete understanding of the node relationships, and translated features no longer accurately represent the actual structure of the network. Our methodology is designed to address this limitation by focusing on a probability-based approach rather than relying solely on translated features. This allows for more accurate and reliable analysis in scenarios where the graph structure is incomplete, ensuring that our conclusions are robust even in the face of missing data.



%{\color{pink}(My comment) I create a shorter version of above paragraph. Only need one version.} {\color{blue} In the context of node classification, $(\mathcal{A}^{(k)})i$ represents the prediction for node $i$ under input $\tilde{\mathcal{G}}^{(k)}, X$, with $(\mathcal{A}^{(k)}){i,c}$ being its $c$-th element of $(\mathcal{A}^{(k)})_i$ (i.e., the prediction for class $c$). The ground truth GCN prediction for node $i$ is denoted as $(\mathcal{M}^{*})_i$. While sharing similarities with \citet{wu2019active}, our methodology diverges, especially in handling incomplete graph structures. We emphasize the uncertainty of nodes, providing a more nuanced understanding of incomplete graphs compared to reliance on 'translated features' of \citet{wu2019active}, which represent pairwise distances between node representations in complete graphs. In complete graphs, these features effectively capture relational structures and interactions. However, in incomplete graphs where links are missing, translated features can misrepresent network structures. Our probability-based approach addresses this by offering more accurate analysis in scenarios with incomplete graph structures, ensuring robust conclusions despite missing data.}


% Our results share some common characteristics with \citet{wu2019active}, our proof is more involved in the sense that it relates to probability $|(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}|$ instead of the translated features (pairwise distances between hidden representations of nodes in GCN). The reason we don't use the translated features is because the translated features calculated on complete graphs so the translated features carry the most correct information. But for incomplete graphs, calculations based on the translated features will lead to confusion due to missing link.

At $k$-th iteration, Theorem ~\ref{theorem:theorem_1} formally shows that choosing the most uncertain nodes can lead to a low node classification loss.



To understand Theorem ~\ref{theorem:theorem_1}, we note that the first term is the selection of an uncertain node $j \in \mathcal{Q}^{(k)}_{b}$, and the second term quickly decays with $n$, where $n$ is the total number of nodes in graph $\mathcal{G}$. Therefore, the node classification loss of $\mathcal{A}^{(k)}$ on the graph $\tilde{\mathcal{G}}^{(k)}$ is mostly dependent on the selection of an uncertain node. The assumptions we made in Theorem ~\ref{theorem:theorem_1} are pretty standard in the literature, and we illustrate the details in the appendix.



\section{Experiments}
\subsection{Experiment Setup}

\textbf{\textit{Dataset.}} To compare against state-of-the-art methods, we experiment on 4 benchmark datasets, including citation networks Citeseer, Cora, and Pubmed \citep{sen2008collective} and co-authorship networks \citep{shchur2018pitfalls}. The summary statistics of the datasets are provided in Table ~\ref{tab:statistics_datasets}. The homophily ratio is defined following \citet{zhu2020beyond}.  In the experimental range with simulated conditions, we remove 30\% of the total edges to create incomplete graphs. 

% this table belong to section EXPERIMENT RESULTS ON GCN
\begin{table*}[t]
\begin{center}
\begin{tabular}{llllll}
\hline
Dataset    & \#Nodes & \#Edges   & \#Features & \#Classes (C) & Homophily \\ \hline
Cora       & 2,708   & 5,278     & 1,433      & 7             & 0.810     \\
Citeseer   & 3,327   & 4,552     & 3,703      & 6             & 0.736     \\
Pubmed     & 19,717  & 44,324    & 500      & 3             &  0.802     \\
Coauthor-CS      & 18,333  & 81,894   & 6,805      & 6             & 0.808     \\ \hline
\end{tabular}
\end{center}
\caption{\label{tab:statistics_datasets} Summary statistics of datasets.}
\end{table*}

\begin{table*}[h]
\begin{center}
\begin{tabular}{|l|ccc|ccc|}
\hline
\textbf{\textbf{Baselines}} & \multicolumn{3}{c|}{\textbf{Cora}}                                                                                                      & \multicolumn{3}{c|}{\textbf{Citeseer}}                                                                                                  \\ \cline{2-7} 
Buget                                        & 200                                         & 230                                         & 260                                         & 200                                         & 230                                         & 260                                         \\ \hline
Random                                       & 76.6 $\pm$ 0.8                              & 78.6 $\pm$ 2.2                              & 79.4 $\pm$ 0.5                              & 61.8 $\pm$ 0.3                                 & 61.0 $\pm$ 0.4                                 & 63.1 $\pm$ 1.4                                 \\
Density                                      & 73.1 $\pm$ 1.1                              & 74.5 $\pm$ 1.6                              & 76.4 $\pm$ 1.9                              & 61.6 $\pm$ 0.6                                 & 60.4 $\pm$ 1.3                                 & 58.4 $\pm$ 1.4                                 \\
Uncertainty                                  & 78.7 $\pm$ 0.7                              & \underline{80.6} $\pm$ 0.9 & 80.5 $\pm$ 1.2                              & 63.3 $\pm$ 1.1                                 & \underline{64.2} $\pm$ 1.3                           & 64.5 $\pm$ 0.7                                 \\
CoreSet                                      & 77.9 $\pm$ 1.8                              & 79.1 $\pm$ 0.8                              & 79.9 $\pm$ 0.2                              & 61.2 $\pm$ 0.2                                 & 61.7 $\pm$ 1.4                                 & 65.8 $\pm$ 0.6                                 \\
Degree                                       & 72.2 $\pm$ 0.4                              & 73.5 $\pm$ 0.8                              & 75.7 $\pm$ 1.2                              & 53.9 $\pm$ 1.8                                 & 54.9 $\pm$ 1.1                                 & 56.4 $\pm$ 1.7                                 \\
Pagerank                                     & 77.8 $\pm$ 0.5                              & 78.5 $\pm$ 0.1                              & 79.5 $\pm$ 0.6                              & 63.1 $\pm$ 0.2                                 & 63.6 $\pm$ 0.1                                 & 64.1 $\pm$ 0.2                                 \\
AGE                                          & 77.6 $\pm$ 0.9                              & 77.7 $\pm$ 1.2                              & 79.6 $\pm$ 0.2                              & 63.2 $\pm$ 0.2                                 & 64.1 $\pm$ 0.9                                 & 66.0 $\pm$ 1.5                                 \\
FeatProp                                     & 72.1 $\pm$ 1.9                              & 73.1 $\pm$ 0.8                              & 74.5 $\pm$ 0.6                              & 50.6 $\pm$ 1.3                                 & 55.8 $\pm$ 0.5                                 & 57.2 $\pm$ 2.2                                 \\
GraphPart                                    & 72.8 $\pm$ 1.8                              & 73.7 $\pm$ 1.2                              & 75.1 $\pm$ 0.6                              & 54.1 $\pm$ 0.4                                 & 54.8 $\pm$ 1.1                                 & 57.2 $\pm$ 1.7                                 \\
GraphPartFar                                 & 77.8 $\pm$ 0.3                              & 78.4 $\pm$ 0.5                              & 78.4 $\pm$ 0.6                              & 60.9 $\pm$ 2.0                                 & 61.4 $\pm$ 2.1                                 & 62.3 $\pm$ 1.7                                 \\ \hline
ALIN                                         & \textbf{79.8} $\pm$ 1.3    & 80.4 $\pm$ 0.6                              & \textbf{81.6} $\pm$ 1.0    & \underline{63.4} $\pm$ 1.1                           & 63.9 $\pm$ 1.2                                 & \underline{66.1} $\pm$ 0.9                           \\
ALINFar                                      & \underline{78.9} $\pm$ 1.2 & \textbf{81.2} $\pm$ 1.1    & \underline{81.4} $\pm$ 1.7 & \textbf{64.4} $\pm$ 1.3                        & \textbf{64.7} $\pm$ 1.1                        & \textbf{66.4} $\pm$ 0.6                        \\ \hline
\textbf{Baselines}          & \multicolumn{3}{c|}{\textbf{Pubmed}}                                                                                                             & \multicolumn{3}{c|}{\textbf{Coauthor-CS}}                                                                                                              \\ \cline{2-7} 
Buget                                        & 200                                         & 230                                         & 260                                         & 200                                         & 230                                         & 260                                         \\ \hline
Random                                       & 77.1 $\pm$ 0.5                              & 77.5 $\pm$ 1.3                              & 78.8 $\pm$ 1.0                              & 77.8 $\pm$ 0.4                              & 82.6 $\pm$ 2.6                              & 81.3 $\pm$ 3.1                              \\
Density                                      & 77.2 $\pm$ 0.4                              & 76.9 $\pm$ 1.2                              & 77.6 $\pm$ 0.9                              & 79.3 $\pm$ 1.5                              & 75.0 $\pm$ 0.8                              & 79.3 $\pm$ 4.0                              \\
Uncertainty                                  & 77.3 $\pm$ 0.4                              & 78.5 $\pm$ 1.5                              & 79.0 $\pm$ 0.3                              & 84.3 $\pm$ 1.0                              & 85.4 $\pm$ 1.5                              & 85.6 $\pm$ 0.4                              \\
CoreSet                                      & 76.4 $\pm$ 0.9                              & 77.3 $\pm$ 0.8                              & 77.1 $\pm$ 0.7                              & 57.2 $\pm$ 3.1                              & 62.7 $\pm$ 5.2                              & 64.1 $\pm$ 3.8                              \\
Degree                                       & 76.1 $\pm$ 1.2                              & 75.7 $\pm$ 0.5                              & 75.7 $\pm$ 0.5                              & 60.0 $\pm$ 0.2                              & 59.9 $\pm$ 0.1                              & 60.2 $\pm$ 0.5                              \\
Pagerank                                     & 75.8 $\pm$ 0.8                              & 76.2 $\pm$ 0.2                              & 77.5 $\pm$ 0.4                              & 84.4 $\pm$ 0.7                              & 84.0 $\pm$ 1.6                              & 84.3 $\pm$ 1.0                              \\
AGE                                          & \underline{78.3} $\pm$ 1.0                              & \underline{79.0} $\pm$ 0.2                              & \underline{79.6} $\pm$ 0.6                              & \underline{85.1} $\pm$ 0.3 & \underline{85.5} $\pm$ 1.1 & \textbf{86.6} $\pm$ 0.3    \\
FeatProp                                     & 74.0 $\pm$ 0.8                              & 73.4 $\pm$ 0.7                              & 73.6 $\pm$ 1.4                              & 72.7 $\pm$ 4.7                              & 77.7 $\pm$ 1.8                              & 78.3 $\pm$ 1.0                              \\
GraphPart                                    & 73.8 $\pm$ 0.6                              & 74.5 $\pm$ 0.8                              & 74.2 $\pm$ 0.9                              & 77.0 $\pm$ 3.6                              & 79.0 $\pm$ 2.6                              & 80.9 $\pm$ 2.4                              \\
GraphPartFar                                 & 75.1 $\pm$ 0.5                              & 75.1 $\pm$ 0.6                              & 74.8 $\pm$ 0.7                              & 80.2 $\pm$ 1.7                              & 84.9 $\pm$ 1.3                              & 85.0 $\pm$ 0.8                              \\ \hline
ALIN                                         & \textbf{79.0} $\pm$ 0.8                                           & \textbf{79.3} $\pm$ 0.5                                           & \textbf{80.1} $\pm$ 0.6                                           & \textbf{85.4} $\pm$ 1.2    & \textbf{86.1} $\pm$ 0.7    & \underline{85.8} $\pm$ 0.5 \\
ALINFar                                      & 75.0 $\pm$ 1.6                                           & 77.7 $\pm$ 1.7                                           & 78.0 $\pm$ 0.9                                           & 84.3 $\pm$ 1.1                              & 84.8 $\pm$ 0.3                              & 85.6 $\pm$ 0.5                              \\ \hline
\end{tabular}
\end{center}
\caption{\label{tab:experiment_result} Summary of the performance of GCN on each benchmark. The \textbf{bold} marker denotes the best performance and the \underline{underlined} marker denotes the second-best performance.}
\end{table*}


\textbf{\textit{GNN Models.}} We perform experiments over three popular GNN models, including a 3-layer GCN \citep{kipf2016semi} with hidden neurons are 128 and 64, respectively, a 3-layer GraphSAGE \citep{hamilton2017inductive} with hidden neurons are 128 and 64, respectively, and an 8 attention head-GAT \citep{hamilton2017inductive} with 2 hidden layers of size 16 and 8, respectively. To train each model, we use an Adam optimizer with an initial learning rate of $1 \times 10^{-2}$ and weight decay of $5 \times 10^{-4}$. As in the active learning setup, there should not be enough labeled samples to be used as a validation set, we train the GNN model with fixed 200 epochs in all the experiments and evaluate over the full graph.

\textbf{\textit{Competitive methods.}} We compare active learning methods that can be applied to the iterative setting, divided into two categories: 1) general-purpose methods that are agnostic to the graph structure, namely Random, Density, Uncertainty, and CoreSet; and 2) methods tailored for graph-structured data, including Centrality, AGE, FeatProp, GraphPartFar, ALINFar. 

\begin{itemize} 
\item \textbf{Random}: Randomly chooses nodes without any specific criteria.

\item \textbf{Density} \citep{cai2017active}: Initially applies clustering to the hidden representations of nodes. It then selects nodes with the highest density score, which is roughly inversely related to the $l_2$-distance between each node and its respective cluster center.

\item \textbf{Uncertainty} \citep{settles2008analysis}: Selects nodes with the highest entropy in their predicted class distribution.

\item \textbf{CoreSet} \citep{sener2017active}: Utilizes K-Center clustering on the hidden representations of nodes. Given the scalability issues of the MIP optimized version, a time-efficient greedy approximation, as described in the original work, is used.

\item \textbf{Centrality}: Chooses nodes with the highest values in graph centrality metrics. Notably, this approach only considers the graph structure and does not take into account node features. Empirical evidence from \citep{cai2017active} suggests that \textbf{Degree} centrality and \textbf{PageRank} centrality tend to outperform other metrics and thus we employ \textbf{Degree} and \textbf{PageRank} as two baselines for comparison.

\item \textbf{AGE} \citep{cai2017active}: Quantifies the informativeness of nodes by linearly combining three metrics: centrality, density, and uncertainty. It then selects nodes with the highest combined scores.

\item \textbf{FeatProp} \citep{wu2019active}: First conducts K-Means clustering on the aggregated node features and subsequently selects nodes that are closest to the cluster centers.

\item \textbf{GraphPart} and \textbf{GraphPartFar} \citep{ma2022partition}: First obtains a K-partition of a graph using the Clauset-Newman-Moore greedy modularity maximization method \citep{clauset2004finding}. In each part, cluster on the aggregated node features and then choose the nodes closest to the cluster centers.
\end{itemize}

Following \citet{wu2019active}, we evaluate each baseline with a series of label budgets and report the Macro-F1 performance for node classification over the full graph. We note that the results are the average of repeated experiments with 3 random seeds

\subsection{Experiment Results on GCN}

The performance comparison between all competitive methods is presented in Table ~\ref{tab:experiment_result}. Note-worthy findings are summarized as follows:
\begin{itemize}
\item Our proposed ALIN and ALINFar, substantially outperform baseline methods across various budget constraints. Notably, these improvements persist until performance plateaus.

\item On smaller datasets like Cora and Citeseer, where the number of nodes and edges is relatively modest, our proposed framework exhibits remarkable superiority, surpassing baseline methods by a notable margin, typically around 1-1.5\%. This enhanced performance can be attributed to the framework's adept utilization of feedback from the training process.

\item For datasets with more extensive node and edge counts, such as Pubmed and Coauthor-CS, ALIN demonstrates clear advantages over baseline methods, particularly outperforming GraphPart, the state-of-the-art method. Notably, on larger datasets like Pubmed and Coauthor-CS, ALIN proves most effective with smaller budgets, typically around 200-230. However, with a more substantial budget (260), AGE edges slightly ahead of ALIN by approximately 0.2-0.5\%, as larger budgets tend to lead to performance saturation.

\item GraphPartFar exhibits commendable performance on Cora, trailing ALINFar by a mere 1-2\%. However, as datasets expand in size, the loss of numerous edges affects partitioning significantly. Consequently, on Pubmed and Coauthor-CS, GraphPartFar lags behind ALINFar by approximately 4-5\%.

\item In the context of the second-best performing methods, both ALINFar and Uncertainty shine on Cora and Citeseer. However, on Pubmed and Coauthor-CS, Uncertainty's performance is lackluster, with AGE emerging as the second-best performer. The discrepancy arises because Uncertainty operates independently of the graph structure, which becomes problematic for larger datasets where edge loss entails more significant information loss. In contrast, AGE amalgamates centrality, density, and uncertainty scores, proving advantageous in scenarios where data lacks edges in extensive graphs. Consequently, AGE consistently outperforms Uncertainty in these settings.
\end{itemize}

Overall, our experimental results showcase the robustness and versatility of the ALIN framework and its extensions, shedding light on their adaptability across diverse graph datasets and budget constraints.




\section{Conclusion and Discussion}
In this work, we embarked on a comprehensive exploration of the active learning paradigm tailored specifically for Graph Neural Networks (GNNs) operating on incomplete graphs. Drawing inspiration from the synergy between node and edge information, we introduced a novel framework designed to harness the unique potential inherent in incomplete graph structures. Our experiments yielded compelling evidence of the efficacy of our proposed framework. We demonstrated that it not only outperforms existing state-of-the-art baseline active learning methods but does so consistently across a variety of real-world datasets and scenarios. These findings underscore the pivotal role of our approach in advancing active learning strategies in the context of incomplete graph data.

Several avenues for future research may include: 1) Investigating dynamic edge scoring mechanisms that adapt to the evolving graph structure could be fruitful and 2) Exploring the synergy between our active learning framework and graph generative models could open doors to novel applications. 



\begin{contributions} % will be removed in pdf for initial submission 
					  % (without ‘accepted’ option in \documentclass)
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
All authors contributed to the study’s conception and
design. Cong Tran had the idea for the article. Material preparation, data collection, and analysis were performed by Tung Khong. Experiments were
conducted by Tung Khong. The first draft of the manuscript was written
by Tung Khong and all authors commented on previous versions of the manuscript.
All authors read and approved the final manuscript.
\end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This work was supported by the Postdoctoral Scholarship Programme of Vingroup Innovation Foundation (VINIF), code
VINIF.2023.STS.58, and the  research project coded DT. 18/24, funded by the Ministry of Information and Communication, 2024.

\end{acknowledgements}

% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{ALIN: An Active Learning Framework for Incomplete Networks\\(Supplementary Material)}
\maketitle

\appendix


\section{ALIN Algorithm}


In this section, we describe the ALIN algorithm in detail. ALIN algorithm includes two principal components: the query phase and the training phase.
\begin{itemize}
\item Query Phase: This phase encompasses node selection and subsequent updates, detailed in lines 3--15 of Algorithm \ref{alg:main}. During the initial node selection, we utilize the InitNodes function (line 4). In subsequent iterations, we calculate node scores (line 8), and edge scores (line 9), and combine them (line 10) to identify the most informative node (line 11). We then update the selected nodes and the lost edges associated with them (lines 13 --15).
\item Training Phase: In this phase, we focus on the core of our methodology: a unified loss function that combines node classification and link loss prediction, as described in lines 16--19.
\end{itemize}

\begin{algorithm}
\caption{ALIN Algorithm}
\begin{algorithmic}[1] % Add this line to enable line numbering
\Require
  $\textbf{X}$, $\tilde{\mathcal{E}}^{(0)}$, $\tilde{\textbf{Y}}^{(0)}=\emptyset$, Hyperparameters ($B$, $\alpha$, $\beta$), Trainable model parameters $\theta^{(0)}$, Training iterations $T$, iteration $k=0$
\Ensure
  $\tilde{\mathcal{G}}^{(K)}$, $\theta^{(K)}$
\While{$ k < K$}
    \State $b \gets [B/K]$
    \If {$k = 0$} \Comment{Start Query Phase}
        \State $\mathcal{Q}^{(k)}_{b}\gets$ InitNodes$(\tilde{\mathcal{E}}^{(0)}, \textbf{X})$
    \Else
        \State $\tilde{\textbf{Y}}^{(k)} \gets \tilde{{\textbf{Y}}}_{u}^{(k-1)}$
        \State $\tilde{\mathcal{E}}^{(k)} \gets \tilde{{\mathcal{E}}}_{u}^{(k-1)}$
        \State $\phi_{NS}^{(k)} \gets$ calculate Node Score for each node following Eq. (\ref{eq:entropy})
        \State $\phi_{ES}^{(k)} \gets$ calculate Edge Score for each node following Eq.  (\ref{eq:edge_score})
        \State ${\phi_{CS}^{(k)}} \gets \alpha \cdot \phi_{NS}^{(k)} + (1-\alpha) \cdot \phi_{ES}^{(k)}$
        \State $\mathcal{Q}^{(k)}_{b} \gets$ Top $b$ nodes from $ \phi_{CS}^{(k)}$
    \EndIf
    \State From $\mathcal{Q}^{(k)}_{b}$ update selected nodes and associated edges from the oracle to ${\tilde{\textbf{Y}}_{q}}^{(k)}$, ${\tilde{\mathcal{E}}_{q}}^{(k)}$
    \State $\tilde{{\textbf{Y}}}_{u}^{(k)} \gets \tilde{\textbf{Y}}^{(k)} \cup {\tilde{\textbf{Y}}_{q}}^{(k)}$
    \State $\tilde{{\mathcal{E}}}_{u}^{(k)} \gets \tilde{\mathcal{E}}^{(k)} \cup {\tilde{\mathcal{E}}_{q}}^{(k)}$ \Comment{End Query Phase}
    \For{$t = 1$ to $T$} \Comment{Start Training Phase}
        \State From $\theta^{(k)}$, calculate $\hat{\textbf{Y}}^{(k)}$ and $\hat{\mathcal{E}}^{(k)}$
        \State $\mathcal{L}^{(k)} \gets \beta \cdot l_{NC}(\tilde{{\textbf{Y}}}_{u}^{(k)},\hat{\textbf{Y}}^{(k)}) + (1-\beta) \cdot l_{LP}(\tilde{{\mathcal{E}}}_{u}^{(k)}, \hat{\mathcal{E}}^{(k)})$
        \State Backpropagation to $\theta^{(k)}$
    \EndFor \Comment{End Training Phase}
    \State $k \gets k + 1$
\EndWhile
\end{algorithmic}
\label{alg:main}
\end{algorithm}

\section{Generalizability to Other GNNs}

We further present experiment results of all competitive methods on other GNN architectures in Table ~\ref{tab:experiment_result_other_gnn}, in which the GCN backbone is replaced by GAT and GraphSAGE accordingly. 
Our proposed framework consistently demonstrates its capacity to enhance the accuracy of node classification tasks, even when transitioning GAT and GraphSAGE. This substantiates the framework's robust applicability across diverse GNN models.

Notably, among the benchmark methods including Random, Density, CoreSet, Pagerank, GraphPartFar, and ALINFar, CoreSet exhibits exceptional performance on GAT, surpassing GraphSAGE by approximately 4\%. In contrast, the other baseline methods exhibit a narrower performance gap of around 1-2\% when comparing GAT to GraphSAGE. This superiority of CoreSet on GAT underscores GAT's effectiveness in capturing pertinent information from neighboring nodes, even in scenarios with missing connections, courtesy of its adaptive attention mechanism that judiciously weighs the significance of various neighbors.

Conversely, GraphSAGE, Uncertainty, Degree, and FeatProp consistently outperform GAT, achieving improvements of roughly 0.5-1\%. Notably, GraphSAGE is favored for its computational efficiency and scalability, making it well-suited for large graph datasets. Its resilience to the absence of edges between nodes is attributed to its neighbor sampling strategy.

It is noteworthy that both CoreSet and FeatProp rely on K-means clustering based on hidden node representations, which introduces sensitivity to different runs, particularly when applied to a limited number of labels. Consequently, these two baseline methods exhibit a considerably higher standard deviation compared to the other baselines.



\begin{table*}[t]
\begin{center}
\begin{tabular}{|l|ccc|lll|}
\hline
\textbf{Baselines} & \multicolumn{3}{c|}{\textbf{GAT}}                                  & \multicolumn{3}{c|}{\textbf{GraphSAGE}}                                      \\ \cline{2-7} 
Buget              & 200                  & 230                  & 260                  & \multicolumn{1}{c}{200} & \multicolumn{1}{c}{230} & \multicolumn{1}{c|}{260} \\ \hline
Random             & 78.4 $\pm$ 0.6          & 78.0 $\pm$ 1.1          & 79.5 $\pm$ 0.3          & 76.2 $\pm$ 0.5             & 78.2 $\pm$ 0.7             & 78.7 $\pm$ 0.4              \\
Density        & 77.2 $\pm$ 0.9    & 80.1 $\pm$ 0.3          & 79.9 $\pm$ 0.4          & 77.4 $\pm$ 0.4       & 80.6 $\pm$ 1.5       & 81.7 $\pm$ 1.6              \\
Uncertainty            & \underline{79.8} $\pm$ 1.0          & 81.3 $\pm$ 1.4          & 81.5 $\pm$ 1.0          & \underline{79.4} $\pm$ 0.7             & \underline{80.8} $\pm$ 1.3             & 81.3 $\pm$ 0.7              \\
CoreSet            & 66.5 $\pm$ 1.4          & 65.7 $\pm$ 2.7          & 66.1 $\pm$ 3.7          & 61.0 $\pm$ 4.2             & 62.5 $\pm$ 2.4             & 62.1 $\pm$ 3.2              \\
Degree             & 74.7 $\pm$ 1.1          & 75.3 $\pm$ 1.1          & 76.8 $\pm$ 0.4          & 76.4 $\pm$ 0.7             & 76.7 $\pm$ 0.6             & 79.1 $\pm$ 0.6              \\
Pagerank           & 77.7 $\pm$ 0.5          & 78.9 $\pm$ 0.3          & 80.3 $\pm$ 0.6          & 77.6 $\pm$ 0.3             & 77.9 $\pm$ 0.5             & 80.1 $\pm$ 0.7              \\
AGE                & 78.8 $\pm$ 0.1          & 79.9 $\pm$ 0.5          & 80.7 $\pm$ 0.1          & 77.3 $\pm$ 1.6             & 80.0 $\pm$ 0.4             & 80.8 $\pm$ 0.4              \\
FeatProp           & 72.2 $\pm$ 0.7          & 73.8 $\pm$ 0.4          & 75.9 $\pm$ 0.4          & 73.1 $\pm$ 0.6             & 75.0 $\pm$ 0.8             & 76.1 $\pm$ 1.2              \\
GraphPart          & 72.8 $\pm$ 0.9          & 74.3 $\pm$ 0.9          & 75.1 $\pm$ 0.3          & 74.4 $\pm$ 0.7             & 74.7 $\pm$ 1.0             & 75.0 $\pm$ 0.7              \\
GraphPartFar       & 77.9 $\pm$ 0.4          & 76.9 $\pm$ 0.6          & 78.6 $\pm$ 0.8          & 76.7 $\pm$ 0.9             & 76.8 $\pm$ 0.5             & 77.6 $\pm$ 0.1              \\ \hline
ALIN               & \textbf{80.5} $\pm$ 0.9 & \underline{81.4} $\pm$ 1.2    & \underline{81.8} $\pm$ 0.7    & 79.0 $\pm$ 0.5             & 79.9 $\pm$ 1.8             & \underline{81.6} $\pm$ 1.4        \\
ALINFar            & 79.7 $\pm$ 0.9          & \textbf{82.1} $\pm$ 0.2 & \textbf{82.7} $\pm$ 0.8 & \textbf{79.9} $\pm$ 0.8    & \textbf{81.7} $\pm$ 0.3    & \textbf{82.4} $\pm$ 0.4     \\ \hline
\end{tabular}
\end{center}
\caption{\label{tab:experiment_result_other_gnn} Summary of the performance of others GNN on Cora dataset. The \textbf{bold} marker denotes the best performance and the \underline{underlined} marker denotes the second-best performance.}
\end{table*}

\section{Hyperparameter Sensitivity}

We further the experiment results on tuning hyperparameters, and carry out experiments as follows in Fig.~\ref{fig:hyperparameter_sensitivity}, we investigate the impact of hyperparameters $\alpha$, which adjusts the balance between two terms for the query node selection at line 10 of the Algorithm ~\ref{alg:main}, on the performance of both ALIN and ALINFar when $B$ = 230 and $K$ = 8 on Cora dataset. 
\begin{figure}[!htb]
  \centering
  \includegraphics[width=.4\linewidth]{hyper_sensitivity.png}
  \caption{The expected influence by ALIN and ALINFar according to different values hyperparameters $\alpha$}\label{fig:hyperparameter_sensitivity}
\end{figure}
In these experiments, we only show the results from the GCN model since those from the other GNN models follow a similar trend. We observe the following:
\begin{itemize}
\item The case of $\alpha= 0.5$  exhibits the best performance almost consistently.
\item When $\alpha=1$, ALIN and ALINFar only depend on the node score, resulting in low performance.
\item Setting $\alpha$ to 0.3 leads to much lower performance as this setting overemphasizes the edge score, detracting from our ultimate goal of node classification.
\end{itemize}
These findings emphasize the delicate interplay between node uncertainty, edge information, and the overarching goal of accurate node classification within the ALIN framework.


\section{Weight Growth of Function}
In this section, we delve into the intricate dynamics of $\beta$ and its evolution across epochs, achieved through the utilization of a growth function. Recall that $\beta$ is the weight of combined loss at line 18 of Algorithm ~\ref{alg:main}, which plays a pivotal role in balancing the trade-off between two essential tasks: optimal link prediction and the ultimate goal of node classification.

At the outset of training, during the initial epochs, we set $\beta$ to a value of 0.05. This choice steers the model's focus primarily towards solving the optimal link prediction problem. In contrast, as we approach the final epochs, our objective is to set $\beta$ to 1, emphasizing the model's commitment to the ultimate task of node classification.

\begin{figure}[!ht]
  \centering
  \includegraphics[width=.5\linewidth]{weight_decay.png}
  \caption{Weight growth functions}\label{fig:weight_decay}
\end{figure}


Achieving this gradual transition in $\beta$ necessitates the implementation of a suitable growth function. We explored various weight growth functions to identify the most effective approach, weight growth functions shown in Fig.~\ref{fig:weight_decay}. The results of these experiments are summarized in Table ~\ref{tab:weight_Decay}, revealing distinct performance characteristics among different growth functions.

Notably, the Cosine Annealing, Step, and Exponential growth functions emerge as superior choices when compared to the Inverse Time growth function. Both the Cosine Annealing and Step growth functions exhibit an advantageous pattern of gradually increasing the $\beta$ parameter during the middle epochs. This characteristic aligns seamlessly with the requirements of the Link Prediction task, which thrives on sustained training over multiple epochs, rather than experiencing a premature reduction in emphasis. Furthermore, the Exponential growth function proves notable for its ability to swiftly approach a $\beta$ value close to 1 during the latter epochs. This rapid convergence to a higher $\beta$ value positions the Exponential growth function as a compelling choice, outperforming the Cosine Annealing growth function in terms of accuracy.

In summary, our approach to the weight growth of $\beta$ involves a thoughtful selection of growth functions, ultimately tailored to strike the right balance between optimizing link prediction and achieving robust node classification. The choice of growth function is a critical aspect of our framework, as it ensures that the model evolves and adapts its focus in a manner that aligns with the evolving requirements of the tasks at hand.



\begin{table}[h]
\begin{center}
\begin{tabular}{|l|l|ccc|}
\hline
Weight Growth Function                   & Baselines & \multicolumn{3}{c|}{Budget}                                                                                  \\ \cline{3-5} 
                                        &           & \multicolumn{1}{c|}{200}                  & \multicolumn{1}{c|}{230}                  & 260                  \\ \hline
\multirow{2}{*}{Step Growth}             & ALIN      & \multicolumn{1}{c|}{\textbf{79.8} $\pm$ 1.3} & \multicolumn{1}{c|}{\underline{80.4} $\pm$ 0.6}    & \textbf{81.6} $\pm$ 1.0 \\
                                        & ALINFar   & \multicolumn{1}{c|}{\underline{78.9} $\pm$ 1.2}    & \multicolumn{1}{c|}{\textbf{81.2} $\pm$ 1.1} & \underline{81.4} $\pm$ 1.7    \\ \hline
\multirow{2}{*}{Inverse Time Growth}     & ALIN      & \multicolumn{1}{c|}{63.3 $\pm$ 1.7}          & \multicolumn{1}{c|}{62.3 $\pm$ 2.3}          & 65.2 $\pm$ 0.6          \\
                                        & ALINFar   & \multicolumn{1}{c|}{63.0 $\pm$ 2.1}          & \multicolumn{1}{c|}{64.3 $\pm$ 1.7}          & 65.6 $\pm$ 0.4          \\ \hline
\multirow{2}{*}{Exponential Growth}      & ALIN      & \multicolumn{1}{c|}{78.2 $\pm$ 0.5}          & \multicolumn{1}{c|}{77.6 $\pm$ 1.5}          & 79.7 $\pm$ 0.6          \\
                                        & ALINFar   & \multicolumn{1}{c|}{78.6 $\pm$ 1.5}          & \multicolumn{1}{c|}{77.7 $\pm$ 1.1}          & 80.1 $\pm$ 0.5          \\ \hline
\multirow{2}{*}{Cosine Annealing Growth} & ALIN      & \multicolumn{1}{c|}{76.0 $\pm$ 0.6}          & \multicolumn{1}{c|}{78.1 $\pm$ 0.8}          & 76.8 $\pm$ 2.0          \\
                                        & ALINFar   & \multicolumn{1}{c|}{74.7 $\pm$ 1.8}          & \multicolumn{1}{c|}{75.8 $\pm$ 0.4}          & 76.7 $\pm$ 0.7          \\ \hline
\end{tabular}
\end{center}
\caption{\label{tab:weight_Decay} Summary of the performance of weight growth functions using GNN on Cora dataset. The numerical values represent the average Macro-F1 score of 3 independent trials. The \textbf{bold} marker denotes the best performance and the \underline{underlined} marker denotes the second-best performance.}
\end{table}

\section{Proof of Theorem 1}
Our approach shares similarities with the work of \citet{wu2019active}. For simplicity, for any model $\mathcal{M}^{(k)}$ at $k$-th iteration let $(\mathcal{M}^{(k)})_i = (\mathcal{M}(\tilde{\mathcal{G}}^{(k)}, X))_i \in \mathbb{R}^{C}$ be the prediction for node $i$ under input $\tilde{\mathcal{G}}^{(k)}, X$, and $\mathcal{M}^{(k)}_{i,c}$ be the $c$-th element of $(\mathcal{M}^{(k)})_i$ (i.e., the prediction for class $c$). We also make the following assumptions:


\textbf{Assumption 1.} We assume that \(\mathcal{A}^{(k)}\) overfits to the training data. Specifically, we also assume the following two conditions: i) \(\mathcal{A}^{(k)}\) attains zero training loss on the set $\mathcal{Q}^{(k)}_{b}$, and ii) for any unlabeled data pair \((x_i, x_j)\) where \(i \notin \mathcal{Q}^{(k)}_{b}\) and \(j \in \mathcal{Q}^{(k)}_{b}\), it holds that \((\mathcal{A}^{(k)})_{i,y_j} \leq (\mathcal{A}^{(k)})_{j,y_j}\) and \((\mathcal{A}^{(k)})_{i,c} \geq (\mathcal{A}^{(k)})_{j,c}\) for all \(c \neq y_j\). The second condition implies that \(\mathcal{A}^{(k)}\) achieves low confidence on unseen samples and high confidence on trained samples. Additionally, we assume that the class probabilities are determined by a ground truth GCN, denoted as \(\mathcal{M}^*\), which predicts Pr$[y_i = c]$ for the entire training set. In the literature, this is a common assumption. Both \(\mathcal{A}^{(k)}\) and \(\mathcal{M}^*\) calculate probability outputs.

% \ Additionally, we assume that the class probabilities are determined by a ground truth Graph Convolutional Network (GCN), denoted as \(\mathcal{M}^*\), which predicts \(Pr[y_i = c]\) for the entire training set. This assumption aligns with common practices in the literature, and \citet{du2019gradient} has demonstrated that gradient descent reliably achieves zero training loss and precise predictions within polynomial time. Both \(\mathcal{A}^{(k)}\) and \(\mathcal{M}^*\) calculate probability outputs.

\textbf{Assumption 2.} We assume that $l_{NC}$ bounded in \( [-L, L] \) is Lipschitz with constant \( \lambda \). The loss function is naturally Lipschitz for many common loss functions such as mean squared error, hinge loss, and cross-entropy when the model output is constrained within certain bounds. This assumption finds frequent application in deep learning theory (e.g.,  \citep{10.1007/s10107-019-01464-2, du2019gradient}).

\textbf{Assumption 3.} We assume that ReLU function activates with probability $1/2$. This assumption, frequently made in the analysis of neural network loss surfaces, is also used in \citep{choromanska2015open, kawaguchi2016deep, xu2018representation}. It is consistent with practical observations where, typically, approximately half of the ReLU neurons can activate.


% \textbf{Assumption 3:} We posit that the ReLU function activates with a probability of 
% 1
% /
% 2
% 1/2. This assumption, frequently made in the analysis of neural network loss surfaces, is also employed in studies such as \citep{choromanska2015open, kawaguchi2016deep, xu2018representation}. It is consistent with practical observations where, typically, approximately half of the ReLU neurons tend to activate.


With these assumptions in place, we can prove Theorem ~\ref{theorem:theorem_1}


% That theorem again


\begin{reptheorem}{theorem:theorem_1}[restated]
Suppose Assumptions 1-3 hold, and the label vector $\tilde{{\textbf{Y}}}_{u}^{(k)}$ is sampled independently from the distribution $y_v \sim \eta(v)$ for every $v \in V$. Then with probability $1 - \delta$ the expected classification loss of \(\mathcal{A}^{(k)}\) satisfies
% \begin{equation}
% \frac{1}{n}l_{NC}(\mathcal{A}^{(k)}|G, X, Y) \leq \frac{(\lambda + L)(\alpha/2)^K}{n} \sum_{i=1}^{n} \min_{j \in \mathcal{Q}^{(k)}_b}\| (S^K X)_i - (S^K X)_j \|_2  + \sqrt{\frac{L \log(1/\delta)}{2n}}
% \end{equation}
% \begin{equation}
%     \frac{1}{n} l_{NC}(\mathcal{A}^{(k)}|G, X, Y) \leq \frac{\lambda}{n} \sum_{c=1}^{C} (\mathcal{M}^*)_{j,c} \min_{j \in \mathcal{Q}^{(k)}_b} |(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}| + \frac{L}{n} \sum_{c=1}^{C} ((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c}) + \sqrt{\frac{L \log(1/\delta)}{2n}} 
% \end{equation}

\begin{equation}
\label{eq:1}
      \frac{1}{n} l^{(k)}_{NC}(\mathcal{A}^{(k)}|\tilde{\mathcal{G}}^{(k)}, X, \tilde{{\textbf{Y}}}_{u}^{(k)})  \leq \sum_{i=1}^{n} \sum_{c=1}^{C} \left[ \frac{\lambda}{n} (\mathcal{M}^*)_{j,c} \min_{j \in \mathcal{Q}^{(k)}_{b}} |(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}| + \frac{L}{n}((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c}) \right] + \sqrt{\frac{L \log(1/\delta)}{2n}}  
\end{equation}
\end{reptheorem}



\textit{Proof.} Consider the following random process: Fix $y_j$ for $j \in \mathcal{Q}^{(k)}_{b}$ and therefore the resulting model $\mathcal{A}^{(k)}$, and suppose the (hidden) labels $y_i$ for $i \notin \mathcal{Q}^{(k)}_{b}$ is randomly sampled according to $\eta(v_i)$. Let $i \in V \setminus \mathcal{Q}^{(k)}_{b}$ be any node and $j \in \mathcal{Q}^{(k)}_{b}$. We have
\begin{equation}
\label{eq:2}
  \begin{aligned}
    \mathbb{E}_{y \sim \eta(i)} \left[ l_{NC}((\mathcal{A}^{(k)})_i, y) \right] & = \sum_{c=1}^{C} \Pr[y_i = c]l_{NC}((\mathcal{A}^{(k)})_{i,c}, c) \\
      & =\sum_{c=1}^{C} \Pr[y_j = c]l_{NC}((\mathcal{A}^{(k)})_{i,c}, c) + \sum_{c=1}^{C} (\Pr[y_i = c] - \Pr[y_j = c])l_{NC}((\mathcal{A}^{(k)})_{i,c}, c). \\
  \end{aligned}
\end{equation}
For the first term, we have
\begin{equation}
\label{eq:3}
  \begin{aligned}
    \sum_{c=1}^{C} \Pr[y_j = c]l_{NC}((\mathcal{A}^{(k)})_{i,c}, c) & = \sum_{c=1}^{C} \Pr[y_j = c]\left[ l_{NC}((\mathcal{A}^{(k)})_{i,c}, c) - l_{NC}((\mathcal{A}^{(k)})_{j,c}, c) \right] \\
                                                            & + \sum_{c=1}^{C} \Pr[y_j = c]l_{NC}((\mathcal{A}^{(k)})_{j,c}, c) \\
                                                           & = \sum_{c=1}^{C} \Pr[y_j = c]\left[ l_{NC}((\mathcal{A}^{(k)})_{i,c}, c) - l_{NC}((\mathcal{A}^{(k)})_{j,c}, c) \right] \\
                                                           & \leq \lambda \sum_{c=1}^{C} \Pr[y_j = c] |(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}|
  \end{aligned}
\end{equation}
The last inequality holds from the Lipschitz continuity of \( l \). Now from Assumption 1, we have $(\mathcal{A}^{(k)})_{i,c} \geq (\mathcal{A}^{(k)})_{j,c}$ for $c \neq y_j$ and $(\mathcal{A}^{(k)})_{i,c} \leq (\mathcal{A}^{(k)})_{j,c}$ otherwise.

Now for the second loss in Eq. (\ref{eq:2}) we use the property that $\mathcal{M}^*$ computes the ground truth:
\begin{equation}
\label{eq:4}
  \begin{aligned}
    \sum_{c=1}^{C} (\Pr[y_i = c] - \Pr[y_j = c])l_{NC}((\mathcal{A}^{(k)})_{i,c}, c) & = \sum_{c=1}^{C} ((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c})l_{NC}((\mathcal{A}^{(k)})_{i,c}, c) \\
                                                             & \leq \sum_{c=1}^{C} L((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c}).
  \end{aligned}
\end{equation}
The last inequality follows from $l \in \left[ -L, L \right]$.

Combining the two parts to Eq. (\ref{eq:2}), we obtain
\begin{equation}
\label{eq:5}
  \begin{aligned}
    \mathbb{E}_{y \sim \eta(i)} \left[ l_{NC}((\mathcal{A}^{(k)})_i, y) \right] & \leq \lambda \sum_{c=1}^{C} \Pr[y_j = c] |(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}| + \sum_{c=1}^{C} L((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c}) \\
    & \leq \sum_{c=1}^{C} \left[ \lambda (\mathcal{M}^*)_{j,c} |(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}| + L((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c}) \right] \\
  \end{aligned}
\end{equation}

% To RHS of (5) be minimum, we need $|(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}|$ is minimum and $((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c})$ is minimum.
% \begin{itemize}
% \item For $|(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}|$ is minimum: Flowing by assumption 1, $(\mathcal{A}^{(k)})_i$ is vector output of model $(\mathcal{A}^{(k)})$ for node $i$, that vector has $C$ dimensions. And $(\mathcal{A}^{(k)})_i$ is uncertainty on $V \setminus \mathcal{Q}^{(k)}_b$ (untrain), which means each value of elements in the output vector is quite the same. To $|(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}|$ is minimum, we need select node $j$ so that it is center of all  $(\mathcal{A}^{(k)})$, it mean the each value in output vector of node $j$ is the most same, so it mean node $j$ is the most uncertainty (highest entropy)
% \item For $((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c})$ is minimum: because $\mathcal{M}^*$ no uncertainty on unseen data. As the ideal model, $\mathcal{M}^*$ perfectly captures the underlying class probabilities. It remains confident and accurate on unseen data because: (1) It doesn't overfit to specific training samples; (2) It accurately reflects the true data distribution, including unseen data. $\mathcal{M}^*$ consistently calibrated confidence: Its confidence levels accurately represent the true likelihood of its predictions, regardless of whether the data has been seen during training. Therefore, $\mathcal{M}^*$ treats seen or unseen data in the same way. So, we can choose any node $j$ no affect on  $((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c})$.
% \end{itemize}

To minimize the right-hand side (RHS) of Eq. (\ref{eq:5}), it is necessary that both $|(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}|$ and $((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c})$ are minimized.

\begin{itemize}
\item To achieve the minimum of $|(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}|$, consider assumption $(\mathcal{A}^{(k)})_i$ represents the $C$-dimensional output vector of model $(\mathcal{A}^{(k)})$ for node $i$. Given that $(\mathcal{A}^{(k)})_i$ indicates uncertainty in $V \setminus \mathcal{Q}^{(k)}_b$, the elements of the output vector are relatively similar. Therefore, selecting node $j$ as the central node of all $(\mathcal{A}^{(k)})$ outputs where the output vector elements are most alike ensures that node $j$ embodies the highest uncertainty.
\item For minimizing$((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c})$, note that $\mathcal{M}^*$ exhibits certainty with unseen data. As an ideal model, $\mathcal{M}^*$ accurately represents the underlying class probabilities and does not overfit specific training samples. Its calibrated confidence ensures that its predictions' confidence levels are consistent with the actual likelihood, for both seen and unseen data. Consequently, selecting any node $j$ does not impact the value of $((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c})$.
\end{itemize}

Therefore, by selecting node $j$ with the highest uncertainty in Eq. (\ref{eq:5}), we obtain:
\begin{equation}
\label{eq:6}
  \begin{aligned}
    \mathbb{E}_{y \sim \eta(i)} \left[ l_{NC}((\mathcal{A}^{(k)})_i, y) \right] & \leq \sum_{c=1}^{C} \left[ \lambda (\mathcal{M}^*)_{j,c} \min_{j \in \mathcal{Q}^{(k)}_b} |(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}| + L((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c}) \right] \\
  \end{aligned}
\end{equation}

    
Now notice that
\begin{equation}
\label{eq:7}
l_{NC}(\mathcal{A}^{(k)}|G, X, Y) = \sum_{i \in V \setminus \mathcal{Q}^{(k)}_b} l_{NC}((\mathcal{A}^{(k)})_i, y_i) + \sum_{j \in \mathcal{Q}^{(k)}_b} l_{NC}((\mathcal{A}^{(k)})_j, y_j) = \sum_{i \in V \setminus \mathcal{Q}^{(k)}_b} l_{NC}((\mathcal{A}^{(k)})_i, y_i).
\end{equation}

% l^{(k)}_{NC}(\mathcal{A}^{(k)}|\tilde{\mathcal{G}}^{(k)}, X, \tilde{{\textbf{Y}}}_{u}^{(k)})

Consider the following process: we first get \( \tilde{\mathcal{G}}^{(k)}, X \) as input, which induces \( \eta(i) \) for \( i \in [n] \). Note that \( \mathcal{M}^* \) gives the ground truth \( \eta(i) \) for every \( i \) so distributions \( \eta(i) \equiv \eta_{X, G}(i) \). Then the algorithm \( \mathcal{A}^{(k)} \) chooses the set \( \mathcal{Q}^{(k)}_b \) to label. After that, we randomly sample \( y_j \sim \eta(j) \) for \( j \in \mathcal{Q}^{(k)}_b \) and use the labels to train model \( \mathcal{A}^{(k)} \). At last, we randomly sample \( y_i \sim \eta(i) \) and obtain loss \( l^{(k)}_{NC}(\mathcal{A}^{(k)}|\tilde{\mathcal{G}}^{(k)}, X, \tilde{{\textbf{Y}}}_{u}^{(k)}) \). Note that the sampling of all \( y_i \) for \( i \in V \setminus \mathcal{Q}^{(k)}_b \) is after we fix the model \( \mathcal{A}^{(k)} \), and knowing exact values of \( y_j \) for \( j \in \mathcal{Q}^{(k)}_b \) does not give any information of \( y_i \) (since \( \eta(i) \) is only determined by \( \tilde{\mathcal{G}}^{(k)}, X \)). Now we use Hoeffding's inequality (Theorem ~\ref{theorem:theorem_2}) with \( Z_i = l_{NC}(\mathcal{A}^{(k)}|\tilde{\mathcal{G}}^{(k)}, X, \tilde{{\textbf{Y}}}_{u}^{(k)}) \); we have \( -L \leq Z_i \leq L \) by our assumption, and recall that \( |V \setminus \mathcal{Q}^{(k)}_b| = n - b \). Let \( \delta \) be the RHS of Eq. (\ref{eq:11}), we have that with probability \( 1 - \delta \),
\begin{equation}
\label{eq:8}
\frac{1}{n-b} \sum_{i \in V \setminus \mathcal{Q}^{(k)}_b} l_{NC}((\mathcal{A}^{(k)})_i, y_i) - \frac{1}{n-b} \mathbb{E}_{y \sim \eta(i), \sigma} \left[ l_{NC}((\mathcal{A}^{(k)})_i, y) \right] \leq \sqrt{\frac{L \log(1/\delta)}{2(n-b)}}
\end{equation}

Now plug in Eq. (\ref{eq:6}), multiply both sides by $(n - b)$ and rearrange. We obtain that
\begin{equation}
\label{eq:9}
  \begin{aligned}
    \sum_{i \in V \setminus \mathcal{Q}^{(k)}_b} l_{NC}((\mathcal{A}^{(k)})_i, y_i) & \leq \sum_{i \in V \setminus \mathcal{Q}^{(k)}_b} \sum_{c=1}^{C} \left[ \lambda (\mathcal{M}^*)_{j,c} \min_{j \in \mathcal{Q}^{(k)}_b} |(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}| + L((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c}) \right] \\
    & + \sqrt{\frac{L \log(1/\delta)(n-b)}{2}}
  \end{aligned}
\end{equation}



Now note that since the random draws of $y_i$ are completely irrelevant with the training of $\mathcal{A}^{(k)}$, we can also sample $y_i$ together with $y_j$ for \( j \in \mathcal{Q}^{(k)}_b \) after receiving $G, X$ and before the training of $\mathcal{A}^{(k)}$ ($\mathcal{A}$ does not have access to the labels anyway). So Eq. (\ref{eq:9}) holds for the random drawings of all $y$’s. Now divide both sides of Eq. (\ref{eq:9}) by $n$ and use Eq. (\ref{eq:7}), we have
\begin{equation}
\label{eq:10}
  \begin{aligned}
     \frac{1}{n} l_{NC}(\mathcal{A}^{(k)}|G, X, Y) & \leq \frac{1}{n} \sum_{i=1}^{n} \sum_{c=1}^{C} \left[ \lambda (\mathcal{M}^*)_{j,c} \min_{j \in \mathcal{Q}^{(k)}_b} |(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}| + L((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c}) \right] + \sqrt{\frac{L \log(1/\delta)(n-b)}{2n^2}} \\
    & \leq \sum_{i=1}^{n} \sum_{c=1}^{C} \left[ \frac{\lambda}{n} (\mathcal{M}^*)_{j,c} \min_{j \in \mathcal{Q}^{(k)}_b} |(\mathcal{A}^{(k)})_{i,c} - (\mathcal{A}^{(k)})_{j,c}| + \frac{L}{n}((\mathcal{M}^*)_{i,c} - (\mathcal{M}^*)_{j,c}) \right] + \sqrt{\frac{L \log(1/\delta)}{2n}}  \\
  \end{aligned}
\end{equation}



\section{Hoeffding’s Inequality}
We attach the Hoeffding’s inequality here for the completeness of our paper.

\begin{theorem}[\citep{hoeffding1994probability}]
\label{theorem:theorem_2}
Suppose $Z_1, \dots, Z_n$ are independent random variables such that $ a_i \leq Z_i \leq b_i$ almost surely for $ 1 \leq i \leq n$. Then we have
\begin{equation}
\label{eq:11}
       Pr\left[ \frac{1}{n} \sum_{i=1}^{n} Z_i - E\left[ \frac{1}{n} \sum_{i=1}^{n} Z_i \right] > t \right] \leq exp \left( - \frac{2 n^2 t^2}{\sum_{i=1}^{n} (b_i - a_i)^2} \right)  \\
\end{equation}
\end{theorem}


\end{document}
