\documentclass[accepted]{uai2023} % for 
\usepackage[american]{babel}


\usepackage{natbib} 
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} 

\usepackage{booktabs} 
\usepackage{tikz} 

 
\usepackage{xr}
\usepackage{times}
\usepackage{latexsym}
\usepackage{amsmath, amsfonts, amsthm}
\usepackage{xspace, subfigure}
\newtheorem{prop}{Proposition}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\usepackage[noend]{algpseudocode}
\usepackage{algorithm}
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
\newcommand{\our}{\text{GraphOBA}\xspace}
\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}



\makeatletter
\newcommand*{\addFileDependency}[1]{
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
%%% END HELPER CODE
\myexternaldocument{kulkarni_590}


\newcommand{\swap}[3][-]{#3#1#2} 

\title{Optimal Budget Allocation for Crowdsourcing Labels for Graphs (Supplementary Material)}


\author[1]{\href{mailto:<aditkulk@iastate.edu>?Subject=Your UAI 2023 paper}{Adithya Kulkarni}{}}
\author[2]{\href{mailto:<mohnac@iastate.edu>?Subject=Your UAI 2023 paper}{Mohna Chakraborty}{}}
\author[3]{Sihong Xie}
\author[4]{Qi Li}

\affil[1,2,4]{%
    Computer Science Dept.\\
    Iowa State University\\
    Ames, Iowa, USA
}
\affil[3]{%
    Computer Science \& Engineering Dept.\\
    Lehigh University\\
    Bethlehem, Pennsylvania, USA
}

  
  
\begin{document}
  

\maketitle


\appendix
\section{Proof of Proposition \ref{prop2}}
\label{proof_of_preposition}
We use the proof technique proposed by \cite{xie2012sequential} to prove Proposition \ref{prop2}. By Proposition \ref{prop1}, the following value function is obtained:

\resizebox{.96\linewidth}{!}{
\begin{minipage}{\linewidth}
\begin{flalign*}
    V(S^{0}) & \dot= \underset{\pi}{\mathrm{sup}} \ \mathbb{E}^{\pi} \left[ \mathbb{E} \left( \sum_{v \in H_{T}} \mathbf{1}(v \in H^{*}) + \sum_{v \notin H_{T}} \mathbf{1}(v \notin H^{*})| \mathcal{F}_T \right) \right] & \\
    & = \underset{\pi}{\mathrm{sup}} \ \mathbb{E}^{\pi} \left( \sum_{v=1}^{N} h(P_{v}^{T} (+1)) \right). \numberthis
    \label{eq16}
\end{flalign*}
\end{minipage}
}

We define $G_{0} = \sum_{v=1}^{N} h(P_{v}^{0} (+1))$ and $G_{t+1} = \sum_{v=1}^{N} h(P_{v}^{t+1}(+1)) - \sum_{v=1}^{N} h(P_{v}^{t}(+1))$ to decompose the final accuracy $\sum_{v=1}^{N} h(P_{v}^{T}(+1))$ into stage-wise reward. Then, $\sum_{v=1}^{N} h(P_{v}^{T}(+1))$ can be decomposed as: $\sum_{v=1}^{N} h(P_{v}^{T}(+1)) \equiv G_{0} + \sum_{t=0}^{T-1} G_{t+1}$. Therefore, the value function can now be re-written as:

\begin{align*}
    V(S^{0}) &= G_{0}(S^{0}) + \underset{\pi}{\mathrm{sup}} \sum_{t=0}^{T-1} \mathbb{E}^{\pi} (G_{t+1}) &\\
    & = G_{0}(S^{0}) + \underset{\pi}{\mathrm{sup}} \sum_{t=0}^{T-1} \mathbb{E}^{\pi} (\mathbb{E}(G_{t+1} | \mathcal{F}_t)) &\\
    & = G_{0}(S^{0}) + \underset{\pi}{\mathrm{sup}} \sum_{t=0}^{T-1} \mathbb{E}^{\pi} (\mathbb{E}(G_{t+1} | S^{t}, v_{t})).\numberthis
    \label{eq17}
\end{align*}
The first equality holds because $G_{0}$ is determinant and independent of the policy $\pi$, the second equality holds because of the tower property of conditional expectation and the third equality is true because $G_{t+1}$ depends on $\mathcal{F}_t$ only through $S^{t}$ and $v_{t}$. We define the stage-wise expected reward gained by obtaining the label for the $v_{t}$-th instance at the state $S^{t}$ as:
\resizebox{.96\linewidth}{!}{
\begin{minipage}{\linewidth}
\begin{align*}
    R(S^{t}, v_t) =& \mathbb{E}(G_{t+1} | S^{t}, v_{t}) \\
    =& \mathbb{E} \left( \sum_{v=1}^{N}h(P_{v}^{t+1}(+1)) - \sum_{v=1}^{N}h(P_{v}^{t}(+1)) | S^{t}, v_{t} \right). \numberthis
    \label{eq18}
\end{align*}
\end{minipage}
}
Therefore, the value function takes the following form:
\begin{align*}
    V(S^{0}) = G_{0}(S^{0}) + \underset{\pi}{\mathrm{sup}} \ \mathbb{E}^{\pi} \left( \sum_{t=0}^{T-1} R(S^{t}, v_{t} | S^{0}) \right). \numberthis
    \label{eq19}
\end{align*}

\section{Proof of Theorem \ref{theorem1}}
\label{proof_of_theorem}


To prove the theorem, we first elaborate on the process of belief propagation. Let us take a simple factor graph $FG = (V \cup F, E')$ which is a path graph. Let the path be $v_1 - f_1 - v_2 - f_2 - v_3$. $f_1, f_2$ represent the pairwise vertex dependency between vertices $v_1, v_2$ and $v_2, v_3$, respectively. The message from vertex $v_3$ to $f_2$ is initialized with the posterior probability of $v_3$ ($\omega_{v_3}$). At current timestamp, let the chosen vertex be $v_1$. Therefore, the messages are propagated from $v_3$ to $v_1$ as part of \textit{forward propagation}. For simplicity, let us consider the label to be $+1$.
The message from vertex $v_3$ to factor $f_2$ is 
\begin{flalign*}
\underset{v_3 \rightarrow f_2}{\mu}(+1) &= \omega_{v_3}(+1),
\end{flalign*}
Following Eq. (\ref{eq2}), the message from factor $f_2$ to vertex $v_2$ is 
\resizebox{.96\linewidth}{!}{
\begin{minipage}{\linewidth}
\begin{flalign*}
    \underset{f_2 \rightarrow v_2}{\mu}(+1) &= \sum_{x_{f}' = +1, x_{v_2}' = +1} \left( \phi_{f_2}(x_{f_2}') \prod_{v^* \in \{v_2, v_3\} \setminus \{v_2\}} \underset{v^* \rightarrow f_2}{\mu}(+1)\right) \\
    &= \sum_{x_{f}' = +1, x_{v_2}' = +1} \left( \phi_{f_2}(x_{f_2}') \underset{v_3 \rightarrow f_2}{\mu}(+1) \right) \\
    &= \phi_{f_2}(+1) * \omega_{v_3}(+1).
\end{flalign*}
\end{minipage}
}
Similarly, $\mu_{v_2 \rightarrow f_1} = \phi_{f_2}(+1) * \omega_{v_3}(+1)$ and $\mu_{f_1 \rightarrow v_1} = \phi_{f_1}(+1) * \phi_{f_2}(+1) * \omega_{v_3}(+1)$. 
From the \textit{forward propagation} we can observe that the messages propagated are dependent on factor initialization and posterior probability of the start vertex. Similar observation can be made for \textit{backward propagation} too since both process follow same steps. Therefore, we can conclude that the messages are only updated due to factor initialization and posterior probabilities. Since factor initialization is fixed and does not change with timestamp, the messages are updated only due to the change in posterior probabilities. From Eq. (\ref{eq3}), the marginal probability of each vertex is dependent on its posterior probability and messages from neighbors. Since messages are updated only due to the change in the posterior probabilities, we can conclude that the marginal probability of each vertex is updated only due to its posterior probability and posterior probabilities of leaf vertices in the graph.

Considering any factor graph $FG$, when a vertex $v_t$ is chosen, the messages are propagated from leaf vertices to the vertex $v_t$ and from vertex $v_t$ back to the leaf vertices. Each leaf vertex and $v_t$ pair is essentially a path graph. Therefore, our conclusion that the marginal probability of each vertex is updated only due to its posterior probability and posterior probabilities of leaf vertices in the graph is valid for any factor graph $FG$.

\subsection{Consistency of \our-OPT}
\label{optsection}
To prove the consistency of \our-OPT, we utilize the observations from \cite{chen2013optimistic}. As per \our-OPT, in each iteration we choose a vertex $v_t$ such that

\resizebox{.96\linewidth}{!}{
\begin{minipage}{\linewidth}

\begin{align*}
    v_t = \underset{v}{\mathrm{argmax}} \left( R^{+}(S^{t}, v_{t})\ \dot=\ \max(R_{1}(S^{t}, v_{t}), R_{2}(S^{t}, v_{t}))\right). \numberthis
\end{align*}

\end{minipage}
}
Let us consider the computation of expected reward $R^{+}(S^{t}, v_{t})$. Since the update to posterior probability of each vertex $v \in V$ only occurs due to the obtained label. At a given timestamp, when computing $R_{1}(S^{t}, v_{t})$ or $R_{2}(S^{t}, v_{t})$, only the posterior probability of vertex $v_{t}$ changes. Therefore, the value of the reward only depends on the effect of this change on the graph. As per Eq. (\ref{eq9}), the reward is the change in the sum of marginal probabilities in the graph
\begin{align*}
    R(S^{t}, v_{t}) = \mathbb{E} (\sum_{v=1}^{N}h(P_{v}^{t+1}(+1)) - \sum_{v=1}^{N}h(P_{v}^{t}(+1)) | S^{t}, v_{t}). \numberthis
\end{align*}

To compute the change in the sum of marginal probabilities in the graph, we first compute the change in marginal probability of vertex $v$ at timestamp $t$ following \cite{chen2013optimistic}. We have 
\begin{align*}
    P_{v}^{t}(x_v) = \omega_{v}^{t}(x_v) \prod_{j \in \mathcal{N}(v)} \underset{j \rightarrow v}{\mu}(x_v).
\end{align*}
Therefore,

\resizebox{.96\linewidth}{!}{
\begin{minipage}{\linewidth}
\begin{align*}
    h(P_{v}^{t+1}(x_v)) - h(P_{v}^{t}(x_v)) = (h(\omega_{v}^{t+1}(x_v) \prod_{j \in \mathcal{N}(v)} \underset{j \rightarrow v}{\mu^{t+1}}(x_v)) \\ - h(\omega_{v}^{t}(x_v) \prod_{j \in \mathcal{N}(v)} \underset{j \rightarrow v}{\mu^{t}}(x_v))).
\end{align*}
\end{minipage}
}
Since the messages from neighbors do not change between two timestamps if $v$ is chosen at timestamp $t$ for obtaining the label. Therefore, $h(P_{v}^{t+1}(x_v)) - h(P_{v}^t(x_v)) > 0$ only if $h(\omega_{v}^{t+1}(x_v)) - h(\omega_{v}^{t}(x_v)) > 0$. 

Considering any vertex $v' \in \{V - {v}\}$, if the vertex is not reachable from $v$ then $h(P_{v'}^{t+1}(x_v)) - h(P_{v'}^{t}(x_v)) = 0$. If it is reachable from $v$ (for simplicity let $v' \in \mathcal{N}(v)$ but the steps are valid even if $v' \notin \mathcal{N}(v)$), then
\resizebox{.96\linewidth}{!}{
\begin{minipage}{\linewidth}
\begin{align*}
    h(P_{v'}^{t+1}(x_{v'})) - h(P_{v'}^{t}(x_{v'})) = (h(\omega_{v'}^{t}(x_{v'}) \prod_{j \in \mathcal{N}(v')} \underset{j \rightarrow v'}{\mu^{t+1}}(x_{v'})) \\ - h(\omega_{v'}^{t}(x_{v'}) \prod_{j \in \mathcal{N}(v')} \underset{j \rightarrow v'}{\mu^{t}}(x_{v'}))).
\end{align*}
\end{minipage}
}
since $\omega_{v'}^{t}(x_v)$ does not change between timestamps $t$ and $t+1$. Now considering the messages from neighbors, the messages from all the neighbors except from the factor $f'$ that connects $v$ to $v'$ do not change between timestamps $t$ and $t+1$. Therefore,

\resizebox{.96\linewidth}{!}{
\begin{minipage}{\linewidth}
\begin{align*}
    P_{v'}^{t+1}(x_v) - P_{v'}^{t}(x_v) \propto \underset{f' \rightarrow v'}{\mu^{t+1}}(x_{v'}) - \underset{f' \rightarrow v'}{\mu^{t}}(x_{v'}).
\end{align*}
\end{minipage}
}
However, $\underset{f' \rightarrow v'}{\mu}$ is proportional to posterior of $v$. So
\resizebox{.96\linewidth}{!}{
\begin{minipage}{\linewidth}
\begin{align*}
    h(P_{v'}^{t+1}(x_v)) - h(P_{v'}^{t}(x_v)) &\propto h(\omega_{v}^{t+1}(x_v)) - h(\omega_{v}^{t}(x_v)) \\
    &\propto \omega_{v}^{t+1}(x_v) - \omega_{v}^{t}(x_v).
\end{align*}
\end{minipage}
}
Since $\forall v' \in \{V - {v}\}$, the change in marginal probability is either $0$ or proportional to $h(\omega_{v}^{t+1}(x_v)) - h(\omega_{v}^{t}(x_v))$. We have that $h(P_{v'}^{t+1}(x_v)) - h(P_{v'}^{t}(x_v)) > 0$ only if $h(\omega_{v}^{t+1}(x_v)) - h(\omega_{v}^{t}(x_v)) > 0$. Therefore, $\sum_{v=1}^{N}h(P_{v}^{t+1}(+1)) - \sum_{v=1}^{N}h(P_{v}^{t}(+1)) > 0$ only if $h(\omega_{v}^{t+1}(+1)) - h(\omega_{v}^{t}(+1)) > 0$ and $\sum_{v=1}^{N}h(P_{v}^{t+1}(+1)) - \sum_{v=1}^{N}h(P_{v}^{t}(+1)) < 0$ if $h(\omega_{v}^{t+1}(+1)) - h(\omega_{v}^{t}(+1)) < 0$.

 The calculation of $h(\omega_{v}^{t+1}(x_v)) - h(\omega_{v}^{t}(x_v))$ is the same posterior calculation as in \cite{chen2013optimistic}. Following the same proof of \cite{chen2013optimistic}, we have $\underset{a_{v}^{t} + b_{v}^{t} \rightarrow \infty}{\lim} h(\omega_{v}^{t+1}(x_v)) - h(\omega_{v}^{t}(x_v)) = 0$. Therefore, for any $v \in V$, $\underset{a_{v}^{t} + b_{v}^{t} \rightarrow \infty}{\lim} h(P_{v}^{t+1}(+1)) - h(P_{v}^{t}(+1)) = 0$. Therefore,
$\underset{a_{v}^{t} + b_{v}^{t} \rightarrow \infty}{\lim} R(S^{t}, v_t) = 0$, and thus $\underset{a_{v}^{t} + b_{v}^{t} \rightarrow \infty}{\lim} R^{+}(S^{t}, v_t) = 0$. 
Applying other observations from \cite{chen2013optimistic}, we have that in any sample path ($v_0, y_{v_{0}}, ..., v_{t-1}, y_{v_{t-1}}$), \our-OPT will label each instance infinitely many times as $T$ goes to infinity. 
Due to our consideration that workers are reliable, if we label each vertex infinitely many times, we will converge to $\theta_{v}$ for each $v \in V$. Therefore, the accuracy will be $100\%$ almost surely implying that \our-OPT is a consistent policy.


\begin{figure*}[t]
    \centering
    \includegraphics[width=\textwidth]{kulkarni_590/Figures/cora_total.pdf}
    \caption{Performance comparison on datasets that follow homophily setting. The plots show the performance on the entire datasets.}
    \label{fig:cora_total}
\end{figure*}

\begin{figure}[t]
    \centering
    \includegraphics[width=0.48\textwidth]{kulkarni_590/Figures/appendix_figure2.pdf}
    \caption{Performance comparison on WebKB and Bitcoin datasets. The plots show the performance on the entire datasets.}
    \label{fig:webkb_total}
\end{figure}


\begin{figure}[t]
    \centering
    \includegraphics[width=0.48\textwidth]{kulkarni_590/Figures/appendix_figure3.pdf}
    \caption{Ablation study results of experiments with different beta and factor initialization on the Cora dataset. We report the performance of \our-EXP.}
    \label{fig:ablation_alpha}
\end{figure}

\subsection{Consistency of \our-EXP}

As per \our-EXP, in each iteration we choose a vertex $v_t$ such that
\begin{align*}
    R(S^{t}, v_{t}) = p_1*R_{1}(S^{t}, v_{t}) + p_2*R_{2}(S^{t}, v_{t}). \numberthis
\end{align*}
 

As part of the proof in Section \ref{optsection}, we show that $R_{1}(S^{t}, v_{t}) > 0$ and $R_{2}(S^{t}, v_{t}) < 0$ when $a_{v}^{t} \geq b_{v}^{t} + 1$, $R_{1}(S^{t}, v_{t}) < 0$ and $R_{2}(S^{t}, v_{t}) > 0$ when $b_{v}^{t} \geq a_{v}^{t} + 1$ and $R_{1}(S^{t}, v_{t}), R_{2}(S^{t}, v_{t}) > 0$ when $a_{v}^{t} = b_{v}^{t}$. Therefore, when $a_{v}^{t} = b_{v}^{t}$, $R(S^{t}, v_{t}) > 0$, but when $a_{v}^{t} \neq b_{v}^{t}$, $R(S^{t}, v_{t})$ can be $0$.

However, even though the change in posterior probability of $v_t$ can be the same when computing $R_{1}$ and $R_{2}$ especially at the start of execution since all the vertices are initialized with Beta prior distribution $Beta(0.1, 0.1)$, the effect of this change on the graph depends on pairwise vertex dependency initialization in factor vertices. If the probability of vertices $v_1$ and $v_2$ having the same label is not equal to $0.5$ then $R_{1}(S^{t}, v_{t}) \neq R_{2}(S^{t}, v_{t})$ and if it is equal to $0.5$ then $R_{1}(S^{t}, v_{t}) = R_{2}(S^{t}, v_{t})$. Therefore, $R(S^{t}, v_{t}) \neq 0$ when $a_{v}^{t} \neq b_{v}^{t}$ if the probability of vertices $v_1$ and $v_2$ having the same label is not equal to $0.5$. Since we assume that the pairwise vertex dependency among vertices is known, therefore $R(S^{t}, v_{t}) \neq 0$. Furthermore, there will be at least one vertex in the graph such that $R(S^{t}, v_{t}) > 0$ since all vertices of the graph do not have the same label and pairwise vertex dependency among all pairs of adjacent vertices is not same. 

Since $R(S^{t}, v_{t}) > 0$ for any positive integers $a_{v}^{t}$ and $b_{v}^{t}$, we can follow the proof technique used in Section \ref{optsection} and show that \our-EXP is a consistent policy.


\section{Additional Experiments}
\label{ablation_studies}

We conduct experiments without splitting the dataset into train and test sets. Figure \ref{fig:cora_total} compares our proposed approaches with the baselines on datasets that follow a homophily setting. We provide pseudo code for optimal policy $\pi^{*}$ computation for \our-OPT and \our-EXP in Algorithm \ref{alg:optimal_policy}. From the results, we can observe that \our-EXP outperforms the baselines on all three datasets, and \our-OPT comes second. The results are similar to the results in \ref{fig:cora} in the main paper, suggesting that the proposed reward function is efficient and the policies that follow the proposed reward function choose the right vertex to label at each timestamp $t$. Figure \ref{fig:webkb_total} compares \our-EXP and \our-OPT with the baselines on WebKB and Bitcoin datasets. The results show that the proposed approaches outperform the baselines for the WebKB dataset and achieve similar performance on the Bitcoin dataset. The results are similar to the results in Figure \ref{fig:webkb} in the main paper and suggest the importance of knowing dependency among adjacent vertices.

\begin{algorithm}  \small
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\caption{Pseudo code for the optimal policy $\pi^{*}$ computation for \our-OPT/\our-EXP} 
\label{alg:optimal_policy}
\begin{algorithmic} 
\Require Unlabeled graph $G = (V, E)$, budget $T$
\Ensure Inferred true labels for each vertex $v \in V$
\For{$t$ in $0...T-1$}
\State Estimate reward for all vertices of graph $G$ following Eq. (\ref{eq13})/Eq. (\ref{eq14}). 
\State Choose the vertex $v$ with the highest estimated reward to request worker label $y_{v_{t}}$.
\State Propagate the labeling information throughout the graph $G$ using belief propagation following Eq. (\ref{eq1}) and Eq. (\ref{eq2}).
\EndFor
\State Compute marginal probability of each vertex $v \in V$ following Eq. (\ref{eq3}) to infer true label.

\State \Return Inferred true label of each vertex $v \in V$.
\end{algorithmic}
\end{algorithm}


We observe volatility in performance of OPTKG+BP in Figure \ref{fig:cora_total} (c) for Pubmed dataset, a large dataset with 19717 vertices. Since the budget considered in our experiments is lower than two times the number of vertices in the graph, OPTKG follows a round-robin policy. We conduct addition experiment to understand the reason for the volatility, the results are shown in Table \ref{tab:volatility_study}. The goal of the experiment is to understand the distribution of vertices chosen by the policies to obtain worker labels. The vertices for which the worker labels are obtained are kept in the set of labeled vertices, and the remaining vertices form the set of unlabeled vertices. For each unlabeled vertex, we find the distance of the nearest labeled vertex and report the mean distance of all vertices.

\begin{table}[h]

\caption{Experiment to understand the reason for volatility in the performance of OPTKG+BP in Figure \ref{fig:cora_total} (c). We report the mean distance of unlabeled vertices to the nearest labeled vertex.}
\begin{tabular}{p{0.35in}|p{0.42in}|p{0.42in}|p{0.64in}|p{0.62in}}
\hline
\textbf{Budget (T)} & \textbf{OPTKG +BP} & \textbf{Uniform +BP} & \textbf{GraphOBA-OPT} & \textbf{GraphOBA-EXP} \\ \hline
50                    & 4.437             & 3.713               & 3.132                 & 3.131                 \\ \hline
100                   & 3.889             & 3.400               & 2.865                 & 2.865                 \\ \hline
150                   & 3.805             & 3.103               & 2.655                 & 2.638                 \\ \hline
200                   & 3.694             & 2.988               & 2.520                 & 2.505                 \\ \hline
250                   & 3.556             & 2.863               & 2.420                 & 2.398                 \\ \hline
300                   & 3.360             & 2.771               & 2.359                 & 2.325                 \\ \hline
350                   & 3.173             & 2.702               & 2.293                 & 2.248                 \\ \hline
400                   & 3.065             & 2.631               & 2.205                 & 2.158                 \\ \hline
450                   & 2.999             & 2.552               & 2.153                 & 2.099                 \\ \hline
500                   & 2.924             & 2.511               & 2.101                 & 2.032                 \\ \hline
\end{tabular}

\label{tab:volatility_study}
\end{table}

The results in Table \ref{tab:volatility_study} show that OPTKG+BP has the largest mean distance compared to other methods. A higher mean distance implies that the labeled vertices are concentrated in a small part of the graph, whereas a lower mean distance implies the labeled vertices are distributed throughout the graph. If labeled vertices are concentrated, when a new worker label is obtained, the newly inferred labels for unlabeled vertices are sensitive to the newly obtained worker label since the new label information is propagated in one direction from the region with more concentrated labeled nodes to the more sparsely labeled regions. Due to this, the labels of all the unlabeled vertices change to the new label, resulting in volatility. If labeled vertices are distributed, the sensitivity for new worker labels is less since the new label information propagates in multiple directions, so that unlabeled nodes can receive information from multiple labeled nodes, resulting in more stability. Furthermore, the volatility decreases as the budget increases since the labeled vertices are no more concentrated in one part of the graph. Figure \ref{fig:cora_total} (c) shows the decrease in volatility with the budget. For the experiments in Figure \ref{fig:cora_total}, we do not shuffle the indexes of the vertices, and since OPTKG follows a round-robin policy, it results in the labeled vertices being concentrated in a small part of the graph. The volatility of OPTKG+BP can be reduced by shuffling the indexes of the vertices.

From the discussion in Section \ref{results_and_discussion}, we observe that the performance of the proposed approach may be sensitive to the initialization of pairwise vertex dependency among adjacent vertices. Therefore, we conduct experiments with different initialization for pairwise vertex dependency and show the results in Figure \ref{fig:ablation_alpha}. In the figure, Pr(same label) represents the probability of connect vertices having the same label. From the results, we observe that initializing the pairwise vertex dependency among connect vertices with the probability of both vertices having the same label between $0.65$ and $0.7$ results in the best performance. The results suggest that initializing with moderate pairwise vertex dependency among connect vertices is preferred and initializing with very high values can result in a bias towards the label of adjacent vertex, and with very low values can result in over-sensitivity towards the labels provided by the workers.

Furthermore, we conduct experiments with different initialization of $\alpha$ and $\beta$ and show the results in Figure \ref{fig:ablation_alpha}. From the results, we observe that the performance of the proposed approach is not sensitive to the initialization of $\alpha$ and $\beta$.

\section{Dataset Preprocessing}
\label{dataset_preprocessing}

None of the five benchmark datasets considered for our experiments are binary-class datasets. Therefore, we first obtain the class distribution in the datasets to convert the datasets from multi-class to binary-class. Then, we combine classes to obtain a nearly equal distribution of vertices. We relabel each vertex with the new binary classes and use the updated datasets for our experiments. Note that we do not assign different labels to the vertices belonging to the same class. Therefore, for some of the datasets, the distribution is not equal.

\bibliography{kulkarni_590}

\end{document}
