% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version;
% also before submission to see how the non-anonymous paper would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{bm}
\usepackage{siunitx}
\usepackage{booktabs}
\usepackage{tabu}
\usepackage{csvsimple}
\usepackage{multicol}
\usepackage{multirow}
\usepackage{tikz}
\usetikzlibrary{backgrounds,calc}
\usepackage{pgfplots}
\pgfplotsset{compat=1.5}
\pgfplotsset{
    cycle list={t_blue\\t_red\\t_green\\t_yellow\\},
}
\usepgfplotslibrary{fillbetween}
\usepackage{intcalc}
\usepackage{graphicx}

\usepackage{hyperref}
\hypersetup{% setup the hyperref-package options
    plainpages=false,           %   -
    %colorlinks=false,           %   - colorize links?
    pdfborder={0 0 0},          %   -
    breaklinks=true,            %   - allow line break inside links
    bookmarksnumbered=true,     %
    bookmarksopen=true,         %
	hypertexnames=true,         %
}

\usepackage[capitalise,nameinlink]{cleveref}
\crefname{ax}{axiom}{axioms}
\usepackage{acro}
\acsetup{format/first-long=\slshape} % chktex 6
\acsetup{barriers/use=true}
\acsetup{barriers/reset=true}
\acsetup{single}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

%% Colors
\usepackage{colortbl}
\definecolor{t_gray}{HTML}{888888}
\definecolor{t_blue}{HTML}{355fb3}
\definecolor{t_red}{HTML}{b33535}
\definecolor{t_green}{HTML}{3bb335}
\definecolor{t_yellow}{HTML}{b39735}
\definecolor{t_darkgray}{HTML}{454545}
\definecolor{t_darkblue}{HTML}{1e3666}
\definecolor{t_darkgreen}{HTML}{22661e}
\definecolor{t_darkred}{HTML}{661e1e}
\definecolor{t_darkyellow}{HTML}{66571e}
\definecolor{t_lightblue}{HTML}{8ea7d7}
\definecolor{t_lightred}{HTML}{dc8989}
\definecolor{t_lightgreen}{HTML}{8ddc89}

\input{macros}

% Acromyms:
\dac{ml}{ML}{machine learning}
\dac{nlp}{NLP}{natural language processing}
\dac{nn}{NN}{neural network}

\dac{mlp}{MLP}{multilayer perceptron}
\dac{rnn}{RNN}{recurrent neural network}
\dac{svm}{SVM}{support vector machine}
\dac{cnn}{CNN}{convolutional neural network}
\dac{gnn}{GNN}{graph neural network}
\dac{gcn}{GCN}{Graph Convolutional Network}
\dac{gin}{GIN}{Graph Isomorphism Network}
\dac{gpn}{GPN}{Graph Posterior Network}
\dac{lop}{LOP}{linear opinion pooling}
\dac{lopgpn}{LOP-GPN}{linear opinion pooled graph posterior network}
\dac{au}{AU}{aleatoric uncertainty}
\dac{eu}{EU}{epistemic uncertainty}
\dac{tu}{TU}{total uncertainty}
\dac{uq}{UQ}{uncertainty quantification}
\dac{appnp}{APPNP}{approximate personalized propagation of neural predictions}
\dac{postnet}{PostNet}{Posterior Network}
\dac{ce}{CE}{cross-entropy}
\dac{uce}{UCE}{uncertain cross-entropy}
\dac{ppr}{PPR}{personalized page-rank}
\dac{arc}{ARC}{accuracy-rejection curve}
\dac{ood}{OOD}{out-of-distribution}
\dac{id}{ID}{in-distribution}
\dac{auroc}{AUC-ROC}{area under the receiver operating characteristic curve}


\title{Linear Opinion Pooling for Uncertainty Quantification on Graphs}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<clemens.damke@ifi.lmu.de>?Subject=Your UAI 2024 paper}{Clemens Damke}{}}
\author[1,2]{\href{mailto:<eyke@ifi.lmu.de>?Subject=Your UAI 2024 paper}{Eyke Hüllermeier}{}}
% Add affiliations after the authors
\affil[1]{%
    Institute of Informatics\\
    LMU Munich\\
    Germany
}
\affil[2]{%
    Munich Center for Machine Learning (MCML)
}

\begin{document}
\maketitle

\begin{abstract}
We address the problem of uncertainty quantification for graph-structured data, or, more specifically, the problem to quantify the predictive uncertainty in (semi-supervised) node classification. Key questions in this regard concern the distinction  between two different types of uncertainty, aleatoric and epistemic, and how to support uncertainty quantification by leveraging the structural information provided by the graph topology. Challenging assumptions and postulates of state-of-the-art methods, we propose a novel approach that represents (epistemic) uncertainty in terms of mixtures of Dirichlet distributions and refers to the established principle of linear opinion pooling for propagating information between neighbored nodes in the graph. The effectiveness of this approach is demonstrated in a series of experiments on a variety of graph-structured datasets.
\end{abstract}

\section{Introduction}\label{sec:intro}

Quantifying the uncertainty of predictions made by machine learning models is critical for applications where safety is important and mistakes can be costly.
%notably in safety-critical systems, decision support, and model selection.
When assessing the uncertainty of a model's prediction, it is common and often useful to distinguish between two types of uncertainty: %\emph{Aleatoric} and \emph{epistemic} uncertainty.
\Ac{au} arises from the stochasticity inherent to the data-generating process, and cannot be reduced by sampling additional data.
For example, when tossing a fair coin, the outcome is uncertain, and this uncertainty is of purely aleatoric nature.
\Ac{eu}, on the other hand, is due to a lack of knowledge about the data-generating process; assuming that an appropriate model class is chosen, \ac{eu} can be reduced by collecting more data and vanishes in the limit of infinite data~\citep{hullermeier2021}.
For example, the lack of knowledge about the bias of a coin is of epistemic nature, and it increases the (total) uncertainty about the outcome of a coin toss. This uncertainty, however, can be reduced by tossing the coin repeatedly and estimating the bias from the outcomes.

In the context of graph-structured data, \ac{uq} is particularly challenging due to the structural information as an additional contributing factor to the uncertainty.
In this paper, we will focus specifically on the problem of \ac{uq} for (semi-supervised) node classification.
Applications of this problem include, for example, the classification of documents in citation networks~\citep{sen2008,bojchevski2018}, or the classification of users or posts in social networks~\citep{shu2017}.

Recently, \acp{gpn} have been proposed as a principled approach to \ac{uq} on graphs~\citep{stadler2021}.
The \ac{gpn} model combines \acp{postnet}~\citep{charpentier2020} with the \ac{appnp} node classification model~\citep{gasteiger2018}.
This combination is motivated by three axioms on how the structural information in a graph should affect the uncertainty of a model's predictions.
One of those axioms states that the aleatoric entropy of nodes with conflicting neighbors should be high.

In this paper, we discuss the validity of this assumption and situations in which it does not hold.
To address those situations, we propose a novel approach to \acl{uq} on graphs based on the idea of \ac{lop} from the field of decision and risk analysis~\citep{clemen2007,stone1961}.
Our approach, which we refer to as \ac{lopgpn}, uses a mixture of Dirichlet distributions to model the uncertainty of a node's label.
We demonstrate the effectiveness of our approach in a series of experiments on a variety of graph-structured datasets, showing that it outperforms existing methods in terms of both predictive accuracy and uncertainty quantification.

The remainder of this paper is organized as follows.
In \cref{sec:um} we give an overview of commonly used measures for \ac{uq}.
In \cref{sec:uq}, we review how \acp{gpn} use those measures to quantify their predictive uncertainty and then discuss the validity of this approach and describe its problems.
\Cref{sec:lop} presents the \acp{lopgpn} approach that addresses the problems described in the previous section.
In \cref{sec:eval}, we compare our approach with the original \ac{gpn} model and other baseline models.
Finally, \cref{sec:conclusion} concludes the paper and outlines directions for future work.

\section{Uncertainty Measures}\label{sec:um}

In the literature on \ac{uq}, there are different notions of what formally constitutes uncertainty.
Depending on the desired properties of the uncertainty measure, different notions may be more or less suitable.
We consider two ways to assess the suitability of a measure of uncertainty:
\begin{enumerate}
  \item Its adherence to a set of axioms~\citep{pal1993,bronevich2008,wimmer2023,sale2023a}.
    \item Its performance on a predictive task, such as outlier detection~\citep{charpentier2020}.
\end{enumerate}
Since we focus on \ac{uq} in the node classification setting, we provide a brief overview of the most common notions of uncertainty in the context of classification tasks.

\subsection{Entropy-based Uncertainty Measures}\label{sec:um:entropy}

One of the most common ways to represent predictive uncertainty of a $K$-class classifier is through the use of a second-order probability distribution $Q$, i.e., a distribution on the probability distributions $\theta = (\theta_1, \ldots , \theta_K)\in \Delta_K$, where $\Delta_K$ is the unit $(K-1)$-simplex and $\theta_k$ denotes the probability of the $k^{th}$ class.
Thus, the true distribution on the $K$ classes is considered as a random variable $\Theta \sim Q$.
Given a second-order distribution $Q$, we denote its expectation by
\begin{equation}
    \bar{\theta} \coloneqq \mathbb{E}_{Q}[\Theta] = \int_{\Delta_K} \theta \, \mathrm{d}Q(\theta) \, . \label{eq:q-exp}
\end{equation}
The \acf{tu} (about the outcome $Y$, i.e., the class eventually observed) can be quantified by the Shannon entropy of $\bar{\theta}$, i.e.,
\begin{equation}
    \mathrm{TU}(Q) \coloneqq H(\mathbb{E}_{Q}[\theta]) = -\sum_{k=1}^{K} \bar{\theta}_k \log \bar{\theta}_k . \label{eq:tu}
\end{equation}
Further, a decomposition of this uncertainty into an aleatoric and an epistemic part can be achieved on the basis of a well-known result from information theory, stating that entropy is the sum of conditional entropy and mutual information~\citep{kendall2017,depeweg2018}.
This result suggests to quantify \acf{au} as conditional entropy (of the outcome $Y$ given the first-order distribution $\Theta$):
%i.e., by the expectation of the entropy of $Y \, | \, \theta$ w.r.t.\ $\Theta \sim Q$:
\begin{equation}
    \mathrm{AU}(Q) \coloneqq \mathbb{E}_{Q} \left[ H(\Theta) \right] = -\smashoperator{\int_{\Delta_K}} \sum_{k=1}^{K} \theta_k \log \theta_k \, \mathrm{d}Q(\theta) \, . \label{eq:au}
\end{equation}
%Note that the \acl{au} is therefore just the conditional entropy $H(Y | \theta)$, for $Y \sim \Theta$.
Moreover, the \acf{eu} is then given by the difference between \ac{tu} and \ac{au}, i.e.,
\begin{align}
    \mathrm{EU}(Q) & \coloneqq \mathrm{TU}(Q) - \mathrm{AU}(Q) \label{eq:eu} \\
    & = I(Y; \Theta) = \mathbb{E}_G[D_{\mathrm{KL}}(\Theta \| \bar{\theta})] \, ,  \nonumber
\end{align}
where $I(\cdot; \cdot)$ denotes mutual information and $D_{\mathrm{KL}}(\cdot \| \cdot)$ the Kullback-Leibler divergence.

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{gfx/entropy-uncertainy.pdf}
    \caption{%
        \acs*{tu}, \acs*{au}, and \acs*{eu} for three second-order distributions on $\Delta_2$,
        namely, $Q_1 = \mathrm{Beta}(5,5)$, $Q_2 = \mathcal{U}[0,1]$, and $Q_3 = \frac{1}{2}\mathrm{Beta}(100,10) + \frac{1}{2}\mathrm{Beta}(10,100)$.%
    }\label{fig:entropy-uncertainty}
\end{figure}
\Cref{fig:entropy-uncertainty} gives an intuition for the behavior of this additive decomposition of uncertainty (for $K=2$).
It shows how the \ac{au} goes down for second-order-distributions which are more concentrated around degenerate categorical distributions.
As described by \citet{wimmer2023}, there are however situations in which this decomposition is less plausible.
For example, $Q_2$ and $Q_3$ in \cref{fig:entropy-uncertainty} have the same \ac{tu}, despite the fact that $Q_3$ arguably expresses more knowledge about the true data-generating distribution $\theta^*$ than $Q_2$.
This raises the question, whether an additive decomposition of \ac{tu} into \ac{au} and \ac{eu} is reasonable at all.

Instead of expressing \ac{eu} in terms of the mutual information between $Y$ and $\Theta$, \citet{malinin2018} and \citet{kotelevskii2023} propose to express \ac{eu} via the differential entropy of the second-order distribution $Q$, i.e.,
\begin{equation}
    \mathrm{EU}_{\textrm{SO}}(Q) \coloneqq H(Q) = -\int_{\Delta_K} \log Q(\theta) \, \mathrm{d}Q(\theta). \label{eq:eu-so-entropy}
\end{equation}
In the following, we will refer to this notion of \ac{eu} as \emph{second-order epistemic uncertainty}.
Note that this notion of \ac{eu} is not without controversy, as the differential entropy can be negative, which is forbidden in some axiomatic characterizations of uncertainty, which use $0$ to represent a state of no uncertainty~\citep{wimmer2023}.
Apart from the entropy-based measures we just described, uncertainty is also often quantified in terms of other concentration measures, such as variance~\citep{sale2023a,duan2024}, confidence or Dirichlet pseudo-counts.
We will now briefly review the latter two notions of uncertainty.

\subsection{Least-confidence- and Count-based Uncertainty Measures}\label{sec:um:confidence}

Given a second-order distribution $Q$, a different notion of uncertainty is provided by the so-called \emph{least-confidence} of the expected distribution $\bar{\theta}$, defined as
\begin{equation}
    \mathrm{LConf}(Q) \coloneqq 1 - \max_k \bar{\theta}_k \, . \label{eq:lconf}
\end{equation}
Note the similarity of this measure to the \ac{tu} measure in \cref{eq:tu}; $\mathrm{LConf}(Q)$ can therefore be seen as a measure of \emph{total} uncertainty.
However, in the literature this measure is also used as a proxy for \emph{aleatoric} uncertainty~\citep{charpentier2020}; we will come back to this point in \cref{sec:uq}.

Finally, if $Q$ is described by a Dirichlet distribution $\mathrm{Dir}(\bm{\alpha})$, where $\bm{\alpha} = (\alpha_1, \dots, \alpha_K)$ is a vector of pseudo-counts,
the sum $\alpha_0 = \sum_{k=1}^K \alpha_k$ describes how concentrated $Q$ is around the expected distribution $\bar{\theta}$.
Note that the concentration of $Q$ is similarly captured by its differential entropy, as described in \cref{eq:eu-so-entropy}, i.e., $\mathrm{EU}_{\textrm{SO}}(\mathrm{Dir}(\bm{\alpha}))$ goes down as $\alpha_0$ grows.
The \ac{eu} of a Dirichlet distribution $Q$ can therefore be quantified by $\mathrm{EU}_{\textrm{PC}}(Q) = -\alpha_0$.
We will refer to this notion of \ac{eu} as \emph{pseudo-count-based epistemic uncertainty}~\citep{charpentier2020,huseljic2021,kopetzki2021}.

\section{Uncertainty Quantification}\label{sec:uq}

As just described, there are different ways to formalize uncertainty, and the choice of an uncertainty measure depends on the properties it is supposed to fulfill.
In the context of graphs, there is an additional factor that contributes to uncertainty and needs to be formalized, too, namely the structural information.
\Citet{stadler2021} propose an axiomatic approach to account for this structure-induced uncertainty, which they call \acf{gpn}.
As mentioned in the introduction, \acp{gpn} are essentially a combination of \acp{postnet}~\citep{charpentier2020} and the \ac{appnp} node classification model~\citep{gasteiger2018}.
We begin with a brief review of the \acp{postnet} and \acp{gpn}, and then discuss the validity of the axioms on which the \ac{uq} estimates of \acp{gpn} are essentially based.

\subsection{Posterior Networks}\label{sec:uq:postnet}

A \ac{postnet} is a so-called \emph{evidential deep learning} classification model~\citep{sensoy2018}, i.e., it quantifies predictive uncertainty via a second-order distribution $Q$, which is learned via a \emph{second-order loss function} $L_2$.
A standard (first-order) loss function $L_1 : \Delta_K \times \mathcal{Y} \to \mathbb{R}$ takes a predicted first-order distribution $\hat{\theta} \in \Delta_K$ and an observed ground-truth label $y \in \mathcal{Y}$ as input (where $\mathcal{Y}$ denotes the set of classes); the \acf{ce} loss is a common example of such a first-order loss function.
Similarly, a second-order loss $L_2$ takes a second-order distribution $Q$, i.e., a distribution over $\Delta_K$, as input, to which it again assigns a loss in light of an observed label $y \in \mathcal{Y}$.
\Ac{postnet} uses the so-called \acf{uce} loss~\citep{bilos2019}, which is defined as
\begin{align}
    L_2(Q, y) & \coloneqq \mathbb{E}_{Q} \left[ \mathrm{CE}(\Theta, y) \right]  \label{eq:uce}  \\
    & = -\int_{\Delta_K} \log P(y\, |\, \theta) \, \mathrm{d}Q(\theta) \, . \nonumber
\end{align}
As shown by \citet{bengs2022}, directly minimizing a second order loss, like the \ac{uce} loss, is problematic, since the minimum is reached if $Q$ is a Dirac measure that puts all probability mass on $\theta^* = {\argmin}_{\theta \in \Delta_K} \mathrm{CE}(\theta, y)$.
For all notions of \ac{eu} we discussed in \cref{sec:um}, the optimal $Q^*$ will therefore have no \ac{eu}, i.e., $\mathrm{EU} = 0$~(\cref{eq:eu}) and $\mathrm{EU}_{\textrm{SO}} = \mathrm{EU}_{\textrm{PC}} = -\infty$~(\cref{eq:eu-so-entropy}).
This problem is commonly addressed by adding a \emph{regularization term}, typically the differential entropy of $Q$, in the second-order loss function, which encourages $Q$ to be more spread out.
Whether one can obtain a faithful representation of \acl{eu} has been generally questioned by \cite{bengs2023}.
One should therefore be cautious when interpreting the \ac{eu} estimates of evidential deep learning models, such as \ac{postnet}.
We will not attempt to interpret uncertainty estimates in a quantitative manner, but rather focus on the question of whether they are qualitatively meaningful, e.g., by considering whether anomalous or noisy instances can be identified via their uncertainty.

\Ac{postnet} models the second-order distribution $Q$ as a Dirichlet distribution $\mathrm{Dir}(\mathbf{\alpha})$, where $\mathbf{\alpha} = (\alpha_1, \dots, \alpha_K)$ is a vector of pseudo-counts.
The predicted pseudo-counts $\alpha_k$ for a given instance $\mathbf{x}^{(i)} \in \mathcal{X}$ are defined as
\begin{equation}
    \alpha_k = 1 + N \cdot P \left(\mathbf{z}^{(i)}\,|\, y^{(i)} = k \right) \cdot P \left(y^{(i)} = k \right) ,
\end{equation}
where $\mathbf{z}^{(i)} = f(\mathbf{x}^{(i)}) \in \mathbb{R}^H$ is a latent neural network embedding of $\mathbf{x}^{(i)}$ and $N \in \mathbb{R}$ a so-called \emph{certainty budget}, determining the highest attainable pseudo-count for a given instance.
The class-conditional probability $P(\mathbf{z}^{(i)}\,|\, k)$ by a normalizing flow model for the class $k$ estimates the density of the instance.
Overall, the \ac{postnet} model therefore consists of a neural network encoder model $f$ and $K$ normalizing flow models, one for each class.

\subsection{Graph Posterior Networks}\label{sec:uq:gpn}

Next, we describe how the \acf{gpn} approach extends \ac{postnet} to the node classification setting for graphs.
We denote a graph as $G \coloneqq (\mathcal{V}, \mathcal{E})$, where $\mathcal{V}$ is a set of $N \coloneqq |\mathcal{V}|$ nodes and $\mathcal{E} \subseteq \mathcal{V}^2$ the set of edges.
The adjacency matrix of $G$ is denoted by $\mathbf{A} = (A_{i,j}) \in {\{0,1\}}^{N \times N}$, where $A_{i,j} = 1$ iff $(v_i, v_j) \in \mathcal{E}$.
For simplicity we also assume that $G$ is undirected, i.e., that $A$ is symmetric.
For each node $v^{(i)} \in \mathcal{V}$ we have a feature vector $\mathbf{x}^{(i)} \in \mathbb{R}^D$ and a label $y^{(i)} \in \mathcal{Y}$.
The goal of the node classification task is to predict the label of each node in $\mathcal{V}$, given the graph structure and the node features.

\Acp{gpn} classify the nodes of a given graph by first making a prediction for each node $v^{(i)}$ solely based on its features $\mathbf{x}^{(i)}$ using a standard \ac{postnet} model, i.e., without considering the graph structure.
The predicted feature-based pseudo-count vectors $\mathbf{\alpha}^{\mathrm{ft},(i)}$ for each vertex $v^{(i)}$ are then dispersed through the graph via a \ac{ppr} matrix $\Pi^{\mathrm{PPR}} \in \mathbb{R}^{N \times N}$ as follows:
\begin{align}
    \mathbf{\alpha}^{\mathrm{agg},(i)} &\coloneqq \sum_{v^{(j)} \in \mathcal{V}} \Pi^{\mathrm{PPR}}_{i,j} \mathbf{\alpha}^{\mathrm{ft},(j)} \label{eq:gpn-alpha} \\
    \text{where } \Pi^{\mathrm{PPR}} &\coloneqq {\left(\varepsilon \mathbf{I} + (1 - \varepsilon) \mathbf{\hat{A}}\right)}^L \label{eq:gpn-ppr}
\end{align}
Here, $\mathbf{I}$ is the identity matrix, $\varepsilon \in (0,1]$ the so-called \emph{teleport probability}, and $\mathbf{\hat{A}} \coloneqq \mathbf{A} \mathbf{D}^{-1}$ the normalized (random-walk) adjacency matrix, with $\mathbf{D} \coloneqq \mathrm{diag}(\mathbf{A} \mathbf{1})$ being the degree matrix of $G$.
For large $L$, $\Pi^{\mathrm{PPR}}$ approximates the personalized page-rank matrix of the graph via power iteration.
\Citet{gasteiger2018} proposed this page-rank inspired information dispersion scheme for the node classification task, which they refer to as \ac{appnp}.
The main difference between \ac{appnp} and \ac{gpn} is that \ac{appnp} disperses (first-order) class probability vectors $\theta^{\mathrm{ft},(i)}$ for each node $v^{(i)}$, whereas \ac{gpn} disperses pseudo-count vectors $\mathbf{\alpha}^{\mathrm{ft},(i)}$. The latter correspond to second-order parameters, but are also in direct correspondence to zero-order (pseudo-)data. Broadly speaking, \ac{appnp} disperses first-order information, whereas \ac{gpn} disperses zero-order information.
%Thus, while the latter averages on the level of probabilities, the former averages on the level of \emph{parameters}

To justify this pseudo-count dispersion scheme, \citet{stadler2021} propose three axioms on how the structural information in a graph should affect the uncertainty of a model's predictions:
\begin{enumerate}[label=\textbf{A\arabic*}]
    \item A node's prediction should only depend on its own features in the absence of network effects.
        A node with features more different from training features should have a higher uncertainty. \label[ax]{ax:gpn-ft}
    \item All else being equal, if a node $v^{(i)}$ has a lower epistemic uncertainty than its neighbors in the absence of network effects, the neighbors' predictions should become less epistemically uncertain in the presence of network effects. \label[ax]{ax:gpn-eu}
    \item All else being equal, if a node $v^{(i)}$ has a higher aleatoric uncertainty than its neighbors in the absence of network effects, the neighbors' predictions should become more aleatorically uncertain in the presence of network effects.
        Further, the aleatoric uncertainty of a node in the presence of network effects should be higher if the predictions of its neighbors in the absence of network effects are more conflicting. \label[ax]{ax:gpn-au}
\end{enumerate}
To show the validity of those axioms, \citet{stadler2021} define \ac{au} to be the least-confidence (\cref{eq:lconf}) and \ac{eu} as the sum of the pseudo-counts.
Using those definitions, the validity of the axioms follows from the fact that $\mathbf{\alpha}^{\mathrm{agg},(i)}$ is effectively a weighted average of the pseudo-counts of the (indirect) neighbors of $v^{(i)}$, with high weights for close neighbors and low weights for more distant ones.

\subsection{Validity of the GPN Axioms}\label{sec:uq:critique}

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{gfx/gpn-aggregation.pdf}
    \caption{Illustration of how \acs*{gpn} aggregates the two conflicting predictions for \textsf{A} and \textsf{B} with low \acs*{au} and low \acs*{eu} into predictions with high \acs*{au} and low \acs*{eu}.}\label{fig:gpn-aggregation}
\end{figure}
The axioms that motivate the aggregation of pseudo-counts in \acp{gpn} are based on two assumptions which may not always hold, namely, \emph{network homophily} and the \emph{irreducibility of conflicts}.

First, \emph{network homophily} refers to the assumption that an edge implies that the connected nodes are similar in some way; more specifically, in the context of \acp{gpn} that connected nodes should have similar second-order distributions, and thereby similar predictive uncertainties.
This is a common assumption which is shared by many \ac{gnn} architectures, based on the idea of repeatedly summing or averaging the features of each node's neighbors~\citep{kipf2017,xu2018}.
As already remarked by \citet{stadler2021}, non-homophilic graphs are not properly dealt with by \acp{gpn}, nor by other \ac{gnn} architectures in general~\citep{zhu2020}.
Nonetheless, since edges are typically used to represent some form of similarity, the homophily assumption is often reasonable.

Second, we define the \emph{irreducibility of conflicts} as the assumption that conflicting predictions cannot be resolved by aggregating the predictions of the conflicting nodes.
\Cref{fig:gpn-aggregation} illustrates the implications of this assumption for the binary node classification; there, without network effects, node \textsf{A} is very confident that its probability of belonging to the positive class is high, whereas node \textsf{B} is very confident that its probability of belonging to that class is low.
Thus, both nodes make conflicting predictions, while both having a low \ac{au} and a low \ac{eu}.
Due to the homophily assumption, a consensus has to be found between the two conflicting predictions.
As described in \cref{ax:gpn-au}, a \ac{gpn} will do this by increasing the \ac{au} of the aggregated prediction, while keeping the \ac{eu} low.
\Citet{stadler2021} argue that this is reasonable, because such a conflict is inherently irreducible and should therefore be reflected in the \acl{au} of the aggregated prediction.

To assess whether the irreducibility assumption is indeed reasonable, one has to clarify what \emph{irreducibility} is actually supposed to mean.
As mentioned in the introduction, irreducibility in the context of \ac{uq} refers to uncertainty that cannot be reduced by additional information, which, in a machine learning context, essentially means by sampling additional data~\citep{hullermeier2021}.
In the context of node classification, the data points are nodes; thus given a sample graph $G_N = (\mathcal{V}_N, \mathcal{E}_N)$ with $N$ vertices, increasing the sample size corresponds to sampling a graph $G_M = (\mathcal{V}_M, \mathcal{E}_M)$ with $M > N$ nodes from an assumed underlying data-generating distribution $P_\mathcal{G}$ over all graphs $\mathcal{G}$, such that $G_N$ is a subgraph of $G_M$.
The question of whether a conflict between a node $v^{(i)}$ and its neighbor $v^{(j)}$ is irreducible then becomes the question of whether the conflict persists in the limit of $M \to \infty$.
Let $\mathcal{N}_M(v^{(i)})$ be the set of neighbors of $v^{(i)}$ in $G_M$.
Assuming homophily, each node $v^{(\ell)}$ that is added to $\mathcal{N}_M(v^{(i)})$ should be \emph{similar} to $v^{(i)}$ with high probability.
Depending on the data-generating distribution $P_\mathcal{G}$, there are two possible scenarios:
\begin{enumerate}
    \item The neighborhood of $v^{(i)}$ does not grow with the sample size, i.e., $\mathbb{E}[|\mathcal{N}_M(v^{(i)})|] \in \mathcal{O}(1)$ as $M \to \infty$.
    \item The neighborhood of $v^{(i)}$ grows with the sample size, i.e., $\mathbb{E}[|\mathcal{N}_M(v^{(i)})|] \to \infty$ as $M \to \infty$.
\end{enumerate}
In the first case, the conflict between $v^{(i)}$ and $v^{(j)}$ is indeed irreducible, as no additional data can be sampled to resolve the conflict.
In this situation, \cref{ax:gpn-au} of \ac{gpn} is reasonable, the irreducible uncertainty, i.e., \ac{au}, should increase with conflicting predictions.
However, in the second case, the conflict is reducible, as the conflict will eventually be resolved by the addition of more similar nodes to the neighborhood of $v^{(i)}$, which will outweigh the conflicting node $v^{(j)}$.
In this situation, the conflict resolution approach of \ac{gpn} is not reasonable; \ac{au} should not go up, instead the reducible uncertainty, i.e., \ac{eu}, should increase.

We argue that the second case is more common in practical node classification tasks.
The Barabási-Albert model~\citep{barabasi1999} is a popular scale-free model, which describes the growth behavior of many real-world graphs, such as the World Wide Web, social networks, or citation networks~\citep{albert2002,redner1998,wang2008}.
In this model, the expected degree of the $N$-th sampled node $v^{(i)}$ after $M-N$ additional nodes have been sampled is equal to $|\mathcal{N}_N(v^{(i)})| \cdot \sqrt{\frac{M}{N}}$.
Thus, for $M \to \infty$, the expected neighborhood size of a node goes to infinity.

Examples for domains in which the neighborhood sizes do not grow with the size of a graph are molecular graphs or lattice graphs, such as 3D models or images, which can be interpreted as grids of pixels.
In those domains, node-level classification tasks are however less common, as one is typically interested in the classification of entire graphs, e.g., whether a given molecule is toxic or not.

To conclude, we argue that the axiomatic motivation for \acp{gpn} is oftentimes inappropriate.
Therefore, we propose a different approach to \ac{uq} for node classification which does not assume the irreducibility of conflicts from \cref{ax:gpn-au}.

\section{LOP-GPN}\label{sec:lop}

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{gfx/lop-gpn-aggregation.pdf}
    \caption{Illustration of how \acs*{lopgpn} preserves the \acs*{au} of conflicting predictions.}\label{fig:lop-gpn-aggregation}
\end{figure}
A \acf{lopgpn} is a variant of the standard \ac{gpn} model, which uses a mixture of Dirichlet distributions instead of a single Dirichlet distribution to model the uncertainty of a node's class in the presence of network effects.
This choice is motivated by the idea that each node $v^{(i)}$ in a given node classification task can be interpreted as a decision maker, which has to assign a label $y^{(i)}$ to itself based on its own features $\mathbf{x}^{(i)}$ and the features of its neighbors.
Using this interpretation, in the \ac{gpn} architecture, each decision maker $v^{(i)}$ first makes a prediction based on its own features, producing a Dirichlet distribution $\mathrm{Dir}(\bm{\alpha}^{\mathrm{ft},(i)})$, and then aggregates this distribution with those produced by its direct and indirect neighbors.
In the standard \acp{gpn} model this aggregate is again a Dirichlet distribution $\mathrm{Dir}(\bm{\alpha}^{\mathrm{agg},(i)})$ (see \cref{eq:gpn-alpha}).

The question of how to aggregate the decisions or opinions of a set of experts is a well-known problem in the field of decision and risk analysis~\citep{clemen2007}.
One of the most common aggregation approaches in this field is the called \acf{lop}~\citep{stone1961}.
\Ac{lop} simply aggregates distributions by taking a weighted average of their densities, resulting in a mixture of the original distributions.
Combining \ac{lop} with \ac{appnp}, the aggregated probability density of a node $v^{(i)}$ is given by
\begin{align}
    Q^{\mathrm{agg},(i)} &\coloneqq \sum_{v^{(j)} \in \mathcal{V}} \Pi^{\mathrm{PPR}}_{i,j} Q^{\mathrm{ft},(j)}, \label{eq:lopgpn-p}
\end{align}
where $Q^{\mathrm{ft},(j)}$ is the density of the Dirichlet distribution $\mathrm{Dir}(\bm{\alpha}^{\mathrm{ft},(j)})$ and $\Pi^{\mathrm{PPR}}$ as in \cref{eq:gpn-ppr}.
Using such mixtures, \ac{lopgpn} does not assume the irreducibility of conflicts, i.e., unlike \ac{gpn}, \ac{au} is not increased in the presence of conflicts.
This follows trivially from the fact that the \ac{au} of a \ac{lop} distribution is just the weighted linear combination of the \acp{au} of the original distributions, i.e.,
\begin{align}
    \mathrm{AU}(Q^{\mathrm{agg},(i)}) &= %\mathbb{E}_{\Theta \sim Q^{\mathrm{agg},(i)}} \left[ H(\Theta) \right] \nonumber
    \int_{\Delta_K} H(\theta) \, \mathrm{d}Q^{\mathrm{agg},(i)} \nonumber \\
    &= \sum_{v^{(j)}} \Pi^{\mathrm{PPR}}_{i,j} \int_{\Delta_K} H(\theta) \, \mathrm{d}Q^{\mathrm{ft},(j)} \nonumber \\
    &= \sum_{v^{(j)}} \Pi^{\mathrm{PPR}}_{i,j} \mathrm{AU}(Q^{\mathrm{ft},(j)}) . \label{eq:lopgpn-conflict-au}
\end{align}
\Cref{fig:lop-gpn-aggregation} illustrates the implications of this approach for the binary node classification.
Next, we will describe how \ac{lopgpn} is trained and how it can be implemented efficiently.

\subsection{Second-order Loss}\label{sec:lop:loss}

Analogous to \cref{eq:uce}, the loss of \ac{lopgpn} is given by
\begin{align}
    \mathcal{L} \coloneqq \sum_{i=1}^{N} \underbrace{\mathbb{E}_{Q^{\mathrm{agg},(i)}} \left[ \mathrm{CE}(\Theta^{(i)}, y^{(i)}) \right] - H(Q^{\mathrm{agg},(i)})}_{\mathcal{L}^{(i)}} . \label{eq:lopgpn-optim}
\end{align}
Due to the use of Dirichlet mixtures, this loss is not directly minimizable, as there is no closed-form expression for the regularization term $H(Q^{\mathrm{agg},(i)})$.
We therefore use the following bounds on the entropy of a mixture distribution instead (see \citet{melbourne2022}):
\begin{align}
    H(Q^{\mathrm{agg},(i)}) &\geq \smashoperator{\sum_{v^{(j)} \in \mathcal{V}}} \Pi^{\mathrm{PPR}}_{i,j} H(Q^{\mathrm{ft},(j)}), \label{eq:lopgpn-entropy-bound}\\
    H(Q^{\mathrm{agg},(i)}) &\leq H(\mathrm{Cat}(\Pi^{\mathrm{PPR}}_{i})) + \smashoperator{\sum_{v^{(j)} \in \mathcal{V}}} \Pi^{\mathrm{PPR}}_{i,j} H(Q^{\mathrm{ft},(j)}), \nonumber
    %H(Q^{\mathrm{agg},(i)}) &\geq \smashoperator{\sum_{v^{(j)} \in \mathcal{V}}} \Pi^{\mathrm{PPR}}_{i,j} H(Q^{\mathrm{ft},(j)}),
\end{align}
where $\mathrm{Cat}(\Pi^{\mathrm{PPR}}_{i})$ is the categorical distribution described by the $i^{th}$ row vector of $\Pi^{\mathrm{PPR}}$.
We use the upper bound on $H(Q^{\mathrm{agg},(i)})$ as a surrogate for $\mathrm{EU}_{\mathrm{SO}}$.
Similarly, the lower entropy bound implies the following upper bound on the loss for each vertex $v^{(i)}$:
\begin{align*}
    \mathcal{L}^{(i)} \leq &\smashoperator{\sum_{j=1}^N} \Pi^{\mathrm{PPR}}_{i,j} \left( \mathbb{E}_{Q^{\mathrm{ft},(i)}} \left[ \mathrm{CE}(\Theta^{(i)}, y^{(i)}) \right] - H(Q^{\mathrm{ft},(j)}) \right) %. \nonumber \\
    %& - H(\mathrm{Cat}(\Pi^{\mathrm{PPR}}_{i})) . \label{eq:lopgpn-optim-bound}
\end{align*}
This bound on the loss is differentiable and can therefore be minimized using standard gradient-based optimization algorithms.

\subsection{Sparse APPNP}\label{sec:lop:sparsity}

Despite the similarities between \ac{gpn} and \ac{lopgpn}, the computational complexity of the \acs{appnp}-based aggregation step is significantly higher for \ac{lopgpn}.
We can express \cref{eq:gpn-alpha} as a matrix-vector multiplication, i.e.,%
\begin{align}
    \bm{\alpha}^{\mathrm{agg}} &= \Pi^{\mathrm{PPR}} \bm{\alpha}^{\mathrm{ft}} = \hat{A}_{\varepsilon}^L \bm{\alpha}^{\mathrm{ft}},
\end{align}
where $\bm{\alpha}^{\mathrm{agg}}, \bm{\alpha}^{\mathrm{ft}} \in \mathbb{R}_+^{N \times K}$ are pseudo-count matrices and $\hat{A}_{\varepsilon} = \varepsilon \mathbf{I} + (1 - \varepsilon) \mathbf{\hat{A}}$.
Since typically $K \ll N$, it is best to evaluate this product from right to left.
Then, assuming that $A$ is sparse, the complexity of this operation is $\mathcal{O}(L \cdot |\mathcal{E}| \cdot K)$.

\Ac{lopgpn} on the other hand uses the values of $\Pi^{\mathrm{PPR}}$ directly as mixture weights, i.e., the $L$-th power of $\hat{A}_{\varepsilon}$ has to be computed explicitly.
For large graphs, this is computationally infeasible.
To address this issue, a sparse approximation of \ac{appnp} is used to compute $\Pi^{\mathrm{PPR}}$.
Between each of the $L$ sparse matrix multiplications, all probability mass below a certain threshold $\delta$ is moved back to the diagonal of the matrix.
This limits the percentage of non-zero entries to ${(N \delta)}^{-1}$ and makes it possible to apply \ac{lopgpn} even to large graphs.
In the experimental evaluation, sparsification was used for all datasets except CoraML and CiteSeer.

\section{Evaluation}\label{sec:eval}

We evaluate \ac{lopgpn} in two ways:
First, we use \acp{arc} to compare the quality of different predictive uncertainty measures of \ac{lopgpn} to those of \ac{gpn} on a set of standard node classification benchmarks.
Second, we compare the \ac{ood} detection performance of our model against a set of node classification models.

\subsection{Experimental Setup}\label{sec:eval:setup}

Due to the similarity between \ac{lopgpn} and \ac{gpn}, we base our experiments on those of \citet{stadler2021}, i.e., we use the same dataset splits and hyperparameters and build upon their reference implementation.\footnote{Implementation available at \url{https://github.com/Cortys/gpn-extensions}}

\paragraph{Datasets}
We use the following node classification benchmarks:
Three citation network datasets, namely, \textbf{CoraML}, \textbf{CiteSeer} and \textbf{PubMed}~\citep{mccallum2000,giles1998,getoor2005,sen2008,namata2012},
two co-purchase datasets, namely, \textbf{Amazon Photos} and \textbf{Amazon Computers}~\citep{mcauley2015}
and the large-scale \textbf{OGBN Arxiv} dataset with about $170 \mathrm{k}$ nodes and over $2.3$ million edges~\citep{hu2020}.
Since OGBN Arxiv is presplit into train, validation and test sets, we use the provided splits.
The results of the other datasets are obtained by averaging over 10 dataset splits with train/val/test sizes of 5\%/15\%/80\%.
Note that all six datasets represent either citation networks or co-purchase networks; the assumptions of unbounded growth of neighborhood sizes and network homophily (see \cref{sec:uq:critique}) are therefore reasonable and the use of \ac{lopgpn} well-motivated.

\paragraph{Models}
We compare \textbf{\ac{lopgpn}} against the following baseline models:
Two variants of \textbf{\ac{gpn}}~\citep{stadler2021}, \textbf{\ac{appnp}}~\citep{gasteiger2018}, \textbf{Matern-GGP}~\citep{borovitskiy2021} and \textbf{GKDE}~\citep{zhao2020}.

As described in \cref{sec:uq:gpn}, \textbf{\ac{appnp}} directly disperses class probabilities, i.e., it is a first-order method that cannot (meaningfully) distinguish between \ac{au} and \ac{eu}; the entropy of its first-order predictions is therefore used as an estimate of \acl{tu}.

\textbf{Matern-GGP}~\citep{bojchevski2018} is a Gaussian process model using the so-called \emph{graph Matérn kernel}.
For each vertex, it predicts a posterior second-order multivariate Gaussian $\Theta \sim \mathcal{N}(\bar{\theta}, \Sigma)$ with $\bar{\theta} \in \Delta_K$ and $\Sigma$ a diagonal covariance matrix.
This implies that some probability mass will lie outside of $\Delta_K$; thus, the additive entropy-based uncertainty decomposition described in \cref{sec:um:entropy} is not well-defined for this distribution.
We therefore do not use the additive entropy decomposition for Matern-GGP and instead use $H(\bar{\theta})$ as a proxy for \ac{au} and the trace $\mathrm{Tr}(\Sigma)$ as a proxy for \ac{eu}.
Note that this definition of \ac{au} coincides with the definition of \ac{tu} in the additive decomposition (\cref{eq:tu}).
Second, note that $\mathrm{Tr}(\Sigma)$ is a monotonic transformation of the differential entropy of the second-order Gaussian $\mathcal{N}(\bar{\theta}, \Sigma)$ and thereby closely related to $\mathrm{EU}_{\mathrm{SO}}$ (\cref{eq:eu-so-entropy}).\footnote{%
The differential entropy of the multinomial Gaussian $\mathcal{N}(\bar{\theta}, \Sigma)$ is $\frac{1}{2}\ln\left((2 \pi e)^k \det \Sigma \right) = \frac{1}{2}\mathrm{Tr}(\ln \Sigma) + c \geq \frac{1}{2}\ln\mathrm{Tr}(\Sigma) + c$ with the constant $c = \frac{k}{2}\ln(2 \pi e)$.
%Thus, an increase of $\mathrm{Tr}(\Sigma)$ implies an increase of the differential entropy.%
}
Last, we note that the Matern-GGP model was not applied to the large-scale OGBN Arxiv dataset due to memory constraints.

The \emph{Graph-based Kernel Dirichlet distribution Estimation} \textbf{GKDE} model~\citep{zhao2020}, is a parameter-free method that estimates a Dirichlet distribution for each node based on the features of its neighbors.

The two evaluated variants of \textbf{\ac{gpn}} are \emph{\ac{gpn} (rw)} and \emph{\ac{gpn} (sym)}.
\ac{gpn} (rw) uses \acs{appnp} with random-walk normalization, i.e., with $\hat{\mathbf{A}} = \mathbf{A} \mathbf{D}^{-1}$.
\ac{gpn} (sym) uses symmetric normalization, i.e., with $\hat{\mathbf{A}} = \mathbf{D}^{-1/2} \mathbf{A} \mathbf{D}^{-1/2}$ (see \citet{kipf2017}).
We evaluate both types of normalization because \citet{stadler2021} used symmetric normalization in their experiments, while \ac{lopgpn} requires random-walk normalization to ensure that valid mixture densities are produced.
To show that any observed differences between \ac{lopgpn} and \ac{gpn} are due to the use of \ac{lop} and not due to the use of random-walk normalization, we compare \ac{lopgpn} against both variants.

\subsection{Accuracy-Rejection Curves}\label{sec:eval:arc}

% \newcommand*{\accrejPlot}[4]{\begin{tikzpicture}
%     \small
%     \begin{axis}[
%         width=0.32\linewidth,
%         height=3.2cm,
%         xlabel={\ifthenelse{\equal{#2}{sample_epistemic}}{\ifthenelse{\equal{#1}{Arxiv}}{OGBN-Arxiv}{#1}}{}},
%         ylabel={#4},
%         grid=major,
%         ymin=#3, ymax=1,
%         xmin=0, xmax=0.99,
%         legend pos=outer north east,
%         legend style={font=\tiny, draw=none},
%         legend image post style={line width=2pt},
%     ]

%     % Plot means
%     % \ifthenelse{\(\equal{#2}{sample_aleatoric}\or\equal{#2}{sample_aleatoric_entropy}\or\equal{#2}{sample_epistemic}\)\and\not\equal{#1}{Arxiv}}{%
%     %     % \addplot [color=t_gray, line width=1pt] table [x=p, y=maternGGPMean, col sep=comma] {tables/acc_rej_#2_#1.csv};
%     %     % \addplot [color=t_gray, line width=1pt] table [x=p, y=appnpMean, col sep=comma] {tables/acc_rej_#2_#1.csv};%
%     % }{}
%     \addplot [color=t_green, line width=1pt] table [x=p, y=gpnMean, col sep=comma] {tables/acc_rej_#2_#1.csv};
%     \addplot [color=t_blue, line width=1pt] table [x=p, y=gpnRWMean, col sep=comma] {tables/acc_rej_#2_#1.csv};
%     \addplot [color=t_red, line width=1pt] table [x=p, y=gpnLOPMean, col sep=comma] {tables/acc_rej_#2_#1.csv};

%     % Plot standard errors
%     % \ifthenelse{\(\equal{#2}{sample_aleatoric}\or\equal{#2}{sample_aleatoric_entropy}\or\equal{#2}{sample_epistemic}\)\and\not\equal{#1}{Arxiv}}{%
%     %     % \addSE{tables/acc_rej_#2_#1.csv}{maternGGP}{t_gray}
%     %     % \addSE{tables/acc_rej_#2_#1.csv}{appnp}{t_gray}%
%     % }{}
%     \addSE{tables/acc_rej_#2_#1.csv}{gpn}{t_green}
%     \addSE{tables/acc_rej_#2_#1.csv}{gpnRW}{t_blue}
%     \addSE{tables/acc_rej_#2_#1.csv}{gpnLOP}{t_red}

%     \ifthenelse{\equal{#1}{Arxiv}\or\equal{#1}{Computers}}{%
%         \legend{GPN (sym), GPN (rw), LOP-GPN}%
%     }{}
%     \end{axis}
% \end{tikzpicture}}
% % \newcommand*{\fullAccrejPlots}[1]{
% %     \accrejPlot{CoraML}{#1}{0.8}%
% %     \hfill\accrejPlot{CiteSeerFull}{#1}{0.8}%
% %     \hfill\accrejPlot{AmazonPhotos}{#1}{0.8}\\%
% %     \accrejPlot{AmazonComputers}{#1}{0.7}%
% %     \hfill\accrejPlot{PubMedFull}{#1}{0.8}%
% %     \hfill\accrejPlot{Arxiv}{#1}{0.5}%
% % }
% \newcommand*{\accrejPlots}[2]{
%     % \accrejPlot{CoraML}{#1}{0.8}%
%     \hfill\accrejPlot{Photos}{#1}{0.8}{Acc.\ (#2)}%
%     \hfill\accrejPlot{PubMed}{#1}{0.8}{}%
%     \hfill\accrejPlot{Arxiv}{#1}{0.5}{}%
% }
% % \begin{figure*}[t]
% %     \centering
% %     \accrejPlots{prediction_aleatoric}
% %     \caption{\Acl*{arc} for least-confidence uncertainty}
% %     \label{fig:accrej-confidence}
% % \end{figure*}
% \begin{figure*}[ht]
%     \centering
%     \accrejPlots{sample_total_entropy}{TU}\\%
%     \accrejPlots{sample_aleatoric_entropy}{AU}\\%
%     \accrejPlots{sample_epistemic_entropy_diff}{EU}\\%
%     \accrejPlots{sample_epistemic_entropy}{$\mathrm{EU}_{\textrm{SO}}$}\\%
%     \accrejPlots{sample_epistemic}{$\mathrm{EU}_{\textrm{PC}}$}\\%
%     \caption{\Acl*{arc} for different uncertainty measures for CoraML, PubMed and OGBN Arxiv}
%     \label{fig:accrej}
% \end{figure*}
% \begin{figure*}[h]
%     \centering
%     \accrejPlots{sample_total_entropy}
%     \caption{\Acl*{arc} for \acl*{tu}}
%     \label{fig:accrej-total}
% \end{figure*}
% \begin{figure*}[h]
%     \centering
%     \accrejPlots{sample_aleatoric_entropy}
%     \caption{\Acl*{arc} for \acl*{au}}
%     \label{fig:accrej-aleatoric}
% \end{figure*}
% \begin{figure*}[h]
%     \centering
%     \accrejPlots{sample_epistemic_entropy_diff}
%     \caption{\Acl*{arc} for \acl*{eu}}
%     \label{fig:accrej-epistemic}
% \end{figure*}
% \begin{figure*}[h]
%     \centering
%     \accrejPlots{sample_epistemic_entropy}
%     \caption{\Acl*{arc} for second-order epistemic uncertainty}
%     \label{fig:accrej-so-epistemic}
% \end{figure*}
% \begin{figure*}[h]
%     \centering
%     \accrejPlots{sample_epistemic}
%     \caption{\Acl*{arc} for Pseudo-count-based Epistemic uncertainty}
%     \label{fig:accrej-alpha-epistemic}
% \end{figure*}
\newcommand*{\accrejPlot}[4]{\begin{tikzpicture}[inner frame sep=0]
    \small
    \begin{axis}[
        width=0.2\linewidth,
        height=2.9cm,
        xlabel={\ifthenelse{\equal{#1}{Arxiv}}{#4}{}},
        ylabel={\ifthenelse{\equal{#2}{sample_total_entropy}}{\scriptsize\ifthenelse{\equal{#1}{Computers}\or\equal{#1}{Photos}}{A.#1}{\ifthenelse{\equal{#1}{Arxiv}}{OGBN #1}{#1}}}{}},
        grid=major,
        ymin=#3, ymax=1,
        xmin=0, xmax=0.99,
        label style={font=\footnotesize},
        tick label style={font=\tiny},
        y tick label style={/pgf/number format/.cd, fixed, fixed zerofill, precision=2, /tikz/.cd},
        legend pos=outer north east,
        legend style={font=\tiny, draw=none},
        legend image post style={line width=2pt},
    ]

    % Plot means
    % \ifthenelse{\(\equal{#2}{sample_aleatoric}\or\equal{#2}{sample_aleatoric_entropy}\or\equal{#2}{sample_epistemic}\)\and\not\equal{#1}{Arxiv}}{%
    %     % \addplot [color=t_gray, line width=1pt] table [x=p, y=maternGGPMean, col sep=comma] {tables/acc_rej_#2_#1.csv};
    %     % \addplot [color=t_gray, line width=1pt] table [x=p, y=appnpMean, col sep=comma] {tables/acc_rej_#2_#1.csv};%
    % }{}
    \addplot [color=t_green, line width=1pt] table [x=p, y=gpnMean, col sep=comma] {tables/acc_rej_#2_#1.csv};
    \addplot [color=t_blue, line width=1pt] table [x=p, y=gpnRWMean, col sep=comma] {tables/acc_rej_#2_#1.csv};
    \addplot [color=t_red, line width=1pt] table [x=p, y=gpnLOPMean, col sep=comma] {tables/acc_rej_#2_#1.csv};
    % \addplot [color=t_green, line width=1pt] table [x=p, y=cuqPPRMean, col sep=comma] {tables/acc_rej_#2_#1.csv};
    % \addplot [color=t_blue, line width=1pt] table [x=p, y=cuqGCNMean, col sep=comma] {tables/acc_rej_#2_#1.csv};
    % \addplot [color=t_red, line width=1pt] table [x=p, y=cuqGATMean, col sep=comma] {tables/acc_rej_#2_#1.csv};

    % Plot standard errors
    % \ifthenelse{\(\equal{#2}{sample_aleatoric}\or\equal{#2}{sample_aleatoric_entropy}\or\equal{#2}{sample_epistemic}\)\and\not\equal{#1}{Arxiv}}{%
    %     % \addSE{tables/acc_rej_#2_#1.csv}{maternGGP}{t_gray}
    %     % \addSE{tables/acc_rej_#2_#1.csv}{appnp}{t_gray}%
    % }{}
    \addSE{tables/acc_rej_#2_#1.csv}{gpn}{t_green}
    \addSE{tables/acc_rej_#2_#1.csv}{gpnRW}{t_blue}
    \addSE{tables/acc_rej_#2_#1.csv}{gpnLOP}{t_red}
    % \addSE{tables/acc_rej_#2_#1.csv}{cuqPPR}{t_green}
    % \addSE{tables/acc_rej_#2_#1.csv}{cuqGCN}{t_blue}
    % \addSE{tables/acc_rej_#2_#1.csv}{cuqGAT}{t_red}

    \ifthenelse{\equal{#2}{sample_epistemic_entropy}}{%
        \legend{GPN (sym), GPN (rw), LOP-GPN}%
    }{}
    \end{axis}
\end{tikzpicture}}
\newcommand*{\accrejPlots}[2]{
    \hfill\accrejPlot{#1}{sample_total_entropy}{#2}{$\mathrm{TU}$}%
    \hfill\accrejPlot{#1}{sample_aleatoric_entropy}{#2}{$\mathrm{AU}$}%
    \hfill\accrejPlot{#1}{sample_epistemic_entropy_diff}{#2}{$\mathrm{EU}$}%
    \hfill\accrejPlot{#1}{sample_epistemic}{#2}{$\mathrm{EU}_{\mathrm{PC}}$}%
    \hfill\accrejPlot{#1}{sample_epistemic_entropy}{#2}{$\mathrm{EU}_{\mathrm{SO}}$}%
}
\begin{figure*}[ht!]
    \centering
    \accrejPlots{CoraML}{0.8}\\[-10pt]%
    \accrejPlots{CiteSeer}{0.8}\\[-10pt]%
    \accrejPlots{Photos}{0.85}\\[-10pt]%
    \accrejPlots{Computers}{0.75}\\[-10pt]%
    \accrejPlots{PubMed}{0.8}\\[-10pt]%
    \accrejPlots{Arxiv}{0.45}%
    \caption{%
        \Acl*{arc} for different uncertainty measures.
        The x-axis represents the fraction of rejected test instances; the y-axis represents the test accuracy for a given rejection rate.
        The (small) shaded areas behind the curves represent the estimate's standard error.
    }
    \label{fig:accrej}
\end{figure*}
\Acfp{arc} are produced by discarding the predictions for instances where the predictor exhibits the highest uncertainty, and then calculating the accuracy for the remaining subset.
For an uncertainty measure which captures predictive uncertainty well, the accuracy should monotonically increase with the rejection rate.
We use the following uncertainty measures to produce \acp{arc}:
% Least-Confidence Uncertainty (\cref{eq:lconf}),
Entropy-based \ac{tu}, \ac{au} and \ac{eu} (\cref{eq:tu,eq:au,eq:eu}),
second-order \acl{eu} (\cref{eq:eu-so-entropy}),
and pseudo-count-based \acl{eu}.
\Cref{fig:accrej} show the \acp{arc} for the different uncertainty measures for the six evaluated datasets.
% We will now briefly summarize the results.
First, note that \ac{lopgpn} consistently outperforms or at least matches \ac{gpn} for almost all rejection rates and all datasets.
This indicates that dropping the assumption of the \emph{irreducibility of conflicts} made by \ac{gpn} is beneficial in practice.
The only exception to this are the curves for the mutual information-based \ac{eu} on the CiteSeer, Amazon Photos and Amazon Computers datasets; here, the accuracies of \ac{lopgpn} drop-off at high rejection rates.
As previously illustrated in \cref{fig:entropy-uncertainty}, the mutual information-based \ac{eu} can be smaller for the uninformed uniform second-order distribution than for a (more-informed) bimodal Dirichlet mixture distribution.
Analogous to that example, we hypothesize that some of the uninformed mixture distributions produced by \ac{lopgpn} are incorrectly assigned low \ac{eu} values, leading to the observed drop-off in accuracy for the nodes with the lowest \ac{eu} estimates.
The (arguably) more reasonable pseudo-count- and differential entropy-based \ac{eu} measures do not exhibit this behavior.
There, the accuracies go up for increasing rejection rates; this is a sign that these measures capture predictive uncertainty in a meaningful way.
% One notable exception is the entropy-based \ac{eu}; here, only \ac{lopgpn} shows the desired behavior while \ac{gpn} fails to capture epistemic uncertainty via mutual information.


\subsection{Out-of-Distribution Detection}\label{sec:eval:ood}

% \begin{table*}[ht!]
%     \caption{OOD Detection Performance and relative change in uncertainty of OOD vs ID vertices (\acs*{tu} / \acs*{au} / \acs*{eu})}
%     \label{tbl:ood}
%     \centering
%     {\scriptsize%
%     \csvreader[
%         column count=49,
%         tabular={c r | ll | ll},
%         separator=comma,
%         table head={%
%             & \multicolumn{1}{c}{\textbf{Model}} &%
%             % \multicolumn{1}{c}{\textbf{ACC}} & %
%             % \multicolumn{1}{c}{\textbf{ID-ACC}} & %
%             \multicolumn{1}{c}{\textbf{OOD-AUC-ROC (\%)}} & \multicolumn{1}{c}{\textbf{Uncertainty \%-Change}} &%
%             %\multicolumn{1}{c}{\textbf{ID-ACC}} & \multicolumn{1}{c}{\textbf{OOD-AUC-ROC}} &%
%             % \multicolumn{1}{c}{\textbf{ID-ACC}} & %
%             \multicolumn{1}{c}{\textbf{OOD-AUC-ROC (\%)}}  & \multicolumn{1}{c}{\textbf{Uncertainty \%-Change}} \\%
%             %
%             & \multicolumn{1}{c}{} &%
%             \multicolumn{2}{c}{Leave-out Classes} &%
%             %\multicolumn{2}{c}{$\mathbf{x}^{(v)} \sim \mathrm{Ber}(0.5)$} &%
%             \multicolumn{2}{c}{$\mathbf{x}^{(v)} \sim \mathcal{N}(0,1)$}%
%             \\\toprule%
%         },
%         before reading={\setlength{\tabcolsep}{3pt}},
%         table foot=\bottomrule,
%         head to column names,
%         late after line={\ifthenelse{\equal{\model}{APPNP}\and\not\equal{\id}{0}}{\\\midrule}{\\}},
%         % filter={\not\equal{\model}{APPNP}}
%         filter={\not\equal{\id}{31}\and\not\equal{\dataset}{Amazon\\Computers}\and\not\equal{\dataset}{CiteSeer}}
%     ]{tables/id_ood.csv}{}{%
%         \ifthenelse{\equal{\model}{APPNP}}{%
%             \ifthenelse{\equal{\id}{30}}{%
%                 \multirow{5}{*}[0em]{\shortstack{\dataset}}%
%             }{\multirow{6}{*}[0em]{\shortstack{\dataset}}}%
%         }{} &%
%         \textbf{\model} &%
%         % \evalres{\acc}{\accSE} &%
%         % \evalres{\oodLocIdAcc}{\oodLocIdAccSE} &%
%         \rawres{\oodLocOodTotalEntropy}{\oodLocOodTotalEntropySE} / %
%         \rawres{\oodLocOodAleatoricEntropy}{\oodLocOodAleatoricEntropySE} / %
%         \rawres{\oodLocOodEpistemicEntropyDiff}{\oodLocOodEpistemicEntropyDiffSE} %
%         % \evalres{\oodLocOodEpistemicEntropy}{\oodLocOodEpistemicEntropySE}
%         &%
%         %
%         \rawres{\oodLocTotalEntropyChange}{} / %
%         \rawres{\oodLocAleatoricEntropyChange}{} / %
%         \rawres{\oodLocEpistemicEntropyDiffChange}{} %
%         % \evalres{\oodLocEpistemicEntropyChange}{}
%         &%
%         %
%         % \evalres{\oodNormalIdAcc}{\oodNormalIdAccSE} &%
%         \rawres{\oodNormalOodTotalEntropy}{\oodNormalOodTotalEntropySE} / %
%         \rawres{\oodNormalOodAleatoricEntropy}{\oodNormalOodAleatoricEntropySE} / %
%         \rawres{\oodNormalOodEpistemicEntropyDiff}{\oodNormalOodEpistemicEntropyDiffSE} %
%         % \evalres{\oodNormalOodEpistemicEntropy}{\oodNormalOodEpistemicEntropySE}
%         &%
%         %
%         \rawres{\oodNormalTotalEntropyChange}{} / %
%         \rawres{\oodNormalAleatoricEntropyChange}{} / %
%         \rawres{\oodNormalEpistemicEntropyDiffChange}{} %
%         % \evalres{\oodNormalEpistemicEntropyChange}{}%
%     }}
% \end{table*}
\begin{table*}[ht!]
    \caption{\Acs*{ood} detection performance of \acs*{ood} vs \acs*{id} vertices and \acs*{id} accuracies for three \acs*{ood} scenarios.}
    \label{tbl:ood}
    \centering
    {\tiny%
    \csvreader[
        column count=99,
        tabular={c r | r rrrrr | r rrrrr | r rrrrr},
        separator=comma,
        table head={%
            %
            & \multicolumn{1}{c}{} &%
            % \multicolumn{1}{c}{} &%
            \multicolumn{6}{c}{Leave-out Classes} &%
            \multicolumn{6}{c}{$\mathbf{x}^{(v)} \sim \mathrm{Ber}(0.5)$} &%
            \multicolumn{6}{c}{$\mathbf{x}^{(v)} \sim \mathcal{N}(0,1)$}%
            \\%
            & \multicolumn{1}{r}{} &%
            % \multicolumn{1}{r}{} & %
            \multicolumn{1}{c}{\textbf{ID}} & %
            \multicolumn{5}{c}{\textbf{OOD-AUC-ROC}} &%
            \multicolumn{1}{c}{\textbf{ID}} & %
            \multicolumn{5}{c}{\textbf{OOD-AUC-ROC}} &%
            \multicolumn{1}{c}{\textbf{ID}} & %
            \multicolumn{5}{c}{\textbf{OOD-AUC-ROC}} \\
            & \multicolumn{1}{r}{} &%
            \multicolumn{1}{c}{\textbf{Acc}} & %
            \multicolumn{1}{c}{$\mathrm{TU}$} & \multicolumn{1}{c}{$\mathrm{AU}$} & \multicolumn{1}{c}{$\mathrm{EU}$} & \multicolumn{1}{c}{$\mathrm{EU}_{\mathrm{PC}}$} & \multicolumn{1}{c}{$\mathrm{EU}_{\mathrm{SO}}$} &%
            \multicolumn{1}{c}{\textbf{Acc}} & %
            \multicolumn{1}{c}{$\mathrm{TU}$} & \multicolumn{1}{c}{$\mathrm{AU}$} & \multicolumn{1}{c}{$\mathrm{EU}$} & \multicolumn{1}{c}{$\mathrm{EU}_{\mathrm{PC}}$} & \multicolumn{1}{c}{$\mathrm{EU}_{\mathrm{SO}}$} &%
            \multicolumn{1}{c}{\textbf{Acc}} & %
            \multicolumn{1}{c}{$\mathrm{TU}$} & \multicolumn{1}{c}{$\mathrm{AU}$} & \multicolumn{1}{c}{$\mathrm{EU}$} & \multicolumn{1}{c}{$\mathrm{EU}_{\mathrm{PC}}$} & \multicolumn{1}{c}{$\mathrm{EU}_{\mathrm{SO}}$} %
            \\\toprule%
        },
        before reading={\setlength{\tabcolsep}{3pt}},
        table foot=\bottomrule,
        head to column names,
        late after line={\ifthenelse{\equal{\model}{APPNP}\and\not\equal{\id}{0}}{\\\midrule}{\\}},
        % filter={\not\equal{\model}{APPNP}}
        filter={\not\(\equal{\dataset}{OGBN\\Arxiv}\and\equal{\model}{Matern-GGP}\)}
    ]{tables/id_ood.csv}{}{%
        \ifthenelse{\equal{\model}{APPNP}}{%
            \ifthenelse{\equal{\dataset}{OGBN\\Arxiv}}{%
                \multirow{5}{*}[0em]{\shortstack{\dataset}}%
            }{\multirow{6}{*}[0em]{\shortstack{\dataset}}}%
        }{} &%
        \textbf{\model} &%
        % \rawres{\acc}{\accSE}{\accBest} &%
        \rawres{\oodLocIdAcc}{\oodLocIdAccSE}{\oodLocIdAccBest} &%
        \rawres{\oodLocOodTotalEntropy}{\oodLocOodTotalEntropySE}{\oodLocOodTotalEntropyBest} & %
        \rawres{\oodLocOodAleatoricEntropy}{\oodLocOodAleatoricEntropySE}{\oodLocOodAleatoricEntropyBest} & %
        \rawres{\oodLocOodEpistemicEntropyDiff}{\oodLocOodEpistemicEntropyDiffSE}{\oodLocOodEpistemicEntropyDiffBest} & %
        \rawres{\oodLocOodEpistemic}{\oodLocOodEpistemicSE}{\oodLocOodEpistemicBest} & %
        \rawres{\oodLocOodEpistemicEntropy}{\oodLocOodEpistemicEntropySE}{\oodLocOodEpistemicEntropyBest} %
        &%
        %
        \rawres{\oodBerIdAcc}{\oodBerIdAccSE}{\oodBerIdAccBest} &%
        % \rawres{\oodBerOodAcc}{\oodBerOodAccSE}{\oodBerOodAccBest} &%
        \rawres{\oodBerOodTotalEntropy}{\oodBerOodTotalEntropySE}{\oodBerOodTotalEntropyBest} & %
        \rawres{\oodBerOodAleatoricEntropy}{\oodBerOodAleatoricEntropySE}{\oodBerOodAleatoricEntropyBest} & %
        \rawres{\oodBerOodEpistemicEntropyDiff}{\oodBerOodEpistemicEntropyDiffSE}{\oodBerOodEpistemicEntropyDiffBest} & %
        \rawres{\oodBerOodEpistemic}{\oodBerOodEpistemicSE}{\oodBerOodEpistemicBest} & %
        \rawres{\oodBerOodEpistemicEntropy}{\oodBerOodEpistemicEntropySE}{\oodBerOodEpistemicEntropyBest} %
        &%
        %
        \rawres{\oodNormalIdAcc}{\oodNormalIdAccSE}{\oodNormalIdAccBest} &%
        % \rawres{\oodNormalOodAcc}{\oodNormalOodAccSE}{\oodNormalOodAccBest} &%
        \rawres{\oodNormalOodTotalEntropy}{\oodNormalOodTotalEntropySE}{\oodNormalOodTotalEntropyBest} & %
        \rawres{\oodNormalOodAleatoricEntropy}{\oodNormalOodAleatoricEntropySE}{\oodNormalOodAleatoricEntropyBest} & %
        \rawres{\oodNormalOodEpistemicEntropyDiff}{\oodNormalOodEpistemicEntropyDiffSE}{\oodNormalOodEpistemicEntropyDiffBest} & %
        \rawres{\oodNormalOodEpistemic}{\oodNormalOodEpistemicSE}{\oodNormalOodEpistemicBest} & %
        \rawres{\oodNormalOodEpistemicEntropy}{\oodNormalOodEpistemicEntropySE}{\oodNormalOodEpistemicEntropyBest} %
    }}
\end{table*}
Next, we evaluate the performance of \ac{lopgpn} and the other models on \acf{ood} detection tasks.
Similar to \citet{stadler2021}, we use three different \ac{ood} detection tasks.
First, we evaluate the models' ability to detect nodes belonging to classes that were not present in the training set.
Second, we randomly drop features from some nodes with probability $0.5$ and evaluate the models' ability to detect those nodes as outliers.
Third, we add Gaussian noise to the features of some nodes, which should then be detected as outliers.
For all detection tasks, we use the entropy-based \ac{tu}, \ac{au} and \ac{eu} measures, as well as $\mathrm{EU}_{\mathrm{PC}}$ and $\mathrm{EU}_{\mathrm{SO}}$, as criteria.
We use the \emph{Area Under Receiving Operator Characteristics Curve} (AUC-ROC) to measure the performance of the models on the \ac{ood} nodes; the performance on the \ac{id} training data is measured via the accuracy.
% Additionally, we show the relative change in the three types of uncertainty when comparing \ac{id} and \ac{ood} vertices.

\Cref{tbl:ood} shows the \ac{ood} detection performance and \ac{id} accuracies.
Overall, \ac{lopgpn} performs very well on the feature dropout and Gaussian noise tasks, outperforming the other models most often.
Looking at the \ac{id} accuracies in the Gaussian noise setting, \ac{lopgpn} is uniquely able to achieve high accuracies on all but the OGBN Arxiv dataset.
On the leave-out-classes detection task, \ac{lopgpn} performs slightly worse but overall still similarly to the other models.

To summarize our experimental results, \ac{lopgpn} was able to achieve strong classification accuracies and meaningful uncertainty estimates, as shown in \cref{fig:accrej} and \cref{tbl:ood}.
This supports our hypothesis that the irreducibility of conflicts assumption made by \ac{gpn} is not always justified in real-world node classification tasks.

\section{Conclusion}\label{sec:conclusion}

%We address the problem of uncertainty quantification for graph-structured data, or, more specifically, the problem to quantify the predictive uncertainty in (semi-supervised) node classification. Key questions in this regard concern the distinction  between two different types of uncertainty, aleatoric and epistemic, and how to support uncertainty quantification by leveraging the structural information provided by the graph topology. Challenging assumptions and postulates of state-of-the-art methods,

In this paper, we proposed a new approach to uncertainty quantification in (semi-supervised) node classification, which is able to represent both aleatoric and epistemic uncertainty.
Broadly speaking, while existing methods realize information dispersion on the level of the data (or pseudo-counts) or the level of aleatoric uncertainty (averaging first-order distributions), our approach makes use of the graph structure to combine information directly on the epistemic level.
To this end, we refer to the established principle of linear opinion pooling and represent epistemic uncertainty in terms of mixtures of Dirichlet distributions.
First experiments on a variety of graph-structured datasets are promising and show the effectiveness of our approach, also compared to state-of-the-art methods used as baselines.

In future work, we plan to study the problem of uncertainty propagation on graphs in more depths and to compare different approaches in a more systematic way.
Intuitively, an optimal approach should find a good compromise between combining information on the aleatoric and the epistemic level, respectively.
However, for now, it is not at all clear how such an approach could be realized.


% \begin{contributions} % will be removed in pdf for initial submission
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions.
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

% \begin{acknowledgements} % will be removed in pdf for initial submission,
% 						 % (without ‘accepted’ option in \documentclass)
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.

%     \emph{All} acknowledgements go in this section.
% \end{acknowledgements}

% References
\bibliography{literature}

% \newpage

% \onecolumn

% \title{Linear Opinion Pooling for Uncertainty Quantification on Graphs\\(Supplementary Material)}
% \maketitle

% \appendix

% Below, we list additional experimental results, which where left out of the main paper due to space limitations.

% \begin{table*}[ht!]
%     \caption{Complete OOD Detection Performance and relative change in uncertainty of OOD vs ID vertices (\acs*{tu} / \acs*{au} / \acs*{eu})}
%     \label{tbl:ood-complete}
%     \centering
%     {\scriptsize%
%     \csvreader[
%         column count=49,
%         tabular={c r | ll | ll},
%         separator=comma,
%         table head={%
%             & \multicolumn{1}{c}{\textbf{Model}} &%
%             % \multicolumn{1}{c}{\textbf{ACC}} & %
%             % \multicolumn{1}{c}{\textbf{ID-ACC}} & %
%             \multicolumn{1}{c}{\textbf{OOD-AUC-ROC (\%)}} & \multicolumn{1}{c}{\textbf{Uncertainty \%-Change}} &%
%             %\multicolumn{1}{c}{\textbf{ID-ACC}} & \multicolumn{1}{c}{\textbf{OOD-AUC-ROC}} &%
%             % \multicolumn{1}{c}{\textbf{ID-ACC}} & %
%             \multicolumn{1}{c}{\textbf{OOD-AUC-ROC (\%)}}  & \multicolumn{1}{c}{\textbf{Uncertainty \%-Change}} \\%
%             %
%             & \multicolumn{1}{c}{} &%
%             \multicolumn{2}{c}{Leave-out Classes} &%
%             %\multicolumn{2}{c}{$\mathbf{x}^{(v)} \sim \mathrm{Ber}(0.5)$} &%
%             \multicolumn{2}{c}{$\mathbf{x}^{(v)} \sim \mathcal{N}(0,1)$}%
%             \\\toprule%
%         },
%         before reading={\setlength{\tabcolsep}{3pt}},
%         table foot=\bottomrule,
%         head to column names,
%         late after line={\ifthenelse{\equal{\model}{APPNP}\and\not\equal{\id}{0}}{\\\midrule}{\\}},
%         % filter={\not\equal{\model}{APPNP}}
%         filter={\not\equal{\id}{31}}
%     ]{tables/id_ood.csv}{}{%
%         \ifthenelse{\equal{\model}{APPNP}}{%
%             \ifthenelse{\equal{\id}{30}}{%
%                 \multirow{5}{*}[0em]{\shortstack{\dataset}}%
%             }{\multirow{6}{*}[0em]{\shortstack{\dataset}}}%
%         }{} &%
%         \textbf{\model} &%
%         % \evalres{\acc}{\accSE} &%
%         % \evalres{\oodLocIdAcc}{\oodLocIdAccSE} &%
%         \rawres{\oodLocOodTotalEntropy}{\oodLocOodTotalEntropySE} / %
%         \rawres{\oodLocOodAleatoricEntropy}{\oodLocOodAleatoricEntropySE} / %
%         \rawres{\oodLocOodEpistemicEntropyDiff}{\oodLocOodEpistemicEntropyDiffSE} %
%         % \evalres{\oodLocOodEpistemicEntropy}{\oodLocOodEpistemicEntropySE}
%         &%
%         %
%         \rawres{\oodLocTotalEntropyChange}{} / %
%         \rawres{\oodLocAleatoricEntropyChange}{} / %
%         \rawres{\oodLocEpistemicEntropyDiffChange}{} %
%         % \evalres{\oodLocEpistemicEntropyChange}{}
%         &%
%         %
%         % \evalres{\oodNormalIdAcc}{\oodNormalIdAccSE} &%
%         \rawres{\oodNormalOodTotalEntropy}{\oodNormalOodTotalEntropySE} / %
%         \rawres{\oodNormalOodAleatoricEntropy}{\oodNormalOodAleatoricEntropySE} / %
%         \rawres{\oodNormalOodEpistemicEntropyDiff}{\oodNormalOodEpistemicEntropyDiffSE} %
%         % \evalres{\oodNormalOodEpistemicEntropy}{\oodNormalOodEpistemicEntropySE}
%         &%
%         %
%         \rawres{\oodNormalTotalEntropyChange}{} / %
%         \rawres{\oodNormalAleatoricEntropyChange}{} / %
%         \rawres{\oodNormalEpistemicEntropyDiffChange}{} %
%         % \evalres{\oodNormalEpistemicEntropyChange}{}%
%     }}
% \end{table*}
% \section{Additional Accuracy Rejection Curves}

% \section{Additional Accuracy Rejection Curves}

% \newcommand*{\fullAccrejPlots}[1]{
%     \accrejPlot{CoraML}{#1}{0.8}%
%     \hfill\accrejPlot{CiteSeerFull}{#1}{0.8}%
%     \hfill\accrejPlot{AmazonPhotos}{#1}{0.8}\\%
%     \accrejPlot{AmazonComputers}{#1}{0.7}%
%     \hfill\accrejPlot{PubMedFull}{#1}{0.8}%
%     \hfill\accrejPlot{Arxiv}{#1}{0.5}%
% }
% \newcommand*{\accrejPlotsAlt}[2]{
%     \accrejPlot{CoraML}{#1}{0.8}{Acc.\ (#2)}%
%     \hfill\accrejPlot{CiteSeerFull}{#1}{0.8}{}%
%     \hfill\accrejPlot{AmazonComputers}{#1}{0.8}{}%
% }
% \begin{figure*}[ht!]
%     \centering
%     \accrejPlotsAlt{prediction_aleatoric}{$\mathrm{LConf}$}\\%
%     \accrejPlotsAlt{sample_total_entropy}{TU}\\%
%     \accrejPlotsAlt{sample_aleatoric_entropy}{AU}\\%
%     \accrejPlotsAlt{sample_epistemic_entropy_diff}{EU}\\%
%     \accrejPlotsAlt{sample_epistemic_entropy}{$\mathrm{EU}_{\textrm{SO}}$}\\%
%     \accrejPlotsAlt{sample_epistemic}{$\mathrm{EU}_{\textrm{PC}}$}\\%
%     \caption{\Acl*{arc} for different uncertainty measures for CoraML, CiteSeerFull and Amazon Computers}
%     \label{fig:accrej-alt}
% \end{figure*}

\end{document}
