%\documentclass{article}
\documentclass[accepted]{uai2022}

%\pdfpagewidth=8.5in
%\pdfpageheight=11in

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{times}
\renewcommand*\ttdefault{txtt}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{graphicx}
\usepackage{algpseudocode}
\usepackage{algorithm}
\usepackage{subcaption,graphicx}
\usepackage{url}
%\usepackage[hidelinks]{hyperref}
\usepackage[utf8]{inputenc}
%\urlstyle{same}
\usepackage{comment}
%\usepackage{hyperref}
\usepackage{xcolor}
\usepackage{mathrsfs}
\usepackage{caption}
\urlstyle{same}
\DeclareCaptionFont{9pt}{\fontsize{9pt}{10pt}\selectfont}
\captionsetup{font={9pt}}

\usepackage{listings}
\lstset{basicstyle=\small\ttfamily,columns=fullflexible}

\renewcommand{\footnotesize}{\fontsize{9pt}{11pt}\selectfont}


\newcommand{\mvec}{\operatorname{vec}\diagfences}

% Francesco start
\newcommand{\blambda}{\boldsymbol{\lambda}}
\newcommand{\bv}{\boldsymbol{v}}
\usepackage{soul} %used for \st{text}
\usepackage{bm}
% Francesco end



\usepackage{booktabs}
\newcommand{\ra}[1]{\renewcommand{\arraystretch}{#1}}

\definecolor{myyellow}{rgb}{0.9290 0.6940 0.1250}
\definecolor{Gray}{gray}{0.9}

\newcommand{\commentR}[1]{\ifcomments \textbf{\textcolor{magenta}{RJ: #1}} \fi}

\title{Principle of Relevant Information for Graph Sparsification (Supp. Material)}

\author[1]{\href{mailto:yusj9011@gmail.com?Subject=Your UAI 2022 paper}{Shujian~Yu}{}}
\author[2]{\href{mailto:francesco.alesiani@neclab.eu?Subject=Your UAI 2022 paper}{Francesco Alesiani}{}}
\author[3]{Wenzhe~Yin}
\author[1,5,6]{Robert~Jenssen}
\author[4]{Jose~C.~Principe}
% Add affiliations after the authors
\affil[1]{%
    %Machine Learning Group\\
    UiT - The Arctic University of Norway\\
    Norway
}
\affil[2]{%
    %dd\\
    NEC Laboratories Europe\\
    Germany
}
\affil[3]{%
    %Informatics Institute\\
    University of Amsterdam\\
    Netherlands
  }
\affil[4]{%
    %Department of Electrical and Computer Engineering\\
    University of Florida\\
    USA
  }
\affil[5]{%
    Norwegian Computing Center\\
    Norway
  }
\affil[6]{%
    University of Copenhagen\\
    Denmark
  }

\usepackage{mathtools}
\DeclareMathOperator{\Mat}{Mat}
\DeclareMathOperator{\diag}{diag}
\DeclarePairedDelimiter{\diagfences}{(}{)}
\newtheorem{theorem}{Theorem}
%\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
%\newtheorem{property}[theorem]{Property}
\newtheorem{conjecture}[theorem]{Conjecture}
%\newcommand{\diag}{\operatorname{diag}\diagfences}
\newcommand{\tr}{\operatorname{tr}\diagfences}
\newcommand{\Be}{\operatorname{Bernoulli}\diagfences}
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator{\Tr}{tr}
\DeclareMathOperator{\range}{range}
%\DeclareMathOperator*{\min}{min} 
\DeclareMathOperator*{\argmin}{argmin} % thin space, limits underneath in displays
\DeclareMathOperator{\Sigmoid}{Sigmoid}

\newtheorem{property}{Property}[section]
\newtheorem{corollary}{Corollary}
\newtheorem{assumption}{Assumption}
\newtheorem{lemma}{Lemma}

\newtheorem{innercustomthm}{Theorem}
\newenvironment{customthm}[1]
  {\renewcommand\theinnercustomthm{#1}\innercustomthm}
  {\endinnercustomthm}

%\usepackage[ruled, noend, noline]{algorithm2e}
%\SetKwInOut{Parameter}{parameter}
\usepackage{xcolor}

\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}

\lstdefinestyle{mystyle}{
    backgroundcolor=\color{backcolour},   
    commentstyle=\color{codegreen},
    keywordstyle=\color{magenta},
    numberstyle=\tiny\color{codegray},
    stringstyle=\color{codepurple},
    basicstyle=\ttfamily\footnotesize,
    breakatwhitespace=false,         
    breaklines=true,                 
    captionpos=b,                    
    keepspaces=true,                 
    numbers=left,                    
    numbersep=5pt,                  
    showspaces=false,                
    showstringspaces=false,
    showtabs=false,                  
    tabsize=2
}

\lstset{style=mystyle}

% Francesco (start)
\usepackage{soul} %used for \st{text}
% https://tex.stackexchange.com/questions/65453/track-changes-in-latex
%\usepackage{changes} % used to track changes
% Francesco (end)

%\setlength {\marginparwidth }{2cm}

\begin{document}

\onecolumn
\maketitle

\graphicspath{{figures/}}

%\newpage
\appendix

\section{Proofs and Additional Information}


\subsection{Additional information on the rigor of Assumption~\ref{assumption}}\label{sec:assumption_rigor}

\begin{assumption}\label{assumption}
Given an undirected graph $G=\{V,E\}$, let $G'=G+\{u,v\}$, where $V(G)=V(G')$ and $E(G)=E(G')\cup\{u,v\}$, we have
$S_{\text{vN}}(L_{G'})\geq S_{\text{vN}}(L_G)$,
i.e., there exists a strictly monotonically increasing relationship between the number of edges $|G|$ and the von Neumann entropy $S_{\text{vN}}(L_G)$.
\end{assumption}


{\color{black}
Note that, one can find counterexamples about Assumption~\ref{assumption}. The question of whether the factor $\frac{d_{G'}-2}{d_{G'}}$ can be removed from Eq.~(12) in Theorem~\ref{assumption} was investigated in~\citep{dairyko2017note}. The answer is negative and adding an edge may decrease the von Neumann entropy slightly. For example, $K_{2,n-2}$ graph ($n>5$) satisfies $S_{vN}(K_{2,n-2})>S_{vN}(K_{2,n-2}+e)$ after adding edge $e$.

However, the Assumption~\ref{assumption} does hold for most of edges. To further corroborate our argument, we performed an additional experiment, where we generated a set of random graphs with $20$ nodes by the Erd{\"o}s-R\'enyi (ER) model, with the average node degree $\bar{d}$ of roughly $1.6$, $2.5$, $3$, $4$ and $5$, respectively. For each graph, we add one edge to the original graph and re-evaluate the von Neumann entropy $S_{\text{vN}}(L)$. We traverse all possible edges and calculate the percentage that the difference of $S_{\text{vN}}(L)$ is non-negative before and after edge addition. As can be seen from the following table, we have more than $85\%$ confidence that Assumption~\ref{assumption} holds. We made similar observations also for large graph with $200$ nodes.

\begin{table}[htb]%\small
    \centering
    \caption{The percentage that adding one edge may increase the von Neumann entropy for a random graph with $20$ nodes generated by the Erd{\"o}s-R\'enyi (ER) model.}
    \setlength{\tabcolsep}{1mm}{
      \begin{tabular}{@{}cccccc@{}}
      \toprule % from booktabs package
      \bfseries Degree  & $1.6$ & $2.5$ & $3$ & $4$ & $5$ \\
      \midrule % from booktabs package
      \bfseries Percentage ($\%$)  & $95.65$  & $88.57$ & $88.82$ & $88.13$ & $87.25$ \\
      \bottomrule % from booktabs package
    \end{tabular}
    }
\end{table}

}



\subsection{Proof to Corollary~\ref{corollary}}


\begin{corollary}\label{corollary}
Under Assumption~\ref{assumption}, suppose $G_s=\{V_s,E_s\}$ is a sparse graph obtained from $G=\{V,E\}$ (by removing edges), let $G_s'=G_s+\{u,v\}$, where $\{u,v\}$ is an edge from the original graph $G$, $V(G_s)=V(G_s')$ and $E(G_s)=E(G_s')\cup\{u,v\}$, we have
$D_{\text{QJS}}(L_{G_s'}\|L_G)\leq D_{\text{QJS}}(L_{G_s}\|L_G)$,
i.e., adding an edge is prone to decrease the QJS divergence.
\end{corollary}

Before proving Corollary~\ref{corollary}, we first present Lemma~\ref{lemma}, 
{ \color{black}
Lemma~\ref{lemma2} and Lemma~\ref{lemma3}. The proof of Corollary~\ref{corollary} is based on the results described in Lemma~\ref{lemma3}, which is based on the conclusion of Lemma~\ref{lemma2}. We present Lemma~\ref{lemma} as a more restrictive version of Lemma~\ref{lemma2}. 
}

\begin{lemma}\label{lemma}
Let $\boldsymbol{\lambda}=\{\lambda_i\}$ be the eigenvalues of (trace normalized) Graph Laplacian $\tilde{L}$. Suppose the $i$-th eigenvalue $\lambda_i$ has a minor and negligible increase and the remaining eigenvalues decrease proportionately to their existing values, such that $\sum_i\lambda_i=1$. Then, the total derivative of von Neumann entropy $S(\boldsymbol{\lambda})=-\sum_{i=1}^{N}{\lambda_i\log_2{\lambda_i}}$ with respect to $\lambda_i$ is given by:
\begin{equation}
\frac{dS}{d\lambda_i} = \frac{\partial S}{\partial\lambda_i}+\sum_{i\neq j}{\frac{\partial S}{\partial\lambda_j}\frac{d\lambda_j}{d\lambda_i}}  = -\frac{S(\boldsymbol{\lambda})+\log_2{\lambda_i}}{1-\lambda_i}.
\end{equation}
% \begin{equation}
% %\begin{split}
% \frac{dS}{d\lambda_i} = \frac{\partial S}{\partial\lambda_i}+\sum_{i\neq j}{\frac{\partial S}{\partial\lambda_j}\frac{d\lambda_j}{d\lambda_i}}  = -\frac{S(\boldsymbol{\lambda})+\log_2{\lambda_i}}{1-\lambda_i}.
% %\end{split}
% \end{equation}
% {\color{blue} this is what I computed, but I am not sure about the following steps
% \begin{align}
% \frac{dS}{d\lambda_i} = \frac{\partial S}{\partial\lambda_i}+\sum_{i\neq j}{\frac{\partial S}{\partial\lambda_j}\frac{d\lambda_j}{d\lambda_i}}  =
% 2 g_i - G(\lambda)
% \end{align}
% }
\end{lemma}

\begin{proof}
% {\color{red} to be checked} 
% {\color{blue} We first apply write the total derivative by parts, since the eigenvalues are related by the requirement that they sum to $1$. 
% $S(\lambda) = -\sum_i \lambda_i \ln \lambda_i  = \sum_i s(\lambda_i), G(\lambda) = -\sum_i (\ln \lambda_i+1) = \sum_i g(\lambda_i), s_i = s(\lambda_i) = - \lambda_i \ln \lambda_i, g_i = g(\lambda_i) = - \ln \lambda_i -1  $ then $ \frac{dS}{d\lambda_i} = 2 g_i - G(\lambda)$, since $\lambda_j = 1 - \sum_{k \ne j } \lambda_k$ and $\frac{d\lambda_j}{d\lambda_i} = -1$}
{\color{black}

For simplicity, suppose we increase the element $\lambda_i$ up to the value $\lambda_i+ d\lambda_i$ for some infinitesimally small value $d\lambda_i$. As we increase this element, we decrease all the other elements proportionately to their values so that the constraint holds. Thus, for some infinitesimal value $\delta$ we update $\boldsymbol{\lambda}$ as:
\begin{align*}
    \lambda_1 \mapsto \lambda_1 (1-\delta) \\
    \vdots \\
    \lambda_{i-1} \mapsto \lambda_{i-1} (1-\delta) \\
    \lambda_{i+1} \mapsto \lambda_{i+1} (1-\delta) \\
    \vdots \\
    \lambda_N \mapsto \lambda_N (1-\delta)
\end{align*}
or in general $\lambda_{j} \mapsto \lambda_j (1-\delta),  j \ne i $. 
Due to the constraint $\sum_i\lambda_i=1$, or
\begin{align*}
    \sum_{j \ne i} \lambda_j(1-\delta) + \lambda_i+ d \lambda_i &= 1 \\
    (1-\lambda_i)(1-\delta) + \lambda_i+ d \lambda_i &= 1 \\
    1 -\lambda_i - (1-\lambda_i)\delta + \lambda_i+ d \lambda_i &= 1 \\
    - (1-\lambda_i)\delta + d \lambda_i &= 0 \\    
\end{align*}
thus we have:
\begin{equation}
    \delta = d\lambda_i/(1-\lambda_i).
\end{equation}

Then,
\begin{equation}
    \frac{d\lambda_j}{d\lambda_i} = \frac{-\lambda_j \delta}{d\lambda_i} = -\frac{\lambda_j}{1-\lambda_i}, \quad j\neq i.
\end{equation}

Therefore, the total derivative of von Neumann entropy $S(\boldsymbol{\lambda})=-\sum_{i=1}^{N}{\lambda_i\log_2{\lambda_i}}$ with respect to $\lambda_i$ is given by:
\begin{equation}
    \begin{split}
        \frac{dS}{d\lambda_i} & = \frac{\partial S}{\partial\lambda_i}+\sum_{i\neq j}{\frac{\partial S}{\partial\lambda_j}\frac{d\lambda_j}{d\lambda_i}} \\
        & = -(\frac{1}{\ln 2}+\log_2 \lambda_i) + \frac{1}{1-\lambda_i} \sum_{i\neq j} \lambda_j\left(\frac{1}{\ln 2} + \log_2 \lambda_j \right) \\
        & = - \frac{(1-\lambda_i)\log_2 \lambda_i}{1-\lambda_i} + \frac{1}{1-\lambda_i}\sum_{i\neq j} \lambda_j \log_2\lambda_j \\
        & = - \frac{1}{1-\lambda_i} \{\sum_{i=1}^{N}{ - \lambda_i\log_2{\lambda_i}} + \log_2\lambda_i \} \\
        & = -\frac{1}{1-\lambda_i} \{S(\boldsymbol{\lambda}) +\log_2\lambda_i\}
    \end{split}
\end{equation}

% I have :
% \begin{align}
%     \frac{dS}{d\lambda_i} & = \frac1{1-\lambda_i} [S(\lambda) - \ln \lambda_i ]
% \end{align}

}
\end{proof}

{\color{black}
In addition of the previous lemma, we show a more general form (without assuming that all other eigenvalues decrease proportionately to their values).

\begin{lemma}\label{lemma2}
Let $\bm{\lambda}=\{\lambda_i\}$ be the eigenvalues of (trace normalized) Graph Laplacian $\tilde{L}$. 
% Suppose the $i$-th eigenvalue $\lambda_i$ has a minor and negligible increase and the remaining eigenvalues decrease proportionately to their existing values, such that $\sum_i\lambda_i=1$. 
Then, the directional total derivative of von Neumann entropy $S(\bm{\lambda})=-\sum_{i=1}^{N}{\lambda_i\log_2{\lambda_i}}$ with respect to $\lambda_i$ along a change in the eigenvalues defined by $\bm{v}$ such that $\blambda' = \blambda + \delta \bv$, for $\delta \to 0$ is given by:
\begin{align}
\frac{dS}{d\lambda_i} |_{\bv} &= \frac{\partial S}{\partial\lambda_i}+\sum_{j\neq i}{\frac{\partial S}{\partial\lambda_j}\frac{d\lambda_j}{d\lambda_i}} |_{\bv} \nonumber \\
&= -\frac1{v_i} \sum_{j} v_j \log_2 \lambda_j = -\frac1{v_i} \E_{\bv} \log_2 \blambda
\end{align}
where $\sum_j v_j = 0$ and $0 \le v_j + \lambda_j \le 1$ is the directional variation of the eigenvalues $\blambda' = \blambda + \delta \bv$. We denoted $\E_{\bv} $ the sum over the element of $\bv$ (i.e. $\E_{\bv} \log_2 \blambda = \sum_{j} v_j \log_2 \lambda_j $).
In compact form (and with the abuse of the
expectation operator)
\begin{align}
\frac{dS}{d\blambda} |_{\bv} = -\bv^{-1} \E_{v} \log_2 \blambda
\end{align}
where the inverse of the vector $\bv$ is element-wise.
\end{lemma}


\begin{proof}
We first observe that 
\begin{equation*}
    \frac{d\lambda_j}{d\lambda_i}|_{\bv} = \frac{\delta v_j}{\delta v_i} = \frac{v_j}{v_i}, \quad j\neq i.
\end{equation*}

for the definition of the variation. Therefore, the total derivative of von Neumann entropy $S(\boldsymbol{\lambda})=-\sum_{i=1}^{N}{\lambda_i\log_2{\lambda_i}}$ with respect to $\lambda_i$,along the direction $\bm{v}$, is given by:
\begin{equation*}
    \begin{split}
        \frac{dS}{d\lambda_i} |_{\bv} &= \frac{\partial S}{\partial\lambda_i}+\sum_{j\neq i}{\frac{\partial S}{\partial\lambda_j}\frac{d\lambda_j}{d\lambda_i}} |_{\bv} \\
        % \frac{dS}{d\lambda_i} & = \frac{\partial S}{\partial\lambda_i}+\sum_{i\neq j}{\frac{\partial S}{\partial\lambda_j}\frac{d\lambda_j}{d\lambda_i}} \\
        & = -(\frac{1}{\ln 2}+\log_2 \lambda_i) - \sum_{j \neq i} \frac{v_j}{v_i} \left(\frac{1}{\ln 2} + \log_2 \lambda_j \right) \\
        & = -\frac{1}{\ln 2}-\log_2 \lambda_i - \frac{1}{v_i} \sum_{j \neq i} v_j \left(\frac{1}{\ln 2} + \log_2 \lambda_j \right) \\
        & = -\frac{1}{\ln 2}-\log_2 \lambda_i - \frac{1}{v_i} \sum_{j \neq i} v_j \frac{1}{\ln 2} - \frac{1}{v_i} \sum_{j \neq i} v_j \log_2 \lambda_j  \\
        & = -\frac{1}{\ln 2}-\log_2 \lambda_i - \frac{1}{v_i} \frac{-v_i}{\ln 2} - \frac{1}{v_i} \sum_{j \neq i} v_j \log_2 \lambda_j  \\
        & = -\frac{1}{\ln 2}-\log_2 \lambda_i + \frac{1}{\ln 2} - \frac{1}{v_i}  \sum_{j \neq i} v_j \log_2 \lambda_j  \\
        & = -\log_2 \lambda_i - \frac{1}{v_i}  \sum_{j \neq i} v_j \log_2 \lambda_j  \\
        & = -\frac{v_i}{v_i} \log_2 \lambda_i - \frac{1}{v_i}  \sum_{j \neq i} v_j \log_2 \lambda_j  \\
        & = -\frac{1}{v_i}  \sum_{j} v_j \log_2 \lambda_j 
    \end{split}
\end{equation*}
\end{proof}


\begin{lemma}\label{lemma3}
Let $\boldsymbol{\lambda}=\{\lambda_i\}$ be the eigenvalues of (trace normalized) Graph Laplacian $\tilde{L}$ whose von Neumann entropy is  $S(\boldsymbol{\lambda})=-\sum_{i=1}^{N}{\lambda_i\log_2{\lambda_i}}$. Let  $\blambda' = \blambda + \bv$, such that $\sum_i v_i = 0 $ and  $0 \le v_j + \lambda_j \le 1$ and $S(\blambda') \ge S(\blambda)$, then 
\begin{align}
\E_{\bv} \log_2 \blambda' \ge \E_{\bv} \log_2 \blambda 
\end{align}
and
\begin{align}
\bm{v} \frac{dS}{d\blambda} (\blambda') |_{\bv} \le \bm{v} \frac{dS}{d\blambda} (\blambda) |_{\bv}
\end{align}
\end{lemma}


\begin{proof}
From the definition we consider $\bv = \blambda' - \blambda$, thus, omitting the vector indices and using vector notation, where operations are performed element-wise and sum is over the elements of the resulting vector:
\begin{align*}
    \E_{\bv} \log_2 \blambda & = \sum_j (\blambda_j' - \blambda_j) \log_2 \blambda_j \\ 
    &= \sum_j \blambda_j' \log_2 \blambda_j - \sum \blambda_j \log_2 \blambda_j \\
    &= \sum_j \blambda_j' \log_2 \blambda_j + S(\blambda)
\end{align*}
similarly
\begin{align*}
    \E_{\bv} \log_2 \blambda' & = \sum_j (\blambda_j' - \blambda_j) \log_2 \blambda_j' \\ 
    &= \sum_j \blambda_j' \log_2 \blambda_j' - \sum_j \blambda_j \log_2 \blambda_j' \\
    &= -S(\blambda') - \sum_j \blambda_j \log_2 \blambda_j'
\end{align*}
the difference is 
\begin{align*}
    \E_{v} \log_2 \blambda' - \E_{v} \log_2 \blambda  & = -S(\blambda') - S(\blambda) \\
    & - \sum_j \blambda_j \log_2 \blambda_j' - \sum_j \blambda_j' \log_2 \blambda_j \ge 0
\end{align*}
which is the sum of two non-negative terms:
\begin{align*}
    -S(\blambda) - \sum_j \blambda_j \log_2 \blambda_j' \ge 0 \\
    -S(\blambda') - \sum_j \blambda_j' \log_2 \blambda_j \ge 0 
\end{align*}
The last two inequalities follow from the property of the KL divergence, indeed we use the inequality $D(\boldsymbol{p} || \boldsymbol{q}) = \sum_i p_i \log_2 \frac{p_i}{q_i}
= \E_{\boldsymbol{p}} \log_2 \boldsymbol{p} - \E_{\boldsymbol{p}} \log_2 \boldsymbol{q}  = -H(\boldsymbol{p}) - \E_{\boldsymbol{p}} \log_2 \boldsymbol{q}\ge 0$.

Since for Lemma \ref{lemma2}
\begin{align*}
\frac{dS}{d\blambda} (\blambda) |_{\bv} = -\bv^{-1} \E_{v} \log_2 \blambda,
\end{align*}
it follows from the first property ($\E_{\bv} \log_2 \blambda' \ge \E_{\bv} \log_2 \blambda $) that 
\begin{align*}
\bv \frac{dS}{d\blambda} (\blambda') |_{\bv} \le \bv \frac{dS}{d\blambda} (\blambda) |_{\bv}
\end{align*}
where the comparison is element wise, i.e.
\begin{align*}
\bv_i \frac{dS}{d\blambda_i} (\blambda') |_{\bv} \le \bv_i \frac{dS}{d\blambda_i} (\blambda) |_{\bv}
\end{align*}
\end{proof}




}




Now, we present proof to Corollary~\ref{corollary}.
\begin{proof}
Suppose the addition of an edge makes the $i$-th eigenvalue $\lambda_i$ has a minor change $\Delta\lambda_i$. By first-order approximation, we have: 
\begin{equation}\label{eq:1st_1}
    S_{\text{vN}}\left(\frac{L_G+L_{G'_S}}{2}\right)-S_{\text{vN}}\left(\frac{L_G+L_{G_S}}{2}\right)\approx\frac{1}{2}\frac{dS_{\text{vN}}(L_{\bar{G}})}{d\lambda_i}\Delta\lambda_i,
\end{equation}
and
\begin{equation}\label{eq:1st_2}
    S_{\text{vN}}(L_{G'_S})-S_{vN}(L_{G_S})\approx\frac{dS_{\text{vN}}(L_{G_s})}{d\lambda_i}\Delta\lambda_i,
\end{equation}
in which $\frac{dS}{d\lambda_i}$ is the total derivative of von Neumann entropy $S(\boldsymbol{\lambda})=-\sum_{i=1}^{N}{\lambda_i\log_2{\lambda_i}}$ with respect to $\lambda_i$.

\begin{comment}
% to be removed
{\color{red}
{ TO REMOVE(red text):
By Assumption~\ref{assumption}, we have $S_{\text{vN}}(L_{\bar{G}})\geq S_{\text{vN}}(L_{G_s})$. Thus, according to Lemma~\ref{lemma}, we obtain:
\begin{equation}\label{eq:1st_3}
    \frac{dS_{\text{vN}}(L_{\bar{G}})}{d\lambda_i}\le\frac{dS_{\text{vN}}(L_{G_s})}{d\lambda_i}.
\end{equation}
}
}
\end{comment}

% new
{\color{black}

By Assumption~\ref{assumption}, we have $S_{\text{vN}}(L_{\bar{G}})\geq S_{\text{vN}}(L_{G_s})$. By applying Lemma~\ref{lemma3}, we obtain:
\begin{equation}\label{eq:1st_3_new}
    \frac{dS_{\text{vN}}(L_{\bar{G}})}{d\lambda_i} \Delta \lambda_i \le\frac{dS_{\text{vN}}(L_{G_s})}{d\lambda_i}\Delta \lambda_i.
\end{equation}
along the direction $\bv = \blambda(L_{\bar{G}}) - \blambda(L_{G_s})$, where $\blambda(L_{\bar{G}})$ and $ \blambda(L_{G_s})$ are the eigenvalues of the two normalized Graph Laplacian matrices. We used the variation $ v_i = \Delta \lambda_i$ of Lemma \ref{lemma3}.

}


Combining Eqs.~(\ref{eq:1st_1}) to (\ref{eq:1st_3_new}), we get:
\begin{equation}
\begin{split}
    S_{\text{vN}}\left(\frac{L_G+L_{G'_S}}{2}\right)-S_{\text{vN}}\left(\frac{L_G+L_{G_S}}{2}\right) \\
    \le\frac{1}{2} \left(S_{\text{vN}}(L_{G'_S})-S_{\text{vN}}(L_{G_S})\right).
\end{split}
\end{equation}

Thus,
\begin{equation}
\begin{split}
    S_{\text{vN}}\left(\frac{L_G+L_{G'_S}}{2}\right)-\frac{1}{2}\left(S_{\text{vN}}(L_{G'_S})+S_{vN}(L_G)\right)\\
    \le S_{\text{vN}}\left(\frac{L_G+L_{G_S}}{2}\right)-\frac{1}{2}\left(S_{\text{vN}}(L_{G_S})+S_{vN}(L_G)\right),
\end{split}
\end{equation}
which completes the proof.

\end{proof}

%\subsection{Additional information on Theorem-3}

% 

% 

\subsection{Additional Information and Proof to Theorem~\ref{th:grad}}



\begin{customthm}{4}\label{th:grad}
The gradient of Eq.~(11) in the main manuscript with respect to edge selection vector $\mathbf{w}$ is:
% \begin{eqnarray}
% \nabla_w D_\beta(\rho, \sigma_w) = -\diag{\left( M^T \left[ \left(1-\frac{\beta}{2}\right)\ln \sigma_w +\frac{\beta}{2}\ln \bar{\sigma}_w\right]M \right)}
% \end{eqnarray}
% \D_\beta(\rho, \sigma_w) 
% \begin{align}
% \nabla_w 
% \mathcal{J}_{\text{Graph-PRI}} = -\diag{\left( M^T \left[ \left(1-\beta\right)\ln \sigma_w +\beta\ln \bar{\sigma}_w\right]M \right)}
% \end{align}
\begin{align}
\nabla_\mathbf{w} 
\mathcal{J}_{\text{Graph-PRI}} = U g,
\end{align}
where $\tilde{\mathbf{w}}$ is the normalised $\mathbf{w}$ ($\tilde{\mathbf{w}} = \mathbf{w} / \sum_{i=1}^M w_i$), $\tilde{\mathbf{1}}_{M}  = \frac1{M} \mathbf{1}_{M} $ is the normalized version of the all-ones vector. $\bar{\sigma}_\mathbf{w} = \frac{1}{2} \left( \tilde{\sigma}_\mathbf{w} +\tilde{\rho} \right) = \frac{1}{2} B  \diag{\left(\tilde{\mathbf{w}} + \tilde{\mathbf{1}}_{M} \right)} B^T$. $g = -\diag{\left( B^T \left[ \left(1-\beta\right)\ln \tilde{\sigma}_\mathbf{w} +\beta\ln \bar{{\sigma}}_\mathbf{w}\right]B \right)}$ and $U = \{ u_{ij} \} \in \mathbb{R}^{M\times M}, u_{ij} = -\frac{\tilde{w}_j}{1-\tilde{w}_i},  \forall ij | i \ne j, u_{ii} = 1.$
\end{customthm}


{\color{black}
Theorem~\ref{th:grad} shows the closed-form gradient of the argument of Eq.~(11) in the main manuscript. This gradient can be used to reduce the computational or memory requirement to compute the gradient, as compared to the use of automatic differentiation. It can also help in understanding the contribution of the gradient and design approximation of the gradient. In Theorem~\ref{th:grad}, $\mathbf{w}$ is edge selection vector, while $\tilde{\mathbf{w}}$ is its normalised version, i.e. $\tilde{\mathbf{w}} = \mathbf{w} / \sum_{i=1}^M w_i$. Similarly,  $\tilde{\mathbf{1}}_{M}  = \frac1{M} \mathbf{1}_{M} $ is the normalized version of the all-ones vector. In Theorem \ref{th:grad}, $g$ is the gradient of the Von Neumann entropy with respect to the normalized Laplacian matrix, while $U$ is a matrix that normalizes the gradient with respect to the edge selection vector values. 

The gumbel-softmax distribution can be used on both $H(G)$ and $S_{\text{vN}}(L_G)$ and does not require the results of Theorem \ref{th:grad}.
}


\begin{proof} [Proof of Theorem \ref{th:grad}]
Theorem \ref{th:grad} follows by definition of Eq.~(10) in the main manuscript and substituting the definition of Eq.~(10) and the use of result from Theorem~\ref{th:grad_entropy}.
% $ \frac{d J}{d w_i} = \frac{\partial J}{\partial w_i}+\sum_{i\neq j}{\frac{\partial J}{\partial w_j}\frac{d w_j}{d w_i}} $, since $w_j = 1 - \sum_{k \ne i} w_k$, then $\frac{d w_j}{d w_i}=-1$.
The total derivative of the cost function w.r.t. to the normalized selection vector $\tilde{\mathbf{w}}$, is given by $ \frac{d J}{d w_i} = \frac{\partial J}{\partial w_i}+\sum_{i\neq j}{\frac{\partial J}{\partial w_j}\frac{d w_j}{d w_i}} $. With the normalized selector vector, we have that $\sum_{k} w_k = 1$ before and after the change.
% , then $\frac{d w_j}{d w_i}=-\frac1{M-1}$. Indeed, if $w_i \to w_i + \delta$ and $w_j \to w_j + \alpha, j \ne i$, then $\alpha = - \frac{\delta}{M-1}$ and $\frac{d w_j}{d w_i} = \frac{\alpha}{\delta} = -\frac1{M-1} $. This can lead to negative values if $w_j<\frac1{M-1}$. 
% If on the other side, 
If we consider, 
{\color{black}
as in Lemma~\ref{lemma},
}
$w_i \to w_i + \delta$ and $w_j \to w_j (1 - \gamma), j \ne i$, then $\gamma = \frac{\delta}{1-w_i}$ and $\frac{d w_j}{d w_i} = - \frac{\gamma w_j}{\delta} = -\frac{w_j}{1-w_i} $
\end{proof}

\begin{customthm}{5}\label{th:grad_entropy}
The gradient of the von Neumann entropy w.r.t. the edge selection vector $\mathbf{w}$ is
% \begin{align} \label{eq:grad_entropy}
% \nabla_\mathbf{w} S(\sigma_\mathbf{w}) = -\diag \left(M^T \log (M \diag(\mathbf{w})M^T) M\right) -\mathbf{1}_{M},
% \end{align}
% \begin{align} \label{eq:grad_entropy}
% \nabla_\mathbf{w} S(\sigma_\mathbf{w}) = -\diag \left(B^T \log (B \diag(\mathbf{w})B^T) B\right) -\mathbf{1}_{M},
% \end{align}
\begin{align} \label{eq:grad_entropy}
\nabla_\mathbf{w} S(\sigma_\mathbf{w}) = -\diag \left(B^T \log (B \diag(\mathbf{w})B^T) B\right),
\end{align}
where 
$S(\sigma) = -\tr{\sigma \log \sigma - \sigma} = -\sum_i (\lambda_i \log \lambda_i - \lambda_i)$
and $\sigma_\mathbf{w} = B \diag(\mathbf{w})B^T$.
% $S(\tilde{\sigma}) = -\tr{\tilde{\sigma} \log_2 \tilde{\sigma}}$ 
% $S(\sigma) = -\tr{\sigma \log \sigma}$ 
% and $\sigma_w = M \diag(\mathbf{w})M^T$.
% \begin{eqnarray} \label{eq:grad_entropy}
% \nabla_w S(M \diag(\mathbf{w})M^T) = -\diag \left(M^T \ln (M \diag(\mathbf{w})M^T) M\right),
% \end{eqnarray}
% where $S(\sigma) = \tr{\sigma \log \sigma -\sigma}$
\end{customthm}

\begin{proof}[Proof of Theorem \ref{th:grad_entropy}]
Theorem \ref{th:grad_entropy} follows from
% $\nabla_\sigma S(\sigma) = - \log{\sigma} - I$
$\nabla_\sigma S(\sigma) = - \log{\sigma}$
% (\textcolor{red}{I feel it should be just $-\log{\sigma}$ ?})
and the use of gradient of the trace of a function of a matrix. Here we use the un-normalized Laplacian matrix for simplicity.
\end{proof}




% \begin{theorem}[Water-Filling] \label{th:waterfilling}
% The solution of Von Neumann entropy maximization problem
% \begin{eqnarray} \label{eq:entropy_sum}
% \max_\sigma S\left(\frac{\sigma + \rho}{2}\right)
% \end{eqnarray}
% with $\text{tr}{\sigma} = 1, \text{tr}{\rho} = 1$, is given by
% \begin{eqnarray}
% % \lambda^\sigma_i+\lambda^\rho_i &=& 1 \\
% \lambda^\sigma_i = [K-\lambda^\rho_i]_+
% \end{eqnarray}
% and $\sigma = V \diag{\lambda^\sigma} V^T$, where $\rho = V \diag{\lambda^\rho} V^T$ is the eigenvalue decomposition of $\rho$,  $[x]_+=\max(0,x)$ is the function returning the positive part of $x$ and $K = \sum_i (\lambda_i^\sigma+\lambda_i^\rho) = 2/m$.
% \end{theorem}

% \begin{proof}[Thrm.\ref{th:waterfilling}]
% To write the solution of Eq.\ref{eq:entropy_sum} we derive the KKT conditions. Since we assume the two graphs laplacian share the same eigenvector we can write the alternative minimiation problem
% \begin{align*}
% \min_{\lambda_i^\sigma} & - \sum_i \frac{(\lambda_i^\sigma+\lambda_i^\rho)}{2} \ln \frac{(\lambda_i^\sigma+\lambda_i^\rho)}{2} + \frac{(\lambda_i^\sigma+\lambda_i^\rho)}{2} + \\
% & \lambda (\sum_i \lambda_i^\sigma -1) -\sum_i \nu_i \lambda_i^\sigma +\sum_i \eta_i (\lambda_i^\sigma-1) 
% \end{align*}
% whose KKT condition is  
% $\ln (\lambda_i^\sigma+\lambda_i^\rho) + \lambda - \nu_i + \eta_i = 0$, with $\nu_i = \eta_i = 0$ if $0 < \lambda_i^\sigma < 1$. This leads to $\lambda_i^\sigma+\lambda_i^\rho = K K_i$, with $K_i=1$ if $0<\lambda_i^\sigma<1$. Thus $\lambda_i^\sigma = [K - \lambda_i^\rho]_+$. This is true if $\lambda_i^\rho<K$, whereas if $\lambda_i^\rho>K$ then $\lambda_i^\sigma + \lambda_i^\rho = K K_i$, with proper $K_i$. We can compute $K$ by $m K = \sum_i \lambda_i^\sigma+\lambda_i^\rho = 2$, thus $K = 2/m$. 

%\end{proof}

% First we know that $\nabla_\sigma S(\sigma+\rho) = \ln (\sigma + \rho)$. We then ask the gradient to be zero $\ln (\sigma + \rho)=0$. If we build $\sigma = V \diag{\lambda^\sigma} V^T$, where $\rho = V \diag{\lambda^\rho} V^T$ is the eigenvalue decomposition of $\rho$, and since $S(\sigma+\rho) = \sum_i[(\lambda^\sigma_i+\lambda^\rho_i)\ln (\lambda^\sigma_i+\lambda^\rho_i) - (\lambda^\sigma_i+\lambda^\rho_i)]$, the condition of zero-gradient can be written as $0=\nabla_\sigma S(\sigma+\rho) = [\ln (\lambda^\sigma_i+\lambda^\rho_i)] = 0$ or equivalently $\lambda^\sigma_i+\lambda^\rho_i = 1$. Since $\lambda^\sigma_i \ge 0$ the result follows. We also note that if $\lambda^\rho_i=0$ then $\lambda^\sigma_i=1$, this means that the new graph may contain edges not original present in $\rho$.



% \begin{theorem}[Water-Filling] \label{th:waterfilling}
% The solution of Von Neumann entropy minimization problem
% \begin{eqnarray} \label{eq:entropy_sum}
% \min_\sigma S(\sigma + \rho)
% \end{eqnarray}
% is given by
% \begin{eqnarray}
% % \lambda^\sigma_i+\lambda^\rho_i &=& 1 \\
% \lambda^\sigma_i = [1-\lambda^\rho_i]_+
% \end{eqnarray}
% and $\sigma = V \diag{\lambda^\sigma} V^T$, where $\rho = V \diag{\lambda^\rho} V^T$ is the eigenvalue decomposition of $\rho$ and $[x]_+=\max(0,x)$ is the function returning the positive part of $x$.
% \end{theorem}

% \begin{proof}[Thrm.\ref{th:waterfilling}]
% The solution of Eq.\ref{eq:entropy_sum} has zero-gradient. First we know that $\nabla_\sigma S(\sigma+\rho) = \ln (\sigma + \rho)$. We then ask the gradient to be zero $\ln (\sigma + \rho)=0$. If we build $\sigma = V \diag{\lambda^\sigma} V^T$, where $\rho = V \diag{\lambda^\rho} V^T$ is the eigenvalue decomposition of $\rho$, and since $S(\sigma+\rho) = \sum_i[(\lambda^\sigma_i+\lambda^\rho_i)\ln (\lambda^\sigma_i+\lambda^\rho_i) - (\lambda^\sigma_i+\lambda^\rho_i)]$, the condition of zero-gradient can be written as $0=\nabla_\sigma S(\sigma+\rho) = [\ln (\lambda^\sigma_i+\lambda^\rho_i)] = 0$ or equivalently $\lambda^\sigma_i+\lambda^\rho_i = 1$. Since $\lambda^\sigma_i \ge 0$ the result follows. We also note that if $\lambda^\rho_i=0$ then $\lambda^\sigma_i=1$, this means that the new graph may contain edges not original present in $\rho$.
% \end{proof}

%{\color{blue} We note that Thrm.\ref{th:waterfilling} does not assume that the graph $\sigma$ is limited by the existing graph (e.g. using the same incident matrix $E$). We considering the case where we can only remove edges in the following section. }


\section{Principle of Relevant Information (PRI) for scalar random variables}\label{sec:appendix_PRI}

In information theory, a natural extension of the well-known Shannon's entropy is the R{\'e}nyi's $\alpha$-entropy~\citep{renyi1961measures}. For a random variable $\bf{X}$ with PDF $f(x)$ in a finite set $\mathcal{X}$, the $\alpha$-entropy of $H(\bf{X})$  is defined as:

\begin{equation}
H_\alpha(f)=\frac{1}{1-\alpha}\log \int_\mathcal{X} f^\alpha(x)dx.
\label{1.1}
\end{equation}

On the other hand, motivated by the famed Cauchy-Schwarz (CS) inequality:
\begin{equation}
\Big| \int f(x)g(x)dx \Big|^2 \leq \int \mid f(x)\mid^2 dx \int \mid g(x)\mid^2 dx,
\end{equation}
with equality if and only if $f(x)$ and $g(x)$ are linearly dependent (e.g., $f(x)$ is just a scaled version of $g(x)$), a measure of the ``distance'' between the PDFs can be defined, which was named the CS divergence~\citep{jenssen2006cauchy}, with:
\begin{equation} \label{1.2}
\begin{split}
D_{cs} (f\|g) & = -\log(\int fg)^2 + \log(\int f^2) + \log(\int g^2) \\
& = 2H_2(f;g) - H_2(f) - H_2(g),
\end{split}
\end{equation}
the term $H_2(f;g)=-\log\int f(x)g(x)dx$ is also called the quadratic cross entropy~\citep{principe2010information}.

Combining Eqs.~(\ref{1.1}) and (\ref{1.2}), the PRI under the $2$-order R{\'e}nyi entropy can be formulated as:
\begin{equation}\label{1.3}
\begin{aligned}
f_{\text{opt}}& =\arg \min_f H_2(f)+\beta(2H_2(f;g)-H_2(f)-H_2(g))\\
& \equiv \arg \min_f (1-\beta)H_2(f) + 2\beta H_2(f;g),
\end{aligned}
\end{equation}
the second equation holds because the extra term $\beta H_2(g)$ is a constant with respect to $f$. %$\bf{X}$.

As can be seen, the objective of na\"ive PRI for $i.i.d.$ random variables (i.e., Eq.~(\ref{1.3})) resembles its new counterpart on graph data (i.e., Eq.~(11) in the main manuscript). The big difference is that we replace $H_2(f)$ with $S_{\text{vN}}(\tilde{\sigma})$ and $H_2(f;g)$ with $S_{\text{vN}}\left(\frac{\tilde{\sigma}+\tilde{\rho}}{2}\right)$ to capture structure information.


% \footnote{The number of samples in $\bf{Y}$ is not required to be same as in $\bf{X}$. However, our application uses the same number for simplicity.}

If we estimate $H_2(f)$ and $H_2(f;g)$ with the Parzen-window density estimator and optimize Eq.~(\ref{1.3}) by gradient descent. 
Fig.~\ref{fig:org_pri} demonstrates the structure learned from an original intersect data by different values of $\beta$.

Interestingly, when $\beta=0$, we obtained a single point, very similar to what happens for Graph-PRI that learns a nearly star graph such that edges concentrates on one node. Similarly, when $\beta\to\infty$, both na\"ive PRI and Graph-PRI get back to the original input as the solution. 


\begin{figure*}[ht]
	\centering
	    \begin{subfigure}{.3\textwidth}
	    \centering
        \includegraphics[width=\textwidth]{prifigure/data.pdf}
        \caption{original data}
	\end{subfigure}%
	\begin{subfigure}{.3\textwidth}
	    \centering
        \includegraphics[width=\textwidth]{prifigure/zeros.pdf}
        \caption{$\beta=0$}
	\end{subfigure}	
	\begin{subfigure}{.3\textwidth}
	    \centering
        \includegraphics[width=\textwidth]{prifigure/one_new.pdf}
        \caption{$\beta=1$}
	\end{subfigure}	 \\
	    \begin{subfigure}{.3\textwidth}
	    \centering
        \includegraphics[width=\textwidth]{prifigure/three_new.pdf}
        \caption{$\beta=3$}
	\end{subfigure}%
	\begin{subfigure}{.3\textwidth}
	    \centering
        \includegraphics[width=\textwidth]{prifigure/six_new.pdf}
        \caption{$\beta=6$}
	\end{subfigure}	
	\begin{subfigure}{.3\textwidth}
	    \centering
        \includegraphics[width=\textwidth]{prifigure/hun_new.pdf}
        \caption{$\beta=100$}
	\end{subfigure}		
	\caption{Illustration of the structures revealed by the na\"ive PRI for (a) Intersect data set. As the values of $\beta$ increase the solution passes through (b) a single point, (c) modes, (d) and (e) principal curves at different dimensions, and in the extreme case of (f) $\beta \rightarrow \infty$ we get back the data themselves as the solution.}
	\label{fig:org_pri}
\end{figure*}



\section{Details of used datasets in Section~4.2 and Section~4.3} \label{sec:appendix_data}

\subsection{Multi-task learning}
\noindent
\textbf{Synthetic data.}
This dataset consists of $20$ regression tasks with $100$ samples each. Each task is a $30$-dimensional linear regression problem in which the last $10$
variables are independent of the output variable $y$. The $20$ tasks are
related in a group-wise manner: the first $10$ tasks form a group and the
remaining $10$ tasks belong to another group. Tasks' coefficients in the same group are completely related to each other, while totally unrelated to
tasks in another group.

Tasks' data are generated as follows: weight vectors corresponding
to tasks $1$ to $10$ are $\mathbf{w}_k= \mathbf{w}_a \odot \mathbf{b}_k + \xi $, where $\odot$ is the element-wise Hadamard product; and tasks $11$ to $20$ are $\mathbf{w}_k= \mathbf{w}_b \odot \mathbf{b}_k + \xi $, where $\xi \sim \mathcal{N}(\mathbf{0},0.2\mathbf{I}_{20})$. Vectors $\mathbf{w}_a$ and $\mathbf{w}_b$ are generated from $\mathcal{N}(\mathbf{0},\mathbf{I}_{20})$, while $\mathbf{b}_k\sim \mathcal{U}(0, 1)$ are uniformly distributed $20$-dimensional random vectors.

Input and output variables for the $t$-th ($t = 1,\cdots,20$) task, $X_t$ and $y_t$, are generated as $X'_t \sim \mathcal{N}(\mathbf{0},\mathbf{I}_{20})$ and
$y_t = X'_t \mathbf{w}_t + \mathcal{N}(0,1)$.
$10$-dimensional unrelated variables  $X''_t \sim \mathcal{N}(\mathbf{0},\mathbf{I}_{10})$ are then concatenated to $X'_t$ to form the final input data $X_t=[X'_t\quad X''_t]$.


\noindent
\textbf{Parkinsons's disease dataset.}
This is a benchmark multi-task regression data set, comprising a range of biomedical voice measurements taken from $42$ patients with earlystage Parkinson’s disease. For each patient, the goal is to predict the motor Unified Parkinson’s Disease Rating Scale (UPDRS) score based $18$-dimensional record: age, gender, and $16$ jitter and shimmer voice measurements. For the categorical variable ``gender", we applied label encoding that converts genders into a numeric representation. We treat UPDRS prediction for each patient as a task, resulting in $42$ tasks and $5,875$ observations in total.



\subsection{Brain network classification}

For both datasets, the Automated Anatomical Labeling (AAL) template was used to extract ROI-averaged time series from the $116$ ROIs. Meanwhile, to construct the initial brain network topology (i.e., the adjacency matrix $A$), we only keep edge if its weight (i.e., the absolute correlation coefficient) is among the top $20\%$ of all absolute correlation coefficients in the network. 


As for the node features, we only use the correlation coefficients for simplicity. That is, the node feature for node $i$ can be represented as $\mathbf{x}_i=[\rho_{i1},\rho_{i2},\cdots,\rho_{in}]^T$, in which $\rho_{ij}$ is the Pearson's correlation coefficient for node $i$ and node $j$. One can expect performance gain by incorporating more discriminative network property features such as the local clustering coefficient~\citep{rubinov2010complex}, although this is not the main scope of our work. 

% and the local degree profile~\citep{cai2018simple}

The first one is the eyes open and eyes closed (EOEC) dataset~\citep{zhou2020toolbox}, which contains the rs-fMRI data of $48$ ($22$ females) college students (aged $19$-$31$ years) in both eyes open and eyes closed states. The task is to predict two states based on brain network FC. 

The second one is from the Alzheimer's Disease Neuroimaging Initiative (ADNI) database\footnote{\url{http://adni.loni.usc.edu/}}. We use the rs-fMRI data collected and preprocessed in~\citep{kuang2019concise} which includes $31$ AD patients aged $60$–$90$ years. They were matched by age, gender, and education to mild cognitive impairment (MCI)\footnote{MCI is a transitional stage between AD and NC.} and $37$ normal control (NC) subjects, together comprising $106$ participants been selected. In this work, we only focus on distinguishing MCI group from NC group. 

\noindent
\textbf{EOEC} is publicly available from
\url{https://github.com/zzstefan/BrainNetClass/}.


\noindent
\textbf{ADNI} preprocessed by~\citep{kuang2019concise} is publicly available from
\url{http://gsl.lab.asu.edu/software/ipf/}.


\section{Network architecture and hyperparameter tuning}

\subsection{fMRI-based Brain Network Classification}
The classification problem is solved using graph neural networks composed of two graph convolutional networks of size $32$ and with relu activation function. We also use node feature drop with probability $10^{-1}$.
The node pooling is the sum of the node features, while the node classification minimizes the cross entropy loss. Hyper parameter search is applied to all method with time budget of $3'000$ seconds, over $3$ runs. The learning rates, $\lambda,\beta$ and the softmax temperature are optimized using early pruning. Each graph neural network is fed with graphs generated from the full correlation matrix by selecting edges among the strongest $20\%$ absolute correlation values. For the Graph-PRI method, we used the GCN \citep{kipf2017semi} as graph classification network. 

For SVM, we use the Gaussian kernel and set kernel size equals to $1$. For LASSO, we set the hyperparameter as $0.1$. For t-test, we set the significance level as $0.05$.

\section{Minimal Implementation of Graph-PRI in PyTorch}\label{sec:appendix_code}


% %\vspace{-0.7em}
% \begin{algorithm}[htb]
% \caption{PRI for Graph Sparsification}
% \label{alg:PRI_Graph}
% \begin{algorithmic}[1]
% \Require
% $\rho = MM^T$, $\beta$, learning rate $\eta$, mini-batch size $B$
% \Ensure
% $\sigma_{\mathbf{w}}$
% \State $M\gets$ incident matrix of $\rho$;
% \State Initialize $\mathbf{\theta}=\{\theta_1,\theta_2,\cdots,\theta_M\}$;
% \While {not converged}
% \For{$i=1,2,\cdots,B$}
%   \State $\mathbf{w}^i\gets$ Gumbel-softmax sampling from $p_{\mathbf{\theta}}=\Sigmoid(\mathbf{\theta})$;
%   \State $\sigma_{\mathbf{w}^i} = M \diag{(\mathbf{w}^i)} M^T$;
% \EndFor
% %\State $w(i^*)=v(i^*)$ \Comment{or $1$ if we want to have graph of unit edge value}\;
% \State $\mathbf{\theta} \gets \mathbf{\theta} - \eta \frac{1}{B} \sum_{i=1}^B \nabla_{\mathbf{\theta}} D_\beta(\rho, \sigma_{\mathbf{w}^i})$;
% \EndWhile
% \State $\mathbf{w}\gets$ Gumbel-softmax sampling from $p_{\mathbf{\theta}}=\Sigmoid(\mathbf{\theta})$; \\
% \Return $\sigma_{\mathbf{w}} = M \diag{(\mathbf{w})} M^T$;
% \end{algorithmic}
% \end{algorithm}
% %\vspace{-0.7em}

\begin{algorithm}[htb]
\caption{PRI for Graph Sparsification}
\label{alg:PRI_Graph}
\begin{algorithmic}[1]
\Require
$\rho = BB^T$, $\beta$, learning rate $\eta$, number of samples $S$
\Ensure
$\sigma_{\mathbf{w}}$
\State $B\gets$ incident matrix of $\rho$;
\State Initialize $\mathbf{\theta}=\{\theta_1,\theta_2,\cdots,\theta_M\}$;
\While {not converged}
\State $L = 0$;
\For{$i=1,2,\cdots,S$}
  \State $\mathbf{w}^i\gets$ $\text{GumbelSoftmax}(\mathbf{\theta})$; 
  \State $\sigma_{\mathbf{w}^i} = B \diag{(\mathbf{w}^i)} B^T$; 
  \State $L = L + \mathcal{J}_\beta(\rho, \sigma_{\mathbf{w}^i})$;
\EndFor
%\State $w(i^*)=v(i^*)$ \Comment{or $1$ if we want to have graph of unit edge value}\;
\State $L = \frac{1}{S} L$
\State $\mathbf{\theta} \gets \mathbf{\theta} - \eta \nabla_{\mathbf{\theta}} L$;
\EndWhile
\State $\mathbf{w}\gets$ $\text{GumbelSoftmax}(\mathbf{\theta})$; \\
\Return $\sigma_{\mathbf{w}} = B \diag{(\mathbf{w})} B^T$;
\end{algorithmic}
\end{algorithm}

We additional provide PyTorch implementation of Graph-PRI.

% \scriptsize{
% \lstinputlisting[linewidth=\columnwidth,breaklines=true]
% {code/pri.py}}
% \normalsize

\begin{lstlisting}[language=Python, caption=Graph-PRI PyTorch]

import torch
import networkx as nx
import numpy as np 

def vn_entropy(k, eps=1e-20):

    k = k / torch.trace(k) 
    eigv = torch.abs(torch.symeig(k, eigenvectors=True)[0])
    entropy = -torch.sum(eigv[eigv>0] * torch.log(eigv[eigv>0] + eps))
    return entropy

def entropy_loss(sigma, rho, beta):

    assert(beta>=0), "beta shall be >=0"
    if beta > 0:
        return 0.5 * (1 - beta) / beta * vn_entropy(sigma) + vn_entropy(0.5 * (sigma + rho))
    else:
        return vn_entropy(sigma)

def sparse(G, tau, n_samples, max_iteration, lr, beta):
	'''
	Args:
		G: networkx Graph
		n_samples: number of samples for gumbel softmax
	'''
	
	E = nx.incidence_matrix(g1, oriented=True)
    E = E.todense().astype(np.double)
    E = torch.from_numpy(E)

    rho = E @ E.T

    m, n = G.number_of_edges(), G.number_of_nodes()   
    theta = torch.randn(m, 2, requires_grad=True)
    optimizer = torch.optim.Adam([theta], lr=lr)

    for itr in range(max_iteration):
        cost = 0      
        for sample in range(n_samples):
            # Sampling
            z = F.gumbel_softmax(theta, tau, hard = True)
            w = z[:, 1].squeeze()
            sigma = E @ torch.diag(w) @ E.T
            _loss = entropy_loss(sigma, rho, beta)                 
            cost = cost + _loss

        cost = cost / n_samples        
        cost.backward()
        optimizer.step()
        optimizer.zero_grad()

    z = F.gumbel_softmax(theta, tau, hard=True)
    w = z[:,1].squeeze()

    sigma = E @ torch.diag(w) @ E.T # sparse laplacian

    return sigma, w
\end{lstlisting}


\begin{comment}
\section{Zoomed plot of Fig.~\ref{fig:brain}} \label{sec:appendix_zoom}

%We finally demonstrate the zoomed plot of Fig.~\ref{fig:brain} as shown below.

\begin{figure*}%[t!]
\centering
	\begin{subfigure}{.9\textwidth}
	\centering
	\includegraphics[width=\textwidth]{MCI_circular.pdf}
	\end{subfigure}\\
	\begin{subfigure}{.9\textwidth}
	\centering
	\includegraphics[width=\textwidth]{figures/brain_color.pdf}
	%\caption{Normal control group}
	%\label{fig:openflights_sparse}
	\end{subfigure}
	\caption{The contributing functional connectivity links for MCI patients. We visualize edges with a probability of more than $50\%$ been selected by our generated edges. The colors of neural systems are described as: sensorimotor network (\textcolor{braincolor1}{SMN}), occipital network (\textcolor{braincolor2}{ON}), fronto-parietal network (\textcolor{braincolor3}{FPN}), default mode network (\textcolor{braincolor4}{DMN}), cingulo-opercular network (\textcolor{braincolor5}{CON}), and cerebellum network (\textcolor{braincolor6}{CN}), respectively.}
\end{figure*}

\begin{figure*}
\centering
	\begin{subfigure}{.9\textwidth}
	\centering
	\includegraphics[width=\textwidth]{NC_circular.pdf}
	\end{subfigure}
	\begin{subfigure}{.9\textwidth}
	\centering
	\includegraphics[width=\textwidth]{figures/brain_color.pdf}
	%\caption{Normal control group}
	%\label{fig:openflights_sparse}
	\end{subfigure}	
	\caption{The contributing functional connectivity links for normal control group. We visualize edges with a probability of more than $50\%$ been selected by our generated edges. The colors of neural systems are described as: sensorimotor network (\textcolor{braincolor1}{SMN}), occipital network (\textcolor{braincolor2}{ON}), fronto-parietal network (\textcolor{braincolor3}{FPN}), default mode network (\textcolor{braincolor4}{DMN}), cingulo-opercular network (\textcolor{braincolor5}{CON}), and cerebellum network (\textcolor{braincolor6}{CN}), respectively.}
	%\label{fig:brain}
\end{figure*}
\end{comment}

%\input{UAI_camera_ready.bbl}
%\bibliographystyle{named}
%{\fontsize{8}{9}\selectfont \bibliography{IT}}
\bibliography{yu_641}

\end{document}
