\pagebreak

\appendix

\section{Supplementary Materials}

\subsection{Description of \texorpdfstring{$\mathcal{G}$}{G}-Mixup hyperparameters}\label{appendix:hyperparamets}

We describe the hyperparameters relevant to the $\mathcal{G}$-Mixup method. We also state the values for these hyperparameters used in the in the original paper, as stated in the paper; we note that some of these values in the original author's code were different, and so we therefore changed it in our code.

\begin{itemize} \item Augmentation ratio $\alpha$: Multiplying the number of training graphs by this ratio gives the number of synthetic graphs generated by applying $\mathcal{G}$-Mixup to the graphons estimated from the training data. The authors use $\alpha=0.2$ in their experiments.

\item Mixup ratio $\lambda$ and mixup interval $[\Lambda_1,\Lambda_2]$: Given graphons $W_1,W_2$ and mixup ratio $\lambda \in [0,1]$, the authors' algorithm produces a new mixed-up graphon 
\begin{equation}\lambda W_1 + (1-\lambda)W_2.\label{eqn:graphon mixup}\end{equation} 
The mixup ratio $\lambda$ is randomly sampled from the interval $[\Lambda_1, \Lambda_2]$. The authors use the mixup interval $[0.1,0.2]$ in their experiments.  

\item Augmentation number $n_{\text{aug}}$: In the $\mathcal{G}$-Mixup algorithm, we generate $n_{\text{aug}}$ mixed-up graphons from any two distinct classes among all of the classes; the mixup ratios of these mixed-up graphons are given by $\lambda_i$ for $i = 1,2,\ldots,n_{\text{aug}}$. We generate $\lfloor \alpha n/n_{\text{aug}}\rfloor$ synthetic graphs from each mixed-up graphon, where $n$ is the size of the original training set. For binary classification with $\Lambda_1 = \Lambda_2$, this parameter is made irrelevant. The authors use $n_{\text{aug}}=10$ in their experiments.

% \item Number of vertices for the synthetic graphs: The algorithm used for generating graphs requires the number of nodes to be specified. The authors used the average number of nodes in the training set to set this parameter after searching.  

\item Graphon resolution: If $n$ is the graphon resolution, then in the graphon estimation step, an $n\times n$ matrix is used to represent the graphon. To sample from the graphon, an $n\times n$ adjacency matrix is then generated. Isolated nodes are then removed to provide a synthetic graph. Thus, the resolution influences the size of synthetic graphs, but does not determine it. In particular, it is an upper bound on the number of nodes in the synthetically generated graphs. The authors use the median number of nodes in the training set as the resolution.
\end{itemize}

\subsection{Theoretical result details}\label{appendix: theory}

We first provide some of the relevant definitions and background for Section~\ref{result 1} (Result 1).

\begin{definition}
A \textbf{discriminative motif} $F_G$ of a graph $G$ is the subgraph with the minimal number of nodes and edges that can decide the class of the graph $G$. Let $\mathcal{F}_{\mathcal{G}}$ denote the set of discrminative motifs for a set $\mathcal{G}$ of graphs.
\end{definition}

The authors use this notion of discriminative motifs as their measure for ``key topologies" among graphs of a certain class or label. 
\\
% In practice, we note this might be too strong of a measure, as it is possible for discriminative motifs to not exist at all for a given graph data set. 

Given an arbitrary graph or graphon, we want to measure ``how often" such a motif appears in the graph or graphon. The following definition will be used as this measure.

\begin{definition}
Let $F$ be an arbitrary graph. 
\begin{itemize}
    \item Let $G$ be a graph. Then the \textbf{homomorphism density} of $F$ with respect to the graph $G$ is
    \[t(F,G)=\frac{\text{hom}(F,G)}{|V(G)|^{|V(F)|}},\]
    where $\text{hom}(F,G)$ denotes the total number of graph homomorphisms from $F$ to $G$, and $|V(F)|,|V(G)|$ denote the number of nodes of the graphs $F,G$ respectively.
    \item Let $W$ be a graphon. Then the \textbf{homomorphism density} of $F$ with respect to the graphon $W$ is
    \[t(F,W)=\int_{[0,1]^{|V(F)|}}\prod_{(i,j)\in E(F)}W(x_i,x_j)\prod_{i\in V(F)}dx_i,\]
    where $E(F),V(F)$ denote the edge and vertex sets of $F$ respectively.
\end{itemize}
\end{definition}

Finally, the following norm on graphons provides a way to measure the ``similarity" of graphons.

\begin{definition}
Let $W:[0,1]^2\to\mathbb{R}$ be a measurable function. Then the \textbf{cut norm} of $W$ is defined as
\[\lVert W\rVert_{\square}=\sup_{S,T\subset[0,1]}\left|\int_{S\times T}W(x,y)\,dx\,dy\right|,\]
where the supremum is taken over all measurable subsets $S,T\subset [0,1]$.
\end{definition}

We note that the use of the box ($\square$) in the cut norm notation is the standard notation, and is used to distinguish cut norm from other norms.

\begin{remark}
The cut norm is indeed a norm. In particular, it is homogeneous, so for $\alpha\in \mathbb{R}$ is a scalar and $W:[0,1]^2\to\mathbb{R}$ a measurable function, then $\alpha W$ is also a measurable function on $[0,1]^2$, and 
\[\lVert \alpha W\rVert_\square = |\alpha|\lVert W\rVert_\square.\]
In particular, note that
\[\lVert W\rVert_\square = \lVert -W\rVert_\square.\]
\end{remark}

We now provide our detailed proof of Lemma~\ref{lem:A.2}.

\begin{proof}[Proof of Lemma~\ref{lem:A.2}]
We expand upon the proof of this result in \cite[Lemma 4.1]{Lovasz:2006}.

First, notice that an equivalent definition for the cut norm of a graphon $U$ is
\begin{equation}\label{eqn:cut norm equiv defn}
    \lVert U\rVert_\square=\sup_{f,g}\left|\int_{[0,1]^2}U(x,y)f(x)g(y)\,dx\,dy\right|,
\end{equation}
where the supremum is taken over all measurable functions $f,g:[0,1]\to [0,1]$.

Let $V(F)=\{1,\ldots,n\}$ and $E(F)=\{e_1,\ldots,e_m\}$, where $e_t=(i_t,j_t)$. Then
\begin{align*}
    t(F,W)-t(F,W')&=\int_{[0,1]^n}\left(\prod_{e_t\in E(F)}W(x_{i_t},x_{j_t})-\prod_{e_t\in E(F)}W'(x_{i_t},x_{j_t})\right)\prod_{i\in V(F)}dx_i.
\end{align*}

For $t=1,\ldots,m$, define
\begin{align*}
    X_t(x_1,\ldots,x_n)&=\left(\prod_{k=1}^{t-1}W(x_{i_k},x_{j_k})\right)(W(x_{i_t},x_{j_t})-W'(x_{i_t},x_{j_t}))\left(\prod_{k=t+1}^m W'(x_{i_k},x_{j_k})\right)\\
    &=W(x_{i_1},x_{j_1})\cdots W(x_{i_t},x_{j_t})W'(x_{i_{t+1}},x_{j_{t+1}})\cdots W'(x_{i_m},x_{j_m})\\
    &-W(x_{i_1},x_{j_1})\cdots W(x_{i_{t-1}},x_{j_{t-1}})W'(x_{i_{t}},x_{j_{t}})\cdots W'(x_{i_m},x_{j_m}).
\end{align*}
Notice that for $t=1,\ldots,m-1$, the second term of $X_t(x_1,\ldots,x_n)$ cancels out with the first term of $X_{t+1}(x_1,\ldots,x_n)$. Thus,
\begin{align*}
\prod_{e_t\in E(F)}W(x_{i_t},x_{j_t})-\prod_{e_t\in E(F)}W'(x_{i_t},x_{j_t})&=\sum_{t=1}^{m}X_t(x_1,\ldots,x_n).
\end{align*}
Fix all variables $x_j\in [0,1]$ , where $k\neq i_t,j_t$; we then have that
\[f(x_{i_t})=\prod_{k=1}^{t-1}W(x_{i_k},x_{j_k}),\quad g(x_{j_t})=\prod_{k=t+1}^m W'(x_{i_k},x_{j_k})\]
are both measurable functions $[0,1]\to [0,1]$ in $x_{i_t}$ and $x_{j_t}$ respectively. Then by Equation~\ref{eqn:cut norm equiv defn},
\[\left|\int_{[0,1]^2}X_t(x_1,\ldots,x_n)\,dx_{i_t}\,dx_{j_t}\right|\leq\lVert W-W'\rVert_\square,\]
and so
\begin{align*}
    \left|\int_{[0,1]^n}X_t(x_1,\ldots,x_n)\prod_{i=1}^n dx_i\right|&\leq \int_{[0,1]^{n-2}}\left|\int_{[0,1]^2}X_t(x_1,\ldots,x_n)\,dx_{i_t}\,dx_{j_t}\right|\prod_{i\neq i_t,j_t}dx_i\\
    &\leq\lVert W-W'\rVert_\square.
\end{align*}
Thus, we see that
\begin{align*}
    |t(F,W)-t(F,W')|&=\left|\int_{[0,1]^n}\sum_{t=1}^m X_t(x_1,\ldots,x_n)\prod_{i=1}^n dx_i\right|\\
    &\leq \sum_{t=1}^m\left|\int_{[0,1]^n}X_t(x_1,\ldots,x_n)\prod_{i=1}^n dx_i\right|\\
    &\leq m\lVert W-W'\rVert_\square.
\end{align*}
Recall that $m=e(F)$, and so we have the desired inequality.
\end{proof}

\subsection{Figures for experimental results}\label{appendix:loss figures}

 The following figures show the loss curves for experiments from Sections~\ref{section:gen-result} and \ref{section:proteins}, as well as the original paper's loss curves.

 \begin{figure}[H]
    \centering
    \includegraphics[scale=0.9]{images/original paper fig 4 binary.png}
    \caption{The training/validation/test loss curves on IMDB-B and REDDIT-B with GCN as backbone. The curves are depicted on ten runs. This is part of Figure 4 in the original paper \cite{Han:2022}.}
    \label{fig:paper_loss_fig}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=.48\textwidth]{images/redbin_train_loss_100_200_0.png}
    \includegraphics[width=.48\textwidth]{images/redbin_val_loss_100_200_0.png}
    \includegraphics[width=.48\textwidth]{images/redbin_test_loss_100_200_0.png}
    \caption{The training/validation/test curves on REDDIT-B with GCN as a backbone. The curves are depicted on ten runs. The line is the mean of the corresponding loss, while the shaded area is $\pm$ the standard deviation of the losses.}
    \label{fig:redbintraintestvallosscurves}
\end{figure}




\begin{figure}
    \centering
    \includegraphics[scale=0.6]{images/imdb_b loss.png}
    \caption{The training/validation/test curves on IMDB-BINARY with GCN as a backbone. The curves are depicted on ten runs. The line is the mean of the corresponding loss, while the shaded area is $\pm$ the standard deviation of the losses.}
    \label{fig:imdbbintraintestvallosscurves}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[scale=0.6]{images/proteins_loss.png}
    \caption{The training/validation/test curves on PROTEINS with GCN as a backbone. The curves are depicted on ten runs. The line is the mean of the corresponding loss, while the shaded area is $\pm$ the standard deviation of the losses.}
    \label{fig:proteinstraintestvallosscurves}
\end{figure}

