\clearpage
\appendix
\onecolumn


% TOC from https://tex.stackexchange.com/questions/419249/table-of-contents-only-for-the-appendix

% Make the "Part I" text invisible
\renewcommand \thepart{}
\renewcommand \partname{}
\doparttoc % Tell to minitoc to generate a toc for the parts
\faketableofcontents % Run a fake tableofcontents command for the partocs

\addcontentsline{toc}{section}{Appendix} % Add the appendix text to the document TOC
\part{Appendix} % Start the appendix part


\parttoc % Insert the appendix TOC

\section{More Details on the Problems of Mean Based Methods}
\label{sec:appA}
We illustrate the problem of man based methods in \autoref{fig:metric-illustration} where we show that using cross-validation with the difference in means or the more complex Wasserstein-1 distance\footnote{Wasserstein-1 distance is also an integral probability metric like MMD but uses a different class of functions in the optimization problem. Wasserstein-1 was chosen for this illustration because it can be computed efficiently and has no hyperparameters.} does not provide useful signal for selecting a model whereas negative log-likelihood provides a strong signal.
In summary, prior evaluation approaches for implicit generative models are limited in their ability to measure performance on thin support regions.
In \autoref{fig:metric-illustration}, we also show that our \name approach provides reliable signal for selecting the correct model in this toy example.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.9\columnwidth]{figures/relative-density.png}
    \includegraphics[width=0.9\columnwidth]{figures/metric-illustration.png}
    \caption{
    While using standard cross-validation with mean difference or Wasserstein-1 metrics does not provide reliable signal to select the right model, our vertical validation (VV) method with mean difference or Wasserstein-1 provides reliable information on the best model and matches the ranking of the negative log-likelihood.
    This illustration uses the 2D dataset from \autoref{fig:shifted-split-illustration} as ground truth, and the relative density of the ``thin region'' is varied to represent different estimated models (top).
    For both standard validation and vertical validation, we use 10 folds and 30 repetitions and show the standard deviation for each method.
    }
    \label{fig:metric-illustration}
\end{figure}
\section{Detailed Illustration of VV's Reweighing Process}
\label{sec:AppB}
In the main paper in \autoref{fig:scv-illustration}, we presented a figure that illustrates the splitting steps of our approach, and here in 
\autoref{fig:scv-illustration-3} we illustrate the complete reweighting process of our \name approach. In this illustration, we focus on a particular split and split feature (for $j = 1 $ and $\ell = 2$) and where the total number of properties is $m = 2$.

\begin{figure*}[htp]
% \resizebox{\textwidth}{.3\textwidth}{
%height=.1\textwidth
\vspace{-1em}
\centering
\centering
%\includegraphics[width=\textwidth]{figures/figures_graph_2}
\includegraphics[page=9,width=\textwidth, trim=0.8cm 0.5cm 1.5cm 0.5cm, clip]{figures/figures_graph.pdf}
\vspace{-.5em}
\caption{
(1) using $\Xtrain^{(\ell,j)}$ we train a generative model which then generates $\Xgen^{(\ell,j)}$. For the generated dataset we can compute the properties of interest ($\bar{Z_1}$ and $\bar{Z_2}$). (2) Use $\Xtest^{(\ell,j)}$ to calculate the weights $W^{(\ell,j)}$ which will then be used in (3) to re-weight the samples in $\Xgen^{(\ell,j)}$ by applying those weights to the calculated properties. The Density figures above show the difference in distribution between the re-weighted and un-weighted versions of the generated data as well as how they compare to the held-out part of the data. (4) Finally we can compute the KS statistic between the weighted $\Xgen^{(\ell,j)}$ and $\Xtest^{(\ell,j)}$ as detailed in \autoref{eqn:phi_ks}.}
\label{fig:scv-illustration-3}
\vspace{-1.5em}
\end{figure*}



\section{Illustration of Beta Splits}
\label{sec:illustrate_beta_splits}
In section \ref{sec:beta_splits} we presented the Beta Splits, and here we add more figures that illustrate how Beta-based splitting works with different parameters. \Cref{fig:beta_sharp1} shows an example for $k = 4$ beta distributions (and $\sharpness = 1$) and \Cref{fig:beta_sharp10} shows the same example but with sharpness of 10, i.e., $\sharpness=10$. In \Cref{fig:eps}, we show an illustration of using the uniform mixing parameter $\epsilon$ to ensure some support on the whole range of the uniform.
%The rightmost plot shows the probabilities of each split, and to the left of that is the aggregation of the remaining plots to the left.

\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{figures/beta_sharp_1.png}
  \caption{First 4 figures are the normalized PDF for 4 Beta distributions parameterized by a set of shifting parameters. The fifth figure is the aggregation of the 4 figures to the left, and the right most figure is the conditional probabilities of a projected point falling in a split based on the previously described Beta distributions where each split have a different color. }
  \label{fig:beta_sharp1}
\end{figure} 

%
\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{figures/beta_sharp_10.png}
  \caption{The same plot as \autoref{fig:beta_sharp1} except where the sharpness is 10, i.e. $\sharpness = 10$. We notice now that the conditional probabilities computed have sharper edges as seen in the rightmost figure.}
  \label{fig:beta_sharp10}
\end{figure}

\begin{figure}[H]
\vspace{-1em}
  \centering
  \includegraphics[width=0.9\linewidth]{figures/eps.png}
  \caption{Leftmost is the mixture of betas for the different splits along with the uniform $\epsilon = 0.1$ (and using $\sharpness = 10$). Middle is the stacked probabilities. Rightmost are the probabilities normalized by the density.}
  \label{fig:eps}

\end{figure}
\section{Illustration of the Splitting Approach Described in section 4.1}
\label{sec:splitting}
\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.8\columnwidth]{figures/splitting.png}
    \caption{
    An illustration demonstrating the created splits described in the main section
    }
    \label{fig:splitting}
\end{figure}

\section{Implementation Details of \name}
\label{sec:implementation-other}
We discuss a few implementation details here including how to project samples to the unit interval via the empirical CDF, how to estimate importance weights via kernel mean matching, and how to generate enough samples from the model (as they depend on the importance weights).

\subsection{A Generalization of the Empirical CDF}
\label{sec:ecdf}
To project samples onto the unit interval, we use a generalization of the empirical CDF (ECDF) where the test points may be different from the training points and where ties (e.g., in discrete spaces) can be handled appropriately.
The core idea is that for test points we want to project, denoted by $\mathcal{A}$, we find the nearest point in the base dataset ($\mathcal{B}$) and evenly spread out the projections corresponding to the ECDF interval. We begin with an illustration to explain this in both continuous and discrete cases in \Cref{fig:cont_ecdf} and \Cref{fig:discrete_ecdf} respectively.
To define it formally, we first begin with a simpler randomized uniform projection method and then develop the non-randomized version that yields a deterministic low discrepancy sequence, also known as a \emph{centered regular lattice} \citep[Chapter 1]{dick2010digital}.

\begin{figure}[H]
\centering
\includegraphics[width=.48\textwidth]{figures/ecdf_cont.pdf}
\includegraphics[width=.5\textwidth]{figures/test_samples_cont.pdf}
\caption{On the left in orange are continuous valued  base samples of dataset $\mathcal{B}$ after being projected onto the uniform space using the ECDF. On the right in green are the continuous valued test samples of dataset $\mathcal{A}$ after being projected onto the uniform space using the nearest point in the base dataset $\mathcal{B}$ }
\label{fig:cont_ecdf}
\end{figure}

\begin{figure}[H]
\centering
\includegraphics[width=.45\textwidth]{figures/ecdf_discrete.pdf}
\includegraphics[width=.45\textwidth]{figures/test_samples_discrete.pdf}

\caption{On the left in orange are discrete valued  base samples of dataset $\mathcal{B}$ after being projected onto the uniform space using the ECDF and breaking ties between the same points at random. On the right in green are the discrete valued test samples of dataset $\mathcal{A}$ after being projected onto the uniform space using the nearest point in the base dataset $\mathcal{B}$. 
Note that if the test samples are outside the original data (as the two test points at 3.0), they will be mapped to the nearest base point (2.0 in this figure) and then projected evenly onto the interval spanning the corresponding part of the ECDF.
}
% Ah, I get it now...whew, I thought there was a bug for a second.
\label{fig:discrete_ecdf}
\end{figure}



%\david{Put the illustration + explanation here.}
%\david{Add pseudo-code}

\paragraph{Preliminary Notation.}
First, let us define the left and right empirical CDFs (the standard ECDF is the right empirical CDF).
Informally, these will give us the bottom and top of each ``step'' in the empirical CDF.
Formally, given a 1D dataset $\mathcal{C}$ as:
\begin{align}
    \hat{F}_{\mathcal{C}}^{(-)}(a) 
    &= \frac{1}{|\mathcal{C}|}\sum_{c_i \in \mathcal{C}} \mathbf{1}(c_i < a)  \\
    \hat{F}_{\mathcal{C}}^{(+)}(a) 
    &= \frac{1}{|\mathcal{C}|} \sum_{c_i \in \mathcal{C}} \mathbf{1}(c_i \leq a) \,,
\end{align}
where $\mathbf{1}(\cdot)$ is an indicator function that is 1 if the condition is true and 0 otherwise.
Note that the only difference between these is that in the left ECDF $\hat{F}_{\mathcal{C}}^{(-)}$, the condition does not include equality.

\paragraph{Na\"ive Randomized Projection to Unit Interval.}
First, we will consider a randomized projection to the uniform based on the left and right ECDFs defined above.
Formally, with two scalar datasets, $\mathcal{A} = \{ a_i \in \R \}_{i=1}^{|\mathcal{A}|}$ and a base dataset $\mathcal{B} = \{ b_i \in \R \}_{i=1}^{|\mathcal{B}|}$, we define our function as: %ATTENTION% \thu{notation $\cG$ and $\cG_\text{base}$ need to be fixed }
\begin{align}
    F_{\text{rand}}(\mathcal{A} ; \mathcal{B}) &\triangleq \{(1-V_i) \hat{F}_{\mathcal{B}}^{(-)}(b^*(a_i)) + V_i \hat{F}_{\mathcal{B}}^{(+)}(b^*(a_i)) : a_i \in \mathcal{A} \}  \,,
\end{align}
where $b^*(a_i) := \argmin_{b \in \mathcal{B}} \|a_i - b\|_2^2$ represents the closest point in $\mathcal{B}$ to $a_i$ and $V_i \sim \text{Uniform}(0,1)$ is an independent uniform random variable corresponding to each value $a_i \in \mathcal{A}$.
This projects each point $a_i$ uniformly over the interval $[\hat{F}_{\mathcal{B}}^{(-)}(b^*(a_i)), \hat{F}_{\mathcal{B}}^{(+)}(b^*(a_i))]$.
In the special case where $\mathcal{A} = \mathcal{B}$, i.e., the test points are the same as the base dataset, then this produces a marginally uniform distribution over $[0,1]$ because each test point is randomly projected on the $1/|\mathcal{B}|$ interval of the ECDF corresponding to itself.
Even if there are ties in the dataset (e.g., with a discrete dataset), this will still produce a uniform marginal distribution because the ECDF will jump by the $n_{\text{ties}}/|\mathcal{B}|$, where $n_{\text{ties}}$ is the number of ties for a particular value in $\mathcal{B}$.


\paragraph{Low Discrepancy Projection to Unit Interval.}
The main drawback to the projection method described above is that it is random and may not evenly space out points across the unit interval [0,1], i.e., the sequence may not have low discrepancy compared to evenly spaced points \citep{dick2010digital}.
% See https://en.wikipedia.org/wiki/Equidistributed_sequence#Discrepancy
Therefore, we propose a non-randomized version of the above projection method that evenly spaces test points across the corresponding interval instead of choosing a random point in the interval, i.e., we replace $V_i$ with a deterministic function.
This form of spacing is known as a regular centered lattice \citep[Chapter 1]{dick2010digital}.
Formally, we define our non-randomized function as: 
\begin{align}
    F(\mathcal{A} ; \mathcal{B}) 
    &:= \{(1-v(a_i)) \hat{F}_{\mathcal{B}}^{(-)}(b^*(a_i)) + v(a_i) \hat{F}_{\mathcal{B}}^{(+)}(b^*(a_i)) : a_i \in \mathcal{A} \} \,,
\end{align}
where $b^*(a_i)$ is the closest point as defined in the randomized version but we replace a random $V_i$ with a determinstic function of the sets defined as follows:
\begin{align}
    v(a_i) &:= \frac{\textnormal{sorted\_index}(a_i, \mathcal{A}(a_i)) - 0.5}{|\mathcal{A}(a_i)|}  \\
    \mathcal{A}(a_i) &:= \{ a_j \in \mathcal{A}: b^*(a_j) = b^*(a_i) \} \,.
\end{align}
where $\mathcal{A}(a_i)$ represents the equivalence class of all points in $\mathcal{A}$ that have the same closest point in $\mathcal{B}$ and $\textnormal{sorted\_index}(a_i, \mathcal{A}(a_i))$ returns the index (with one-indexing) of the element in the multi-set of $\mathcal{A}(a_i)$ after sorting the elements where ties are broken arbitrarily.
Note that $v(a_i) \in [0,1]$ by construction and it evenly spaces the points in the equivalence class of $\mathcal{A}(a_i)$ over the interval corresponding to the equivalence class.
For example, if $a_i=1$ and $\mathcal{A}(a_i) = \{1,0.5,1.5\}$, then $v(a_i) = \frac{\textnormal{index}(a_i,\mathcal{A}(a_i)) -0.5}{|\mathcal{A}(a_i)|} = \frac{2-0.5}{3} = \frac{1.5}{3} = 0.5$, i.e., it would project this point to the center of the interval $[\hat{F}_{\mathcal{B}}^{(-)}(b^*(a_i)), \hat{F}_{\mathcal{B}}^{(+)}(b^*(a_i))]$.



\subsection{Computing the Importance Weights}

We Used Kernel Mean Matching (KMM) \citep{Huang2006CorrectingSS,Yu2012AnalysisOK} implemented in\cite{de2021adapt} to directly estimate the ratio in \autoref{eqn:wts} using the generated and held-out samples. For this approach we used an RBF kernel and experimented with different values of kernel bandwidth to finally set on using $\frac{10}{\sigma(Z_{i,\ell})}$ where $\sigma(Z_{i,\ell})$ is the standard deviation of the held portion of the data.

\subsection{Procedure for Generating Samples from the Model}
%Describe how you did this here.
Because the samples are re-weighted based on the importance weights, the effective number of samples for statistical tests is less than the number of generated samples.
Therefore, we choose to generate the same number of \emph{effective} samples for each method where the number of effective samples is defined as in the main paper but repeated here for clarity:
%\begin{align}
%    n_{\text{eff}}(w) = \frac{(\sum_{i=1} w_i)^2}{\sum_{i=1} w_i^2}.
%\end{align}
\begin{align}
    N_{\text{eff}}(\mathcal{G}_{\sgen,W}^{(\ell, j)})= \frac{(\sum_{(\Gbar_i, W(\Gbar_i)\in \mathcal{G}_{\sgen,W}^{(\ell, j)}} W^{(\ell,j)}(\Gbar_i))^2}{\sum_{(\Gbar_i, W(\Gbar_i)\in \mathcal{G}_{\sgen,W}^{(\ell, j)}} W^{(\ell,j)}(\Gbar_i)^2}.
\end{align}

Specifically, we set a target threshold of the number of effective samples $t=\min(1000, |\Xtest^{(\ell,j)}|)$.
We iteratively generated batches of t samples and checked if the concatenated samples reached the number of effective samples, and we stopped generating once it reached the desired value.

\clearpage




\section{Proofs}
\label{sec:proofs}

\thmone*

\begin{proof}

In the proof below we suppress the dependency on $i$ and $\ell$ when needed for simplicity.
To prove that $P(U_{i,\ell}) =\mathrm{Uniform}[0,1]$ we first prove that this is true for $ \pbetamix (U_{i,\ell} | S_{i,\ell}) $ by marginalizing over the joint distributions of $U_{i,\ell}$ and $S_{i,\ell}$ to get:
%\david{First show that Using the mixture of Beta distributions is uniform using Seigers, integrate the $ p_\mathrm{Beta}(U_{i,\ell}|S_{i,\ell}\!=\!j) =\frac{1}{\sharpness}\sum_{i'=1}^\sharpness p_{\mathrm{Beta}[\alpha_j\sharpness+i',\beta_j\sharpness+i']}(U_{i,\ell}) $
%And show that for any sharpness because of Seigers this is uniform.
%}

\begin{align}
    & \sum_{j = 1}^k \pbetamix (S = j , U) \\
    & = \sum_{j = 1}^k p(S=j) \pbetamix(U|S=j) \\
    & = \sum_{j = 1}^k P(S=j) \frac{1}{\sharpness} \sum_{a = 1}^{\sharpness} p_{\mathrm{Beta}[\alpha_{j,a},\beta_{j,a}]}(U) \\
    & = \frac{1}{\sharpness K} \sum_{j = 1}^k \sum_{a = 1}^{\sharpness} p_{\mathrm{Beta}[\alpha_{j,a},\beta_{j,a}]}(U) \\
    & = \frac{1}{\sharpness K} \sum_{j = 1}^k \sum_{a = 1}^{\sharpness} p_{\mathrm{Beta}[(j-1)\sharpness+a,k\sharpness + 1 - ((j-1)\sharpness + a)]}(U) \\
    \intertext{Let $n = k \sharpness$, $r = (j-1)\sharpness+a$ then we can rewrite the above as:}
    & = \frac{1}{n} \sum_{r = 1}^{n } p_{\mathrm{Beta}[r,n + 1 - r]}(U) 
    \label{eq:use_segers} \\
    & = p_{\mathrm{Unif}[0,1]}(U) \label{eq:lastone}
\end{align}

\autoref{eq:use_segers} is similar to \cite{segers2017empirical} results in section 2.1 if we substitute $d = 1$ and $n = k \sharpness$ in their results, and use that to arrive to the last equality leading to \autoref{eq:lastone}.


%\david{Now show that it holds for the epsilon version. For the $(1-\epsilon)$ case, use the result above. For the uniform epsilon part, it is trivial to show it is also uniform since it doesn't depend on S.}
Next, we show that this holds true for $p(U|S)$ as follows:
\begin{align}
&\sum_{j=1}^k p(S=j,U) \\
&= \sum_{j = 1}^k p(S=j) p(U|S = j) \\
&= \sum_{j = 1}^k p(S=j) [(1\!-\!\epsilon)\pbetamix(U|S) + \epsilon p_{\mathrm{Unif}[0,1]}(U)]\\
&= \epsilon  p_{\mathrm{Unif}[0,1]}(U) + (1-\epsilon) \sum_{j = 1}^k p(S = j) \pbetamix(U|S = j)\\
&= p_{\mathrm{Unif}[0,1]}(U)\,.
\end{align}

Invoking the previous results, we see that the above is also uniform.


To prove that the splits will be biased, we will use mutual information as follows:
Let $p(U,S)$ denote the joint distribution of $p(U_{i,\ell},S_{i,\ell})$.
The mutual information for $\epsilon < 1$ can be written as:
\begin{align}
    I(U,S) 
    &\equiv \mathrm{KL}(p(U,S), p(U)p(S)) \\
    &= \E_{p(S)}[\E_{p(U|S)}[\log \frac{p(U,S)}{p(U)p(S)}]] \\
    &= \E_{p(S)}[\E_{p(U|S)}[\log \frac{p(U|S)p(S)}{p(U)p(S)}]] \\
    &= \E_{p(S)}[\E_{p(U|S)}[\log \frac{p(U|S)}{p(U)}]] \\
    &= \E_{p(S)}[\mathrm{KL}(p(U|S), p(U))]] \\
    &>0 \,,
\end{align}
where the last inequality is because $p(U)$ is a uniform distribution but $p(U|S=j)$ is not uniform whenever $\epsilon < 1$, i.e., if $\epsilon < 1$, then $p(U|S=j) \neq p(U), \forall j$, and thus the KL must be positive for all terms in the expectation.

\end{proof}

\paragraph{Remark on Mutual Information Between Splits}
We also briefly discuss additional design points of the mutual information between splits and graphs.
If $\epsilon=0$ and $\sharpness = 1$, since $p(U) = p_{\mathrm{Unif}[0,1]}$, we also know that the KL terms are equal to negative differential entropy of the Beta distributions which is known in closed form, i.e.,
\begin{equation}
    \begin{aligned}[b]
    &\mathrm{KL}(p_{\mathrm{Beta}[\alpha,\beta]}, p_{\mathrm{Unif}[0,1]}) \equiv -H(p_{\mathrm{Beta}[\alpha,\beta]}) \\
    &=-[\log \mathrm{B}(\alpha,\beta) - (\alpha-1)\gamma(\alpha)- (\beta-1)\gamma(\beta) + 
    (\alpha+\beta-2)\gamma(\alpha + \beta)] \,,
\end{aligned}
\end{equation}
where $\mathrm{B}(\cdot,\cdot)$ denotes the the Beta function and $\gamma(\cdot)$ denotes the digamma function.

Furthermore, if $0\leq \epsilon \leq 1$ and we consider the term $\mathrm{KL}((1-\epsilon)p_{\mathrm{Beta}[\alpha,\beta]} + \epsilon p_{\mathrm{Unif}[0,1]}, p_{\mathrm{Unif}[0,1]})$ which we will refer to as $\mathrm{KL}_{\epsilon}$, we know that at $\epsilon = 1$ the term becomes $\mathrm{KL}( p_{\mathrm{Unif}[0,1]}, p_{\mathrm{Unif}[0,1]}) = 0$, and on the other extreme at $\epsilon = 0$, it becomes $\mathrm{KL}(p_{\mathrm{Beta}[\alpha,\beta]}, p_{\mathrm{Unif}[0,1]})$ which is known in closed form by the result above. Thus for any $0 < \epsilon < 1$, we are guaranteed to have: $0 < \mathrm{KL}_{\epsilon} < -H(p_{\mathrm{Beta}[\alpha,\beta]})$.


\thmtwo*

\begin{proof}
Let $F$ be the true data CDF we would like to generate, i.e., the true CDF of the graph property we are testing.
 Let $\hat{F}^{(i)}_{n_i}$ be the empirical CDF obtained from the implicit model after $n_i$ samples.\footnote{
 While we prove the theorem statement with respect to an unweighted empirical CDF for the generated samples, we expect a similar result to hold for a weighted empirical CDF where we use the number of effective samples as defined in the main paper in place of $n_i$. }
 Let $\hat{F}^{(h)}_{n_h}$ be the empirical heldout CDF from $n_h$ heldout samples.
 Finally, let $\hat{F}^{(m)}_{n_m}$ be the empirical memorization CDF with $n_m$ memorization samples (we will use $\hat{F}^{(m)}_{n_m}$ rather than $F_m$ in the theorem statement).
 %
By the Dvoretzky-Kiefer-Wolfowitz inequality with Massart's universal constant~\citep{massart1990tight} we have that for an arbitrary empirical distribution $\hat{H}_n$ with $n$ samples and its true distribution $H$, 
\[
 P(\phi_\text{KS}(\hat{H}_n, H) > d) \leq 2 \exp(-2 nd^2), \ \forall d > 0.
\]
%
Note that $\phi_\text{KS}$ satisfies the triangle inequality, and hence, 
\begin{equation}
\label{eq:triangle}    
\phi_\text{KS}(\hat{F}^{(i)}_{n_i}, \hat{F}^{(h)}_{n_h}) \leq \phi_\text{KS}(\hat{F}^{(i)}_{n_i}, F) + \phi_\text{KS}(F, \hat{F}^{(h)}_{n_h}).
\end{equation}
By the total law of probabilities,  
\[
P(\phi_\text{KS}(\hat{F}^{(i)}_{n_i}, F) < d) \geq 1 - 2 \exp(-2 n_i d^2)
\]
and 
\[
P(\phi_\text{KS}(F, \hat{F}^{(h)}_{n_h}) < d) \geq 1 - 2 \exp(-2 n_h d^2),
\]
which together with \Cref{eq:triangle} yields 
\begin{align*}    
P(\phi_\text{KS}(\hat{F}^{(i)}_{n_i}, \hat{F}^{(h)}_{n_h}) < d) &\geq (1 - 2 \exp(-2 n_i d^2)) (1 - 2 \exp(-2 n_h d^2)) \\
& \geq 1 - 2 \exp(-2 n_i d^2) - 2\exp(-2 n_h d^2) + 4\exp(-2 (n_h + n_i) d^2).
\end{align*}
By the total law of probabilities, we obtain
\begin{align*}    
P(\phi_\text{KS}(\hat{F}^{(i)}_{n_i}, \hat{F}^{(h)}_{n_h}) > d) & \leq 2 \exp(-2 n_i d^2) + 2\exp(-2 n_h d^2) - 4\exp(-2 (n_h + n_i) d^2)\\
& \leq 2 \exp(-2 \min(n_i,n_h) d^2) + 2\exp(-2 \min(n_i,n_h) d^2) \\
&\leq 4 \exp(-2 \min(n_i,n_h) d^2) \\
&\leq 4 \exp(-2 \min(n_i,n_h) (\frac{d}{2})^2) \\
\end{align*}

Now, set $\epsilon \in [0,1]$, then we have: $P(\phi_{\text{KS}}(\Xheldw, \cG_{\textnormal{gen},W}^{(\ell,j)}; h_{\ell'})  >  \epsilon)  \leq 4 \exp\left(- 2  \min(|\Xgen^{(\ell, j)}| , |\Xtest^{(\ell, j)}|) \left( \frac{\epsilon}{2}\right)^2  \right)$

\end{proof}




\section{Additional Experimental Details and Results}
\label{sec:add_exp}
In this section, we provide additional experimental details concerning the datasets, training setup, as well as additional results and figures.
\subsection{More Information About the Datasets}

\paragraph{Community datasets:} We used two variations of this dataset (referred to as Comm20 in \autoref{sec:exp_val} and as Comm in \autoref{sec:scv}) in our experiments. Both variations share the following, they are synthetic graphs made up by exactly two equally sized communities, with number of nodes ranging from $12$ to $20$. Each of the communities are generated by the random graph generator model, E-R model \citep{E-R-Model} with  $p = 0.7$, and the number of edges added between the two communities is with probability of $0.05$ times the number of nodes in the graph.
The variation used in \autoref{sec:exp_val} was generated using the parameters above using the code provided in the git repository of the official GDSS model implementation \footnote{\url{https://github.com/harryjo97/GDSS/blob/master/data/data_generators.py}} after adjusting the seeds in such a way to avoid exactly repeating the same graph, and we generated 500 samples.

The variation used by the experiments in \autoref{sec:scv} was the original dataset available at the same GDSS repository \footnote{\url{https://github.com/harryjo97/GDSS/blob/master/data/community_small.pkl}} which contains 100 graphs. After close inspection of this dataset, we noted that it has multiple repeated graphs, due to running the code with similar seeds more than once. In some sense these replications can be considered a form of leakage when splitting the data, however with our approach since we are doing vertical splitting, identical samples are more likely to belong to only v-train or only v-test. We leave it as a future work to thoroughly understand the effects of such repetitions. 


\paragraph{Qm9:} This dataset \footnote{\url{https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/molnet_publish/qm9.zip}} contains 134k drug-like molecules which made up of at most 9 heavy (non Hydrogen) atoms: Carbon (C), Oxygen (O), Nitrogen (N), and Flourine (F), and Hydrogen (H) bonding. 

The dataset we ended up using after some preprocessing and filtering had a total number of 130,831 samples. We arrived at this number by prepossessing the dataset by removing the hydrogen atoms from all the molecules, and removing all the molecules where any of the 5 properties cannot be calculated by rdkit package. The 5 chosen properties for this dataset are average degree ($\phideg$), molecular weight ($\phimwt$), Topological Polar Surface Area (TPSA) ($\phitpsa$) \citep{Prasanna2009TopologicalPS}, ring counts ($\phirc$), and the logarithm of the partition coefficient (logp) ($\philogp$).

\subsection{Training Details for Each Graph Generative Model}

We chose the models: \textbf{DiGress} \citep{Vignac2022DiGressDD}, \textbf{GDSS} \citep{Jo2022ScorebasedGM} and \textbf{GGAN} \citep{krawczuk2021gggan} to evaluate.

\paragraph{DiGress Training.}

for the experiments of \autoref{sec:scv} we trained DiGress on the Comm dataset, we set the number of epochs to be 100,000 with a batch size of 256, a learning rate of 0.0002, and an AdamW optimizer. The model parameters used for training were T = 500 diffusion steps, with a cosine noise schedule and 8 layers. Those were the default parameters provided in the config files in the official code repository\footnote{\url{https://github.com/cvignac/DiGress}}.

For training DiGress with Qm9 dataset, we set the number of epochs to be 1000 with a batch size of 512, a learning rate of 0.0002, a weight decay of $10^{-12}$ and an AdamW optimizer. The model parameters used for training were T = 500 diffusion steps, with a cosine noise schedule and 9 layers.

For the experiments of \autoref{sec:vtrain-val-vtest}, we used the same settings as mentioned above for the respective dataset but trained with early stopping with a patience of 20 and monitoring the Negative log-likelihood (NLL) of the validation data (v-val).

\paragraph{GDSS Training}
The code used for GDSS is adapted from their official repository \footnote{\url{https://github.com/harryjo97/GDSS/tree/master}}. The hyperparameters applied to train the Community and Qm9 datasets are taken from the config files provided by the 
 the author \citep{Jo2022ScorebasedGM}. 
For the Community datasets\footnote{\url{https://github.com/harryjo97/GDSS/blob/master/config/community_small.yaml}}, the number of epochs is $5000$ with a batch size of $128$, a learning rate of $0.01$, with an Adam optimizer and Exponential Moving Average (EMA).
For Qm9 dataset\footnote{\url{https://github.com/harryjo97/GDSS/blob/master/config/qm9.yaml}}, the number of epochs is $300$ with a batch size of $1024$, a learning rate of $0.005$, with an Adam optimizer and Exponential Moving Average (EMA).

For the experiments in \autoref{sec:vtrain-val-vtest}, we incorporated an early stopping criterion. Specifically, training was terminated if the difference between the MMD loss (estimated partial scores, equation 5 in \citet{Jo2022ScorebasedGM}) for the training and validation sets did not decrease below $1e-10$ for 5 consecutive epochs. This prevents overfitting and saves computational resources.


\paragraph{GGAN Training}
GGAN code is taken from SPECTRE \citep{martinkus2022spectre} repository \footnote{\url{https://github.com/KarolisMart/SPECTRE/tree/main}} with \verb|--use_fixed_emb| argument while training. Only Comm dataset is used with this model. The hyper-parameters to train the model are also taken from the suggested commands for Comm in the mentioned repository: the number of epochs is up to $12000$ with a batch size of $10$, a learning rate of $0.0001$, with an Adam optimizer (both discriminator and generator). 

%We also use early stop for all the training with GG-GAN. The early stopping criterion is set with a loss threshold of 
%$\geq 1e-10$ and a patience parameter (waiting period) of 5 epochs. 

\subsection{Choosing the Split Feature} %in \autoref{sec:exp_val} and \autoref{sec:vtrain-val-vtest}}
\label{appendix:choose_split}
In \autoref{sec:exp_val} we chose a single split property out of 3 possible properties, and in \autoref{sec:vtrain-val-vtest} we chose one out of 5 possible properties. 
Intuitively, a split property which is completely independent of the other test properties would not cause any (indirect) shift of the other properties.
Thus, the marginal distributions of each property would be equal for every split.
This would not enable good evaluation of the generalization performance on the thin support since a model could just copy the marginal distributions of the properties from the training split.
Therefore, we choose to find a split property that has high dependence with all other split properties.
Concretely, in both cases we calculated the spearman correlation between all the features against each other and computed the average absolute correlation that one feature has with the rest, then choose the feature that has the highest average correlation to be the split feature. As we have mentioned in the discussion, this is still an area of exploration, but the motivation behind this approach was that choosing a split feature that is somewhat correlated with the others will cause the distribution of the other features to change accordingly which is the desirable effect in our case.


\subsection{ECDF Plots for Model Comparison}
%In this section we present an exhaustive list of ECDF plots for all splits, split properties and test properties.  
In this section we present a list of ECDF plots for some splits that we inspected for further analysis.
%The plots are presented in this order: for the same test property under multiple split properties first, then for other test properties in the same fashion. We would like to point out that we didn't include the case where the test property is the same as the split property in calculating the final results, however we included the figures related to that case here since it's useful to see the behaviour in this case.

%In order not to overwhelm the reader, we point out to some plots that we used to draw our conjectures about some model's behavior, but we include all the plots for completeness.
 %that we examined to draw some conjectures about the model's behavior.
 In the main paper we mentioned two conjectures and that we arrived to them by inspecting some ECDF plots, we point the reader to those ECDF plots below.
First, the conjecture that GGAN generally matches the modes of the distribution while DiGress and GDSS tend to be more smooth, this behaviour is emphasized in \autoref{fig:comm20_models_vv_3_1}, \autoref{fig:comm20_models_vv_3_2}, \autoref{fig:comm20_models_vv_4_2}, \autoref{fig:comm20_models_vv_5_2}.

Second, the conjecture that GDSS and DiGress are not able to concentrate the generation near the middle of the distribution for test property average shortest path length, which can be seen in \autoref{fig:comm20_models_vv_1_3}, and \autoref{fig:comm20_models_vv_2_3}.
\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{figures/VV_comm20_models_3_1.pdf}
  \caption{Weighted ECDF for DiGress, GDSS and GGAN as compared to held out data for split property average average shortest path length ($\ell = 3$) when test property is average degree ($\ell' = 1$). The legend reads modelName($\phi_{ks})$ value}
  \label{fig:comm20_models_vv_3_1}
\end{figure} 



\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{figures/VV_comm20_models_4_2.pdf}
  \caption{Weighted ECDF for DiGress, GDSS and GGAN as compared to held out data for split property average clustering coefficient ($\ell = 4$) when test property is average number of triads ($\ell' = 2$). The legend reads modelName($\phi_{ks})$ value}
  \label{fig:comm20_models_vv_4_2}
\end{figure} 


\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{figures/VV_comm20_models_5_2.pdf}
  \caption{Weighted ECDF for DiGress, GDSS and GGAN as compared to held out data for split property average maximal cliques ($\ell = 5$) when test property is average number of triads ($\ell' = 2$). The legend reads modelName($\phi_{ks})$ value}
  \label{fig:comm20_models_vv_5_2}
\end{figure} 




\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{figures/VV_comm20_models_3_2.pdf}
  \caption{Weighted ECDF for DiGress, GDSS and GGAN as compared to held out data for split property average shortest path length ($\ell = 3$) when test property is average number of triads ($\ell' = 2$). The legend reads modelName($\phi_{ks})$ value}
  \label{fig:comm20_models_vv_3_2}
\end{figure} 

\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{figures/VV_comm20_models_1_3.pdf}
  \caption{Weighted ECDF for DiGress, GDSS and GGAN as compared to held out data for split property average degree ($\ell = 1$) when test property is average shortest path length ($\ell' = 3$). The legend reads modelName($\phi_{ks})$ value}
  \label{fig:comm20_models_vv_1_3}
\end{figure} 

\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{figures/VV_comm20_models_2_3.pdf}
  \caption{Weighted ECDF for DiGress, GDSS and GGAN as compared to held out data for split property average number of triads ($\ell = 2$) when test property is average shortest path length ($\ell' = 3$). The legend reads modelName($\phi_{ks})$ value}
  \label{fig:comm20_models_vv_2_3}
\end{figure} 



\subsection{Examples of Generated Molecules}
\label{sec:visuals}
From the top 100 molecules generated from DiGress and GDSS we filter for validity and novelty, and we visualize the top 4 weighted molecules in different scenarios.


\begin{figure}[ht]
%\resizebox{0.5\textwidth}{!}{
\begin{subfigure}{.5\textwidth}
  \centering
  % include first image
 \includegraphics[width=.7\linewidth]{figures/digress_vval.png}  
  \caption{Top 4 valid and novel molecules sampled from DiGress \\  when evaluated against v-val}
  \label{fig:sub-first}
\end{subfigure}
\begin{subfigure}{.5\textwidth}
  \centering
  % include second image
  \includegraphics[width=.7\linewidth]{figures/digress_test.png}  
  \caption{Top 4 valid and novel molecules sampled from DiGress \\  when evaluated against v-test}
  \label{fig:sub-second}
\end{subfigure}
\hspace{0.2em}
\newline
\begin{subfigure}{.5\textwidth}
  \centering
  % include third image
  \includegraphics[width=.7\linewidth]{figures/gdss_vval.png}  
 
  \caption{Top 4 valid and novel molecules sampled from GDSS \\  when evaluated against v-val}
  \label{fig:sub-third}
\end{subfigure}
\begin{subfigure}{.5\textwidth}
  \centering
  % include fourth image
  \includegraphics[width=.7\linewidth]{figures/gdss_vtest.png}  
  \caption{Top 4 valid and novel molecules sampled from GDSS \\  when evaluated against v-test}
  \label{fig:sub-fourth}
\end{subfigure}
\caption{Visualization of the top 4 highest weighted molecules (after filtering for novelty and validity) that are sampled from DiGress and GDSS and assigned weights by our approach when testing against the held portion being either v-val or v-test. These are the same molecules in \autoref{fig:dist_val_test} that were plotted with respect to the TPSA distribution.}
\label{fig:example_molecules}
%}
\end{figure}






