\section{Proof for Consistency}

\thmone 

\begin{proof} \footnote{
 While we prove the theorem statement with respect to an unweighted empirical CDF for the generated samples, we expect a similar result to hold for a weighted empirical CDF where we use the number of effective samples, $N_{\text{eff}}(\mathcal{G}_{\sgen,W}^{(\ell, j)})$, as defined in the main paper in place of $n_g$. }
 
 
 In what follows we omit the superscript $(l,j)$ from our notation for simplicity.
Let 
\begin{itemize}
    \item $F$ be the true data CDF;
    \item $\hat{F}^{(g)}_{n_g}$ be the empirical CDF obtained from $n_g$ samples in $\Xgen^{(\ell,j)}$;
    \item $\hat{F}^{(t)}_{n_t}$ be the empirical CDF obtained from $n_t$ samples in $\Xtrain^{(\ell,j)}$;
    \item $\hat{F}^{(h)}_{n_h}$ be the empirical CDF obtained from $n_h$ samples in $\Xtest^{(\ell,j)}$.
\end{itemize}
 %
 Note that $\phi_\text{KS}$ satisfies the triangle inequality, and hence, we can use it on $\hat{F}^{(g)}_{n_g}, \hat{F}^{(h)}_{n_h}, F$, giving:
\begin{equation}
\label{eq:triangle}    
\phi_\text{KS}(\hat{F}^{(g)}_{n_g}, \hat{F}^{(h)}_{n_h}) \leq \phi_\text{KS}(\hat{F}^{(g)}_{n_g}, F) + \phi_\text{KS}(F, \hat{F}^{(h)}_{n_h}).
\end{equation}

 %
By the Dvoretzky-Kiefer-Wolfowitz inequality with Massart's universal constant~\citep{massart1990tight} we know that for an arbitrary empirical distribution $\hat{H}_n$ with $n$ samples and its true distribution $H$, 
\[
 P(\phi_\text{KS}(\hat{H}_n, H) > d) \leq 2 \exp(-2 nd^2), \ \forall d > 0.
\]
%

Combining ~\citep{massart1990tight} with the total law of probabilities \mai{cite total law of probabilities}, we have:
\[
P(\phi_\text{KS}(\hat{F}^{(g)}_{n_g}, F) > d) \leq 2 \exp(-2 n_g d^2) \Rightarrow P(\phi_\text{KS}(\hat{F}^{(g)}_{n_g}, F) < d) \geq 1 - 2 \exp(-2 n_g d^2)
\]
and 
\[
P(\phi_\text{KS}(F, \hat{F}^{(h)}_{n_h}) > d) \leq 2 \exp(-2 n_h d^2) \Rightarrow P(\phi_\text{KS}(F, \hat{F}^{(h)}_{n_h}) < d) \geq 1 - 2 \exp(-2 n_h d^2),
\]
which together with \Cref{eq:triangle} yields \mai{stopped here} 
\begin{align*}    
P(\phi_\text{KS}(\hat{F}^{(g)}_{n_g}, \hat{F}^{(h)}_{n_h}) < 2 d) &\geq (1 - 2 \exp(-2 n_g d^2)) (1 - 2 \exp(-2 n_h d^2)) \\
& \geq 1 - 2 \exp(-2 n_g d^2) - 2\exp(-2 n_h d^2) + 4\exp(-2 (n_h + n_g) d^2).
\end{align*}
By the total law of probabilities, we obtain
\begin{align*}    
P(\phi_\text{KS}(\hat{F}^{(g)}_{n_g}, \hat{F}^{(h)}_{n_h}) > 2 d) &\leq 2 \exp(-2 n_g d^2) + 2\exp(-2 n_h d^2) - 4\exp(-2 (n_h + n_g) d^2)\\
&\leq 2 \exp(-2 n_g d^2) + 2\exp(-2 n_h d^2) \\
&\leq 4 \exp(-2 \min(n_g,n_h) d^2).
\end{align*}

\end{proof}

\iffalse
Restating \Cref{eq:barphi} without the weights using a notation consistent with this proof,
\begin{align}
\label{eq:barphi2}
    \bar{\phi}_{\text{KS}}(\hat{F}^{(t)}_{n_t}, \hat{F}^{(h)}_{n_h}, \hat{F}^{(g)}_{n_g}) & = \frac{\phi_{\text{KS}}(\hat{F}^{(h)}_{n_h},\hat{F}^{(g)}_{n_g})}{\phi_{\text{KS}}(\hat{F}^{(h)}_{n_h},\hat{F}^{(t)}_{n_t})}.
\end{align}
%
We have a bound for the numerator of \Cref{eq:barphi2}:

$$P(\phi_\text{KS}(\hat{F}^{(g)}_{n_g}, \hat{F}^{(h)}_{n_h}) > 2 d) \leq 4 \exp(-2 \min(n_g,n_h) d^2).$$


%
We now define the bound for the denominator. From the theorem statement we know $\Delta = \phi_{\text{KS}}(F,\hat{F}^{(t)}_{n_t})$. 
%
Again, by ~\citep{massart1990tight}, we know 
\begin{align*} 
P(\phi_\text{KS}(F, \hat{F}^{(h)}_{n_h}) > d) &\leq 2 \exp(-2 n_h d^2) \\
P(\phi_\text{KS}(F, \hat{F}^{(t)}_{n_t}) - \phi_\text{KS}(F, \hat{F}^{(h)}_{n_h}) - \phi_\text{KS}(F, \hat{F}^{(t)}_{n_t}) > d) &\leq 2 \exp(-2 n_h d^2) \\ 
P(\phi_\text{KS}(F, \hat{F}^{(h)}_{n_h}) + \phi_\text{KS}(F, \hat{F}^{(t)}_{n_t}) < \phi_\text{KS}(F, \hat{F}^{(t)}_{n_t}) - d) &\leq 2 \exp(-2 n_h d^2) \\ 
P(\phi_\text{KS}(F, \hat{F}^{(h)}_{n_h}) + \phi_\text{KS}(F, \hat{F}^{(t)}_{n_t}) < \Delta - d) &\leq 2 \exp(-2 n_h d^2)
\end{align*} 

then, again because $\phi_\text{KS}$ satisfies the triangle inequality, the bound of the denominator of \Cref{eq:barphi2} would be:
\bruno{stopped here}
\[
P(\phi_{\text{KS}}(\hat{F}^{(h)}_{n_h},\hat{F}^{(t)}_{n_t}) <
\Delta - d ) \leq 2 \exp(-2 n_h d^2).
\]

\thu{I need help from here. I don't think $\epsilon$ is set with the correct value in here.}

Hence, the probability that $\bar{\phi}_{\text{KS}}(\hat{F}^{(t)}_{n_t}, \hat{F}^{(h)}_{n_h}, \hat{F}^{(g)}_{n_g})$ from \Cref{eq:barphi2} is greater than $d = \epsilon \Delta/(1 - \epsilon)$ for some $\epsilon \in [0,1]$, is 
\begin{align*}    
P(\bar{\phi}_{\text{KS}}(\hat{F}^{(t)}_{n_t}, \hat{F}^{(h)}_{n_h}, \hat{F}^{(g)}_{n_g}) > \epsilon) &\leq \max\big( 2 \exp\big( - 2 n_h \big(\frac{\epsilon}{1- \epsilon}\big)^2 \Delta^2\big), \\
& \qquad 4 \exp\big( - 2 \min(n_g,n_h) \big(\frac{\epsilon}{1- \epsilon}\big)^2 \Delta^2\big)\big)\\
& \leq 4 \exp\big( - 2 \min(n_g,n_h) \big(\frac{\epsilon}{1- \epsilon}\big)^2 \Delta^2\big)\big).
\end{align*}
\iffalse
\thu{[Rewrite -- need validation]

Hence, the probability that $\bar{\phi}_{\text{KS}}(\hat{F}^{(t)}_{n_t}, \hat{F}^{(h)}_{n_h}, \hat{F}^{(g)}_{n_g})$ from \Cref{eq:barphi2} is greater than $\epsilon$, where $\epsilon = 2d$

\begin{align*}    
P(\phi_\text{KS}(\hat{F}^{(g)}_{n_g}, \hat{F}^{(h)}_{n_h}) > 2 d) &\leq 4 \exp(-2 \min(n_g,n_h) d^2) \\
\Rightarrow P\big(\frac{\phi_{\text{KS}}(\hat{F}^{(h)}_{n_h},\hat{F}^{(g)}_{n_g})}{\phi_{\text{KS}}(\hat{F}^{(h)}_{n_h},\hat{F}^{(t)}_{n_t})} > 2 d \big) &\leq 4 \exp(-2 \min(n_g,n_h) d^2), \ \text{because } \phi_{\text{KS}}(\hat{F}^{(h)}_{n_h},\hat{F}^{(t)}_{n_t}) \in [0,1] \\
\Rightarrow P(\bar{\phi}_{\text{KS}}(\hat{F}^{(t)}_{n_t}, \hat{F}^{(h)}_{n_h}, \hat{F}^{(g)}_{n_g}) > 2d) &\leq 4 \exp(-2 \min(n_g,n_h) d^2) \\ 
\end{align*}

If this bound is correct, we need to fix the theorem. And remove the denominator bound.
}

\fi



\newpage
\section{Proof for Sensitivity}

\thu{Needs to be validated and fixed to be more formal. This is only the draft.}

\thmtwo

\begin{proof} \footnote{
 While we prove the theorem statement with unweighted KS statistics, we expect a similar result to hold for the weighted KS statistics or any other 2 sample statistics method}

Based on \url{https://faculty.washington.edu/yenchic/19A_stat535/Lec10_bootstrap.pdf} -- Theorem 10.2 \thu{needs to find a better source or redo the derivation}, from The Berry-Esseen bound, we can derive to:

$$\phi_{\text{KS}}(\Xtrain^{(\ell, j)}, \cG_{\textnormal{boot}}^{(\ell,j)} ; h_{\ell}) = \cO(\frac{1}{\sqrt{m}}), \text{where } m \text{ is the size of } \Xtrain^{(\ell, j)}$$

Then $\lim_{m \to \infty} \phi_{\text{KS}}(\Xtrain^{(\ell, j)}, \cG_{\textnormal{boot}}^{(\ell,j)} ; h_{\ell}) = 0$, i.e., if $m$ is getting larger and larger,  KS statistics cannot see the difference between the distribution of $\Xtrain^{(\ell, j)}$ and $\cG_{\textnormal{boot}}^{(\ell,j)}$. 

With normal cross validation, the split needs to be independently and uniformly distributed. In other words, $S_{i,\ell}$ and $U_{i,\ell}$ are independent, which makes $\Xtrain^{(\ell, j)}$ and $\Xtest^{(\ell, j)}$ have the same distribution. Thus, as $m$ gets large enough, we have:

$$\phi_{\text{KS}}(\Xtrain^{(\ell, j)}, \cG_{\textnormal{boot}}^{(\ell,j)} ; h_{\ell}) = \phi_{\text{KS}}(\Xtest^{(\ell, j)}, \cG_{\textnormal{boot}}^{(\ell,j)} ; h_{\ell}) = 0$$

Thus, if we set the distribution of $\Xtest^{(\ell, j)}$ as the target distribution that we want the model to generate, then $\phi_{\text{KS}}(\Xtest^{(\ell, j)}, \Xgen^{(\ell, j)} ; h_{\ell})$ should be small, or close to $0$ when the model performs well. However, with normal cross validation, we cannot identify whether the model has generated the target distribution or it has memorized the distribution of $\Xtrain^{(\ell, j)}$ (it has boostrapped the training dataset) because the value of $\phi_{\text{KS}}(\Xtest^{(\ell, j)}, \cG_{\textnormal{boot}}^{(\ell,j)} ; h_{\ell})$ also equals $0$.

In brief, with $m$ large enough and using normal cross validation for splitting, there is no way to identify whether  $\Xgen^{(\ell, j)} = \cG_{\textnormal{boot}}^{(\ell,j)}$.

On the other hand, in our proposed SCV splitting procedure, $S_{i,\ell}$ and $U_{i,\ell}$ are not independent, which also makes $\Xtrain^{(\ell, j)}$ and $\Xtest^{(\ell, j)}$ have different distributions. Thus, as $m$ gets large enough, we still have 0 value for $\phi_{\text{KS}}(\Xtrain^{(\ell, j)}, \cG_{\textnormal{boot}}^{(\ell,j)} ; h_{\ell})$, but not with $\phi_{\text{KS}}(\Xtest^{(\ell, j)}, \cG_{\textnormal{boot}}^{(\ell,j)} ; h_{\ell})$. As we mentioned in the main paper, $\phi_{\text{KS}}(\Xtest^{(\ell, j)}, \cG_{\textnormal{boot}}^{(\ell,j)} ; h_{\ell})$ will still get large value as the distribution of $\Xtest^{(\ell, j)}$ and $\Xtrain^{(\ell, j)}$ are different. In mathematical writing, with $m$ large enough, $\phi_{\text{KS}}(\Xtest^{(\ell, j)}, \cG_{\textnormal{boot}}^{(\ell,j)} ; h_{\ell}) = \phi_{\text{KS}}(\Xtest^{(\ell, j)}, \Xtrain^{(\ell,j)} ; h_{\ell}) > 0$. And only if the model can generate the target distribution, then $\phi_{\text{KS}}(\Xtest^{(\ell, j)}, \Xgen^{(\ell,j)} ; h_{\ell})$ will give the value close to $0$. 

Hence, the normal cross validation splitting does not help KS statistics be sensitive to memorization, but our proposed SCV splitting procedure helps KS statistics be sensitive to memorization.



\end{proof}