
\documentclass{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr}

%% Copied from https://www.overleaf.com/learn/how-to/Cross_referencing_with_the_xr_package_in_Overleaf#When_should_I_use_the_xr_package.3F
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[2][]{%
\externaldocument[#1]{#2}%
\addFileDependency{#2.tex}%
\addFileDependency{#2.aux}%
}
%------------End of helper code--------------

% put all the external documents here!
\myexternaldocument[main-]{main}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Fast and Scalable Score-Based Calibration Tests\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<pierreglaser@gmail.com>?Subject=Your UAI 2023 paper}{Pierre~Glaser}{}}
%\author[1]{Harry~Q}
\author[2]{David~Widmann}
\author[3]{Fredrik~Lindsten}
\author[1]{Arthur~Gretton}
% Add affiliations after the authors
\affil[1]{%
    Gatsby Computational Neuroscience Unit\\
    University College London\\
    London, UK
}
\affil[3]{%
Fredrik Lindsten
    Division of Statistics and Machine Learning\\
    Linköping University\\
    Sweden
}
\affil[2]{%
    Department of Information Technology\\
    Uppsala University \\
    Sweden
}
%% Custom additions
\usepackage{preamble}
\usepackage{chngcntr}
\usepackage{placeins}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix
\numberwithin{equation}{section}
\counterwithin{figure}{section}

\section{Conditional Goodness-of-Fit: General Operator-Valued Kernel}\label{app-sec:cgof-general-kernel}

Assume that
\begin{itemize}
\item kernel $l \in \mathcal{C}^2(\mathcal{Y} \times \mathcal{Y}, \mathbb{R})$,
\item densities $P_{|x} \in C^1(\mathcal{Y}, \mathbb{R})$ for $\mathbb{P}(X)$-almost all $x$, and that
\item $\E_{(x,y) \sim \mathbb{P}(X, Y)} \left\|K_{P_{|x}} \xi_{P_{|x}}(y, \cdot) \right\|_{\mathcal{F}_K} < \infty$.
\end{itemize}
Due to the Bochner integrability of $(x, y) \mapsto K_{P_{|x}} \xi_{P_{|x}}(y, \cdot)$ expectation and inner product commute~\citep[see][Definition~A.5.20]{Steinwart2008SVM}, and hence we have
\begin{equation*}
\begin{split}
    C_{P_{|\cdot}}(\mathbb{P}) &= \left \| \E_{(x, y) \sim \mathbb{P}(X, Y)}\left [ K_{P_{|x}} \xi_{P_{|x}}(y, \cdot) \right ]  \right \|^{2}_{\mathcal  F_{K}} \\
    &= \bigg\langle \E_{(x, y) \sim \mathbb{P}(X, Y)}\left [ K_{P_{|x}} \xi_{P_{|x}}(y, \cdot) \right ], \E_{(x', y') \sim \mathbb{P}(X, Y)}\left [ K_{P_{|x'}} \xi_{P_{|x'}}(y', \cdot) \right ] \bigg\rangle_{\mathcal  F_{K}} \\
    &= \E_{(x, y) \sim \mathbb{P}(X, Y)} \E_{(x', y') \sim \mathbb{P}(X, Y)} \bigg\langle K_{P_{|x}} \xi_{P_{|x}}(y, \cdot), K_{P_{|x'}} \xi_{P_{|x'}}(y', \cdot) \bigg\rangle_{\mathcal  F_{K}} \\
    &= \E_{(x, y) \sim \mathbb{P}(X, Y)} \E_{(x', y') \sim \mathbb{P}(X, Y)} \bigg\langle K^*_{P_{|x'}} K_{P_{|x}} \xi_{P_{|x}}(y, \cdot), \xi_{P_{|x'}}(y', \cdot) \bigg\rangle_{\mathcal  F^{d_y}_l},
\end{split}
\end{equation*}
where $K^*_{P_{|x'}}$ is the adjoint of $K_{P_{|x'}}$.
The reproducing property implies $K^*_{P_{|x'}} K_{P_{|x}} = K(P_{|x}, P_{|x'})$, and therefore we get
\begin{equation*}
\begin{split}
    C_{P_{|\cdot}}(\mathbb{P}) &= \E_{(x, y) \sim \mathbb{P}(X, Y)} \E_{(x', y') \sim \mathbb{P}(X, Y)} \bigg\langle K(P_{|x},P_{|x'}) \xi_{P_{|x}}(y, \cdot), \xi_{P_{|x'}}(y', \cdot) \bigg\rangle_{\mathcal  F^{d_y}_l} \\
    &= \E_{(x, y) \sim \mathbb{P}(X, Y)} \E_{(x', y') \sim \mathbb{P}(X, Y)} H((P_{|x}, y), (P_{|x'}, y'))
\end{split}
\end{equation*}
where
\begin{equation*}
\begin{split}
    H((p, y), (p', y')) &\coloneqq \bigg\langle K(p, p') \xi_{p}(y, \cdot), \xi_{p'}(y', \cdot) \bigg\rangle_{\mathcal  F^{d_y}_l} \\
    &= \bigg\langle K(p, p') \xi_{p}(y, \cdot), l(y', \cdot) \nabla_{y'} \log f_{p'}(y') + \nabla_{y'} l(y', \cdot) \bigg\rangle_{\mathcal  F^{d_y}_l}.
\end{split}
\end{equation*}
For $i \in \{1,\ldots,d_y\}$, let $\operatorname{proj}_i \colon \mathcal{F}_l^{d_y} \to \mathcal{F}_l$ be the projection map to the $i$th subspace of the product space $\mathcal{F}_l^{d_y}$, and similarly let $\iota_i \colon \mathcal{F}_l \to \mathcal{F}_l^{d_y}$ be the embedding of $\mathcal{F}_l$ in the $i$th subspace of $\mathcal{F}_l^{d_y}$ via $x \mapsto (0, \ldots, 0, x, 0, \ldots, 0)$.
Then we can write
\begin{equation*}
\begin{split}
    H((p, y), (p', y')) &= \sum_{i=1}^{d_y} \bigg\langle \operatorname{proj}_i K(p, p') \xi_{p}(y, \cdot), l(y', \cdot) \frac{\partial}{\partial y'_i} \log f_{p'}(y') + \frac{\partial}{\partial y'_i} l(y', \cdot) \bigg\rangle_{\mathcal  F_l} \\
    &= \sum_{i=1}^{d_y} \left[(\operatorname{proj}_i K(p, p') \xi_{p}(y, \cdot))(y') \frac{\partial}{\partial y'_i} \log f_{p'}(y')
    + \frac{\partial}{\partial y'_i} (\operatorname{proj}_i K(p, p') \xi_{p}(y, \cdot))(y')\right].
\end{split}
\end{equation*}
Since $K(p, p') \in \mathcal{L}(\mathcal{F}_l^{d_y})$ is a linear operator, we have
\begin{equation*}
    K(p, p') \xi_p(y, \cdot) = K(p, p') (l(y, \cdot) \nabla_y \log f_p(y)) + K(p, p') \nabla_y l(y, \cdot).
\end{equation*}
For $1 \leq i, j \leq d_y$, define $K_{i,j}(p, p') \colon \mathcal{F}_l \to \mathcal{F}_l$ as the continuous linear operator
\begin{equation*}
    K_{i,j}(p, p') := \operatorname{proj}_i K(p, p') \iota_j.
\end{equation*}
Thus we have
\begin{equation*}
    \operatorname{proj}_i K(p, p') \xi_p(y, \cdot) = \sum_{j=1}^{d_y} \left[\frac{\partial}{\partial y_j} \log f_p(y)\right] K_{i,j}(p, p') l(y, \cdot) + \sum_{j=1}^{d_y} \frac{\partial}{\partial y_j} K_{i,j}(p, p') l(y, \cdot),
\end{equation*}
and therefore
\begin{equation*}
    (\operatorname{proj}_i K(p, p') \xi_p(y, \cdot))(y') =\sum_{j=1}^{d_y}  \left[\frac{\partial}{\partial y_j} \log p(y)\right]  (K_{i,j}(p, p') l(y, \cdot))(y') + \sum_{j=1}^{d_y} \frac{\partial}{\partial y_j} (K_{i,j}(p, p') l(y, \cdot))(y').
\end{equation*}
Due to the differentiability of kernel $l$ we can interchange inner product and differentiation~\citep[][Lemma~4.34]{Steinwart2008SVM}, and thus we obtain
\begin{equation*}
\begin{split}
    H((p, y), (p', y')) ={}& \sum_{i,j=1}^{d_y} \left[\frac{\partial}{\partial y_j} \log f_p(y) \right]\left[\frac{\partial}{\partial y'_i} \log f_{p'}(y') \right]
    (K_{i,j}(p, p') l(y, \cdot))(y') \\
    &+ \sum_{i,j=1}^{d_y}\left[ \frac{\partial}{\partial y'_i} \log f_{p'}(y')\right] \frac{\partial}{\partial y_j} (K_{i,j}(p, p') l(y, \cdot))(y') \\
    &+ \sum_{i,j=1}^{d_y} \left[\frac{\partial}{\partial y_j} \log f_p(y) \right] \frac{\partial}{\partial y'_i} (K_{i,j}(p, p') l(y, \cdot))(y') \\
    &+ \sum_{i,j=1}^{d_y} \frac{\partial}{\partial y'_i}  \frac{\partial}{\partial y_j} (K_{i,j}(p, p') l(y, \cdot))(y'),
\end{split}
\end{equation*}
Define $A \colon (P_{|\mathcal{X}} \times \mathcal{Y})^2  \to \mathbb{R}^{d_y \times d_y}$ by
\begin{equation*}
    [A((p, y), (p', y'))]_{i,j} := (K_{i,j}(p, p') l(y, \cdot))(y') \qquad (1 \leq i, j \leq d_y).
\end{equation*}
Thus we obtain
\begin{equation}\label{eq:calibration-gof-test-statistics-advanced}
    H((p, y), (p', y')) = (s_{p'}(y') + \nabla_{y'})^{\top} A((p, y), (p', y')) (s_{p}(y) + \nabla_{y}),
\end{equation}
where for $x, x' \in \mathbb{R}^d, M(x, x') \in \mathbb{R}^{d \times d}$ we use the notation
\begin{equation*}
\nabla_x^\top M(x, x') = \begin{bmatrix}
\nabla_x^\top [M(x, x')]_{:,1} & \cdots & \nabla_x^\top [M(x, x')]_{:,d}
\end{bmatrix}
=\begin{bmatrix}
\operatorname{div}_x [M(x, x')]_{:,1} & \cdots & \operatorname{div}_x [M(x, x')]_{:,d}
\end{bmatrix},
\end{equation*}
and similarly
\begin{equation*}
M(x, x') \nabla_{x'} = {\left(\nabla_{x'}^\top M(x, x')^\top\right)}^\top =
\begin{bmatrix}
\operatorname{div}_{x'} [M(x, x')]_{1,:} &
\cdots &
\operatorname{div}_{x'} [M(x, x')]_{d,:}
\end{bmatrix}^\top
\end{equation*}
and
\begin{equation*}
\nabla_x^\top M(x, x') \nabla_{x'} =
\nabla_x^\top (M(x, x') \nabla_{x'}^\top) = 
\sum_{i,j=1}^d \frac{\partial^2}{\partial x_i \partial x'_j} {[M(x, x')]}_{i,j}.
\end{equation*}

Thus, given samples $\{(P_{|x^i}, y^i)\}_{i=1}^n \stackrel{\text{i.i.d.}}{\sim} \mathbb{P}(P_{|X}, Y)$, an unbiased estimator of statistic $C_{P_{|\cdot}}(\mathbb{P})$ is
\begin{equation*}
    \widehat{C_{P_{|\cdot}}} = \frac{2}{n(n-1)} \sum_{1 \leq i < j \leq n} H((P_{|x^i}, y^i), (P_{|x^j}, y^j)),
\end{equation*}
where $H$ is given by \cref{eq:calibration-gof-test-statistics-advanced}.

If kernel $K$ is of the form in \cref{main-eq:kernel_identity}, we recover the simpler formula in \cref{main-eq:calibration-cgof-test-statistic}.
In this case $A((p, y), (p', y')) = k(p, p') l(y, y') I_{d_y} \in \mathbb{R}^{d_y \times d_y}$, i.e., $A$ is a scaled identity matrix.

\section{KCCSD as a special case of SKCE}\label{app-sec:kccsd-relation-skce}

We prove the following general lemma that establishes the KCSD as a special case of the MMD.
Then \cref{main-prop:kccsd-relation-skce} follows immediately by considering random variables $Z = P_{|X}$ and $Y$, and models $Q_{|z} = z = P_{|x}$.

\begin{lemma}[KCSD as a special case of the MMD]\label{lemma:kcsd-relation-mmd}
Let $Q_{|z}$ be models of the conditional distributions $\mathbb{P}(Y \in \cdot \,|\, Z = z)$.
Moreover, we assume that
\begin{itemize}
    \item $Q_{|z}$ has a density $f_{Q_{|z}} \in C^1(\mathcal{Y}, \mathbb{R})$ for $\mathbb{P}(Z)$-almost all $z$,
    \item kernel $l \in C^2(\mathcal{Y} \times \mathcal{Y}, \mathbb{R})$,
    \item $\E_{(z,y) \sim \mathbb{P}(Z, Y)} \left\|K_{z} \xi_{Q_{|z}}(y, \cdot) \right\|_{\mathcal{F}_K} < \infty$, and
    \item $\oint_{\partial \mathcal{Y}} l(y, y') f_{Q_{|z}}(y) n(y) \, \mathrm{d}S(y') = 0$ and $\oint_{\partial \mathcal{Y}} \nabla_{y} l(y, y') f_{Q_{|z}}(y') n(y') \, \mathrm{d}S(y') = 0$ for $\mathbb{P}(Z)$-almost all $z$,
\end{itemize}
where $n(y)$ is the unit vector normal to the boundary $\partial \mathcal{Y}$ of $\mathcal{Y}$ at $y \in \mathcal{Y}$.%
\footnote{These assumptions are not restrictive in practice since they are satisfied
if the conditions of \cite[Theorem~1]{jitkrittum2020testing} hold
which are required to ensure that $D_{Q_{|\cdot}}(\mathbb{P}) = 0$ if and only if $Q_{|Z}(\cdot) = \mathbb{P}(Y \in \cdot | Z)$ $\mathbb{P}(Z)$-almost surely.}

Then
\begin{equation*}
    D_{Q_{|\cdot}}(\mathbb{P}) = \operatorname{MMD}_{k_{Q_{|\cdot}}}^2(\mathbb{P}(Z, Y), \mathbb{P}_{Q_{|\cdot}}(Z, Y) )
\end{equation*}
where we define distribution $\mathbb{P}_{Q_{|\cdot}}$ by
\begin{equation*}
    \mathbb{P}_{Q_{|\cdot}}(Z \in A, Y \in B) := \int_A Q_{|z}(Y \in B) \, \mathbb{P}(Z \in \mathrm{d}z)
\end{equation*}
and kernel $k_{Q_{|\cdot}} \colon (\mathcal{Z} \times \mathcal{Y}) \times (\mathcal{Z} \times \mathcal{Y}) \to \mathbb{R}$ as
\begin{equation*}
    k_{Q_{|\cdot}}((z, y), (z', y')) := (s_{Q_{|z'}}(y') + \nabla_{y'})^\mathsf{T} A((z, y), (z', y')) (s_{Q_{|z}}(y) + \nabla_y),
\end{equation*}
using the same notation as in \cref{app-sec:cgof-general-kernel} and similarly defining $A((z, y), (z', y')) \in \mathbb{R}^{d_y \times d_y}$ by
\begin{equation*}
    \left[A((z, y), (z', y'))\right]_{i,j} := (K_{i,j}(z, z') l(y, \cdot))(y') \qquad (1 \leq i, j \leq d_y).
\end{equation*}
If $K$ is of the form $k(\cdot, \cdot) I_{\mathcal{F}_l^{d_y}}$, function $A$ simplifies to
\begin{equation*}
    A((z, y), (z', y')) = k(z, z') l(y, y') I_{d_y}
\end{equation*}
and kernel $k_{Q_{|\cdot}}$ is given by
\begin{multline*}
k_{Q_{|\cdot}}((z, y), (z', y')) \\
= k(z, z') \left[ l(y, y') s_{Q_{|z}}(y)^\mathsf{T} s_{Q_{|z'}}(y') + s_{Q_{|z}}(y)^\mathsf{T}\nabla_{y'} l(y, y') + s_{Q_{|z'}}(y')^\mathsf{T} \nabla_y l(y, y') + \sum_{i=1}^{d_y} \frac{\partial^2}{\partial y_i \partial y'_i} l(y, y')\right].
\end{multline*}
\end{lemma}

\begin{proof}
From a similar calculation as in \cref{app-sec:cgof-general-kernel}~\cite[cf.][Section~A.2]{jitkrittum2020testing} we obtain that
\begin{equation*}
    k_{Q_{|\cdot}}((z, y), (z', y')) = \bigg\langle K_z \xi_{Q_{|z}}(y, \cdot), K_{z'} \xi_{Q_{|z'}}(y', \cdot) \bigg\rangle_{\mathcal{F}_K}.
\end{equation*}

Thus $k_{Q_{|\cdot}}$ is an inner product of the features of $(z, y)$ and $(z', y')$ given by the feature map $(z, y) \mapsto K_z \xi_{Q_{|z}}(y, \cdot) \in \mathcal{F}_K$,
and therefore $k_{Q_{|\cdot}}$ is a positive-definite kernel.
Moreover, from our assumption we obtain
\begin{equation*}
    \E_{(z, y) \sim \mathbb{P}(Z, Y)} {|k_{Q_{|\cdot}}((z, y), (z, y))|}^{1/2} = \E_{(z, y) \sim \mathbb{P}(Z, Y)} \left\|K_{z} \xi_{Q_{|z}}(y, \cdot) \right\|_{\mathcal{F}_K} < \infty.
\end{equation*}
Thus the mean embedding $\mu_{\mathbb{P}(Z, Y)} \in \mathcal{F}_K$ of $\mathbb{P}(Z, Y)$ exists~\citep[][Lemma~3]{gretton2012kernel}.

Due to the Bochner integrability of $(z, y) \mapsto K_{z} \xi_{Q_{|z}}(y, \cdot)$ expectation and inner product commute~\citep[see][Definition~A.5.20]{Steinwart2008SVM}, and hence we have
\begin{equation*}
\begin{split}
\E_{(z, y) \sim \mathbb{P}_{Q_{|\cdot}}(Z, Y)} \E_{(z', y') \sim \mathbb{P}_{Q_{|\cdot}}(Z, Y)} k_{Q_{|\cdot}}((z, y), (z', y'))
&= \left \|\E_{(z, y) \sim \mathbb{P}_{Q_{|\cdot}}(Z, Y)} K_{z} \xi_{Q_{|z}}(y, \cdot) \right\|^2_{\mathcal{F}_K} \\
&= \left \|\E_{z \sim \mathbb{P}(Z)}\E_{y \sim Q_{|z}} K_{z} \xi_{Q_{|z}}(y, \cdot) \right\|^2_{\mathcal{F}_K} \\
&= \left \|\E_{z \sim \mathbb{P}(Z)} K_z \E_{y \sim Q_{|z}} \xi_{Q_{|z}}(y, \cdot) \right\|^2_{\mathcal{F}_K}.
\end{split}
\end{equation*}
Due to the last assumption~\citep[][Lemma~5.1]{Chwialkowski16KGOF} we know that
\begin{equation*}
\E_{y \sim Q_{|z}} \xi_{Q_{|z}}(y, \cdot) = 0,
\end{equation*}
which implies
\begin{equation*}
    \E_{(z, y) \sim \mathbb{P}_{Q_{|\cdot}}(Z, Y)} \E_{(z', y') \sim \mathbb{P}_{Q_{|\cdot}}(Z, Y)} k_{Q_{|\cdot}}((z, y), (z', y')) = 0.
\end{equation*}
Thus the mean embedding $\mu_{\mathbb{P}_{Q_{|\cdot}}(Z, Y)} \in \mathcal{F}_K$ of $\mathbb{P}_{Q_{|\cdot}(Z, Y)}$ exists and satisfies $\|\mu_{\mathbb{P}_{Q_{|\cdot}}(Z, Y)}\|^2_{\mathcal{F}_K} = 0$, and hence $\mu_{\mathbb{P}_{Q|\cdot}(Z, Y)} = 0$.
We obtain~\citep[][Lemma~4]{gretton2012kernel} that
\begin{equation*}
\begin{split}
\operatorname{MMD}^2_{k_{Q_{|\cdot}}}(\mathbb{P}(Z, Y), \mathbb{P}_{Q_{|\cdot}}(Z, Y)) &= \|\mu_{\mathbb{P}(Z, Y)} - \mu_{\mathbb{P}_{Q_{|\cdot}}(Z, Y)}\|^2_{\mathcal{F}_K} \\
&= \|\mu_{\mathbb{P}(Z, Y)}\|^2_{\mathcal{F}_K} \\
&= \E_{(z, y) \sim \mathbb{P}(Z, Y)} \E_{(z', y') \sim \mathbb{P}(Z, Y)} k_{Q_{|\cdot}}((z, y), (z', y')) \\
&= \E_{(z, y) \sim \mathbb{P}(Z, Y)} \E_{(z', y') \sim \mathbb{P}(Z, Y)} \bigg\langle K_z \xi_{Q_{|z}}(y, \cdot), K_{z'} \xi_{Q_{|z'}}(y', \cdot) \bigg\rangle_{\mathcal{F}_K} \\
&= D_{Q_{|\cdot}}(\mathbb{P}),
\end{split}
\end{equation*}
where the last equality follows from \cite[][Section~A.2]{jitkrittum2020testing}.
\end{proof}

\section{Calibration implies expected coverage}

We show that the sense of calibration employed by our tests implies posterior coverage in the sense of \citet{Hermans2021}.
Again let us note $P_{|x}(\cdot)$ for a model of the conditional distribution $\mathbb{P}(Y \in \cdot \mid X = x)$.
Moreover, we assume that $P_{|x}$ has a density $f_{P_{|x}}$ for $\mathbb{P}(X)$-almost every $x$.

For level $1 - \alpha \in [0, 1]$, let $\Theta_{P_{|x}}(1 - \alpha)$ be the highest density region of a probabilistic model $P_{|x}$ with density $f_{P_{|}}$.
It is defined~\citep[see, e.g.,][]{Hyndman1996} by
\begin{equation*}
    \Theta_{P_{|x}}(1 - \alpha) \coloneqq \left\{ y \colon f_{P_{|x}}(y) \geq c_{P_{|x}}(1 - \alpha) \right\}
\end{equation*}
where
\begin{equation*}
    c_{P_{|x}}(1 - \alpha) := \sup \left\{ c \colon \int_{\left\{\tilde{y} \colon f_{P_{|x}}(\tilde{y}) \geq c \right\}} \, P_{|x}(\mathrm{d}y) \geq 1 - \alpha \right\}.
\end{equation*}
Hence, by definition \citep[see, e.g.,][]{Hermans2021}
\begin{equation*}
    \E_{y \sim P_{|x}} \mathbbm{1}\big\{y \in \Theta_{P_{|x}}(1 - \alpha)\big\}
    =\int_{\Theta_{P_{|x}}(1 - \alpha)} \, P_{|x}(\mathrm{d}y) \geq 1 - \alpha.
\end{equation*}

Assume that model $P_{|\cdot}$ is calibrated.
By definition, it satisfies
\begin{equation*}
    \mathbb{P}(Y \in \cdot \mid P_{|X}) = P_{|X} \qquad \mathbb{P}(X)\text{-almost surely}.
\end{equation*}
Hence, for all $\alpha \in [0,1]$, we obtain
\begin{equation*}
\begin{split}
     \E_{(x, y) \sim \mathbb{P}(X, Y)} \mathbbm{1}\big\{y \in \Theta_{P_{|x}}(1 - \alpha)\big\} 
     &= \E_{(P_{|x}, y) \sim \mathbb{P}(P_{|X}, Y)} \mathbbm{1}\big\{y \in \Theta_{P_{|x}}(1 - \alpha)\big\} \\
     &= \E_{P_{|x} \sim \mathbb{P}(P_{|X})} \E_{y \sim P_{|x}} \mathbbm{1}\big\{y \in \Theta_{P_{|x}}(1 - \alpha)\big\} \\
     &\geq \E_{P_{|x} \sim \mathbb{P}(P_{|X})} \big[1 - \alpha \big] \\
     &= 1 - \alpha.
\end{split}
\end{equation*}
Thus model $P_{|\cdot}$ has expected coverage for all $\alpha \in [0, 1]$.

\section{Diffusion-Limit and Universality}
\subsection{Fisher divergence as a diffusion limit}\label{app-sec:limit-fisher-divergence}

We recall that for a map $ f $  and a measure  $ \mu $, the push-forward measure of $ \mu $ by $ f $, noted $ f_{\#} \mu $, 
is the measure on the image space of $ f $ which verifies, for any measurable function $ g $
\begin{equation*}
    \int_{  }^{  } g(x) \, f_{\#} \mu(\mathrm{d}x) = \int_{  }^{  } g(f(x)) \, \mu(\mathrm{d}x).
\end{equation*}

To prove the differential inequality linking the MMD and the KGFD, we rely on the following reformulation of the Fokker-Planck equation:
\begin{equation*}
\begin{split}
    \frac{\partial \mu(x, t)}{\partial t} &= \operatorname{div}_x(-\mu(x, t) s_p(x)) + \Delta_x \mu(x, t) \\
					  &= \operatorname{div}_x(-\mu(x, t) s_p(x)) + \operatorname{div}_x \nabla_{ x } \mu(x, t) \\
					  &= \operatorname{div}_x(-\mu(x, t) s_p(x)) + \operatorname{div}_x(\mu(x, t) \nabla_x \log \mu(x, t)) \\
					  &= \operatorname{div}_x(- \mu(x, t)(s_p(x) - \nabla_x \log \mu(x, t)).
\end{split}
\end{equation*}
We remark that since the density $\mu(x, t)$ is twice differentiable in $ x $ and differentiable in $ t $~\citep{johnson2004information}, this equation holds in the strong sense, and not only in the sense of distributions. Because of that, one has
\begin{equation*}
    \partial_t \mu(x, t) = \lim_{ \Delta \to 0 } \frac{\mu(x, t + \Delta) - \mu(x, t)}{\Delta}.
\end{equation*}

Let us consider an RKHS $ \mathcal  H $ with kernel $ k $, and let  $ h \in \mathcal  H $.
Let us define $m_t(x) := m(x, t) := \mu_{\nu,p}(x, t) - \mu_{\nu,q}(x, t) $ and we note $ \operatorname{MMD}(m_t)$ the function given by
\begin{equation*}
\operatorname{MMD}(m_t) = \left[\iint k(x, y) m_t(x) m_t(y) \, \mathrm{d}x \, \mathrm{d}y\right]^{1/2} = \operatorname{MMD}(\mu_{\nu,p}(\cdot, t), \mu_{\nu,q}(\cdot, t)).
\end{equation*}
To show that $ \lim_{ t  \to 0 } \frac{ d }{ \text{d}t }\operatorname{MMD}(m_t) = \operatorname{KGFD}(p, q) $,
we first analyze the differential properties of the easier to handle $\operatorname{MMD}^2$ and complete the proof using a chain rule argument.
The first variation (also called Gateaux Derivative) of $m  \mapsto \operatorname{MMD}^2(m)$ is a linear functional
on the space of functions
\begin{equation*}
    \left\{f - g \,\middle|\, f, g \colon \mathcal{X} \times [0, \infty) \to \mathbb{R} \quad \text{with} \quad \forall t \geq 0 \colon \int_{\mathcal{X}} f(x, t) \,\mathrm{d}x = \int_{\mathcal{X}} g(x, t) \,\mathrm{d}x = 1 \right\},
\end{equation*} given by
\begin{equation*}
\frac{ \delta \operatorname{MMD}^2 }{ \delta m} \colon f \mapsto \int 2 k(x, y) m_t(x) f(y) \,\mathrm{d}x \,\mathrm{d}y.
\end{equation*}
Using the chain rule for Gateaux derivatives, we have that
\begin{equation*}
\begin{split}
    \frac{\mathrm{d} \operatorname{MMD}^2(m)}{\mathrm{d} t} &= \frac{ \mathrm{d} \operatorname{MMD}^2}{ \mathrm{d} m}(m) \frac{ \mathrm{d} m}{ \mathrm{d} t}\\ 
	&= \int 2 k(x,y) m_t(x) \frac{\mathrm{d} m}{\mathrm{d} t}(y) \,\mathrm{d}x \,\mathrm{d}y.
\end{split}
\end{equation*}
From the Fokker-Planck Equation, we have that
\begin{equation*}
\begin{split}
    \frac{\mathrm{d}m }{\mathrm{d}t} &= \partial_t \mu_{\nu, p} - \partial_t \mu_{\nu, q} \\
				    &= \operatorname{div}_x (\mu_{\nu, p} \nabla_x \log \frac{p}{ \mu_{\nu, p} }) - \operatorname{div}_x (\mu_{\nu, q} \nabla_x \log \frac{q}{ \mu_{\nu, q} }) \\
				    &= \operatorname{div}_x (\nu \nabla_x \log \frac{ p }{ \nu}) - \operatorname{div}_x (\nu \nabla_x \log \frac{ q }{ \nu }) + o(1) \\
                   &= \operatorname{div}_x (\nu \nabla_x \log \frac{p}{q}) + o(1)
\end{split}
\end{equation*}
% Note that $t \longmapsto \mu_{t, p}$ and $t \longmapsto \mu_{t, q}$ are continuous curves under the wasserstein-2 geometry, which metricizes the weak convergence of measure. Thus, expectations under $\mu_{t, p}$ and $\mu_{t, q}$ can be replaced by
% expectations under $\nu$.
% As a consequence, the previous equation simplifies, in the weak sense, to
% \begin{equation*}
% \begin{split}
%     \frac{\mathrm{d}m }{\mathrm{d}t} &= \operatorname{div}_x (\nu \nabla_x \log \frac{ p }{ \nu}) - \operatorname{div}_x (\nu \nabla_x \log \frac{ q }{ \nu }) + o(1) \\
%                     &= \operatorname{div}_x (\nu \nabla_x \log \frac{p}{q}) + o(1)
% \end{split}
% \end{equation*}
% Than as $\mu_{\nu, p}(x, t) = \nu(x) + o(1) $ and $ \mu_{\nu, q}(x, t) = \nu(x) + o(1) $.
%All residuals in the proof are made
%We make the $o(1)$ uniform in $x$ by splitting $\mathcal X = \mathbb R^d$ between a ball $B(0_{\mathbb R^d}, R)$ and its complement, uniformly bounding the residual on $R$. The residual on the complement has a negligible impact
Plugging the last equation in the chain rule, we have:
\begin{equation*}
\begin{split}
	\frac{\mathrm{d} \operatorname{MMD}^2(m)}{\mathrm{d}t}					&= \int_{  }^{  } 2 m_t(x) \text{div}_y \nu(y) \nabla_{ y } \log \frac{ p }{ q }(y) k(x,y) \text{d}x \text{d}y + o(1) \\
						& = \int_{  }^{  } 2m_t(x) \left \langle \nabla_{ y }  k(x, y), \nu(y) \nabla_{ y } \log \frac{ p }{ q }(y) \right \rangle \,\mathrm{d}x \,\mathrm{d}y + o(1).
\end{split}
\end{equation*}
Similarly, since $ m_0 = \mu_{\nu, p}(\cdot, 0) - \mu_{\nu, q}(\cdot, 0) = \nu - \nu = 0 $, we have $ m_t(x) = t \partial_t m(x, 0) + o_x(t)$.
%The term $o(t)$ can be controlled by splitting $\mathcal X = \mathbb R^d$ into a boub
The calculation follows as:
\begin{equation*}
\begin{split}
    \frac{\mathrm{d} \operatorname{MMD}^2(m)}{ \mathrm{d} t} &= \int 2 t \times \partial_t m(x, t) \left \langle \nabla_{ y }  k(x, y), \nu(y) \nabla_{ y } \log \frac{ p }{ q }(y) \right \rangle \,\mathrm{d}x \,\mathrm{d}y + o(t) \\
	&= \int 2 t \times \operatorname{div}_x \nu(x) \nabla_{ x } \log \frac{ p }{ q }(x) \left \langle \nabla_{ y }  k(x, y), \nu(y) \nabla_{ y } \log \frac{ p }{ q }(y) \right \rangle \,\mathrm{d}x \,\mathrm{d}y  + o(t) \\
	&= \int 2 t \times \left \langle  \nu(x)\nabla_{ x }\log \frac{ p }{ q }(x), \nabla_{ x }  \left \langle \nabla_{ y }  k(x, y), \nu(y) \nabla_{ y } \log \frac{ p }{ q }(y) \right \rangle \right \rangle \,\mathrm{d}x \,\mathrm{d}y  + o(t)\\
	&= \int 2 t \times \left \langle  \nu(x)\nabla_{ x }\log \frac{ p }{ q }(x), \nabla_{ x }  \nabla_{ y }  k(x, y), \nu(y) \nabla_{ y } \log \frac{ p }{ q }(y) \right \rangle \,\mathrm{d}x \,\mathrm{d}y + o(t).
\end{split}
\end{equation*}
To get rid of the degenerate scaling as $ t  \to 0 $, we now focus on (the
derivative of) $ \sqrt {\operatorname{MMD}^2(m_t)}  $ as $ t  \to 0 $. Notice that
since $ \operatorname{MMD}(m_0) = 0 $, the derivative of $ \sqrt {\operatorname{MMD}^2(m_t)}$
does not exist a priori for $ t=0 $: we consider instead  $ \frac{\mathrm{d}}{\mathrm{d}t } \sqrt {\operatorname{MMD}^2(m_t)}\Big|_{t=t}  $, and extend it by continuity by setting $
t  \to 0 $. We have:
\begin{equation*}
\begin{split}
\frac{\mathrm{d}\sqrt {\operatorname{MMD}^2(m_t)}  }{ \mathrm{d}t } = \frac{1}{2 \sqrt {\operatorname{MMD}^2(m_t)} } \frac{ \mathrm{d} \operatorname{MMD}^2(m_t)}{ \mathrm{d} t}.
\end{split}
\end{equation*}
As
\begin{equation*}
\operatorname{MMD}^2(m_t) = \int_{  }^{  } k(x, y) m_t(x) m_t(y) \,\mathrm{d}x \,\mathrm{d}y
\end{equation*}
we obtain through similar calculations that
\begin{equation*}
    \operatorname{MMD}^2(m_t) = \int_{  }^{  } \int_{  }^{  } t^2\left \langle  \nu(x)\nabla_{ x }\log \frac{ p }{ q }(x), \nabla_{ x }  \nabla_{ y }  k(x, y), \nu(y) \nabla_{ y } \log \frac{ p }{ q }(y) \right \rangle \,\mathrm{d}x \,\mathrm{d}y + o(t)
\end{equation*}
from which the results follows. Note that the matrix-valued kernel $ (K(x,
y))_{ij} = (\nabla_{x}  \nabla_{y}  k(x, y))_{ij}$ is positive
definite, a result akin to one of \citet{zhou2008derivative} but for the matrix-valued case.
Indeed, for all $ x, y \in
\mathcal  X $, $ z, t \in  \mathbb{R}^d $, 
\begin{equation*}
    z K(x, y)t = \left\langle \sum\limits_{i=1}^{d} z_i \partial_i k(x, \cdot), \sum\limits_{i=1}^{d} t_i \partial_i k(y, \cdot) \right \rangle_{\mathcal H} 
\end{equation*}
where $ \partial_i k(x, \cdot) \in \mathcal  H $~\citep{zhou2008derivative}.
In the following, we write $ \phi(x, y) = \sum_{i=1}^{d} y_i \partial_i
k(x_i, \cdot) $. Now, for all sets of $ \{ x^{i} \}_{i=1}^{n}
\in \mathcal  X  $,
$\{ y^{j} \}_{i=1}^{n} \in  \mathbb{R}^d$, we have
\begin{equation*}
\begin{split}
    \sum\limits_{ i, j=1 }^{ n } \left \langle K(x^i, x^j) y^j, y^i \right \rangle_{\mathbb{R}^d}
    &= \sum\limits_{ i, j=1 }^{ n } \left \langle \phi(x^i, y^i), \phi(x^j, y^j)  \right \rangle_{\mathcal  H} \\ 
    &=  \left \langle \sum\limits_{ i=1 }^{  n} \phi(x^i, y^i), \sum\limits_{ i=1 }^{ n } \phi(x^i, y^i) \right \rangle_{\mathcal  H}  \geq  0
\end{split}
\end{equation*}
from which it follows that $ K $ is indeed positive definite~\citep[Theorem
2.1]{micchelli2005learning}.

\subsection{Universality of the Exponentiated-GFD and Exponentiated-KGFD kernel}
To prove the universality of $K_\nu$ and $K_{\nu, K}$ under the assumptions discussed in the related propositions,
we rely on the following theorem \cite[Theorem 2.2]{christmann2010universal}.
\begin{theorem}
On a compact metric space $(\mathcal Z, d_\mathcal Z$ ) and for a continuous
and injective map $\phi : \mathcal Z \mapsto H$,  where H is a separable
Hilbert space, the kernel
$K(z, z') = e^{-\gamma \| \phi(z) - \phi(z')\|^2_H}$ is universal.
\end{theorem}
We first focus on the universality of $K_\nu$.
We set as our goal to apply that theorem to our setting, in which $\mathcal Z := \mathcal P_\mathcal X$ 
is a (sub)set of probability densities, which needs to be associated with a suitably chosen metric
in order to make $\mathcal P_\mathcal X$ to be compact, and $\phi$ continuous.
As bounded subsets of differentiable densities, whose elements can be framed as elements
of the Sobolev space of first order $\mathcal W^{2, 1}(\nu)$~\citep{taylor1996partial}), are not compact a priori,
we restrict ourselves to twice-differentiable densities with bounded Sobolev norm of second order, i.e., to $\mathcal W^{2,2 }(\nu)$ with norm $\|p\|^2_{\mathcal W^{2, 2}} \coloneqq \|p\|_{\mathcal L_2(\nu)}^2+ \sum_{i=1}^{d} \|\partial_i p\|^2_{\mathcal L_2(\nu)} + \sum_{i, j=1}^{d} \|\partial_i\partial_j p\|^2_{\mathcal{L}_2(\nu)}$.
From the Rellich-Kondrachov theorem~\citep{taylor1996partial}, 
we know that when $\nu$ has compact support, the canonical canonical injection $I \colon \mathcal W^{2,2}(\nu) \to \mathcal W^{2,1}(\nu)$ is a compact operator.
As a consequence, for any bounded subset $A$ of $\mathcal P_{\mathcal X}$ we thus have that $I(A)$ is compact for $\|f\|_{\mathcal W^{2, 1}}^2:=\|f\|_{\mathcal L_2(\nu)}^2+ \sum_{i=1}^{d} \|\partial_i f\|^2_{\mathcal L_2(\nu)} $, which implies that any bounded subset $A$ of $P_\mathcal X$ is compact for $d(z, z') = \|z - z'\|_{\mathcal W^{2, 1}}$.
To apply the above theorem, it remains to prove the continuity and injectivity of $\phi: p \mapsto \nabla \log p$ under this metric (in that case the separable Hilbert space $H$ is set to $\mathcal L_2(\nu)$). And indeed, for such a choice of $d$, $\phi$ and $H$, $\phi$ is continuous. To prove this fact, remark that differentiable densities with full support on $\mathcal X$ are bounded away from $0$, making the use of a $\phi \colon p \mapsto \nabla \log p = \nabla p / p$ continuous.
Moreover, $\phi$ is injective as $d_{W^{2, 1}}(p, q) \coloneqq \|p - q\|_{W^{2, 1}} \neq 0$ implies $\|\nabla \log p  - \nabla \log q\|_{\mathcal L_2(\nu)} \neq 0$.
Thus, all conditions of \cite[Theorem 2.2]{christmann2010universal} are satisfied, and the result follows as a consequence.

We now move on to prove the universality of $K_{\nu, K}$.
The proof follows the same reasoning as the proof of the universality of $K_\nu$, the only difference being the fact that the feature map $\tilde \phi$ of $K_{K, \nu}$ is given by $T_\nu \circ \phi$, where $\phi \colon p \mapsto \nabla \log p$ and $T_{K, \nu} \colon \mathcal{L}(\mathcal{X}, \mathbb{R}^d) \to  \mathcal{H}_{K}$ is given by
\begin{equation*}\label{eq:k-int-op}
T_{K, \nu} \colon f \mapsto \int_{\mathcal{X}} K_x f(x)  \, \nu(\mathrm{d}x).
\end{equation*}
However, if $\nu$ is a probability measure and $K$ is bounded, then $T_{K, \nu}$ is a bounded operator, and thus continuous, making $\tilde \phi$ continuous.
Moreover, if $K$ is characteristic, $T_{K, \nu}$ is injective.
Thus $\tilde \phi$ is injective and continuous, from which the result follows by \cite{christmann2010universal}.


\section{Background on Stein and Fisher divergences} \label{app-sec:background-divergences}

\paragraph{The Fisher Divergence} 
Consider two continuously differentiable densities $p$ and $q$ on $\mathbb{R}^d$.
Then the Fisher divergence~\citep{sriperumbudur2017density,johnson2004information} between $p$ and $q$ is defined as:
\begin{equation*}
\operatorname{FD}(p||q) = \int_{ \mathbb{R}^d }^{  } \left \| \nabla_{  } \log p(x) - \nabla_{  } \log q(x) \right \|_{2}^{2} p(x) \,\mathrm{d}x.
\end{equation*}
We refer to \cite{sriperumbudur2017density} for an overview of the properties of the Fisher divergence, including its relative strength w.r.t.\ other divergences, and other formulations. The Fisher divergence was used for learning statistical models of some training data in \cite{hyvarinen2005estimation, sriperumbudur2017density}, and more recently in \cite{song2019generative}.

\paragraph{Stein Discrepancies} 
Of proximity to the Fisher divergence is the family of Stein discrepancies~\citep{anastasiou2022stein}.
Stein discrepancies build upon the concept of Stein operators, which are operators $\mathcal{A}_{\mathbb{P}}$ such that
\begin{equation*} 
\E_{\mathbb{Q}}\left[ \mathcal  A_{\mathbb P} f \right ] = 0 \iff \mathbb{ Q } = \mathbb{ P }
\end{equation*}
for any $ f $ within a set $ \mathcal  G(\mathcal  A_{\mathbb{ P }}) \subset \operatorname{dom}(\mathcal A_{\mathbb P}) $ called the \emph{Stein class}
of $ \mathcal  A_{\mathbb P} $.
Following this definition, the $ \mathcal  A_{\mathbb{ P }} $-stein discrepancy is defined as
\begin{equation*} 
\operatorname{SD}_{\mathcal  A_{\mathbb{ P }}}(\mathbb{ P }, \mathbb{ Q }) = \sup_{ f \in G(\mathcal  A) }\left \|\E_{ \mathbb{ Q } }  \mathcal  A f \right \|
\end{equation*}
which satisfies by construction the axioms of a \emph{dissimilarity} (or \emph{divergence}) measure between $ \mathbb{ P } $ and $ \mathbb{ Q } $.

\paragraph{Link Between the Fisher divergence and Diffusion Stein Discrepancies}
Perhaps the most famous Stein discrepancy is the one that sets $\mathcal  {A}_{\mathbb{P}}$ to be the infinitesimal generator of the isotropic diffusion process toward $\mathbb{ P } $~\citep{gorham2019measuring}:
\begin{equation*}
    \begin{cases}
    \mathrm{d}X_t &= \nabla_{  } \log p(X_t) \,\mathrm{d}t + \sqrt{2}\,\mathrm{d}W_t \\
    (\mathcal  A_{d, \mathbb{P}}f)(\cdot)  &= \left \langle \nabla_{  }  \log p(\cdot), \nabla_{  }  f   \right \rangle  + \left \langle \nabla_{  } , \nabla_{  } f   \right \rangle 
    \end{cases} 
\end{equation*}
Recalling that $\E_{ \mathbb{   P} }\left [ \mathcal A_{d, \mathbb{ P }} f \right ]  = 0$ for all $f \in \mathcal  G(\mathcal  A_{d, \mathbb{ P }}) $, we obtain the following formulation for the diffusion Stein discrepancy
\begin{equation*}
\begin{split}
    \operatorname{SD}_{\mathcal  A_{d, \mathbb{P}}}(\mathbb{ P }, \mathbb{ Q }) &\coloneqq \sup_{  f } \left \| \E_{  \mathbb{ Q } } \mathcal  A_{d, \mathbb{ P }} f \right \| = \sup_{  f } \left \| \E_{  \mathbb{ Q } } (\nabla_{  }   \log p - \nabla_{  }   \log q)^{\top} \nabla_{  }  f  \right \| \\
							    &= \sup_{g =  \nabla_{  } f} \left \| \E_{  \mathbb{ Q } } (\nabla_{  }   \log p - \nabla_{  }   \log q)^{\top} g  \right \|,
\end{split}
\end{equation*}
highlighting the connection between the Fisher divergence and the diffusion Stein discrepancy.

\paragraph{Link Between the Fisher divergence and the Kernelized Stein Discrepancy}
% The stein class of the diffusion stein discrepancy has to be included in the
% domain $ \mathcal  A_{d, \mathbb{ P }} $.
Given a RKHS $\mathcal{H}$ such that $B_{\mathcal{H}^{\otimes d}}(0_{\mathcal{H}^{\otimes d}}, 1)$ is a Stein class for $\mathcal{A}_{d, \mathbb{P}}$, the kernelized Stein discrepancy~\citep{gorham2017measuring} is given by
\begin{equation*}\label{eq:KSD}
\begin{split}
    \operatorname{KSD}(\mathbb{ P }, \mathbb{ Q }) &\coloneqq \sup_{ h = \nabla_{  } f  \in \mathcal  H^{\otimes d}: \left \|h \right \|_{\mathcal H^{\otimes d}} \leq 1 } \left \| \E_{  \mathbb{ Q } } \left \langle \nabla_{  }   \log p(x) - \nabla_{  }   \log q(x), h(x)\right \rangle \right \|\\
					     &=  \sup_{ h = \nabla_{  } f  \in \mathcal  H^{\otimes d}: \left \|h \right \|_{\mathcal H^{\otimes d}} \leq 1 } \left \langle h, \E_{ \mathbb{ Q } } (\nabla_{  }  \log p(x) - \nabla_{  }  \log q(x)  )k(x, \cdot)\right \rangle_{\mathcal H^{\otimes d}}^{1/2}  \\
					     &=  \left \| \E_{ \mathbb{ Q } } \left \lbrack  (\nabla_{  }  \log p(x) - \nabla_{  }  \log q(x)  ) k(x, \cdot) \right \rbrack  \right \|_{\mathcal  H^{\otimes d}} \\
					     &= \left \| I^\star_{k, \mathbb{ Q }} (\nabla_{  }  \log p - \nabla_{  }  \log q  ) \right \|_{\mathcal  H^{\otimes d}}
\end{split}
\end{equation*}
where $I^\star_{k, \mathbb{ Q }}$ is the adjoint of the canonical injection from $ \mathcal  H^{\otimes d} $ to $ (L^2(\mathbb{ Q }))^{\otimes d} $, also known as the \emph{kernel integral operator}.
This derivation shows that the KSD can be seen as a kernelized version of the Fisher divergence.


\paragraph{Link between MMD and KSD} 
It is possible~\citep{gorham2017measuring} to reframe the KSD as an MMD with a specific kernel.
Indeed, given some base kernel $k(x, y)$, define the following ``Stein'' kernel
\begin{equation*}\label{eq:stein-kernel}
    \tilde{ k }(x, y) = \left \langle \nabla_{  }  \log p(x)k(x, \cdot) + \nabla_{  }  k(x, \cdot), \nabla_{  }  \log p(y)k(y, \cdot) + \nabla_{  }  \log k(y, \cdot)     \right \rangle_{\mathcal  H^{\otimes d}}
\end{equation*}
which is positive definite as an inner product of a feature map of $x$.
Then $\mathcal  H_{ \tilde{ k}} = \mathcal A_{d, \mathbb{ P }}( \mathcal  H) $ and $ \left \| f \right \|_{ \mathcal  H_{\tilde{k}}} = \left \| \mathcal  A f \right \|_{\mathcal  H_{k}^{\otimes d}} $.
Moreover, we have that $ \E_{ \mathbb{ P } } \, \tilde{ h} = 0$ for all $\tilde{h} \in \mathcal  H_{ \tilde{k}} $.
By the definition of the KSD, we have that
\begin{equation*} 
\begin{split}
\operatorname{KSD}(\mathbb{ P }, \mathbb{ Q }) &= \sup_{ h   \in \mathcal  H^{\otimes d} \colon \left \|h \right \|_{\mathcal H^{\otimes d}} \leq 1 } \left \| \E_{ \mathbb{ Q } } \mathcal  A_{d, \mathbb{ P }} h  \right \|\\
				       &= \sup_{ h \in \mathcal  H^{\otimes d} \colon \left \|h \right \|_{\mathcal H^{\otimes d}} \leq 1 } \left \| \E_{ \mathbb{ Q } } \mathcal  A_{d, \mathbb{ P }} h  - \E_{  \mathbb{ P } } \mathcal  A_{d, \mathbb{ P }} h\right \|_{\mathcal  H} \\
				       &= \sup_{ h \in \mathcal  H_{\tilde{k}} \colon \left \|h \right \|_{\mathcal  H_{\tilde{k}}} \leq 1 } \left \| \E_{ \mathbb{ Q } }   h  - \E_{  \mathbb{ P } } h\right \|_{\mathcal  H_{\tilde{k}}} \\
				       &= \operatorname{MMD}_{\tilde{ k}}(\mathbb{ P }, \mathbb{ Q }).
\end{split}
\end{equation*}

\paragraph{Differential Inequalities between the KL and the Fisher Divergence} 
It is well known~\citep{carrillo2003kinetic} that the KL divergence can be related to the Fisher divergence by considering the evolution of $\operatorname{KL}(\mathbb{ P }_t||\mathbb{ Q })$ when $ \mathbb{ P }_t $ evolves according to the Fokker-Planck equation
\begin{equation}\label{eq:fokker-planck}
\partial_t p_t(x) = \operatorname{div} ( p_t(x) (\nabla_{  } \log q_t(x) - \nabla_{  }  \log p_t(x)  )), \quad \mathbb P_0=\mathbb P.
\end{equation}
(Two relevant side notes: for any $t\geq 0$, $\mathbb P_t$ is the law at time $t$ of the Markov process $(X_t)_{t\geq 0}$ such that $X_0 \sim \mathbb P$ and undergoing an isotropic diffusion towards $\mathbb Q$. Moreover, \cref{eq:fokker-planck} is also the Wasserstein gradient flow equation of $\operatorname{KL}(\cdot||\mathbb Q)$ starting from $\mathbb  P$).
Recalling that \cref{eq:fokker-planck} is satisfied in the sense of distributions, and relying on Gateaux-Derivative formulas for Free Energy-type functionals~\citep[see][for more precise statements]{ambrosio2005gradient}, we have:
\begin{equation*}
\begin{split}
    \frac{\mathrm{d}\text{KL}(\mathbb P_t || \mathbb Q) }{\mathrm{d}t} &= \frac{ \partial \operatorname{KL} }{\mathrm{d}\mathbb{ P } }\bigg\rvert_{\mathbb{ P }_t} \frac{ d \mathbb{ P }_t }{\mathrm{d}t } \\
				      &= \int_{ }^{  } \left \langle  \nabla_{  } (\log p_t(x) - \log q_t(x) ), (\nabla_{  }  \log q_t - \nabla_{  }  \log p_t  )\right \rangle  \,\mathrm{d}\mathbb{ P }_t(x) \\
				      &= - \operatorname{FD}(\mathbb{ P }_t, \mathbb{ Q }).
\end{split}
\end{equation*}

\begin{figure}[htbp]
    \centering%
    \includegraphics[width=.7\textwidth]{figures/fd_ksd_mmd_kl.png} %
    \label{fig:link-divergences}%
    \caption{Relationships between the Fisher divergence, the KL divergence, the MMD, and the KSD~\citep{liu2016short}.}
\end{figure}

\clearpage
\section{Experimental Results}

This section contains visualizations of all experiments discussed in \cref{main-sec:experiments}, including figures contained in the main text.
In all experiments we set the significance level to $\alpha = 0.05$.
Every experiment is repeated for 100 randomly sampled datasets and with 500 bootstrap iterations for estimating the quantile of the test statistic.

We use Gaussian distributions and compare the KCCSD and the SKCE with different combinations of kernels.
For the KCCSD, for Gaussian distributions all considered test statistics can be evaluated exactly.
Alternatively, for the exponentiated (kernelized) Fisher kernel and the exponentiated MMD kernel one can resort to approximations using samples from the base measure.
For the SKCE, however, the test statistic can be evaluated exactly on in special cases such as Gaussian kernels on the target space.
All approximate evaluations are performed with 10 samples.

\subsection{Mean Gaussian Model}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/mgm_kccsd.pdf}
    \caption{False rejection rate of the KCCSD for MGM ($\delta = 0$).}
\end{figure}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/mgm_skce.pdf}
    \caption{False rejection rate of the SKCE for MGM ($\delta = 0$).}
    \label{fig:mgm_skce}
\end{figure}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/pmgm_kccsd_delta=0.1_shift-dim=-1.pdf}
    \caption{Rejection rate of the KCCSD for MGM ($\delta = 0.1$, $c = \mathbf{1}_d$).}
    \label{fig:pmgm_kccsd_all}
\end{figure}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/pmgm_skce_delta=0.1_shift-dim=-1.pdf}
    \caption{Rejection rate of the SKCE for MGM ($\delta = 0.1$, $c = \mathbf{1}_d$).}
    \label{fig:pmgm_skce_all}
\end{figure}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/pmgm_kccsd_delta=0.1_shift-dim=1.pdf}
    \caption{Rejection rate of the KCCSD for MGM ($\delta = 0.1$, $c = e_1$).}
\end{figure}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/pmgm_skce_delta=0.1_shift-dim=1.pdf}
    \caption{Rejection rate of the SKCE for MGM ($\delta = 0.1$, $c = e_1$).}
    \label{fig:pmgm_skce_first}
\end{figure}

\FloatBarrier
\subsection{Linear Gaussian Model}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/lgm_kccsd.pdf}
    \caption{False rejection rate of the KCCSD for LGM ($\delta = 0$).}
    \label{fig:lgm_kccsd}
\end{figure}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/lgm_skce.pdf}
    \caption{False rejection rate of the SKCE for LGM ($\delta = 0$).}
    \label{fig:lg_skce}
\end{figure}

\FloatBarrier
\subsection{Heteroscedastic Gaussian Model}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/hgm_kccsd.pdf}
    \caption{Rejection rate of the KCCSD for HGM ($\delta = 1$).}
    \label{fig:hgm_kccsd}
\end{figure}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/hgm_skce.pdf}
    \caption{Rejection rate of the SKCE for HGM ($\delta = 1$).}
    \label{fig:hgm_skce}
\end{figure}

\FloatBarrier
\subsection{Quadratic Gaussian Model}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/qgm_kccsd.pdf}
    \caption{Rejection rate of the KCCSD for QGM ($\delta = 1$).}
    \label{fig:qgm_kccsd}
\end{figure}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\linewidth]{figures/qgm_skce.pdf}
    \caption{Rejection rate of the SKCE for QGM ($\delta = 1$).}
    \label{fig:qgm_skce}
\end{figure}

\FloatBarrier
\bibliography{glaser_556}

\end{document}
