\documentclass[accepted]{uai2023}
\usepackage[american]{babel}
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}


%% packages
%\usepackage{graphicx}
%\usepackage{grffile}
\usepackage{longtable}
%\usepackage{wrapfig}
%\usepackage{rotating}
%\usepackage[normalem]{ulem}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{hyperref}
\usepackage{amsthm,mathtools}
\usepackage{bm}
%\usepackage{caption}
%\usepackage{comment}
\usepackage{booktabs}
\usepackage{xcolor}
\usepackage{hyperref}
\usepackage{enumitem}
%\usepackage{subfigure}
\usepackage{subcaption}
%% commands

\newcommand*{\tran}{^{\mkern-1.5mu\mathsf{T}}} % transpose
\newcommand{\E}{\mathbb{E}}   % expectation
\newcommand{\F}{\mathcal{F}}  % function class in IPM
\newcommand{\G}{\mathcal{G}_k}% RKHS
\newcommand{\Q}{\mathbb{Q}}   % expectation
\newcommand{\R}{\mathbb{R}}   % reals
\newcommand{\X}{\mathcal{X}}  % domain
\newcommand{\Y}{\mathcal{Y}}  % domain
\newcommand{\ip}[2]{\left\langle{#1}\right\rangle_{#2}} % inner product
\newcommand{\kme}[1]{\mu_{k_{#1}}(\P{#1})} % kernel mean embedding
\newcommand{\norm}[2]{\left\|{#1}\right\|_{#2}} % norm
\newcommand{\nys}{\text{Nys}} % Nyström estimator
\newcommand{\opnorm}[1]{\left\|{#1}\right\|_{\mathrm{op}}} %op-norm
\newcommand{\prodmarginals}{\otimes_{m=1}^M \P_ m} %product of marginals
\newcommand{\tb}{\textbf}    % bold text
\newcommand{\tphs}{\otimes_{m=1}^M \H_{k_m}} % tensor product Hilbert space
\renewcommand{\F}{\mathcal{F}}% unit ball in an RKHS
\renewcommand{\H}{\mathcal{H}} % RKHS
\renewcommand{\O}{\mathcal{O}} % computational complexity
\renewcommand{\P}{\mathbb{P}} % probability measure
\renewcommand{\b}{\mathbf}    % bold maths
\renewcommand{\d}{\mathrm{d}} % dx
\newcommand{\kmeP}{\mu_{k}\left(\tilde \P_{n'}\right)} % Nyström embedding of joint distribution
\newcommand{\kmePm}{\mu_{k_m}\left(\tilde \P_{m,n'}\right)} % Nyström embedding of marginal distribution


%% comments:
\newcommand{\z}{\textcolor{red}} %Zoltan
\newcommand{\f}{\textcolor{orange}} %Florian

%% operators
\DeclareMathOperator{\trace}{tr}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\median}{median}
%\DeclareMathOperator*\hadamard{\raisebox{-.5mm}{\scalebox{2}{$\circ$}}}
\DeclareMathOperator{\hadamard}{\circ}
\DeclareMathOperator{\HSIC}{HSIC}
\DeclareMathOperator{\MMD}{MMD}

\newcommand{\Psamp}{\mathbb{\hat P}}

\newcommand{\Span}{\mathrm{span}} % linear hull
\usepackage{mathabx} %=> \widebar

\usepackage{xr} 
\externaldocument{kalinke_294}

\newtheorem{definition}{Definition}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}{Proposition}[section]
\newtheorem{remark}{Remark}




\title{Nyström $M$-Hilbert-Schmidt Independence Criterion\\(Supplementary Material)}


\author[1]{Florian~Kalinke}
\author[2]{Zoltán~Szabó}
% Add affiliations after the authors
\affil[1]{Institute for Program Structures and Data Organization, Karlsruhe Institute of Technology, Karlsruhe, Germany}
\affil[2]{
  Department of Statistics, London School of Economics, London, UK
  }
  



  \begin{document}

  \onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
\appendix
\section{Appendix}
Section~\ref{sec:external-theorem} contains two external theorems and lemmas that we use. Section~\ref{sec:proofs} is about  our proofs.

\subsection{External Theorems and Lemmas}
\label{sec:external-theorem}
In this section two theorems and lemmas are recalled for self-completeness, Theorem~\ref{thm:rudi} is about bounding the error of Nyström mean embeddings \citep[Theorem 4.1]{chatalic22nystrom},  Theorem~\ref{thm:hoeffding} is a well-known result \citep[Section~5.6, Theorem~A]{serfling80approximation} for bounding the deviation of U-statistics. Lemma~\ref{lemma:conn-u-v} is about connection between U- and V-statistics. Lemma~\ref{lemma:markov} recalls Markov's inequality.


\begin{theorem}[Bound on mean embeddings]
  \label{thm:rudi} Let $\mathcal X$ be a locally compact second-countable topological space, $X$ a random variable supported on $\mathcal X$ with Borel probability measure $\P {}$, and let $\mathcal H_k$ be a RKHS on $\mathcal X$ with kernel $k : \X \times \X \to \R$, and feature map $\phi_k$. 
  Assume that there exists a constant $K \in (0,\infty)$ such that $\sup_{x \in \mathcal X}\sqrt{k(x,x)} \leq K$. Let $C_k=\E\left[\phi_k(X) \otimes \phi_k(X) \right]$. Furthermore, assume that the data points $\hat \P_{n} = \{x_1,\dots,x_n\}$ are drawn i.i.d.\ from the distribution $\P{}$ and that $n' \leq n$ subsamples $\tilde\P_{n'} = \{\tilde x_1, \dots, \tilde x_{n'}\}$ are drawn uniformly with replacement from the dataset $\hat\P_n$. Then for any $\delta \in (0,1)$ it holds that 
  \begin{align*}
  \norm{\mu_{k}\left(\P{}\right) - \mu_{k}\left(\tilde \P_{n'}\right) }{\H_k} \leq \frac{c_1}{ \sqrt{n}} + \frac{c_2}{n'} + \frac{c_3\sqrt{\log(n'/\delta)}}{n'}\sqrt{\mathcal N_{X}\left(\frac{12K^2\log(n'/\delta)}{n'}\right)},
  \end{align*}
  with probability at least $1-\delta$ provided that
  \begin{align*}
    n' \geq \max\left(67,12K^2\opnorm{C_k}^{-1}\right)\log\left(\frac{n'}{\delta}\right),
  \end{align*}
  where $c_1 = 2K\sqrt{2\log(6/\delta)}$, $c_2 = 4\sqrt 3 K \log(12/\delta)$, and $c_3 = 12\sqrt{3\log(12/\delta)}K$.
\end{theorem}

Recall that a U-statistic is the average of a (symmetric) core function $h=h(x_1,\dots,x_m)$ over the observations $X_1,\dots,X_n \sim \P$ ($n\ge m$)  with form
\begin{align}
\label{eq:def-u-stat}
  U_n = U(X_1,\dots,X_m) = \frac{1}{\binom{n}{m}}\sum_ch(X_{i_1},\dots,X_{i_m}),
\end{align}
where $c$ is the set of the $\binom n m $ combinations of $m$ distinct elements $\{i_1,\dots,i_m\}$ from $\{1,\dots,n\}$. $U_n$ is an unbiased estimator of $\theta = \theta(\P) = \E_\P[h(X_1,\dots,X_m)]$. 

\begin{theorem}[Hoeffding's inequality for U-statistics]
  \label{thm:hoeffding}Let $h = h(x_1,\dots,x_m)$ be a core function for $\theta = \theta(\P)=\E_{\P}\left[h(X_1,\dots,X_m)\right]$ with $a \leq h(x_1,\dots,x_m) \leq b$.  Then, for any $u> 0$ and $n\geq m$,
  \begin{align*}
    \P(U_n - \theta \geq u) \leq \exp\left(-\frac{2nu^2}{m(b-a)^2}\right).
  \end{align*}
\end{theorem}

Similar to \eqref{eq:def-u-stat} one can consider an alternative (slightly biased) estimator of $\theta$, which is called V-statistic:
\begin{align}
\label{eq:def-v-stat}
  V_n = V(X_1,\dots,X_m) = \frac{1}{{n}^{m}}\sum_{(i_1,\dots,i_m) \in T_m(n)} h(X_{i_1},\dots,X_{i_m}),
\end{align}
where $T_m(n)$ is the $m$-fold Cartesian product of the set $[n]$.

There is a close relation between U- and V-statistics, as it is made explicit by the following lemma  \citep[Lemma,~Section~5.7.3]{serfling80approximation}.

\begin{lemma}[Connection between U- and V-statistics]
\label{lemma:conn-u-v} Let $\P$ be a probability measure on a metric space $\X$. Let $(X_i)_{i\in[n]}\stackrel{\text{i.i.d.}}{\sim} \P$. Let $m$ denote any element of $[n]$. Let $h$ be a core function satisfying 
    $\E\left[|h(X_1,\dots,X_m)|^r\right] < \infty$ with some $r \in \mathbb Z_+$.  Let $U_n$ and $V_n$ denote the U and V-statistic associated to $h$ as defined in \eqref{eq:def-u-stat} and \eqref{eq:def-v-stat}, respectively.
Then it holds that
\begin{align*}
    \E\left[\left|U_n-V_n\right|^r\right] = \O\left(n^{-r}\right).
\end{align*}
\end{lemma}

\begin{lemma}[Markov inequality] \label{lemma:markov}
For a real-valued random variable $X$ with probability distribution $\P$ and $a > 0$, it holds that
\begin{align*}
    \P\left(|X| \ge a \right) \le \frac{\E\left(|X|\right)}{a}.
\end{align*}
\end{lemma}

\subsection{Proofs} \label{sec:proofs}
This section is dedicated to proofs. Lemma~\ref{thm:nystroem-hsic} is derived in Section~\ref{sec:mult-empir-nystr}. Proposition~\ref{thm:error-nystrom-hsic} is proved in Section~\ref{sec:error-nystrom-hsic} relying on two lemmas shown in Section~\ref{sec:technical-lemmas}. Lemma~\ref{lemma:deviation} is proved in Section~\ref{sec:proof-lemma-refl}, with an auxiliary result in Section~\ref{sec:u-stat-deviation}.


\subsubsection{Proof of Lemma \ref{thm:nystroem-hsic}}
\label{sec:mult-empir-nystr}
Let $\kmeP = \sum_{i=1}^{n'} \alpha_k^i \otimes_{m=1}^M \phi_{k_m}(x_m^i) $, and let $ \kmePm = \sum_{i=1}^{n'}\alpha_{k_m}^i\phi_{k_m}(x_m^i) $ for $m\in[M]$. We write
\begin{align*}
  \HSIC_{k,\text N}^2\left(\hat\P_{n}\right) &=\norm{ \kmeP- \otimes_{m=1}^M \kmePm}{\H_k}^2\\ 
                                               &= \underbrace{\norm{\kmeP}{\H_k}^2}_{=:A} -
                                                 2\cdot \underbrace{\left \langle \kmeP , \otimes_{m=1}^M\kmePm \right \rangle_{\H_k}}_{=: C} +
                                                 \underbrace{\norm{\otimes_{m=1}^M\kmePm}{\H_k}^2}_{=: B},
\end{align*}
and continue term-by-term. Using the definition of the tensor product, we have for term $A$ that
\begin{align*}
  A &= \left\langle \kmeP, \kmeP \right\rangle_{\H_k} 
    =  \sum_{i=1}^{n'}\sum_{j=1}^{n'} \alpha_k^i \alpha_k^j \left\langle \otimes_{m=1}^M \phi_{k_m}(x_m^i), \otimes_{m=1}^M \phi_{k_m}(x_m^j)\right\rangle_{\H_k} =   \sum_{i=1}^{n'}\sum_{j=1}^{n'} \alpha_k^i \alpha_k^j \prod_{m=1}^Mk_m(x_m^i,x_m^j) \\
    &= \bm \alpha_k\tran\left( \hadamard_{m=1}^M \mathbf{K}_{k_m}\right) \bm \alpha_k.
\end{align*}
Similarly, we obtain for term $B$ that
\begin{align*}
  B &= \left\langle \otimes_{m=1}^M\kmePm, \otimes_{m=1}^M \kmePm \right\rangle_{\H_k} \\
    &=  \left\langle \otimes_{m=1}^M \sum_{i^{(m)}=1}^{n'}\alpha_{k_m}^{i^{(m)}}\phi_{k_m}\left(x_m^{i^{(m)}}\right), \otimes_{m=1}^M \sum_{j^{(m)}=1}^{n'}\alpha_{k_m}^{j^{(m)}}\phi_{k_m}\left(x_m^{j^{(m)}}\right) \right\rangle_{\H_k} \\
    &\stackrel{(*)}{=} \prod_{m=1}^M\sum_{i^{(m)}=1}^{n'}\sum_{j^{(m)}=1}^{n'} \alpha_{k_m}^{i^{(m)}}\alpha_{k_m}^{j^{(m)}} k_m\left(x_m^{i^{(m)}},x_m^{j^{(m)}}\right) 
  =  \prod_{m=1}^M \bm \alpha_{k_m}\tran \mathbf{K}_{k_m} \bm\alpha_{k_m},
\end{align*}
where in $(*)$ we used \eqref{eq:tensor:inner-product}, the linearity of the inner product, and the reproducing property.

Last, we express term $C$ as
\begin{align*}
 C &= \left\langle \sum_{i=1}^{n'} \alpha_k^i \otimes_{m=1}^M \phi_{k_m}\left(x_m^i\right) ,\otimes_{m=1}^M \sum_{j^{(m)}=1}^{n'}\alpha_{k_m}^{j^{(m)}}\phi_{k_m}\left(x_m^{j^{(m)}}\right) \right \rangle_{\H_k} \\
&\stackrel{(a)}{=} \sum_{i=1}^{n'} \alpha_k^i \left\langle  \otimes_{m=1}^M \phi_{k_m}\left(x_m^i\right) ,\otimes_{m=1}^M \sum_{j^{(m)}=1}^{n'}\alpha_{k_m}^{j^{(m)}}\phi_{k_m}\left(x_m^{j^{(m)}}\right) \right \rangle_{\H_k} \\
&\stackrel{(b)}{=} \sum_{i=1}^{n'}\alpha_k^i \prod_{m\in[M]}  \left\langle   \phi_{k_m}\left(x_m^i\right) , \sum_{j^{(m)}=1}^{n'}\alpha_{k_m}^{j^{(m)}}\phi_{k_m}\left(x_m^{j^{(m)}}\right) \right \rangle_{\H_k} \\
&\stackrel{(c)}{=} \sum_{i=1}^{n'}  \alpha_k^i\prod_{m\in[M]} \sum_{j^{(m)}=1}^{n'}\alpha_{k_m}^{j^{(m)}} \left\langle   \phi_{k_m}\left(x_m^i\right) , \phi_{k_m}\left(x_m^{j^{(m)}}\right) \right \rangle_{\H_k} \\
&\stackrel{(d)}{=} \sum_{i=1}^{n'} \alpha_k^i\prod_{m\in[M]} \underbrace{\sum_{j^{(m)}=1}^{n'}\alpha_{k_m}^{j^{(m)}}  k_m\left(x_m^i,x_m^{j^{(m)}}\right )}_{\left(\b K_{k_m}\right)_i\bm\alpha_{k_m}} 
   = \bm\alpha_k\tran \left ( \hadamard_{m=1}^M \mathbf{K}_{k_m}\bm\alpha_{k_m} \right),
\end{align*}
where (a) follows from the linearity of the inner product, (b) holds by  \eqref{eq:tensor:inner-product}, (c) is implied by the linearity of the inner product, (d) is valid by the reproducing property, and we refer to the $i$-th row of $\b K_{k_m}$ as $\left(\b K_{k_m}\right)_i$.

Substituting terms $A, B$, and $C$ concludes the proof.

\subsubsection{Two Lemmas to the Proof of Proposition~\ref{thm:error-nystrom-hsic}}
\label{sec:technical-lemmas}

Our main result relies on two lemmas.

\begin{lemma}[Error bound for Nyström mean embedding of tensor product kernel]
  \label{lemma:nystrom-cross-cov}    
  Let  $X=(X_m)_{m=1}^M \in \mathcal X = \times_{m=1}^M \X_m$, $X\sim\P{}\in \mathcal M_1^+(\X{})$, and $(\X_m)_{m\in [M]}$ locally compact, second-countable topological spaces.  Let $k_m: \mathcal X_m \times \mathcal X_m \rightarrow \R$ be a bounded kernel, i.e.\ there exists  $a_{k_m} \in (0,\infty)$ such that $\sup_{x_m\in \X_m}\sqrt{k_m(x_m,x_m)} \leq a_{k_m}$ for $m \in [M]$. Let $a_k = \prod_{m=1}^M a_{k_m}$, $k=\otimes_{m=1}^M k_m$, $\H_k$ the RKHS associated to $k$, $\phi_k = \otimes_{m=1}^M \phi_{k_m}$, $C_k = \E_{}\left[\phi_{k}(X) \otimes \phi_{k}(X) \right]$, $n' \le n$, and $\tilde \P_{n'}$ defined according to \eqref{eq:Nystrom-samples}. Then for any $\delta \in (0,1)$ it holds that 
  \begin{align*}
    \norm{\mu_{k}\left(\P{}\right) - \mu_{k}\left(\tilde \P_{n'}\right)}{\H_k} \leq \frac{c_{k,1}}{ \sqrt{n}} + \frac{c_{k,2}}{n'} + \frac{c_{k,3}\sqrt{\log(n'/\delta)}}{n'}\sqrt{\mathcal N_{X}\left(\frac{12a_k^2\log(n'/\delta)}{n'}\right)},
  \end{align*}
   with probability at least $1-\delta$, provided that
  \begin{align*}
    n' \geq \max\left(67,12a_k^2\opnorm{C_k}^{-1}\right)\log\left(\frac{n'}{\delta}\right),
  \end{align*}
  where $c_{k,1}= 2a_k\sqrt{2\log(6/\delta)}$, $c_{k,2}=4\sqrt 3 a_k \log(12/\delta)$, and $c_{k,3}= 12\sqrt{3\log(12/\delta)}a_k$.
\end{lemma}

\begin{proof}
With $\X = \times_{m\in [M]} \X_m$, noticing that $\X$ is locally compact second-countable iff.\ $(\X_m)_{m\in [M]}$ are so \citep[Theorem~16.2(c), Theorem~18.6]{willard70general}, $\H_k = \tphs$, $\phi_k = \otimes_{m=1}^M \phi_{k_m} $, and $\sqrt{k(x,x)}= \prod_{m=1}^M\sqrt{k_m(x_m,x_m)} \leq a_k $, the statement is implied by Theorem~\ref{thm:rudi}.
\end{proof}

\begin{proof}[Proof of Lemma~\ref{lemma:decomposition}]
    
To simplify notation, let $\mu_{k_m} = \mu_{k_m}\left(\P_m\right)$, $\tilde \mu_{k_m} = \mu_{k_m}\left(\tilde\P_{m,n'}\right)$, $\H_k = \tphs$, and $d_{k_m} = \norm{\mu_{k_m}-\tilde \mu_{k_m}}{\H_{k_m}}$. The proof proceeds by induction on $M$: For $M=1$ the l.h.s. = r.h.s. = $\norm{\mu_{k_1}\left(\P_1\right) - \mu_{k_1}\left(\tilde\P_{1,n'}\right)}{\H_k} $ is satisfied, and
    we assume that the statement holds for $M = M-1$, to obtain
    \begin{eqnarray*}
      \lefteqn{\norm{\otimes_{m=1}^M \mu_{k_m} - \otimes_{m=1}^M \tilde\mu_{k_m}}{\H_k} = \big\|\otimes_{m=1}^M \mu_{k_m} - \otimes_{m=1}^{M-1} \mu_{k_m} \otimes \tilde\mu_{k_M} +  \otimes_{m=1}^{M-1} \mu_{k_m} \otimes \tilde\mu_{k_M}  - \otimes_{m=1}^M \tilde\mu_{k_m}\big\|_{\H_k}}  \\
    &&= \left\|\otimes_{m=1}^{M-1} \mu_{k_m} \otimes (\mu_{k_M} - \tilde\mu_{k_M}) \right. + \left. \left(\otimes_{m=1}^{M-1} \mu_{k_m}- \otimes_{m=1}^{M-1} \tilde\mu_{k_m}\right) \otimes \tilde\mu_{k_M} \right\|_{\H_k} \\
    && \stackrel{(a)}{\le} \left\|\otimes_{m=1}^{M-1} \mu_{k_m} \otimes (\mu_{k_M} - \tilde\mu_{k_M}) \right\|_{\H_k} + \left\|\left(\otimes_{m=1}^{M-1} \mu_{k_m}- \otimes_{m=1}^{M-1} \tilde\mu_{k_m}\right) \otimes \tilde\mu_{k_M} \right\|_{\H_k} \\
    &&\stackrel{(b)}{=} \left(\prod_{m \in [M-1]} \left\| \mu_{k_m}\right\|_{\H_{k_m}}\right) d_{k_M} + \left\|\otimes_{m=1}^{M-1} \mu_{k_m}- \otimes_{m=1}^{M-1} \tilde\mu_{k_m} \right\|_{\otimes_{m=1}^{M-1} \H_{k_m}}\left\|\tilde\mu_{k_M} \right\|_{\H_{k_M}} \\
    &&\stackrel{(c)}{\le} d_{k_M}\prod_{m \in [M-1]} a_{k_m}  + \left\|\otimes_{m=1}^{M-1} \mu_{k_m}- \otimes_{m=1}^{M-1} \tilde\mu_{k_m} \right\|_{\otimes_{m=1}^{M-1} \H_{k_m}}\left(a_{k_M} + d_{k_M}\right) \\
    &&\stackrel{(d)}{\le} d_{k_M}\prod_{m \in [M-1]} a_{k_m} + \left\{\prod_{m\in[M-1]}\left(a_{k_m} + d_{k_m}\right) - \prod_{m\in[M-1]}a_{k_m}\right\}\left(a_{k_M} + d_{k_M}\right) \\
    &&= d_{k_M}\prod_{m \in [M-1]} a_{k_m} + \prod_{m\in[M]}\left(a_{k_m} + d_{k_m}\right) - \prod_{m\in[M]}a_{k_m} - d_{k_M}\prod_{m \in [M-1]}a_{k_m} \\
    &&= \prod_{m\in[M]}\left(a_{k_m} + d_{k_m}\right) - \prod_{m\in[M]}a_{k_m},
    \end{eqnarray*}
    where (a) holds by the triangle inequality, (b) is implied by \eqref{eq:tensor:norm} and the definition of $d_{k_M}$, (c) follows from
    \begin{align}
    \left\| \mu_{k_m}\right\|_{\H_{k_m}} & = \left\| \int_{\X_m} k_m(\cdot,x_m) \d \P_m (x_m)\right\|_{\H_{k_m}} \stackrel{(e)}{\le} \int_{\X_m} \underbrace{\left\|k_m(\cdot,x_m) \right\|_{\H_{k_m}}}_{\stackrel{(f)}{=}\sqrt{k_m(x_m,x_m)} \stackrel{(g)}{\le} a_{k_m}} \d \P_m (x_m) \le a_{k_m}, \label{eq:ME:bound}\\
    \norm{\tilde\mu_{k_M}}{\H_{k_M}} & = \norm{\tilde\mu_{k_M} - \mu_{k_M} + \mu_{k_M}}{\H_{k_M}} \stackrel{(h)}{\le} \norm{\tilde\mu_{k_M} - \mu_{k_M}}{\H_{k_M}} + \norm{\mu_{k_M}}{\H_{k_M}} \stackrel{(i)}{\le} d_{k_M} + a_{k_M}, \nonumber
    \end{align}
    (d) is valid by the induction statement holding for $M-1$, 
    (e) is a property of Bochner integrals, (f) is implied by the reproducing property, (g) comes from the definition of $a_{k_m}$, the triangle inequality implies (h), (i) follows from \eqref{eq:ME:bound} and the definition of $d_{k_M}$.
  \end{proof}



\subsubsection{Proof of Proposition \ref{thm:error-nystrom-hsic}}
\label{sec:error-nystrom-hsic}

Let $k = \otimes_{m=1}^M k_m$, and let $\H_k = \otimes_{m=1}^M \H_{k_m}$. We note that $\X=\times_{m \in [M]} \X_m$ is locally compact second-countable as $(\X_m)_{m\in [M]}$ are so \citep[Theorem~16.2(c), Theorem~18.6]{willard70general}.

We decompose the error of the Nyström approximation as 
  \begin{align*}
    \left|\mathrm{HSIC}_k(\P{}) - \HSIC_{k,\text N}\left(\hat \P_{n}\right) \right| 
    &= \left|\norm{\mu_{k}(\P{})- \otimes_{m=1}^M \mu_{k_m}( \P_{m}) }{\H_k} - \norm{ \kmeP- \otimes_{m=1}^M \kmePm}{\H_k} \right| \\
     &\stackrel{(a)}{\le} \norm{\mu_{k}(\P{})- \otimes_{m=1}^M \mu_{k_m}( \P_{m})  - \kmeP +  \otimes_{m=1}^M \kmePm}{\H_k} \\
    &\stackrel{(b)}{\le} \underbrace{\norm{\mu_{k}(\P{})- \kmeP }{\H_{k}}}_{t_1} +\underbrace{\norm{ \otimes_{m=1}^M \mu_{k_m}( \P_{m})  -  \otimes_{m=1}^M \kmePm}{\H_{k}}}_{t_2}, 
  \end{align*}
  where (a) holds by the reverse triangle inequality, and (b) follows from the triangle inequality. 
  
  \tb{First term ($t_1$)}:  One can bound the error of the first term by Lemma~\ref{lemma:nystrom-cross-cov}; in other words, for any $\delta \in (0,1)$ with probability at least $(1-\delta)$ it holds that
  \begin{align*}
      \norm{\mu_{k}(\P{})- \kmeP }{\H_{k}} \le \frac{c_{k,1}}{ \sqrt{n}} + \frac{c_{k,2}}{n'} + \frac{c_{k,3}\sqrt{\log(n'/\delta)}}{n'}\sqrt{\mathcal N_{X_m}\left(\frac{12a_{k_m}^2\log(n'/\delta)}{n'}\right)}
  \end{align*}
   provided that $n' \geq \max\left(67,12a_{k}^2\opnorm{C_{k}}^{-1}\right)\log\left(\frac{n'}{\delta}\right)$,
  with the constants $c_{k,1} =  2a_{k}\sqrt{2\log(6/\delta)}$, $c_{k,2}=4\sqrt 3 a_{k} \log(12/\delta)$, $c_{k,3} =  12\sqrt{3\log(12/\delta)}a_{k}$.
  
  \tb{Second term} ($t_2$): Applying Lemma~\ref{lemma:decomposition} to the second term gives
  \begin{align*}
     \norm{ \otimes_{m=1}^M \mu_{k_m}( \P_{m})  -  \otimes_{m=1}^M \kmePm}{\H_{k}} &\le  \prod_{m\in[M]}\left(a_{k_m}+\norm{\mu_{k_m}\left(\P_m\right) -  \mu_{k_m}\left(\tilde \P_{m,n'}\right)}{\H_{k_m}}\right) - \prod_{m\in[M]}a_{k_m}.
  \end{align*}
We now bound the error of each of the $M$ factors by Theorem~\ref{thm:rudi}, i.e., for fixed $m \in [M]$; particularly we get that for any $\delta\in(0,1)$ with probability at least $1-\delta$ 
\begin{align*}
      \norm{\mu_{k_m}\left(\P_m\right) - \kmePm}{\H_{k_m}} &\le \frac{c_{k_m,1}}{ \sqrt{n}} + \frac{c_{k_m,2}}{n'} + \frac{c_{k_m,3}\sqrt{\log(n'/\delta)}}{n'}\sqrt{\mathcal N_{X_m}\left(\frac{12a_{k_m}^2\log(n'/\delta)}{n'}\right)}, \text{ hence} \\
      a_{k_m} + \norm{\mu_{k_m}\left(\P_m\right) - \kmePm}{\H_{k_m}} &\le a_{k_m} + \frac{c_{k_m,1}}{ \sqrt{n}} + \frac{c_{k_m,2}}{n'} + \frac{c_{k_m,3}\sqrt{\log(n'/\delta)}}{n'}\sqrt{\mathcal N_{X_m}\left(\frac{12a_{k_m}^2\log(n'/\delta)}{n'}\right)},
\end{align*}
and by union bound that their product is for any $\delta \in (0,\frac{1}{M})$ with probability at least $1-M\delta$ 
\begin{eqnarray*}
      \lefteqn{\prod_{m\in[M]}\left[a_{k_m} + \norm{\mu_{k_m}\left(\P_m\right) - \kmePm}{\H_{k_m}} \right]  \le}\\
      && \le \prod_{m\in[M]}\Bigg[a_{k_m} + \frac{c_{k_m,1}}{ \sqrt{n}} + \frac{c_{k_m,2}}{n'} + \frac{c_{k_m,3}\sqrt{\log(n'/\delta)}}{n'}\sqrt{\mathcal N_{X_m}\left(\frac{12a_{k_m}^2\log(n'/\delta)}{n'}\right)}\Bigg],
\end{eqnarray*}
\begin{eqnarray*}      
      \lefteqn{\prod_{m\in[M]}\left[a_{k_m} + \norm{\mu_{k_m}\left(\P_m\right) - \kmePm}{\H_{k_m}} \right] -\prod_{m\in[M]}a_{k_m} \le}\\
      && \le \prod_{m\in[M]}\Bigg[a_{k_m} + \frac{c_{k_m,1}}{ \sqrt{n}} + \frac{c_{k_m,2}}{n'} + \frac{c_{k_m,3}\sqrt{\log(n'/\delta)}}{n'}\sqrt{\mathcal N_{X_m}\left(\frac{12a_{k_m}^2\log(n'/\delta)}{n'}\right)}\Bigg] -\prod_{m\in[M]}a_{k_m},
\end{eqnarray*}
  provided that $
    n' \geq \max\left(67,12a_{k_m}^2\opnorm{C_{k_m}}^{-1}\right)\log\left(\frac{n'}{\delta}\right)$
    for all $m\in [M]$, 
  with  $C_{k_m} = \E\left[\phi_{k_m}(X_m) \otimes \phi_{k_m}(X_m)\right]$ and constants $c_{k_m,1} =  2a_{k_m}\sqrt{2\log(6/\delta)}$, $c_{k_m,2}=4\sqrt 3 a_{k_m} \log(12/\delta)$, $c_{k_m,3} =  12\sqrt{3\log(12/\delta)}a_{k_m}$, with $m\in[M]$.
  
  Combining the $M+1$ terms by union bound yields the stated result.

\subsubsection{Lemma to the Proof of Lemma~\ref{lemma:deviation}}
\label{sec:u-stat-deviation}
\begin{lemma}[Deviation bound for U-statistics based HSIC estimator]
\label{lemma:deviation-u-stat}It holds that
\begin{align*}
    \left|\HSIC_{k,u}^2\left(\hat\P_n\right)-\HSIC_{k}^2\left(\P\right)\right| = \O_{\text{P}}\left(\frac{1}{\sqrt{n}}\right),
\end{align*}
where $\HSIC_{k,u}^2$ is the U-statistic based estimator of $\HSIC_k^2$.
\end{lemma}
\begin{proof}
We show that \eqref{eq:def-hsic} can be expressed as a sum of U-statistics and then bound the terms individually. First, square \eqref{eq:def-hsic} to obtain
\begin{align*}
  \HSIC_k^2(\P) &= \underbrace{\E_{(x_1,\dots,x_M),(x'_1,\dots,x_M')\sim\P}\left[\prod_{m\in[M]}k_m\left(x_m,x_m'\right)\right]}_A +
                  \underbrace{\E_{x_1,x_1'\sim\P_1,\dots,x_M,x_M'\sim \P_M}\left[\prod_{m\in[M]}k_m\left(x_m,x_m'\right)\right]}_B \\
                &\quad\quad- 2\underbrace{\E_{(x_1,\dots,x_M)\sim\P, x_1'\sim\P_1, \dots, x_M'\sim\P_M}\left[\prod_{m\in[M]}k_m(x_m,x_m')\right]}_C,
\end{align*}
where $A$, $B$, and $C$ can be estimated by U-statistics $A'_n$, $B'_n$, and $C'_n$, respectively. Let $\HSIC_{k,u}^2\left(\hat \P_n\right) = A'_n + B'_n - 2C'_n$, and split $t$ as $\alpha t +\beta t + (1-\alpha-\beta)t$, with $\alpha, \beta > 0$ and $\alpha + \beta < 1$. One obtains
\begin{align*}
  P\left(\left|\HSIC_k^2(\P) - \HSIC_{k,u}^2\left(\hat\P_n\right)\right | \ge t \right) \leq P\left(\left|A-A'_n\right| \ge \alpha t\right) +  P\left(\left|B-B'_n\right| \ge \beta t\right) + P\left(2\left|C-C'_n\right| \ge (1-\alpha-\beta) t\right).
\end{align*}
Doubling and rewriting Theorem~\ref{thm:hoeffding}, we have that for U-statistics and any $\delta \in (0,1)$
\begin{align*}
  \P\left(\left|U_n-\theta\right| \ge \sqrt{\frac{m(b-a)^2\ln(\frac 2 \delta)}{2n}}\right) \leq \delta.
\end{align*}
Now, choosing the $\left(\theta, U_n, u\right)$ triplet to be $\left(A,A_n',\alpha t\right)$, $\left(B,B_n', \beta t \right)$, $\left(C,C_n', \frac{(1-\alpha-\beta)t}{2}\right)$, respectively, setting $m=2M$, and observing that $a \le k(x,y) \leq b$ as $k$ is bounded, we obtain that $|A'_n-A|\sqrt{n}$, $|B'_n-B|\sqrt{n}$, and $|C'_n-C|\sqrt{n}$ are bounded in probability and so is their sum.
\end{proof}
  
  \subsubsection{Proof of Lemma~\ref{lemma:deviation}}
  \label{sec:proof-lemma-refl}

We consider the decomposition
\begin{align}
    \left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_k^2\left(\P\right)\right| \le \underbrace{\left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_{k,u}^2\left(\hat\P_n\right) \right|}_{t_1} +\underbrace{\left|\HSIC_{k,u}^2\left(\hat\P_n\right) - \HSIC^2_k\left(\P\right)\right|}_{t_2}, \label{eq:decompo-1}
\end{align}
by using the triangle inequality,  where $\HSIC_{k,u}$ is the U-statistic based HSIC estimator.

\tb{Second term ($t_2$)}: Lemma~\ref{lemma:deviation-u-stat} establishes that $t_2=\O_{\text{P}}\left(\frac{1}{\sqrt{n}}\right)$. 

\tb{First term ($t_1$)}: To bound $t_1$, first, by  Markov's inequality (Lemma~\ref{lemma:markov}) observe that 
    \begin{align}
    \P\left(\left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_{k,u}^2\left(\hat\P_n\right) \right| \ge a \right) &\le \underbrace{\frac{\E\left(\left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_{k,u}^2\left(\hat\P_n\right) \right|\right)}{a}}_{=: \epsilon}, \nonumber\\
    \P\left(\left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_{k,u}^2\left(\hat\P_n\right) \right| \ge \frac{\E\left(\left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_{k,u}^2\left(\hat\P_n\right) \right|\right)}{\epsilon} \right)&\le \epsilon, \nonumber\\
    \P\left(\left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_{k,u}^2\left(\hat\P_n\right) \right| < \frac{\E\left(\left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_{k,u}^2\left(\hat\P_n\right) \right|\right)}{\epsilon} \right) &\ge 1- \epsilon, \nonumber\\
    \P\left(\left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_{k,u}^2\left(\hat\P_n\right) \right| < \frac{C}{n\epsilon} \right) &\stackrel{(*)}{\ge} 1- \epsilon, \nonumber\\
    \P\left(\left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_{k,u}^2\left(\hat\P_n\right) \right| \ge \frac{C}{n\epsilon} \right) &\le  \epsilon, \label{eq:HSIC-1/n}
\end{align}
for constant $C >0$ and $n$ large enough, where $(*)$ follows from Lemma~\ref{lemma:conn-u-v} (with $r=1$). \eqref{eq:HSIC-1/n} implies that 
\begin{align*}
    \left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_{k,u}^2\left(\hat\P_n\right) \right| = \O_{\text{P}}\left(\frac{1}{n}\right).
\end{align*}
\tb{Combining the terms ($t_1 + t_2$)}: Combining the obtained results for the two terms, one gets that
\begin{align}
\left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_k^2\left(\P\right)\right| &\stackrel{\eqref{eq:decompo-1}}{\le}     \left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_{k,u}^2\left(\hat\P_n\right) \right| +\left|\HSIC_{k,u}^2\left(\hat\P_n\right) - \HSIC_k^2\left(\P\right)\right| \nonumber\\
& =  \O_{\text{P}}\left(\frac{1}{n}\right) + \O_{\text{P}}\left(\frac{1}{\sqrt n}\right) = \O_{\text{P}}\left(\frac{1}{\sqrt n}\right). \label{eq:decompo-2}
\end{align}
Hence
\begin{align*}
    \O_{\text{P}}\left(\frac{1}{\sqrt n}\right) & \stackrel{\eqref{eq:decompo-2}}{\ge}  \left|\HSIC_k^2\left(\hat\P_n\right) - \HSIC_k^2\left(\P\right)\right| = \left|\HSIC_k\left(\hat\P_n\right) - \HSIC_k\left(\P\right)\right|
     \Big|\underbrace{\HSIC_k\left(\hat\P_n\right)}_{\stackrel{\eqref{eq:emp-hsic}}{\ge} 0} + \underbrace{\HSIC_k\left(\P\right)}_{\ge 0}\Big| \\
    &\ge \left|\HSIC_k\left(\hat\P_n\right) - \HSIC_k\left(\P\right)\right|\HSIC_k\left(\P\right),
\end{align*}
which by dividing with the constant $\HSIC_k\left(\P\right)>0$ implies the statement. 

\bibliography{BIB/collected_zoltan,BIB/publications}

\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End:
