\documentclass[accepted]{uai2023}
\usepackage[american]{babel}
\usepackage{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
    
%% packages
%\usepackage{graphicx}
%\usepackage{grffile}
\usepackage{longtable}
%\usepackage{wrapfig}
%\usepackage{rotating}
%\usepackage[normalem]{ulem}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{hyperref}
\usepackage{amsthm,mathtools}
\usepackage{bm}
%\usepackage{caption}
%\usepackage{comment}
\usepackage{booktabs}
\usepackage{xcolor}
\usepackage{hyperref}
\usepackage{enumitem}
%\usepackage{subfigure}
\usepackage{subcaption}
%% commands

\newcommand*{\tran}{^{\mkern-1.5mu\mathsf{T}}} % transpose
\newcommand{\E}{\mathbb{E}}   % expectation
\newcommand{\F}{\mathcal{F}}  % function class in IPM
\newcommand{\G}{\mathcal{G}_k}% RKHS
\newcommand{\Q}{\mathbb{Q}}   % expectation
\newcommand{\R}{\mathbb{R}}   % reals
\newcommand{\X}{\mathcal{X}}  % domain
\newcommand{\Y}{\mathcal{Y}}  % domain
\newcommand{\ip}[2]{\left\langle{#1}\right\rangle_{#2}} % inner product
\newcommand{\kme}[1]{\mu_{k_{#1}}(\P{#1})} % kernel mean embedding
\newcommand{\norm}[2]{\left\|{#1}\right\|_{#2}} % norm
\newcommand{\nys}{\text{Nys}} % Nyström estimator
\newcommand{\opnorm}[1]{\left\|{#1}\right\|_{\mathrm{op}}} %op-norm
\newcommand{\prodmarginals}{\otimes_{m=1}^M \P_ m} %product of marginals
\newcommand{\tb}{\textbf}    % bold text
\newcommand{\tphs}{\otimes_{m=1}^M \H_{k_m}} % tensor product Hilbert space
\renewcommand{\F}{\mathcal{F}}% unit ball in an RKHS
\renewcommand{\H}{\mathcal{H}} % RKHS
\renewcommand{\O}{\mathcal{O}} % computational complexity
\renewcommand{\P}{\mathbb{P}} % probability measure
\renewcommand{\b}{\mathbf}    % bold maths
\renewcommand{\d}{\mathrm{d}} % dx
\newcommand{\kmeP}{\mu_{k}\left(\tilde \P_{n'}\right)} % Nyström embedding of joint distribution
\newcommand{\kmePm}{\mu_{k_m}\left(\tilde \P_{m,n'}\right)} % Nyström embedding of marginal distribution


%% comments:
\newcommand{\z}{\textcolor{red}} %Zoltan
\newcommand{\f}{\textcolor{orange}} %Florian

%% operators
\DeclareMathOperator{\trace}{tr}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\median}{median}
%\DeclareMathOperator*\hadamard{\raisebox{-.5mm}{\scalebox{2}{$\circ$}}}
\DeclareMathOperator{\hadamard}{\circ}
\DeclareMathOperator{\HSIC}{HSIC}
\DeclareMathOperator{\MMD}{MMD}

\newcommand{\Psamp}{\mathbb{\hat P}}

\newcommand{\Span}{\mathrm{span}} % linear hull
\usepackage{mathabx} %=> \widebar

\newtheorem{definition}{Definition}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}{Proposition}[section]
\newtheorem{remark}{Remark}

\title{Nyström $M$-Hilbert-Schmidt Independence Criterion}

\author[1]{Florian~Kalinke}
\author[2]{Zoltán~Szabó}
\affil[1]{Institute for Program Structures and Data Organization, Karlsruhe Institute of Technology, Karlsruhe, Germany}
\affil[2]{Department of Statistics, London School of Economics, London, UK}
  
\begin{document}

\maketitle

\begin{abstract}
Kernel techniques are among the most popular and powerful approaches of data science. Among the key features that make kernels ubiquitous are (i) the number of domains they have been designed for, (ii) the Hilbert structure of the function class associated to kernels facilitating their statistical analysis, and (iii) their ability to represent probability distributions without loss of information. These properties give rise to the immense success of Hilbert-Schmidt independence criterion (HSIC) which is able to capture joint independence of random variables under mild conditions, and permits closed-form estimators with quadratic computational complexity (w.r.t.\ the sample size). In order to alleviate the quadratic computational bottleneck in large-scale applications, multiple HSIC approximations have been proposed, however these estimators are restricted to $M=2$ random variables, do not extend naturally to the $M\ge 2$ case, and lack theoretical guarantees. In this work, we propose an alternative Nyström-based HSIC estimator which handles the $M\ge 2$ case, prove its consistency, and  demonstrate its applicability in multiple contexts, including synthetic examples, dependency testing of media annotations, and causal discovery.
\end{abstract}

\section{Introduction}
Kernels methods \citep{aronszajn50theory} have been on the forefront of data science for more than $20$ years \citep{scholkopf02learning,steinwart08support}, and they underpin some of the most powerful and principled machine learning techniques currently known. The key idea of kernels is to map the data into a (possibly infinite-dimensional) feature space in which one computes the inner product implicitly by means of a symmetric, positive definite function, the so-called kernel function.

Kernel functions have been designed for 
strings \citep{watkins99dynamic,lodhi02text} or more generally for sequences \citep{kiraly19kernel}, sets \citep{haussler99convolution, gartner02multi}, rankings \citep{jiao16kendall}, fuzzy domains \citep{guevara17cross} and graphs \citep{borgwardt20graph}, which renders them broadly applicable.
Their extension to the space of probability measures \citep{berlinet04reproducing,smola07hilbert} allows to represent distributions in a reproducing kernel Hilbert space (RKHS) by the so-called mean embedding. 
Such embeddings form the main building block of maximum mean discrepancy (MMD;  \citet{smola07hilbert,gretton12kernel}), which quantifies the discrepancy of two distributions as the RKHS distance of their respective mean embeddings. 
MMD is (i) a semi-metric on probability measures, (ii) a metric iff.\ the kernel is characteristic \citep{fukumizu08kernel,sriperumbudur10hilbert}, (iii) an instance of integral probability metrics  (IPM; \citet{muller97integral,zolotarev83probability}) when the underlying function class in the IPM is chosen to be the unit ball in an RKHS. 

Measuring the discrepancy of a joint distribution to the product of its marginals by MMD gives rise to the Hilbert-Schmidt independence criterion (HSIC; \citet{gretton05measuring}). 
HSIC was shown to be equivalent \citep{sejdinovic13equivalence} to distance covariance \citep{szekely07measuring,szekely09brownian,lyons13distance}; \citet{sheng23distance} have recently proved a similar result for the conditional case. 
HSIC is known to capture the independence of $M=2$ random variables with characteristic $(k_m)_{m=1}^2$ kernels (on the respective domains) as proved by \citet{lyons13distance}; for more than two components ($M>2$; \citet{quadrianto09kernelized,sejdinovic13kernel,pfister18kernel}) universality \citep{steinwart01influence, micchelli06universal} of $(k_m)_{m=1}^M$-s is sufficient  \citep{szabo18characteristic2}. 
HSIC has been deployed successfully in a wide range of domains including independence testing \citep{gretton08kernel,pfister18kernel,albert22adaptive},  feature selection \citep{camps10remote,song12feature,wang22rank} with applications in biomarker detection \citep{gonzalez19block} and wind power prediction \citep{bouche23wind}, clustering \citep{song07dependence,gonzalez19block}, and causal discovery \citep{mooij16distinguishing,pfister18kernel,chakraborty19distance,scholkopf21causal}.

Various estimators for HSIC and other dependence measures exist in the literature, out of which we summarize the most closely related ones to our work in Table~\ref{tab:comparison}. 
The classical V-statistic based HSIC estimator (V-HSIC; \citet{gretton05measuring,quadrianto09kernelized,pfister18kernel}) is powerful but its runtime increases quadratically with the number of samples, which limits it applicability in large-scale settings. 
To tackle this severe computational bottleneck, approximations of HSIC (N-HSIC, RFF-HSIC) have been proposed \citep{zhang18large}, relying on the Nyström \citep{williams01using} and the random Fourier feature (RFF; \citet{rahimi07random}) method, respectively. 
However, these  estimators (i) are limited to two components, (ii) their extension to more than two components is not straightforward, and (iii) they lack theoretical guarantees. 
The RFF-based approach is further restricted to finite-dimensional Euclidean domains and to translation-invariant kernels. 
The normalized finite set independence criterion (NFSIC; \citet{jitkrittum17adaptive2}) replaces the RKHS norm of HSIC with an $L_2$ one which allows the construction of linear-time estimators. 
However, NFSIC is also limited to two components, requires $\R^d$-valued input, and analytic kernels \citep{chwialkowski15fast}. 
Novel complementary approaches are the kernel partial correlation coefficient (KPCC; \citealt{huang22kernel}), and tests basing on incomplete U-statistics \citep{schrab22incompleteu}. One drawback of KPCC is its cubic runtime complexity w.r.t.\ the sample size when applied to kernel-enriched domains. \citet{schrab22incompleteu}'s approach can run in linear time, but it is limited to $M=2$ components. We note that all approaches require choosing an appropriate kernel: Here, one can optimize over various parametric families of kernels for increasing a proxy of test power in case of MMD \citep{jitkrittum16interpretable,liu20learning}, and in case of HSIC \citep{jitkrittum17adaptive2}. One can also design (almost) minimax-optimal MMD-based two-sample tests  using spectral regularization \citep{hagrass22spectral}.

\begin{table*}
  \centering
  \caption{Comparison of kernel independence measures: $n$ -- number of samples, $M$ -- number of components, $n'$ -- number of Nyström samples, $s$ -- number of random Fourier features, $d$ -- data dimensionality.}
  \label{tab:comparison}
  \begin{tabular}{lllll}
    \toprule
    Independence Measure                            & Runtime Complexity                 & $M$     & Domain & Admissible Kernels    \\
    \midrule
    V-HSIC \citep{pfister18kernel}       & $\O\left(Mn^2\right) $             & $M\ge2$ & any    & universal             \\
    NFSIC \citep{jitkrittum17adaptive2} & $\O\left(n\right)$                                &  $M=2$       & $\R^d$       & analytic, characteristic              \\
    
    N-HSIC \citep{zhang18large}                & $\O\left({n'}^3 + n{n'}^2\right) $ & $M=2$   & any    & characteristic        \\
    RFF-HSIC \citep{zhang18large}                    & $\O\left(s^2n\right) $                                & $M=2$   & $\R^d$ & translation-invariant, characteristic \\
    KPCC \citep{huang22kernel}   & $\O\left(n^3\right)$               & $M=2$   & any    & characteristic        \\
    \textbf{Nyström $M$-HSIC} (N-MHSIC)                     & $\O\left(M{n'}^3+Mn'n\right) $      & $M\ge2$ & any    & universal             \\
    \bottomrule
  \end{tabular}
\end{table*}

The restriction of existing HSIC approximations to two components is a severe limitation in recent applications like causal discovery which require independence tests capable of handling more than two components. Furthermore, the emergence of large-scale data sets necessitates algorithms that scale well in the sample size. To alleviate these bottlenecks, we make the following \tb{contributions}. 
\begin{enumerate}[labelindent=0em,leftmargin=1.2em,topsep=0cm,partopsep=0cm,parsep=0cm,itemsep=2mm]
    \item We propose Nyström $M$-HSIC, an efficient HSIC estimator, which can handle more than two components and has runtime $\O\left(M{n'}^3+Mn'n\right)$, where $n$ denotes the number of samples, $n' \ll n$ stands for the number of Nyström points, and $M$ is the number of random variables whose independence is measured.
    \item We provide theoretical guarantees for Nyström $M$-HSIC: we prove that our estimator converges with rate $\O\left(n^{-1/2}\right)$ for $n'\sim \sqrt{n}$, which matches the convergence of the quadratic-time estimator.
    \item We perform an extensive suite of experiments to demonstrate the efficiency of Nyström $M$-HSIC. These applications include dependency testing of media annotations and causal discovery. In the former, we achieve similar runtime and power as existing HSIC approximations. The latter requires testing joint independence of more than two components, which is beyond the capabilities of existing HSIC accelerations. Here, the proposed algorithm achieves the same performance as the quadratic-time HSIC estimator V-HSIC with a significantly reduced runtime.
\end{enumerate}

The paper is structured as follows. Our notations are introduced in Section~\ref{sec:notations}. The existing Nyström-based HSIC approximation for two components is reviewed in Section~\ref{sec:estimation-hsic}. Our proposed method, which is capable of handling $M\ge 2$ components, is presented in Section~\ref{sec:prop-hsic-appr} together with its theoretical guarantees. In Section~\ref{sec:experiments} we demonstrate the applicability of Nyström $M$-HSIC. All the proofs of our results are available in the supplementary material.

\section{Notations} \label{sec:notations}
This section is dedicated to definitions 
and to the introduction of our target quantity Hilbert-Schmidt independence criterion (HSIC). 
In particular, we introduce the \tb{notations} $[M]$, $\langle\b v, \b w\rangle$, $\norm{\b v}{2}$, $\circ_{m\in [M]} \b A_m$, $\trace(\b A)$, $\b A^{-1}$, $\b A^{-}$, $\b A\tran$, $\norm{\b A}{\text{F}}$, $\b 1_d$, $\b I_d$, $\Span$, $\mathcal M_1^+(\X{})$, $\H_k$, 
$\mu_k$, $\mathrm{MMD}_k$, $\otimes_{m=1}^M k_m$, $\otimes_{m=1}^M\P_ m$, $\mathrm{HSIC}_{\otimes_{m=1}^M k_m}$, $C_X$, $A^{-1}$, $\opnorm{A}$, $\trace(A)$, $\mathcal N_{X}(\lambda)$, $\O_{\text{P}}\left(r_n\right)$.

For a positive integer $M$, $[M] := \{1,\dots,M\}$. The Euclidean inner product of vectors $\b v, \b w \in \R^d$ is denoted by $\langle \b v, \b w \rangle $; the Euclidean norm is $\norm{\b v}{2}:= \sqrt{\langle\b v, \b v\rangle}$. The Hadamard product of matrices $\b A_m \in \R^{d_1 \times d_2}$ of equal size ($m\in [M]$) is $\circ_{m\in [M]} \b A_m := \left[\prod_{m\in [M]} (\b A_m)_{i,j}\right]_{i\in [d_1], j\in [d_2]}$. Matrix multiplication takes precendence over the Hadamard one.  For a matrix $\b A \in \R^{d\times d}$, $\trace(\b{A}):=\sum_{i \in [d]}A_{i,i}$ denotes its trace, $\b A^{-1}$ is its inverse (assuming that $\b A$ is non-singular), and $\b A^-$ is its Moore–Penrose inverse. The transpose of a matrix $\b A\in \R^{d_1 \times d_2}$ is denoted by $\b A\tran$. The Frobenius norm of a matrix $\b A \in \R^{d_1\times d_2}$ is  $\norm{\b A}{\text{F}}:=\sqrt{\sum_{i\in [d_1],j\in [d_2]} (A_{i,j})^2}$. The $d$-dimensional vector of ones is $\b 1_d$. The $d\times d$-sized identity matrix is denoted by $\b I_d$. For a set $S$ in a vector space, $\Span(S)$ denotes the linear hull of $S$. Let $(\X{},\tau_{\X{}})$ be a topological space, and $\mathcal B(\tau_{\X{}})$ the Borel sigma-algebra induced by the topology $\tau_{\X{}}$. All probability measures in the manuscript are meant with respect to the measurable space $\left(\X{},\mathcal B(\tau_{\X{}})\right)$, and they are denoted by $\mathcal M_1^+(\X{})$. The RKHS $\H_k$ on $\X{}$ associated with a kernel $k : \X{} \times \X{} \to \R$ is the Hilbert space of functions $h : \X{} \to \R$ such that $k(\cdot, x) \in \H_k$ and $\ip{h,k(\cdot,x)}{\H_k} = h(x)$ for all $x \in \X{}$ and $h\in \H_k$.\footnote{$k(\cdot,x)$ stands for $x' \in \X{} \mapsto k(x',x) \in \R$ with $x \in \X{}$ fixed.} Kernels are assumed to be bounded (in other words, there exists $B\in \R$ such that $\sup_{x,x'\in \X{}}k(x,x')\le B$) and measurable, and $\H_k$ is assumed to be separable throughout the paper.\footnote{The separability of $\H_k$ can be guaranteed on a separable topological space $\X{}$ by  taking a continuous kernel $k$ \citep[Lemma 4.33]{steinwart08support}.} The function defined by $\phi_k(x) := k(\cdot, x)$ is the canonical feature map; with this feature map $k(x,x') = \ip{k(\cdot,x),k(\cdot,x')}{\H_k} = \ip{\phi_k(x),\phi_k(x')}{\H_k}$ for all $x, x' \in  \X{}$.
A kernel $k : \R^d \times \R^d \to \R$ is called translation-invariant if there exists a function $\kappa : \R^d\to\R$ such that $k(\b x,\b x') = \kappa(\b x-\b x')$ for all $\b x,\b x' \in \R^d$. 
The mean embedding  $\mu_k$ of a probability measure $\P{} \in \mathcal M_1^+(\X{})$ is
$\kme {} := \int_{\X {}} \phi_k(x) \d\P{}(x)$, where the integral is meant in Bochner's sense. The resulting (semi-)\-metric  is called maximum mean discrepancy (MMD):
\begin{align*}
\mathrm{MMD}_k(\P, \Q) &:= \norm{\mu_k(\P{})-\mu_k(\mathbb Q)}{\H_k},
\end{align*}
for $\P{},\Q \in \mathcal M_1^+(\X{})$. The injectivity of the mean embedding $\mu_k$ is equivalent to $\mathrm{MMD}_k$ being a metric; in this case the kernel $k$ is called characteristic. Let $X = (X_m)_{m=1}^M$ denote a random variable with distribution $\P{} \in \mathcal M_1^+(\X{})$ on the product space  $\X{} = \times_{m=1}^M \X_m$, where $\X_{m}$ is enriched with kernel $k_m : \X_{m} \times \X_{m} \to \R$. The distribution of the $m$-th marginal $X_m$ of $X$ is denoted by $\P_ m \in \mathcal M_1^+(\X_m)$; the product of these $M$ marginals is $\otimes_{m=1}^M\P_ m \in \mathcal M_1^+(\X{})$.
 The tensor product of the kernels~$(k_m)_{m=1}^M$
\begin{align*}
\otimes_{m=1}^M k_m\left((x_m)_{m=1}^M,(x'_m)_{m=1}^M\right) := \prod_{m\in[M]} k_m(x_m,x_m'), 
\end{align*}
with $x_m,x_m' \in \X_m$ ($m\in [M]$), is also a kernel; we will use the shorthand $k=\otimes_{m=1}^M k_m$. The associated RKHS has a simple structure $\H_{k} = \tphs$ \citep{berlinet04reproducing} with the r.h.s.\ denoting the tensor product of the RKHSs $(\H_{k_m})_{m=1}^M$. Indeed, for $h_m \in \H_{k_m}$, the multi-linear operator $\otimes_{m=1}^M h_m \in \tphs$ acts as $\otimes_{m=1}^Mh_m(v_1,\dots,v_M)  = \prod_{m\in [M]}\ip{h_m,v_m}{\H_{k_m}}$, where $h_m,v_m \in \H_{k_m}$. The space $\tphs$ is the closure of 
the linear combination of such $\otimes_{m=1}^M h_m$-s: 
\begin{align*}
\tphs = \widebar{\Span}\left(\otimes_{m=1}^M h_m\, :\, h_m\in \H_{k_m}, m\in [M]\right),
\end{align*}
where the closure is meant w.r.t.\ to the (linear extension of the) inner product defined as 
\begin{align}
    \MoveEqLeft \left\langle \otimes_{m=1}^M a_m, \otimes_{m=1}^M b_m \right\rangle_{\tphs} \hspace{-0.1cm} := \nonumber \\
    & = \prod_{m\in [M]} \hspace{-0.1cm} \left\langle a_m,b_m\right\rangle_{\H_{k_m}},\quad a_m,b_m\in \H_{k_m}. \label{eq:tensor:inner-product} 
\end{align}
Specifically,  \eqref{eq:tensor:inner-product} implies that 
\begin{align}
\left\| \otimes_{m=1}^M a_m\right\|_{\tphs} & = \prod_{m\in [M]} \left\| a_m\right\|_{\H_{k_m}}. \label{eq:tensor:norm}
\end{align}
One can define an independence measure, the so-called Hilbert-Schmidt independence criterion based on $k$ as 
\begin{align}
  \label{eq:def-hsic}
  \mathrm{HSIC}_k(\P{}) := \mathrm{MMD}_{k}\left(\P{},\prodmarginals\right) = \norm{C_X}{\H_k},
\end{align}
where $C_X := \mu_{k}(\P{}) - \mu_{k}\left(\prodmarginals\right)$ is the centered cross-covariance operator.

Let $A : \H_k \to \H_k$ be a bounded linear operator. Its inverse  (provided that it exists) $A^{-1}: \H_k \to \H_k$ is also bounded linear. The operator norm of $A$ is defined as $\opnorm{A} := \sup_{\norm{h}{\H_k}=1} \norm{Ah}{\H_k}$. As $\H_k$ is separable, it has a countable orthonormal basis $(e_j)_{j\in J}$. $A$ is called trace-class if $\sum_{j\in J}\ip{(A^*A)^{\frac{1}{2}} e_j, e_j}{\H_k}<\infty$ where $(\cdot)^*$ denotes the adjoint, and in this case $\trace(A):= \sum_{j\in J}\ip{A e_j, e_j}{\H_k} < \infty$ is called the trace of $A$.
For $\P{} \in  \mathcal M_1^+(\X{})$, kernel $k:\X{}\times \X{} \rightarrow \R$ and $\lambda >0$, the  uncentered covariance operator  is $\mu_{k\otimes k}(\P{}):=\int_{\X} k(\cdot, x) \otimes k(\cdot,x) \d \P(x)$ and its regularized variant is $\mu_{k\otimes k,\lambda}(\P{}) := \mu_{k\otimes k}(\P{}) + \lambda I$, respectively, where $I$ denotes the identity operator. Let $\mathcal N_x(\lambda)= \ip{\phi_k(x),\mu^{-1}_{k\otimes k,\lambda}(\P{})\phi_k(x)}{\H_{k}}$. The effective dimension of $X \sim \P{}$ is defined as $\mathcal N_{X}(\lambda) := \E_{x\sim \P{}}\left[\mathcal N_{x}(\lambda)\right] = \trace\left(\mu_{k\otimes k}(\P{})\mu_{k\otimes k,\lambda}^{-1}(\P{})\right)$. For a sequence of $r_n>0$-s and a sequence of real-valued random variables $X_n$, $X_n = \O_{\text{P}}(r_n)$ denotes that $\frac{X_n}{r_n}$ is bounded in probability.

\section{Existing HSIC Estimators}
\label{sec:estimation-hsic}

We recall the existing HSIC estimator V-HSIC in Section~\ref{sec:class-hsic-estim}, and its Nyström approximation for two components in Section~\ref{sec:nystr-estim-m=2}. We present our proposed Nyström approximation for more than two components in Section~\ref{sec:prop-hsic-appr}.

\subsection{Classical HSIC Estimator (V-HSIC)} \label{sec:class-hsic-estim}

Given an i.i.d.\ sample of $M$-tuples of size $n$
\begin{align}
\Psamp_{n} := \left\{\left(x_1^1, \dots, x_M^1\right), \dots, \left(x_1^n, \dots, x_M^n\right)\right\} \subset \X{} \label{eq:sample-of-m-tuplets}
\end{align}
drawn from $\P{}$, the corresponding  empirical estimate of the squared HSIC, obtained by replacing the population means with the sample means, gives rise to the V-statistic based estimator
\begin{eqnarray}
 \lefteqn{0\le\HSIC_{k}^2\left(\hat \P_n\right) := \frac {1}{n^2}\ \bm 1_n\tran\left(\hadamard_{m\in[M]}\mathbf{K}_{k_m}\right) \bm 1_n} \label{eq:emp-hsic}\\
 && \hspace{-0.6cm}+ \frac{1 }{n^{2M}} \prod_{m\in[M]}\bm 1_n\tran \mathbf{K}_{k_m} \bm 1_n - \frac {2}{n^{M+1}}\bm 1_n\tran \left( \hadamard_{m\in[M]}\mathbf{K}_{k_m} \bm 1_n\right) \nonumber
 \end{eqnarray}
with Gram matrices
\begin{align}
\mathbf{K}_{k_m} = \left[k_m\left(x_m^i,x_m^j\right)\right]_{i,j \in [n]} \in \R^{n\times n},\label{eq:gram-matrix}
\end{align}
which can be computed in $O(n^2M)$.\footnote{$\HSIC_{k}^2(\hat \P_n)$ denotes the application of $\HSIC_k^2$ to the empirical measure $\hat \P_n$. $\HSIC^2_{k,\text N_0}(\hat \P_n)$ and $\HSIC_{k,\text N}^2(\Psamp_{n})$ indicate dependence on $\hat \P_n$. Similarly, $\mu_{\ell}(\hat \Q_{n})$ stands for application, $\mu_\ell(\tilde \Q_{n'})$, $\mu_{k_m}(\tilde \P_{m,n'})$ and $\mu_{k}(\tilde \P_{n'})$ indicate dependence on the argument. \label{footnote:app-vs-dependence}}
This prohibitive runtime inspired the development of HSIC approximations \citep{zhang18large} using the Nyström method and random Fourier features. We review the Nyström-based construction in Section~\ref{sec:nystr-estim-m=2} and explain why the technique is restricted to $M=2$ components, before presenting our alternative approximation scheme of HSIC in Section~\ref{sec:prop-hsic-appr} which is capable of handling $M\geq2$ components.

\subsection{Nyström Method} \label{sec:nystr-estim-m=2}
In this section, we recall the existing Nyström approximation, which can handle $M=2$ components.

The expression \eqref{eq:emp-hsic} can be rewritten \citep{gretton05measuring} for $M=2$ components as 
\begin{align}
  \label{eq:hsic-two-components}
  \HSIC^2_{k}\left(\Psamp_{n}\right) = \frac{1 }{n^2}\trace\left( \mathbf{H  K}_{k_1} \mathbf{ HK}_{k_2} \right),
\end{align}
with the centering matrix $ \mathbf{H} =  \mathbf{I}_n - \frac 1 n \bm 1_n \bm 1_n \tran \in \R^{n\times n}$, Gram matrices $\mathbf{K}_{k_1},\,\mathbf{K}_{k_2}$ defined in \eqref{eq:gram-matrix}, and sample $\Psamp_n := \left\{(x_1^1,x_2^1),\dots,(x_1^n,x_2^n)\right\}$ as in \eqref{eq:sample-of-m-tuplets} with $M=2$. The naive computation of \eqref{eq:hsic-two-components} costs $\O\left(n^3\right)$. However, noticing that $\trace(\b A\tran \b B) = \sum_{i,j\in[n]}  A_{i,j } B_{i,j}$, the computational complexity reduces to $\O\left(n^2\right)$. The quadratic complexity can be reduced by the Nyström approximation\textsuperscript{\ref{footnote:app-vs-dependence}} \citep{zhang18large} 
\begin{align}
\label{eq:hsic-ny-plugin}
\begin{split}
  \HSIC^2_{k,\text N_0}\left(\hat \P_n\right) &= \frac{1}{n^2}\trace\left(\mathbf{HK}^{\nys}_{k_1}\mathbf{HK}^{\nys}_{k_2}\right) \\
  &\stackrel{(*)}{=} \frac {1}{n^2}\norm{\left(\mathbf{H}\phi_{k_1}^{\nys}\right)\tran \mathbf{H}\phi_{k_2}^{\nys}}{\text F}^2,
\end{split}
\end{align}
which we detail in the following. The Nyström approximation relies on a subsample of size $n' \leq n$ of $\Psamp_n$, which we denote by $\tilde\P_{n'} :=  \left\{\big(\tilde x_1^1, \tilde x_2^1\big), \dots,  \big(\tilde x_1^{n'}, \tilde x_2^{n'}\big)\right\}$; the tilde indicates a relabeling. The subsample allows to define three matrices
\begin{align}
  \label{eq:matrix-definitions}
  \begin{split}
      \mathbf{K}_{k_m,n'n'} &= \left[k_m\left(\tilde x_m^i,\tilde x_m^j\right)\right]_{i,j\in[n']} \in \R^{n'\times n'}, \\
  \mathbf{K}_{k_m,nn} &= \mathbf{K}_{k_m} \in \R^{n\times n}, \\
  \mathbf{K}_{k_m,n'n} &= \left[k_m(\tilde x_m^i, x_m^j)\right]_{i\in[n'],j\in[n]} \in \R^{n'\times n}, 
  \end{split}
\end{align}
where $m\in [2]$ and $\b K_{k_m}$ is defined in \eqref{eq:gram-matrix}, and let $\mathbf{K}_{k_m,nn'} = \mathbf{K}_{k_m,n'n}\tran \in \R^{n\times n'}$. The matrices  $\b K_{k_m}^{\nys}$ $(m \in [2])$ as used in \eqref{eq:hsic-ny-plugin} are 
\begin{align*}
  \mathbf{K}_{k_m}^{\nys} &:= \mathbf{K}_{k_m,nn'}\mathbf{K}_{k_m,n'n'}^{-1}\mathbf{K}_{k_m,n'n}
  \\ &= \underbrace{\mathbf{K}_{k_m,nn'}\mathbf{K}_{k_m,n'n'}^{-\frac 1 2 }}_{=:\phi_{k_m}^{\nys} \in \R^{n\times n'}} \big(\underbrace{\mathbf{K}_{k_m,nn'}\mathbf{K}_{k_m,n'n'}^{-\frac 1 2 }}_{\phi_{k_m}^{\nys}}\big)\tran  \in \R^{n\times n}, 
\end{align*}
provided that the inverse $\mathbf{K}_{k_m,n'n'}^{-1}$ exists.
In \eqref{eq:hsic-ny-plugin} the r.h.s.\ of $(*)$ has a computational complexity of $\O({n'}^3 + nn'^2)$,\footnote{This follows from the complexity of $O({n'}^3)$ of inverting an $n' \times n'$ matrix and the complexity of multiplying both feature representations \citep{zhang18large}.} which is smaller than $\O\left(n^2\right)$ of \eqref{eq:hsic-two-components},  provided that $n' < \sqrt n$; this speeds up the computation. $(*)$ relies on the  cyclic invariance property of the trace, and the idempotence of $\b H$ (in other words, $\b H \b H = \b H$), limiting the above derivation to $M=2$ components; the approach does not extend naturally to the case of $M>2$.


\section{Proposed HSIC Estimator}
\label{sec:prop-hsic-appr}
We now elaborate the proposed Nyström HSIC approximation for $M\geq 2$ components. 


Recall that the centered cross-covariance operator takes the form
\begin{align}
  C_X &= \mu_{k}(\P{}) - \mu_{k}\left(\prodmarginals\right)  \nonumber \\ 
  &
  = \mu_k(\P{}) - \otimes_{m=1}^M\mu_{k_m}\left(\P_m\right). \label{eq:centered-cross-cov}
\end{align}
There are $M+1$ expectations in this expression; we  estimate these mean embeddings separately. This conceptually simple construction, is to the best of our knowledge, the first that handles $M\ge 2$ components, and it allows to leverage recent bounds on mean estimators (Lemma~\ref{lemma:nystrom-mean-embedding}).
We first detail the general Nyström method for approximating expectations  $\int_{\Y{}}\phi_\ell(y)\d\Q{} (y) $ associated to a kernel $\ell : \Y \times \Y \to \R$ and probability distribution $\mathbb Q \in \mathcal M_1^+(\Y{})$. One can then choose
\begin{align}
\begin{split}
    (\Y, \ell, \Q) &= (\X, k, \P{}), \text{ and }\\ (\Y, \ell, \Q) &= (\X_m, k_m, \P_{m}), \quad m\in[M],
\end{split}
\label{eq:triplets}
\end{align}
to achieve our goal.

Let $\tilde \Q_{n'} = \left\{\tilde y^1,\dots,\tilde y^{n'}\right\}$ be a subsample (with replacement) of $\hat \Q_n = \left\{y^1,\dots,y^n\right\} \stackrel{\text{i.i.d.}}{\sim} \Q$ referred to as Nyström points; the tilde again indicates relabeling. 
The usual estimator of the mean embedding replaces the population mean with its empirical counterpart over $n$ samples\textsuperscript{\ref{footnote:app-vs-dependence}}
\begin{align*}
  \mu_{\ell}(\Q) = \int_{\Y{}}\phi_\ell(y)\d\Q{} (y) 
   \approx \frac 1 n \sum_{i\in[n]} \phi_\ell(y^i) =  \mu_{\ell}(\hat \Q_{n}).
\end{align*}
Instead, the Nyström approximation uses a weighted sum with weights  $\alpha_i \in \R$ ($i \in [n']$): given $n'$ Nyström points, the estimator takes the form\textsuperscript{\ref{footnote:app-vs-dependence}}
\begin{align*}
  \mu_{\ell}(\Q{})   &\approx \sum_{i\in[n' ]}\alpha_i \phi_\ell(\tilde y^i) = \mu_{\ell}\left(\tilde \Q_{n'}\right) \in  \mathcal H_\ell^{\nys},
\end{align*}
where $\H_\ell^{\nys} := \Span\left(\phi_\ell\big(\tilde y^i\big)\,:\,i\in [n']\right) \subset \mathcal H_\ell$.
The coefficients $\bm \alpha_{\ell} = (\alpha_{\ell}^1,\dots,\alpha_{\ell}^{n'})\tran \in \R^{n'}$  are obtained by the minimum norm solution of 
\begin{align}
 \min_{\bm\alpha_\ell \in\R^{n'}}\norm{\mu_{\ell}\left({\hat \Q_n}\right) -\sum_{i\in[n' ]}\alpha_i \phi_\ell\left(\tilde y^i\right)}{\H_\ell}^2. \label{eq:optim-prob}
\end{align}
The following lemma  describes the solution of \eqref{eq:optim-prob}.


\begin{lemma}[Nyström mean embedding, \citet{chatalic22nystrom}] 
  \label{lemma:nystrom-mean-embedding}
For a kernel $\ell$ with corresponding feature map $\phi_\ell$, an i.i.d.\  sample $\hat\Q_n$ of distribution $\Q{}$, and a subsample $\tilde \Q_{n'}$ of $\hat \Q_n$, the Nyström estimate of $\mu_{\ell}(\Q{})$ is given by
  \begin{align}
    \mu_{\ell}\left(\tilde \Q_{n'}\right) &= \sum_{i \in[n']}\alpha_\ell^i\phi_\ell\left(\tilde y^i\right), \nonumber\\
    \bm \alpha_\ell &= \frac 1 n \left(\mathbf{ K}_{\ell, n'n'}\right)^{-}\mathbf{K}_{\ell, n'n}\bm 1_n  \label{eq:alpha-k},
  \end{align}
  with Gram matrix  $\mathbf{K}_{\ell,n'n'} = \left[\ell(\tilde x^i,\tilde x^j)\right]_{i,j\in[n']} \in \R^{n'\times n'}$, and  $\mathbf{K}_{\ell,n'n} = \left[ \ell(\tilde x^i, x^j) \right]_{i\in[n'],j\in[n]} \in \R^{n'\times n}$.
\end{lemma}

Let
\begin{align}
\tilde\P_{n'} = \left\{\Big(\tilde x_1^1,\dots,\tilde x_M^1\Big),\dots,\Big(\tilde x_1^{n'},\dots,\tilde x_M^{n'}\Big)\right\} \label{eq:Nystrom-samples}
\end{align}
be a subsample (with replacement) of
$
\Psamp_n = \left\{\left(x_1^1,\dots,x_M^1\right),\dots,\left(x_1^n,\dots,x_M^n\right)\right\}
$ defined in \eqref{eq:sample-of-m-tuplets}, and
\begin{align} \label{eq:tildeP:m,n'}
\tilde\P_{m,n'} = \left\{\tilde x_m^1,\dots,\tilde x_m^{n'}\right \}
\end{align}
be the corresponding subsample of the $m$-th marginal ($m\in [M]$).
Using our choice \eqref{eq:triplets} with Lemma~\ref{lemma:nystrom-mean-embedding}, the estimators for the embeddings of marginal distributions take the form\textsuperscript{\ref{footnote:app-vs-dependence}}
\begin{align}
    \mu_{k_m}\left(\tilde \P_{m,n'}\right) &= \sum_{i \in [n']}\alpha_{k_m}^i \phi_{k_m}\left(\tilde x_m^i\right), \nonumber\\
  \bm \alpha_{k_m} &= \frac 1 n \left( \mathbf{K}_{k_m,n'n'}\right)^{-} \mathbf{K}_{k_m,n'n} \bm 1_n,  \label{eq:alpha-marginal}
\end{align}
and the estimator of the mean embedding of the joint distribution is\textsuperscript{\ref{footnote:app-vs-dependence}}
\begin{align}
  \mu_{k}\left(\tilde \P_{n'}\right) &= \sum_{i \in [n']} \alpha_k^i \otimes_{m=1}^M \phi_{k_m}\left(\tilde x_m^i\right), \nonumber
    \\\bm \alpha_{k} &=  \frac 1 n \left(  \mathbf{K}_{k,n'n'} \right)^{-}\left( \mathbf{K}_{k,n'n}\right) \bm 1_n \nonumber  
  \\
  & \stackrel{(*)}{=}  \frac 1 n \overbrace{\Big(\underbrace{\hadamard_{m\in[M]} \mathbf{K}_{k_m,n'n'}}_{(a)}\Big)^{-}}^{(c)} \times \nonumber \\ &\quad\hspace{.55cm} \Big(\underbrace{\hadamard_{m\in[M]}  \mathbf{K}_{k_m,n'n}}_{(b)}\Big) \bm 1_n \label{eq:alpha-prod},
\end{align}
where $(*)$ holds as for the Gram matrix $\b K_{k,n'n'}$ associated with the product kernel $k = \otimes_{m\in [M]}k_m$ one has 
\begin{align*}
  \mathbf{K}_{k,n'n'} &= \left[ k\left((x_1^i,\dots,x_M^i),(x_1^j,\dots,x_M^j)\right)\right]_{i,j\in[n']} \\ &= \left[\prod_{m\in[M]}k_m(x_m^i,x_m^j)\right]_{i,j\in[n']} \hspace{-0.6cm}= \hadamard_{m\in[M]}\mathbf{K}_{k_m,n'n'}, 
\end{align*}
and similarly $\mathbf{K}_{k,n'n} = \hadamard_{m\in[M]}\mathbf{K}_{k_m,n'n}$, with $\mathbf{K}_{k_m,n'n'}$ and $\mathbf{K}_{k_m,n'n}$ defined in \eqref{eq:matrix-definitions}.

Combining the $M+1$ Nyström estimators in \eqref{eq:alpha-marginal} and in \eqref{eq:alpha-prod} gives rise to the overall Nyström HSIC estimator, which is elaborated in the following lemma.

\begin{lemma}[Computation of Nyström $M$-HSIC]
  \label{thm:nystroem-hsic}
  The Nyström estimator for HSIC  can be expressed as\textsuperscript{\ref{footnote:app-vs-dependence}}
  \begin{eqnarray}
    \label{eq:nyström-hsic}
    \lefteqn{\HSIC_{k,\text N}^2\left(\Psamp_{n}\right) = \bm \alpha_k\tran\left( \hadamard_{m\in[M]} \mathbf{K}_{k_m,n'n'}\right) \bm \alpha_k } \\
                                               && \hspace{-0.8cm} + \hspace{-0.2cm}\prod_{m\in[M]} \bm \alpha_{k_m}\tran  \mathbf{K}_{k_m,n'n'} \bm\alpha_{k_m} 
                                                \hspace{-0.1cm} - 2 \bm\alpha_k\tran \left ( \hadamard_{m\in[M]}  \mathbf{K}_{k_m,n'n'}\bm\alpha_{k_m} \right), \nonumber
  \end{eqnarray}
  with $\bm \alpha_{k_m}$ and $\bm \alpha_{k}$  defined in \eqref{eq:alpha-marginal} and  \eqref{eq:alpha-prod}, respectively, $\mathbf{K}_{k_m,n'n'}$ is defined in \eqref{eq:matrix-definitions}, and $N$ in the subscript of the estimator refers to Nyström. Note that \eqref{eq:nyström-hsic} depends on $\hat \P_{n}$ as one must solve \eqref{eq:optim-prob}.
\end{lemma}

\begin{remark}~
  \label{remark:nystroem-hsic}
  \begin{itemize}[labelindent=0em,leftmargin=0.85em,topsep=0cm,partopsep=0cm,parsep=0cm,itemsep=2mm]
  \item \textbf{Uniform weights, no subsampling.} The estimator \eqref{eq:nyström-hsic} gives back \eqref{eq:emp-hsic} when $\bm \alpha_k :=  \bm \alpha_{k_m} := \frac 1 n \bm 1_n$ for all $m\in[M]$, and when there is no subsampling applied.
  
  \item \textbf{Runtime complexity.}   In order to determine the computational complexity of \eqref{eq:nyström-hsic} one has to find that of \eqref{eq:alpha-prod}; that of \eqref{eq:alpha-marginal} follows by choosing $M=1$ in \eqref{eq:alpha-prod}.
    $(a)$ and $(b)$ in \eqref{eq:alpha-prod} are Hadamard products; hence their computational complexity is $\O\left(M{n'}^2\right)$ and $\O\left(Mnn'\right)$. $(c)$ in \eqref{eq:alpha-prod} is the Moore-Penrose inverse of an $n' \times n'$ matrix; thus its complexity is $\O\left({n'}^3\right)$. Hence, the computation of $\bm \alpha_k$ costs $\O\big(M{n'}^2 + {n'}^3+Mn'n \big)$, and that of $(\bm \alpha_{k_m})_{m=1}^M$ is  $\O\big({n'}^2 + {n'}^3+n'n \big)$ for each $m\in[M]$.
    In \eqref{eq:nyström-hsic} each term can be computed in $\O\left(M {n'}^2\right)$. Overall the Nyström $M$-HSIC estimator has complexity  $\O\big(M{n'}^2 + M{n'}^3+Mn'n \big) = \O\big(M{n'}^3+Mn'n \big)$.
    
  \item \textbf{Difference compared to the estimator by \citet{zhang18large}.} For $M=2$, \eqref{eq:nyström-hsic} reduces to
    \begin{eqnarray}
      \label{eq:ours-two-components}
      \lefteqn{
      \HSIC_{k,\text N}^2\left( \Psamp_{n}\right) = \bm\alpha_k\tran\left(\circ_{i\in[2]}\mathbf{K}_{k_i,n'n'}\right) \bm\alpha_k  } \\
 &&\hspace{-0.5cm}+ \bm\prod_{i\in[2]}\alpha_{k_i}\tran \mathbf{K}_{k_i,n'n'} \bm\alpha_{k_i}  -2 \bm\alpha_k\tran\left(\circ_{i\in[2]} \mathbf{K}_{k_i,n'n'}\bm\alpha_{k_i}\right). \nonumber
    \end{eqnarray}
    Using the equivalence of \eqref{eq:emp-hsic} and  \eqref{eq:hsic-two-components} in case $M=2$ gives
    \begin{eqnarray*}
      \lefteqn {\trace\left(\mathbf{HK}_{k_1}\mathbf{HK
      }_{k_2}\right) = \frac{1}{n^2}\bm1_n\tran\left(\mathbf{K}_{k_1} \circ \mathbf{K}_{k_2}\right)\bm 1_n} \\
      &&\hspace{-0.5cm}+ \frac{1 }{n^4}\prod_{i\in[2]}\bm 1_n\tran \mathbf{K}_{k_i} \bm 1_n - \frac{2}{n^3}\bm 1_n \tran\left(\mathbf{K}_{k_1}\bm1_n\circ \mathbf{K}_{k_2}\bm 1_n\right),
    \end{eqnarray*}
    hence \eqref{eq:hsic-ny-plugin} becomes
    \begin{eqnarray}
      \label{eq:zhang-extended-form}
      \lefteqn{\HSIC_{k,\text N_0}^2\left( \Psamp_{n}\right)  = \frac{1}{n^2}\bm1_n\tran\left(\mathbf{K}_{k_1}^{\nys} \circ \mathbf{K}_{k_2}^{\nys}\right)\bm 1_n} \\ 
      &&\hspace{-0.5cm}+ \frac{1 }{n^4} \prod_{i\in [2]}\bm 1_n\tran \mathbf{K}_{k_i}^{\nys} \bm 1_n - \frac{2}{n^3}\bm 1_n \tran\left(\mathbf{K}_{k_1}^{\nys}\bm1_n\circ \mathbf{K}_{k_2}^{\nys}\bm 1_n\right).\nonumber
    \end{eqnarray}
    The estimators \eqref{eq:ours-two-components} and \eqref{eq:zhang-extended-form} are identical if $\bm \alpha_k =  \bm \alpha_{k_m} = \frac 1 n \bm 1_n$ for all $m\in[M]$ and when there is no subsampling; in the general case they do not coincide. In \eqref{eq:hsic-ny-plugin} the dominant term in the complexity is $\left(n'\right)^2 n$ (since $n' < n$), this reduces to $n'n$ in our proposed estimator \eqref{eq:nyström-hsic}.

  \end{itemize}
\end{remark}

Key to showing the  consistency of the proposed Nyström $M$-HSIC estimator \eqref{eq:nyström-hsic} (Proposition~\ref{thm:error-nystrom-hsic})
is our next lemma, which describes how the Nyström approximation error of the mean embeddings of the components ($d_{k_m}$ below) can be propagated through tensor products.

\begin{lemma}[Error propagation on tensor products]
  \label{lemma:decomposition}
  Let $X=(X_m)_{m=1}^M  \in \X = \times_{m=1}^M \X_m$, $k_m: \X_m \times \X_m \rightarrow \R$ bounded kernels ($\exists a_{k_m} \in (0,\infty)$ such that $\sup_{x_m\in \X_m}\sqrt{k_m(x_m,x_m)} \leq a_{k_m}$, $m\in [M]$), $k=\otimes_{m=1}^M k_m$, $\H_k$ the RKHS associated to $k$,  $X\sim\P{}\in \mathcal M_1^+(\X{})$, $\P_m$ the $m$-th marginal of $\P$ ($m\in [M]$), $n'\le n$, and $\tilde\P_{m,n'}$ defined according to \eqref{eq:tildeP:m,n'}.
   Then
    \begin{eqnarray*}
      \lefteqn{\norm{\otimes_{m=1}^M \mu_{k_m}\left(\P_m\right) - \otimes_{m=1}^M \mu_{k_m}\left(\tilde\P_{m,n'}\right)}{\H_k} \le}\\
      &&\le \prod_{m\in[M]}\left(a_{k_m}+d_{k_m}\right) - \prod_{m\in[M]}a_{k_m},
    \end{eqnarray*}
    where $d_{k_m} = \norm{\mu_{k_m}\left(\P_m\right) -  \mu_{k_m}\left(\tilde \P_{m,n'}\right)}{\H_{k_m}}$.
  \end{lemma}
  Our resulting Nyström $M$-HSIC performance guarantee is as follows.
  \begin{proposition}[Error bound for Nyström $M$-HSIC] \label{thm:error-nystrom-hsic}
   Let $X=(X_m)_{m=1}^M \in \mathcal X = \times_{m=1}^M \X_m$, $ X \sim\P{}\in \mathcal M_1^+(\X{})$,  $(\mathcal X_m)_{m\in [M]}$ locally compact, second-countable topological spaces, $k_m: \mathcal X_m \times \mathcal X_m \rightarrow \R$ bounded kernels, i.e., $\exists a_{k_m}\in (0,\infty)$ such that $\sup_{x_m\in \X_m}\sqrt{k_m(x_m,x_m)} \leq a_{k_m}$ for all $m\in [M]$, $k=\otimes_{m\in [M]}k_m$, $a_k = \prod_{m=1}^M a_{k_m}$,  $\phi_{k_m}(x_m)=k_m(\cdot,x_m)$ for all $x_m \in \X_m$, $\phi_k = \otimes_{m=1}^M \phi_{k_m}$, $C_k = \E_{}\left[\phi_{k}(X) \otimes \phi_{k}(X) \right]$, $C_{k_m} = \E_{}\left[\phi_{k_m}(X_m) \otimes \phi_{k_m}(X_m) \right]$, the number of Nyström points $n' \le n$,  $\hat\P_{n}$ defined according to \eqref{eq:sample-of-m-tuplets}.
  Then, for any $\delta \in \left(0, \frac{1}{M+1}\right)$  
  \begin{eqnarray*}
    \lefteqn{\left|\mathrm{HSIC}_k(\P{}) - \HSIC_{k,\text N}\left(\hat \P_{n}\right) \right| \leq \underbrace{\frac{c_{k,1}}{ \sqrt{n}}}_{t_{k,1}} + \underbrace{\frac{c_{k,2}}{n'}}_{t_{k,2}} + }\\
    &&+ \underbrace{\frac{c_{k,3}\sqrt{\log(n'/\delta)}}{n'}\sqrt{\mathcal N_{X}\left(\frac{12a_k^2\log(n'/\delta)}{n'}\right)}}_{t_{k,3}} +\\
    &&+\prod_{m\in[M]}\Bigg[a_{k_m} + \underbrace{\frac{c_{k_m,1}}{ \sqrt{n}}}_{t_{k_m,1}} + \underbrace{\frac{c_{k_m,2}}{n'}}_{t_{k_m,2}} + \\
    && + \underbrace{\frac{c_{k_m,3}\sqrt{\log(n'/\delta)}}{n'}\sqrt{\mathcal N_{X_m}\left(\frac{12a_{k_m}^2\log(n'/\delta)}{n'}\right)}}_{t_{k_m,3}}\Bigg]\\
    &&-\prod_{m\in[M]}a_{k_m}
  \end{eqnarray*}
 holds  with probability at least $1-(M+1)\delta$, provided that
  \begin{align*}
    n' \geq \max_{m\in[M]}\left(67,12a_k^2\opnorm{C_k}^{-1},12a_{k_m}^2\opnorm{C_{k_m}}^{-1}\right)\log\frac{n'}{\delta},
  \end{align*}
  where $c_{k,1}= 2a_k\sqrt{2\log(6/\delta)}$, $c_{k,2}=4\sqrt 3 a_k \log(12/\delta)$, $c_{k,3}= 12\sqrt{3\log(12/\delta)}a_k$, $c_{k_m,1} =  2a_{k_m}\sqrt{2\log(6/\delta)}$, $c_{k_m,2}=4\sqrt 3 a_{k_m} \log(12/\delta)$, $c_{k_m,3} =  12\sqrt{3\log(12/\delta)}a_{k_m}$ for $m \in [M]$.
\end{proposition}

As a baseline, to interpret the result (see the second bullet point in Remark~\ref{remark:main-prop-remarks}), one could consider the V-statistic based HSIC estimator \eqref{eq:emp-hsic} for $M\ge 2$, which according to our following lemma has a convergence rate of $\O_{\text{P}}\left(\frac {1}{\sqrt n}\right)$.

\begin{lemma}[Deviation bound for V-statistic based HSIC estimator]
  \label{lemma:deviation}
  Let $\HSIC_{k}(\hat \P_n)$ be as in \eqref{eq:emp-hsic} on a metric space $\X=\times_{m=1}^M\X_m$, and $\HSIC_k\left(\P\right) > 0$. Then
  \begin{align*}
    \left|\HSIC_k\left(\P\right)-\HSIC_{k}\left(\hat \P_n\right)\right| = \O_{\text{P}}\left(\frac{1}{\sqrt n} \right).
  \end{align*}  
\end{lemma}

\begin{remark}~ \label{remark:main-prop-remarks}
  \begin{itemize}[labelindent=0em,leftmargin=0.85em,topsep=0cm,partopsep=0cm,parsep=0cm,itemsep=2mm]
  \item From the terms $t_{k,1}, t_{k,2}, t_{k_m,1}, t_{k_m,2}, m\in[M]$ it follows that for $n' < \sqrt n$ the respective second term dominates, thus increasing the error; for $n' > \sqrt n$ the respective first term dominates and the computational complexity increases. The effective dimension $\left(t_{k,3}, t_{k_m,3}\right)$ controls the trade off between the two terms and can be related \citep{chatalic22nystrom} to the decay of the eigenvalues of the respective covariance operator. A convergence rate of $n^{-1/2}$ for the sums $t_{k,1}+t_{k,2}+t_{k,3}$ and $t_{k_m,1}+t_{k_m,2}+t_{k_m,3}$ can be achieved if
    \begin{itemize}[labelindent=0em,leftmargin=1.2em,topsep=0cm,partopsep=0cm,parsep=0cm,itemsep=2mm]
    \item  $\max_{m\in[M]}\left(\mathcal N_X(\lambda),\mathcal N_{X_m}(\lambda)\right) \le c\lambda^{-\gamma}$ for some $c > 0$ and $\gamma \in (0,1]$ with $n' = n^{1/(2-\gamma)}\log(n/\delta)$, or
    \item  $\max_{m\in[M]}\left(\mathcal N_X(\lambda),\mathcal N_{X_m}(\lambda)\right) \le \log(1+c/\lambda)/\beta$ for some $c>0$, $\beta >0$, and  $n' = \sqrt n \log\left(\sqrt n \max\limits_{m\in[M]}\left(\frac 1 \delta, \frac{c}{6a_k^2}, \frac{c}{6a_{k_m}^2}\right)\right)$.
    \end{itemize}
    This rate of convergence propagates through the product.
   
  \item Lemma~\ref{lemma:deviation} establishes that the V-statistic based estimator of HSIC converges with rate $n^{-1/2}$. Recalling the last line of Table~\ref{tab:comparison}, setting $n' = o\left(n^{2/3}\right)$, the proposed estimator yields an asymptotic speedup over V-HSIC. Hence, setting $n' = \tilde \O\left(\sqrt n\right)$ allows to obtain the same rate of convergence while decreasing runtime. Assumption $\HSIC_k\left(\P\right) > 0$ in Lemma~\ref{lemma:deviation} protects one from attaining a convergence rate of $n^{-1}$ of $\HSIC^2_k\left(\hat \P_n\right)$.
  \end{itemize}  
\end{remark}

\section{Experiments}
\label{sec:experiments}

In this section, we demonstrate the efficiency of the proposed method (N-MHSIC) 
against the baselines NFSIC, RFF-HSIC, N-HSIC and the quadratic-time V-statistic based HSIC estimator (V-HSIC) in the context of independence testing. Hence, the null  hypothesis $H_0$ is that the joint distribution factorizes to the product of the marginals, the alternative $H_1$ is that this is not the case. The experiments study both synthetic (Section~\ref{subsec:toy-problems}) and real-world (Section~\ref{subsec:real-problems}) examples, in terms of power and runtime.\footnote{The code of our experiments is available at \url{https://github.com/FlopsKa/nystroem-mhsic}.} 

We use the Gaussian kernel 
\begin{align*}
k_m(\b x_m,\b x_m') = \exp\left(-\gamma_{k_m} \norm{\b x_m-\b x_m'}{2}^2\right)
\end{align*}
for all experiments, with $\gamma_{k_m}$ chosen according to the median heuristic. For a fair comparison of the test power, we approximate the null distribution of each test statistic by the permutation approach with $250$ samples.  We then perform a one-sided test with an acceptance region of $5\%$ ($\alpha=0.05$), which we repeat, for all power experiments, on $100$ independent draws of the data; the runtime results include these. We set each algorithm's parameters as recommended by the respective authors: For NFSIC, we set the number of test locations $J =5 $; the number of Fourier features (RFF-HSIC) and Nyström samples (N-HSIC) is set to $ \sqrt n$. The number of Nyström samples of N-MHSIC is indicated within the experiment description. The opaque area in the figures indicates the $0.95$-quantile obtained over $5$ runs. All experiments were performed on a PC with Ubuntu 20.04, 124GB RAM, and 32 cores with 2GHz each. 

\subsection{Synthetic Data}
\label{subsec:toy-problems}

We examine three toy problems in the following, illustrating runtime and statistical power.

\paragraph{Comparison of HSIC approximations under $H_0$.} \label{sec:toy-comparison}
First, for $M=2$ components, we compare our proposed method to the existing accelerated HSIC estimators (N-HSIC, RFF-HSIC) on independent data to assess convergence w.r.t.\ runtime.  Specifically, we set $X_1, X_2 \stackrel{\text{i.i.d.}}{\sim} \mathcal N(0,1)$. The theoretical value of HSIC is thus zero. 
Figure~\ref{fig:runtime_indep_data} shows the estimates for sample sizes from 100 to 1000; the number of Nyström samples for N-MHSIC is set to $n' = 2\sqrt n$. All approaches converge to zero, with N-MHSIC converging a bit slower than the exisiting HSIC approximations. However, we note that the gap is on the order of $10^{-3}$ so it is close to the theoretical value also for small sample sizes. The runtime scales as predicted by the complexity analysis, with the proposed approach running faster than both N-HSIC and RFF-HSIC starting from $n=500$ samples.

\begin{figure}
  \centering
  \includegraphics[width=\linewidth]{figures/indep_runtime.pdf}
  \caption{Estimation accuracy for $M=2$ components; the theoretical HSIC value is zero.}
  \label{fig:runtime_indep_data}
\end{figure}

\paragraph{Dependent Data ($H_1$ holds).} \label{sec:toy-strong-dependence}
To evaluate the statistical power on $M=2$ components, we set $X_1 \sim \mathcal N(0,1)$, $X_2=X_1 +\epsilon$, and $\epsilon \sim \mathcal N(0,1)$, with $n'$ set as before. Figure~\ref{fig:power} shows that N-MHSIC achieves a power of one for $n\approx 100$ and that it is slightly worse than the existing HSIC approximations for small sample sizes. V-HSIC has the highest power but also the highest runtime. Even though NFSIC has linear runtime complexity it is slower than all other statistics on small sample sizes.

\begin{figure}
  \centering
  \includegraphics[width=\linewidth]{figures/power.pdf}
  \caption{Power on dependent data. Runtime on log scale.}
  \label{fig:power}
\end{figure}


\paragraph{Causal Discovery.} \label{seq:toy-causal-discovery} The experiments until now considered $M=2$ components. However, N-MHSIC allows for handling $M\geq 2$ components and thus can estimate the directed acyclic graph (DAG) governing causality if one assumes an additive noise model.

Specifically, we sample from the structural equations
    $X_i = \sum_{j\in\mathrm{PA}_i}f^{i,j}\left(X_j\right)+\epsilon_i$  for $i \in [M]$,
of a randomly selected fully connected DAG with four nodes ($M=4$), of which there are $24$.
In the equation, $\mathrm{PA}_i$ denotes the parents of $i$ in the associated DAG, and the $\epsilon_i$ are normally distributed and jointly independent, with a variance sampled independently from the uniform distribution $\mathcal U\left(1,\sqrt 2\right)$. 

To now test whether a particular DAG fits the data,
\cite{pfister18kernel} propose to use generalized additive model regression to find the residuals when regressing each node onto all its parents and to reject the DAG if the residuals are not jointly independent. If these are independent, we accept the causal structure. In this application, one is only interested in the relative $p$-values when performing the procedure for all possible DAGs with the correct number of nodes.

V-HSIC has the best performance in \citep{pfister18kernel}, so we only compare against V-HSIC; it is also the only other approach which allows testing joint independence of more than two components. Figure~\ref{fig:sim-dag} shows how often N-MHSIC and V-HSIC identify the correct DAG in 100 samples. V-HSIC has higher power than N-MHSIC and more often identifies the correct DAG for small sample sizes. However, as the r.h.s.\ of Figure~\ref{fig:sim-dag} shows, the proposed algorithm runs even for $n' = 8\sqrt n$ and $n=1500$ twice as fast as V-HSIC while producing the same result quality. Due to their different runtime complexities, the gap in runtime widens further with increasing sample size.

\begin{figure}
  \centering
  \includegraphics[width=0.99\linewidth]{figures/dag_sim.pdf}
  \caption{Ratio of correctly identified DAGs with $4$ nodes.}
  \label{fig:sim-dag}
\end{figure}

\subsection{Real-World Data} \label{subsec:real-problems}
This section is dedicated to benchmarks on real-world data.

\textbf{Million Song Data.} The Million Song Data \citep{bertin11million} contains approximately 500,000 songs. Each has 90 features ($X$) together with its year of release, which ranges from 1922 to 2011 ($Y$). The algorithms must detect the dependence between the features and the year of release. To approximate the power, we draw $100$ independent samples of the whole data set. Figure~\ref{fig:power-msd} shows the results, for level $\alpha=0.01$; the different ranges of $n$ highlight the asymptotic runtime gains. In contrast to a similar experiment of \cite{jitkrittum17adaptive2}, we use a permutation approach for all two-sample tests and increase the number of Nyström samples (random Fourier features) as a function of $n$, obtaining higher power throughout. The problem is sufficiently challenging, so that we set the number of Nyström samples to $8 \sqrt n$ for N-MHSIC. V-HSIC and NFSIC achieve maximum power from $n=500$. N-MHSIC features similar runtime and power as the existing HSIC approximations N-HSIC and RFF-HSIC but can handle more than two components. The runtime plot illustrates that the lower asymptotic complexity of N-MHSIC compared to V-HSIC also holds in practice.

\begin{figure}
  \centering
  \includegraphics[width=\linewidth]{figures/msd.pdf}
  \caption{Test power vs.\ runtime on the Million Song Data.}
  \label{fig:power-msd}
\end{figure}

\paragraph{Weather Causal Discovery.} \label{sec:weather-causal} Here, we aim to infer the correct causality DAG from real-world data, namely the data set of \cite{mooij16distinguishing} which contains $349$ measurements consisting of altitude, temperature and sunshine. The goal is to infer the most plausible DAG with three nodes $(d=3)$ out of the $25$ possible DAGs ($3^3-2 = 25$; two graphs have a cycle). We assume the structural equations discussed before. Figure~\ref{fig:real-world-dag} shows the $p$-values with the estimated DAG (with index $25$) having the largest $p$-value. Again, we compare our results to V-HSIC  and find that both successfully identify the most plausible DAG \citep{pfister18kernel}.

These experiments demonstrate the efficiency of the proposed Nyström $M$-HSIC method.
\begin{figure}
    \centering
    \begin{subfigure}{0.59\linewidth}
         \centering
         \includegraphics[width=\linewidth]{figures/caus_real_world}
        \end{subfigure}%
     ~
     \begin{subfigure}[c]{0.39\linewidth}
       \centering
       \includegraphics[width=\linewidth]{figures/dag}
    \end{subfigure}
    \caption{Testing for joint independence on the residuals of DAGs with three nodes (left) and the DAG with the largest $p$-value (right). The $p$-values agree on DAGs $1$ to $24$.}
    \label{fig:real-world-dag}
\end{figure}

\begin{acknowledgements}
This work was supported by the German Research Foundation
(DFG) Research Training Group GRK 2153: Energy Status Data –-
Informatics Methods for its Collection, Analysis and Exploitation. The major part of this work was carried out while Florian Kalinke was a research associate at the Department of Statistics, London School of Economics.
\end{acknowledgements}

%\small
%\bibliographystyle{plain}
%\bibliography{BIB/curated,BIB/collected_zoltan,BIB/publications}
\bibliography{BIB/collected_zoltan,BIB/publications}

\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End:
