\section{Preliminaries}\label{sec:preli}
 

The goal of this section is to introduce the basic definitions and lemmas that will be used to prove our main result. 
We first introduce a collection of subsets called geometric weight levels. 

\begin{definition}[Geometric Weight Levels]\label{def:level}
Fix $R \in\mathbb{N}_+$ and $q\in\R^d$. 
We define
%\begin{align*}
$
    w_i:=f(x_i,q)
$. 
%\end{align*}
For any fix $r\in[R]:=\{1,2,\cdots,R\}$, we define  
$
% \begin{align*}
    L_{r}:=\{x_{i}\in X ~|~ w_{i}\in(2^{-r+1},2^{-r}]\}.
% \end{align*}
$
We define the corresponding distance levels as
% \begin{align*}
    $z_r:=\max_{\mathrm{s.t.}f(z)\in(2^{-r},2^{-r+1}]} z$,
% \end{align*}
where $f(z):=f(x,q)$ for $z=\|x-q\|_2$. 
In addition, we define 
% \begin{align*}
$L_{R+1}:=X \setminus \bigcup_{ r \in [R] } L_{r}$.
% \end{align*}
\end{definition}

 

Geometric weight levels can be visualized as a sequence of circular rings centered at query $q$. The contribution of each level to kernel density at $q$ is mainly determined by the corresponding distance level.
Next, we introduce the important sampling technique to accelerate the query procedure.

\begin{definition}[Importance Sampling]\label{def:importance_sampling} 

Let ${x_1, \dots, x_n} \subset \mathbb{R}^d$ be a given set of data points. Suppose each point $x_i$ is sampled independently with probability $p_i > 0$. The importance sampling estimator for a quantity of interest is given by: 
\begin{align*} 
    T := \sum_{i=1}^{n} \frac{\chi_i}{p_i} x_i,
\end{align*} 
where $\chi_{i}=1$ is defined to be the event that point $p_{i}$ gets sampled and recovered in the phase corresponding to its weight level, and $\chi_{i}=0$ is defined to the contrary.
\end{definition}

 

To apply importance sampling, we need to evaluate the contribution of each point. We sample each point that has a high contribution with a high probability. 
A natural question arose: when preprocessing, we have no access to the query, so we cannot calculate distance directly. Locality Sensitive Hashing is a practical tool to address this problem. 

\begin{definition}[Locally Sensitive Hash \cite{im98}]\label{def:LSH_family}
A family $\mathcal{H}$ is called $(p_\mathrm{near},p_\mathrm{far},z,c)$-sensitive  where $p_\mathrm{near},p_\mathrm{far}\in [0,1],z \in\mathbb{R},c\geq 1$, if for any $x,q\in\mathbb{R}^d$, $\Pr_{h\sim\mathcal{H}}[h(x)=h(q)~|~\| x-q\|_2\leq z]\geq p_\mathrm{near}$, and $\Pr_{h\sim\mathcal{H}}[h(x)=h(q)~|~\| x-q\|_2\geq cz]\leq p_{\mathrm{far}}$.

\end{definition}

 

The next lemma shows the existence of the LSH family and its evaluation time.

\begin{lemma}[Lemma 3.2 in page 6  
of \cite{ai06}]\label{lem:p}
Let $(x, q) \in \mathbb{R}^d \times \mathbb{R}^d$. Define 
\begin{align*}
    p_\mathrm{near}:=p_1(z):=\Pr_{h\sim\mathcal{H}}[h(x)=h(q)~|~\| x-q\|_2 \leq z]
\end{align*}
and
\begin{align*}
    p_\mathrm{far}:=p_2(z,c):=\Pr_{h\sim\mathcal{H}}[h(x)=h(q)~|~\| x-q\|_2 \geq cz].
\end{align*}
Then, if we fix $z$ to be positive,  we can have 
a hash family $\mathcal{H}$ satisfying
\begin{align*}
    \rho :=\frac{\log {1}/{p_\mathrm{near}}}{\log {1}/{p_\mathrm{far}}}\leq \frac{1}{c^2}+O(\frac{\log t}{t^\frac{1}{2}}),
\end{align*}
for any $c \geq 1, t > 0$, where $p_\mathrm{near}\geq e^{-O(\sqrt{t})}$ and it requires $dt^{O(t)}$ time for every evaluation.

\end{lemma}

\begin{remark}\label{rmk:p_near}
We set $t=\log^{\frac{2}{3}}n$, which results in $n^{o(1)}$ evaluation time and $\rho = \frac{1}{c^2}+o(1)$. Note that if $c=O(\log^\frac{1}{7}n)$, then 
% \begin{align*}
   $ \frac{1}{\frac{1}{c^2}+O(\log t/t^\frac{1}{2})}=c^2(1-o(1))$ \footnote{The above three $o(1)$ can be $\frac{\log \log^\frac{2}{3}n}{\log ^\frac{1}{3}n}, \frac{\log \log^\frac{2}{3}n}{\log ^\frac{1}{3}n}, \frac{\log \log^\frac{2}{3}n}{\log ^\frac{1}{21}n}$ respectively.}.
% \end{align*}

\end{remark}
 
Next, we assign the LSH family to each geometric weight level (Definition~\ref{def:level}) and show how well these families can distinguish points from different levels.

\begin{lemma}[Probability bound for separating points in different level sets, informal version of Lemma~\ref{lem:LSH_formal}]\label{lem:LSH}
 
Given kernel function $f$ and $r \in [R]$, let $L_r$ be the weight level set and $z_r$ be the corresponding distance level (Definition~\ref{def:level}). For any query $q\in\R^d$, any integer pair  $(i, r) \in [R+1] \times [R]$, satisfying $i>r$, let $x \in L_r$ and $x' \in L_i $. Let $c_{i,r}:=\min\{\frac{z_{i-1}}{z_r},\log^{1/7}n\}$. We set up Andoni-Indyk LSH family (Definition~\ref{def:LSH_family}) $\mathcal{H}$ with near distance $z_r$ and define
\begin{align*}
    p_{\mathrm{near},r}:=\Pr_{h\sim\mathcal{H}}[h(x)=h(q)~|~\| x-q\|_2\leq z]
    % p_{\mathrm{far},r}&~:=\Pr_{h\sim\mathcal{H}}[h(x)=h(q)~|~\| x-q\|_2\geq cz]
\end{align*}
and
\begin{align*}
    % p_{\mathrm{near},r}&~:=\Pr_{h\sim\mathcal{H}}[h(x)=h(q)~|~\| x-q\|_2\leq z]
    p_{\mathrm{far},r}:=\Pr_{h\sim\mathcal{H}}[h(x)=h(q)~|~\| x-q\|_2\geq cz].
\end{align*}
Then, for any $k\geq 1$, it is sufficient to show $\Pr_{h^*\sim \mathcal{H}^k}[h^*(x)=h^*(q)]\geq p_{\mathrm{near},r}^k$
and $\Pr_{h^*\sim \mathcal{H}^k}[h^*(x')=h^*(q)]\leq p_{\mathrm{near},r}^{kc_{i,r}^2(1-o(1))}$
 
\end{lemma}

This lemma suggests that we can apply LSH several times to separate points in different level sets. It is useful for recovering points in a specific level set when estimating the ``importance" of a point based on its distance from the query point. We will discuss more in Section~\ref{sec:data}. 
We use a similar definition for the cost of the kernel in \cite{ckns20}.  
\begin{definition}[Kernel cost]\label{def:cost_K}
Given a kernel $f$, which has geometric weight levels $L_r$'s and distance levels $z_r$'s defined in Definition~\ref{def:level}. For any $r\in[R]$, we define the kernel cost $f$ for $L_r$ as  
\begin{align*}
    \cost(f,r):=\exp_2(\max\limits_{i\in\{r+1,\cdots,R+1\}}\lceil\frac{i-r}{c_{i,r}(1-\mathrm{o}(1))}\rceil),
\end{align*}
where 
\begin{align*}
c_{i,r}:=\min\{\frac{z_{i-1}}{z_{r}},\log^\frac{1}{7}n\}.
\end{align*}
Then we define the general cost of a kernel $f$ as
\begin{align*}
    \cost(f):=\max_{r \in [R]}\cost(f,r).
\end{align*}
\end{definition}
Note that when $f$ is Gaussian kernel, the $\cost(f)$ is $(\frac{1}{f_{\mathsf{KDE}}})^{(1+o(1))\frac{1}{4}}$ \cite{ckns20}.

