% \documentclass{uai2022} % for initial submission

\documentclass[accepted]{uai2022}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{float} % added by Yu Chen
% \usepackage{subcaption} % added by Yu Chen

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example

% If you use natbib package, activate the following three lines:
% \usepackage[round]{natbib}
% \usepackage{natbib}
% \renewcommand{\bibname}{References}
% \renewcommand{\bibsection}{\subsubsection*{References}}

% use Times
\usepackage{times}
% For figures
\usepackage{graphicx} % more modern
%\usepackage{epsfig} % less modern
\usepackage{subfigure}

% % For citations
% \usepackage{natbib}

% For algorithms
\usepackage{algorithm}
\usepackage{algorithmic}

\usepackage{hyperref}

\renewcommand{\theHalgorithm}{\arabic{algorithm}}

\newcommand{\csize}{
\fontsize{8}{8}\selectfont
}

\newcommand{\csizenine}{
\fontsize{9}{9}\selectfont
}

\newcommand{\csizenineplus}{
\fontsize{9.5}{9.5}\selectfont
}

\newcommand{\csizeten}{
\fontsize{10}{10}\selectfont
}

\newcommand{\tabsize}{
\fontsize{7}{7}\selectfont
}

\renewcommand\algorithmiccomment[1]{
  {
  	{
	\csizenine    
  	{\textit{\%\ #1}}
  	}
  }
}

% \frenchspacing

\newcommand{\ug}[1]{{\color {magenta} #1}}
\usepackage{url}  %Required
\frenchspacing  %Required
% \usepackage{amsmath}
\usepackage{verbatim}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{booktabs}
\usepackage{epstopdf}
\usepackage{lipsum}
\usepackage{color}
\usepackage[normalem]{ulem}

\newcommand{\cA}{{\mathcal{A}}}
\newcommand{\cB}{{\mathcal{B}}}
\newcommand{\cC}{{\mathcal{C}}}
\newcommand{\cD}{{\mathcal{D}}}
\newcommand{\cG}{{\mathcal{G}}}
\newcommand{\cI}{{\mathcal{I}}}
\newcommand{\cN}{{\mathcal{N}}}
\newcommand{\cM}{{\mathcal{M}}}
\newcommand{\cO}{{\mathcal{O}}}
\newcommand{\cP}{{\mathcal{P}}}
\newcommand{\bP}{{\mathbf{P}}}
\newcommand{\cR}{{\mathcal{R}}}
\newcommand{\cS}{{\mathcal{S}}}
\newcommand{\cH}{{\mathcal{H}}}
\newcommand{\cK}{{\mathcal{K}}}
\newcommand{\cT}{{\mathcal{T}}}
\newcommand{\cU}{{\mathcal{U}}}
\newcommand{\cV}{{\mathcal{V}}}
\newcommand{\cY}{{\mathcal{Y}}}
\newcommand{\cZ}{{\mathcal{Z}}}
\newcommand{\newsetminus}{{\!-\!}}
\newcommand{\cVmA}{{\cV\newsetminus\cA}}
\newcommand{\cX}{{\mathcal{X}}}
\newcommand{\cs}{s}
\newcommand{\cVms}{{\cV-\cs}}

\newcommand{\ba}{{\mathbf{a}}}
\newcommand{\bb}{{\mathbf{b}}}
\newcommand{\bu}{{\mathbf{u}}}
\newcommand{\bx}{{\mathbf{x}}}
\newcommand{\resid}{\cR}

\newcommand{\NP}{{\mathbf{NP}}}

% \DeclareMathOperator{\MIF}{MI} 

\newcommand{\bs}[1]{\boldsymbol{#1}}

\newcommand{\mb}[1]{\mathbf{#1}}

\newcommand{\mhk}{\cM^h_k}

\newcommand{\thmref}[1]{Theorem~\ref{#1}}
\newcommand{\tabref}[1]{Table~\ref{#1}}
\newcommand{\figref}[1]{Fig.~\ref{#1}}
\newcommand{\eqnref}[1]{Eq.~\ref{#1}}
\newcommand{\secref}[1]{Sec.~\ref{#1}}
\newcommand{\appref}[1]{Appendix~\ref{#1}}
\newcommand{\prcref}[1]{Procedure~\ref{#1}}
\newcommand{\assmref}[1]{Assumption~\ref{#1}}
\newcommand{\crlref}[1]{Corollary~\ref{#1}}
\newcommand{\algoref}[1]{Alg.~\ref{#1}}
\newcommand{\prpref}[1]{Proposition~\ref{#1}}
\newcommand{\cnjref}[1]{Conjecture~\ref{#1}}
\newcommand{\axmref}[1]{Axiom~\ref{#1}}
\newcommand{\lmaref}[1]{Lemma~\ref{#1}}

\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[lemma]{Corollary}
\newtheorem{procedure}[lemma]{Procedure}
\newtheorem{assumption}[lemma]{Assumption}
\newtheorem{claim}[lemma]{Claim}
\newtheorem{conclusion}[lemma]{Conclusion}
\newtheorem{proposition}[lemma]{Proposition}
\newtheorem{conjecture}[lemma]{Conjecture}
\newtheorem{axiom}[lemma]{Axiom}
\newtheorem{algo}[lemma]{Algorithm}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}


%additions suggested by Sahil
\newcommand{\s}[1]{\textcolor{magenta}{#1}}

% %deletions suggested by Sahil
% \newcommand{\sd}[1]{\textcolor{orange}{#1}}

%additions suggested by Sahil
\newcommand{\todo}[1]{\textcolor{blue}{Sahil's todo: #1}}

% \newcommand{\uai}[1]{\textcolor{brown}{#1}}

\newcommand{\te}{TE }
\newcommand{\tes}{TE}

\definecolor{shadecolor}{gray}{0.95}
\newcommand{\algshade}[1]{
    \hspace*{-\fboxsep}
    %\vspace*{-\fboxsep}
    \colorbox{shadecolor}{
        \parbox{\linewidth}{#1}
    }
}

% If your paper is accepted and the title of your paper is very long,
% the style will print as headings an error message. Use the following
% command to supply a shorter title of your paper so that it can be
% used as headings.
%
%\runningtitle{I use this title instead because the last one was very long}

% If your paper is accepted and the number of authors is large, the
% style will print as headings an error message. Use the following
% command to supply a shorter version of the authors names so that
% they can be used as headings (for example, use only the surnames)
%
%\runningauthor{Surname 1, Surname 2, Surname 3, ...., Surname n}

% \twocolumn[

% %% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Estimating Transfer Entropy under Long Ranged Dependencies (Supplementary material)}

% Add authors
% {\href{mailto:<sahil.garg.cs@gmail.com>?Subject=Your UAI 2022 paper}

\author[1,*]{Sahil Garg}
\author[2]{Umang Gupta}
\author[1,$\dagger$]{Yu Chen}
\author[1,$\dagger$]{Syamantak Datta Gupta}
\author[1,$\dagger$]{Yeshaya Adler}
\author[1]{Anderson Schneider}
\author[1]{Yuriy Nevmyvaka}

\affil[1]{%
    Department of Machine Learning Research\\
    Morgan Stanley\\
    New York, NY, USA\\
}

\affil[2]{%
    Department of Computer Science\\
    University of Southern California\\
    Los Angeles, CA, USA
}
    
\affil[*]{Corresponding Author: sahil.garg.cs@gmail.com, sahil.garg@morganstanley.com}

\affil[$\dagger$]{Equal contributions}

\begin{document}

\onecolumn

\maketitle

\appendix

\section{Theoretical Proofs}

% consistency
\subsection{Consistency of Regularization with Hash Codes}\label{subsec:consistency}
% notation
Below, we present a proof of Thm \ref{thm:consistency}, we can see LSH based data generation as sampling from a data-driven histogram. Using this insight and using results from \citet{lugosi1996consistency} and a proof technique similar to \citet{rothfuss2019conditional}, we will demonstrate consistency of our sampling approach. We use this approach to estimate entropy, i.e., $\cH(\bs{\cY}_t|\bs{\cY_{t-1}})$. 

% \ug{No need to restate the theorem here, right? Also messes up the theorem label. so removing it}

% I think, restate the thoorem here, no worries. Appendix will be separate file anyways.

% Ok 

\begin{theorem}
\label{thm:consistency}
% \textbf{(Theorem~\ref{thm:consistency} restated)}
Suppose $\lim_{n \to \infty}$ $\frac{2^H}{n}$ $\to$ $0$, $\lim_{n\rightarrow \infty} \frac{tH\log n}{n} \rightarrow 0$ and the input space, i.e., $\bs{y} \in \mathbb R^{t}$ is bounded. Consider any function, $f: \bs{y} \rightarrow \mathbf (0, \infty) $ with $\log f$ having finite second order moment w.r.t to $p$ and $g_{n,H}$, then  
% 
\begin{align*}
% 
\lim_{n\rightarrow \infty}
\left| \mathbb E_p [-\log f(\bs y)]-\mathbb E_g [-\log f(\bs y )] \right| \rightarrow 0 % 
\end{align*}
% 
\end{theorem}

\begin{proof}

% More generally, consider $f:\mathbb R^m \times \Theta \rightarrow \mathbb (0, \infty) $ and  


Let $l(\bs{\theta}) = -\mathbb E_{\bs{y}\sim p} \log f(\bs y)$ and $l_n^{(g, H)}(\bs \theta) = -\mathbb E_{\bs{y} \sim g_{n,H}} \log f(\bs{y})$. Here $g_{n,H}(\bs y)$ denotes the perturbed distribution obtained by using H dimensional hashcode function and $n$ samples. Consider, 


% \begin{theorem}\label{thm:consistency}
% Suppose $n^{-1}\log n \rightarrow 0$, ${n^{-1}md\log n }\rightarrow 0$, the input space, i.e., x is bounded and $f$ has finite second order moments w.r.t to $p$ and $g_{n,d}$, then  $\lim_{n\rightarrow \infty}|l(\theta)-l_{n}^{(g,d)}(\theta)| \rightarrow 0$.   
% \end{theorem}


\begin{align*}
|l(\bs \theta) - l_{n}^{(g,H)}(\bs \theta)|^2 
    &= \left|- \mathbb E_{\bs{y} \sim p} \log f(\bs{y})   + \mathbb E_{\bs{y}\sim g_{n,H}} \log f(\bs{y})\right|^2 \\
    &= \left( \mathbb E_{\bs{y} \sim p} \log f(\bs{y})   -\mathbb E_{\bs{y} \sim g_{n,H}} \log f(\bs{y})\right)^2 \\
    & = \left(\int d\bs{y} \log f(\bs{y}) \left(p(\bs{y}) - g_{n,H}(\bs{y})\right)\right)^2 \\
    % 
    % 
    % & = \left(\int dx \left(\log f(x) (p(x) - g_{n,d}(x))\right)\right)^2 \\
    % & \leq \left(\int dx |\log f(x)||p(x) - g_{n,d}(x)|\right)^2 \\
    % 
    % 
    & = \left(\int d\bs{y} \left (\log f(\bs{y})\left(p(\bs{y}) - g_{n,H}(\bs y)\right)^{0.5} \right) \left(p(\bs{y}) - g_{n,H}(\bs{y})\right)^{0.5}\right)^2\\
\intertext{Using Cauchy-Schwartz inequality, $\left(\int d\bs{y} a(\bs{y}) b(\bs{y})\right)^2 \leq= \int d\bs{y}|a(\bs{y})|^2  \int d\bs{y} |b(\bs{y})|^2$}
    & \leq  \left(\int d\bs{y} \left|\log f(\bs{y})\right|^2\left|p(\bs{y}) - g_{n,H}(\bs{y})\right| \right)\left(\int d\bs{y} \left|p(\bs{y}) - g_{n,H}(\bs{y})\right|\right)\\
\end{align*}

The first term will be bounded by $\mathbb E_p \log^2 f + \mathbb E_g \log^2 f$ which is some finite quantity by assumption. To bound the second term, we use Thm.\ 1 of  \citet{lugosi1996consistency}, which is restated below.


\begin{theorem}[Theorem 1 of \cite{lugosi1996consistency}]
% 
Let $\bs{y}^{(1)}, \bs{y}^{(2)}, \ldots$ be IID random vectors in $\mathbb R^t$ and $\bs{y}^{(i)} \sim p(\bs{y})$. Let $\Pi = \{\pi_1, \pi_2 \ldots\}$ be a fixed partitioning scheme for $\mathbb R^t$ and let $\mathcal A_n$ be the collection of all possible partitions associated with the rule $\pi_n$, and $\pi_n[\bs{y}]$ denotes the partition in which $\bs{y}$ lies. Define $m(\mathcal A)=\max_{\pi\in \mathcal A}|\pi|$ as the maximum size of partition, $\Delta^*(\mathcal A, n)$ is the maximum number of distinct partitions of any $n$ points in $\mathbb R^t$ induced by $\mathcal A$. If as $n \rightarrow  \infty$,
\begin{itemize}
    \item $n^{-1}m(\mathcal A_n) \rightarrow 0 $
    \item $n^{-1}\log \Delta^*(\mathcal A_n, n) \rightarrow 0 $
    \item $P(\{ \bs{y} : diam(\pi_n[\bs{y}])>\gamma\}) \rightarrow 0 $, i.e., the size of each bin with significant probability mass goes to zero.
\end{itemize}% (a) n

Then the histogram density estimates, $g_{n,H}$ are strongly consistent in $L_1$, i.e., with probability 1
$$\int |p(\bs{y}) - g_{n,H}(\bs{y})|d\bs{y} \rightarrow 0$$
% 
\end{theorem}

Since, we used a $H$ dimensional binary hashcode in this work, $m(\mathcal A_n)\leq 2^H$. Now each bit of hashcode function can be seen as partitioning the input space (or transformed input space) by hyperplanes. We know that $n$ points can be split by hyperplanes in $n^{t^*}$ ways~\citep{cover1965geometrical}, where $t^*$ is the effective-input dimension. Assuming each hash code function is linear, there are only $n^t$ possible ways in N points can be split. Thus, $\Delta^*(\mathcal A_n, n) \leq (n^{t})^{H}$. We can see that the first two conditions are satisfied due to the regularity conditions.
    
Since input space is bounded and $H$ can be increased as $n$ increases, the diameter of significant probability bins would shrink and tend to 0 eventually. Hence, $$\int |p(\bs{y}) - g_{n,H}(\bs{y})|d\bs{y} \rightarrow 0$$.
% 
% , for our setting. 
% 
\end{proof}

% \textbf{Remark:} Thm.~\ref{thm:consistency} demonstrates that distributions $ p(x)$ and $g_{n,d}(x)$ will lead to same results in expectation as $n\rightarrow \infty$. We can generated a large number of samples from $g_{n,d}$ and hence due to central limit theorem, the empirical estimate will converge to the expectation. If $f$ is bounded, i.e., $f\in [0, F_{max}]$, we can use finite sample bounds such as Chernoff to get sample complexity for high confidence bounds similar to~\citet{mcallester2020formal}.

\subsection{Variance of the Estimator}

\begin{theorem}
% 
% \textbf{Thoerem~\ref{thm:confidence} restated}
% 
For some data distribution $p$ and conditional model distribution $q$  and $-\log q(\bs y_t|(.)) \in [-Q, Q]$. Let ${\hat T}^q_{X\rightarrow Y}$ denote the n-sample estimate of transfer entropy. Then with probability $1-\delta$ ($\delta>0$), we have
\begin{align}
\left| { \mathcal {\hat T}^q_{X\rightarrow Y}} - \mathcal T^q_{X\rightarrow Y}  \right| \leq 2 Q \sqrt {\frac {2}{n}\ln \frac 4 \delta}
\end{align}
\end{theorem}

\begin{proof}
To estimate transfer entropy, we use difference of empirical estimate of two conditional entropy terms, i.e,
$$\mathcal T^q_{X\rightarrow Y}  = \cH(\cY_t|\bs{\cY}_{t-1}) - \cH(\cY_t|\bs{\cY}_{t-1}, \bs{\cX}_{t-1})
$$
Now, we can write the error as: 
\begin{align*}
\left| { \mathcal {\hat T}^q_{X\rightarrow Y}} - \mathcal T^q_{X\rightarrow Y}  \right| &= \left| \left(\cH(\cY_t|\bs{\cY}_{t-1}) - \hat \cH(\cY_t|\bs{\cY}_{t-1})\right) - \left(\cH(\cY_t|\bs{\cY}_{t-1}, \bs{\cX}_{t-1}) - \hat \cH(\cY_t|\bs{\cY}_{t-1}, \bs{\cX}_{t-1}) \right)\right|\\
& \leq \left| \cH(\cY_t|\bs{\cY}_{t-1}) - \hat \cH(\cY_t|\bs{\cY}_{t-1})\right| + \left|\cH(\cY_t|\bs{\cY}_{t-1}, \bs{\cX}_{t-1}) - \hat \cH(\cY_t|\bs{\cY}_{t-1}, \bs{\cX}_{t-1}) \right|
\end{align*}
We will bound the probability of error in transfer entropy estimates by bounding the error of both terms in both R.H.S terms of above expression using additive-chernoff bounds (similar to~\cite{mcallester2020formal}).
We can write, 
\begin{align}
   & P\left(\left| { \mathcal {\hat T}^q_{X\rightarrow Y}} - \mathcal T^q_{X\rightarrow Y}  \right|\leq \epsilon \right) \nonumber\\
    & \geq \sum_{e=0}^{\epsilon} P\left(\left| \cH(\cY_t|\bs{\cY}_{t-1}) - \hat \cH(\cY_t|\bs{\cY}_{t-1})\right|\leq e\right) P \left(\left|\cH(\cY_t|\bs{\cY}_{t-1}, \bs{\cX}_{t-1}) - \hat \cH(\cY_t|\bs{\cY}_{t-1}, \bs{\cX}_{t-1}) \right|\leq \epsilon - e\right) \nonumber \\
    & \geq P\left(\left| \cH(\cY_t|\bs{\cY}_{t-1}) - \hat \cH(\cY_t|\bs{\cY}_{t-1})\right|\leq \epsilon/2\right) P \left(\left|\cH(\cY_t|\bs{\cY}_{t-1}, \bs{\cX}_{t-1}) - \hat \cH(\cY_t|\bs{\cY}_{t-1}, \bs{\cX}_{t-1}) \right|\leq \epsilon/2 \right) \label{eq:te_error_prob_bound}
\end{align}
Consider, 
$\hat \cH(\cY_t|\bs{\cY}_{t-1}) = \frac 1 n \sum_{i=1}^n  \log  q(y_t^{(i)}|\bs{y}_{t-1}^{(i)}) $, and, $\mathbb E \ \hat \cH(\cY_t|\bs{\cY}_{t-1}) = \cH(\cY_t|\bs{\cY}_{t-1}) $. Since $\log q \in [-Q,Q]$, using additive chernoff bounds, we have 
\begin{align*}
P\left(\left| \cH(\cY_t|\bs{\cY}_{t-1}) - \hat \cH(\cY_t|\bs{\cY}_{t-1})\right|\geq \epsilon/2\right)  &\leq 2e^{-\frac{2n(\epsilon/2)^2}{(2Q)^2}} = 2e^{-\frac{n\epsilon^2}{8Q^2}} 
\intertext{and similarly,}
P\left(\left| \cH(\cY_t|\bs{\cY}_{t-1}, \bs{\cX}_{t-1}) - \hat \cH(\cY_t|\bs{\cY}_{t-1}, \bs{\cX}_{t-1})\right|\geq \epsilon/2\right)  &\leq  2e^{-\frac{n\epsilon^2}{8Q^2}} 
\intertext{Substituting in~\ref{eq:te_error_prob_bound}, we have }
P\left(\left| { \mathcal {\hat T}^q_{X\rightarrow Y}} - \mathcal T^q_{X\rightarrow Y}  \right|\leq \epsilon \right) \leq  \left(1-2e^{-\frac{n\epsilon^2}{8Q^2}^2}\right)^2
\intertext{or,}
P\left(\left| { \mathcal {\hat T}^q_{X\rightarrow Y}} - \mathcal T^q_{X\rightarrow Y}  \right|\geq \epsilon \right) \leq  4e^{-\frac{n\epsilon^2}{8Q^2}}
\end{align*}
Setting, $4e^{-\frac{n\epsilon^2}{8Q^2}} = \delta$, we get $\epsilon =2 Q \sqrt {\frac {2}{n}\ln \frac 4 \delta}$
\end{proof}
\textbf{Remark:} Using above results, it becomes straight-forward to compute the bounds on mean square error of the estimator or variance as below:

\begin{align*}
   \mathbb E  \left( \mathcal {\hat T}^q_{X\rightarrow Y} - \mathcal T^q_{X\rightarrow Y}\right)^2 &\leq \left(2 Q \sqrt {\frac {2}{n}\ln \frac 4 \delta}\right)^2 P\left( \left|\mathcal {\hat T}^q_{X\rightarrow Y} - \mathcal T^q_{X\rightarrow Y}\right |\leq  2 Q \sqrt {\frac {2}{n}\ln \frac 4 \delta}\right) + \\
   & P\left( \left|\mathcal {\hat T}^q_{X\rightarrow Y} - \mathcal T^q_{X\rightarrow Y}\right |\geq  2 Q \sqrt {\frac {2}{n}\ln \frac 4 \delta}\right) (Q - (-Q))^2\\
   & = (1-\delta)\left (\frac {8Q^2}{n} \ln \frac 4 \delta\right) + 4Q^2 \delta \\ 
   & =  4Q^2 \left((1-\delta)\frac 2 n \ln \frac 4 \delta +\delta\right  )
\end{align*}

\section{More Algorithmic Details}
% 
% There is a vast literature on locality sensitive hashing algorithms~\citep{indyk1998approximate,kulis2009kernelized,joly2011random,grauman2013learning,zhao2014locality,wang2017survey,garg2019kernelized,garg2019nearly}. 
% 
In this section, we introduce more details about unsupervised learning of hash functions.
The main idea behind locality sensitive hashing~(LSH) is to have a set of hash functions, $\bs{h}(.) = \{ h_1(.), \cdots, h_H(.) \}$, with each one, $h_l(.)$, randomly splitting the input space into two parts; $h_l(\bs{y}) \in \{ 0, 1 \}$. Despite the randomness of an individual hash function, putting multiple hash functions together ensures that the data points belonging to same hashcode bin are similar to each other ~\citep{indyk1998approximate,zhao2014locality,wang2017survey}. 
    
In a data-driven LSH approach to learning a hash function $h_l(.)$, a subset of data points $\bs{Y}^{(l)}$ is subsampled from the superset of data points, $\bs{Y} = \{ \bs{y}^{(1)}, \cdots, \bs{y}^{(n)} \}$, for which the hash functions~(hashcodes) are optimized. A binary split of $\bs{Y}^{(l)}$ into two subsets can be generalized as a split of the entire input space~(so for $\bs{Y}$) through a binary classification model, and that model is the  resulting hash function  $h_l(.)$~\citep{joly2011random,kulis2012kernelized,garg2019kernelized}. \cite{garg2019nearly} proposed to find the optimal split of $\bs{Y}^{(l)}$ such that it generalizes across both training and test~(nearly-unsupervised setting). They also proposed to sample the subset $\bs{Y}^{(l)}$ locally from a hashcode bin so as to capture more fine-grained differences between data points with in the local neighborhoods.
% ~(corresponding to the hashcode bin).
% This approach corresponds to what they refer as a local hash function.
        
Here we extend their approach from a nearly-unsupervised setting to an unsupervised learning setting. We learn hash functions in a greedy fashion with pseudo code shown in \algoref{alg:lsh}. There are two key steps in the greedy approach for optimizing $h_l(.)$ (see \figref{fig:hash_opt} for a visual illustration). First, as per the $l-1$ number of hash functions optimized so far, we obtain hashcodes based binning of all the data points in $\bs{Y}$, then select a hashcode bin to sample $\bs{Y}^{l}$ from within the selected bin. If the maximal size of a bin is above a certain threshold, it suffices to just select the bin with the maximum size. Otherwise we \emph{select the hashcode bin with the highest entropy} of data points, as shown below,
% 
% so as to sample $\bs{Y}^{l}$ from within the selected bin. The selection of the highest entropy bin can be formulated as,
% 
\begin{align}
% & \arg\!\max_{\bs{c}}
\max_{\bs{c}}
\cH(\bs{\cY}|\bs{\cC}=\bs{c}),
% 
\end{align}
% 
wherein $\cH(\bs{\cY}|\bs{\cC}=\bs{c})$ is entropy of $\bs{\cY}$ within a particular hashcode bin $\bs{c}$; the entropy term can be computed in terms of nearest neighbor distances within the bin itself using KL estimator due to \cite{Kozachenko1987statistical}.\footnote{KL estimator should be accurate for empirical estimates of entropy within local neighborhoods, even in high dimensions, since a small value of $k$ is optimal in such case.
}
    
\begin{algorithm}[t!]
\caption{Unsupervised Learning of LSH functions}
% 
% \csizenine
% 
\begin{algorithmic}[1]
% 
\REQUIRE $\bs{Y} = ( \bs{y}^{(1)}, \cdots, \bs{y}^{(n)} ), H$.
% 
% \COMMENT{Greedy step for optimizing hash function, $h_l(.)$}
% 
% \STATE $\bs{C} \gets \{ \}$
% 
\FOR{$l=1 \to H$}
% 
\algshade{
% 
\STATE $\bs{Y}^{(l)} \gets$ sampleFromHashcodeBinOfHighEntropy$\left( \bs{Y}, \bs{C} \right)$
% 
\STATE $h_l(.) \!\gets\!$ optimizeSplit( $\bs{Y}^{(l)}, \bs{Y}, \bs{C}$)
% 
}
% 
\STATE $\bs{C}(:, l) \gets h_l( \bs{Y}$ )\\
% 
\ENDFOR
% 
\STATE \textbf{Return} $h_1(.), \cdots, h_H(.), \bs{C}$.
% 
\end{algorithmic}
\label{alg:lsh}
\end{algorithm}
    
Having sampled $\bs{Y}^{l}$ from a hashcode bin with the highest entropy~(\figref{fig:select_cluster}), the second step is to optimize its split. In \figref{fig:partition_samples}, a split of the sampled points corresponds to a split of the entire 2-D input space (dashed line). Between the two choices of splits, the one with vertical dashed line~(Navy colored) is optimal since it leads to better splits of the other hash bins~(including self). Mathematically, we characterize this criterion of splitting the hashcode bins as,

\begin{align}
\max_{h_l(.)}
\cH(h_l(\bs{y})|h_1(\bs{y}), \cdots, h_{l-1}(\bs{y})).
\end{align}

It ensures that hash function $h_l(.)$ is disentangled~(not redundant) w.r.t. the hash functions optimized previously. It is cheap to compute empirically, by simply counting the fraction of ones from the output of $h_l(.)$ for each of the bins. We refine this criterion further for a hashcode bin that is of a relatively small size~(number of data points in it) and has high entropy of $h_l(.)$ ~(less-biased proportion of ones). 
% 
% For any such cluster bin, we want to quantify the exact measure of divergence between the two partitions of the hashcode bin, emerging from $h_l(.)$; 
% 
For any such hashcode bin $\bs{c}$, we propose to maximize the KL-divergence between the data distribution from the two partitions of the bin, emerging from $h_l(.)$:

\begin{align}
% 
\max_{h_l(.)} \cD^c_{KL}(p(\bs{y}|h_l=0)||p(\bs{y}|h_l=1)).
% 
\end{align}
    
We empirically estimate $\cD^c_{KL}(.)$ from the ratio of the nearest neighbor distances within and across the two partitions of the bin~\citep{zhao2020analysis}.
    
We eliminate a large fraction of candidates for a split of $\bs{Y}^{l}$ through constraints based on info-theoretic clustering along with a divide and conquer procedure. The algorithm is parallelizable, and is overall compute efficient.
% 
% \footnote{Code will be released upon publication.}
% 
While the proposed LSH algorithm has various applications, we evaluate it only for the purpose of robust estimation of transfer entropy.

\section{More Details of Neuroscience Data}
% 
\label{sec:appendix_neuro}
    
The experiments used multiple high-density extracellular electrophysiology probes to simultaneously record spiking activity from a wide variety of areas in the mouse brain, ranging from the subcortical region, such as the thalamus, to multiple the visual cortices, such as primary visual cortex (V1), lateral medial visual area (LM), rostrolateral visual area (RL), anterolateral visual are (AL), anteromedial visual area (AM), etc.
The neural activities were recorded while the animals were head-fixed and were passively presented with visual stimuli. The details of the experimental setup can be found in \citep{siegle2021survey, visualcoding2020}.
We used animal with session-id \texttt{798911424}.
One experiment contained a mixture of many stimulus types, such as natural movies, flashes, Gabor filters, drifting gratings, etc. We selected recording trials of drifting gratings as they could strongly elicit neural responses. The visual stimulus of each trial lasted for two seconds with one-second rest in between without any visual stimulus.
We randomly subsampled 100 trials without replacement from 13 conditions of drifting gratings with 15 repeated trials each, so totally out of 195 trials. 
The condition ids are: \texttt{275}, \texttt{268}, \texttt{270}, \texttt{284}, \texttt{274}, \texttt{249}, \texttt{261}, \texttt{278}, \texttt{280}, \texttt{256}, \texttt{260}, \texttt{257}, \texttt{281}.
The first 500 ms of each trial after stimulus onset was extracted, as the early visual activities mainly involved feedforward interactions, thus could better reflect the hierarchy of the visual system.
The number of neurons in visual cortical areas recorded by one probe was roughly around 100.
The raw data was composed of sequences of action potential timestamps or counting processes of each neuron. We time-binned the timestamps with 0.1 ms resolution, then averaged the time series across all neurons for each brain region.
The time lags of all TE estimators are all 20 ms, with 40 time steps and the length of each time step is 0.5 ms.

\begin{figure}[!t]
\centering
% 
\subfigure[Select highest entropy bin.]{
\includegraphics[
width=0.48\columnwidth]{high_entropy_cluster.png}
\label{fig:select_cluster}
}
% 
\subfigure[Optimize split.]{
\includegraphics[
width=0.48\columnwidth]{hash_func_choices.png}
\label{fig:partition_samples}
}
% 
\caption{This figure illustrates the learning of a hash function. In \ref{fig:select_cluster} and \ref{fig:partition_samples}, black lines refer to hash functions learned previously. The intersection of the lines correspond to hashcode bins, with data points~(red dots) dispersed across these bins. If data points are highly dispersed within a bin, as is the case for the yellow points, it is a good candidate for sampling data points from it, so as to learn a new hash function which would split the selected bin to reduce the dispersion within it, and potentially split the other bins as well. Mathematically, dispersion of data in high dimensions can be characterized as entropy of the data distribution, see the highest entropy bin in \ref{fig:select_cluster}. In \ref{fig:partition_samples}, there are multiple ways of splitting the high entropy cluster~(dashed lines), and the optimal choice is one which splits the other bins as well~(Navy colored vertical dashed line).}
% 
\label{fig:hash_opt}
\end{figure}

% Here are more details of the main Fig. \ref{fig:expr_neuro}. The hierarchical order of the brain regions, from low to high, is V1, LM, RL, AL, and AM \citep{harris2019hierarchical, siegle2021survey}. The columns indicate the source regions and the rows indicate the target regions. We sort the regions according to the hierarchical order from left to right and top to bottom. 
% If $\mathcal{T}_{\text{A}\to\text{B}} > 0$ only if the hierarchical order of A is greater than B, then positive TE values will stay under the diagonal.
% Fig. \ref{fig:expr_neuro}(a) shows the TE between regions. A larger value means the source region contributes more significantly to the target region's entropy, which implies the direction of information flow and the source region has an impact on the activity of the downstream target region. All large values concentrate in the lower left triangle, which means the low-order regions impact the high-order regions, thus the conclusions agree with the hierarchical order found by other anatomical or functional methods.
% Importantly, the TE values in each row has a decreasing trend, so for one target region, the lower-order area contributes more than the higher-order.
% However, interestingly, the TEs in columns do not have an obvious monotonic pattern. For example, the V1 column is not in decreasing order from top to the bottom, which means V1 does not influence more on the higher-order region.
% $\mathcal{T}_{\text{V1}\to\text{RL}} > \mathcal{T}_{\text{V1}\to\text{AM}}$ but AM has higher hierarchical order than RL. This observation also applies to LM and RL. 
% It is worth noting that one area having higher hierarchical order than another does not necessarily indicate non-zero or large TE. For example, AL's order is larger than RL, but $\mathcal{T}_{\text{RL}\to\text{AL}} = 0$.
% The top right triangle matrix has almost all zeros except for two small positive values, $\mathcal{T}_{\text{AL}\to\text{LM}}$ and $\mathcal{T}_{\text{AM}\to\text{RL}}$. We purposely selected the early visual activities to eliminate the influences of the following feedback interactions. This is why the results do not show strong interaction from high-order regions to low-order regions. But this does not fully remove the effects. Feedback information flow $\text{AL}\to\text{LM}$ and $\text{AM}\to\text{RL}$ may occur earlier than 500 ms.
% If we enlarge the time window from 500 ms to 2000 ms to cover the entire trial window, the upper triangle matrix will show more positive values (figures or results not shown). As the current neuroscience research is still not clear when and how the feedback signal intertwines with the feedforward signal, we bypass the analysis of these complicated patterns, but our method suggests the existence of strong feedback loops, and it is interesting to notice that the feedback propagation could start from high-order regions.

% In contrast, other methods do not properly reveal the hierarchical relationships among the visual areas. 
% The TE values of the kNN method in Fig. \ref{fig:expr_neuro}(b) show an almost symmetric matrix, that hardly provides any directional information between brain regions. This issue is also observed in Fig. \ref{fig:expr_syn}, where the kNN method yields almost the same positive value no matter the true value is close to zero or one.
% The Copent method in Fig. \ref{fig:expr_neuro}(c) falsely indicates that most higher-order regions impact V1's activity, the lowest order area. It also ignores the interactions between LM, RL, and AL, where most corresponding TE values are nearly zeros. The ITENE method in Fig. \ref{fig:expr_neuro}(d) also suffers from this issue showing many opposite hierarchical relations between visual areas where many large values concentrate in the top right triangle matrix instead of the bottom left matrix.

\bibliography{references}

\end{document}
