% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{xr}
\usepackage{multirow}
\usepackage{siunitx}
\usepackage{adjustbox}
\usepackage{multirow}
\usepackage{hyperref,xspace}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{color}
\usepackage{bbm}
\usepackage{mathtools}
\usepackage{framed}
\usepackage{graphicx}
\usepackage[boxed]{algorithm2e}
\usepackage{bbm}
\usepackage{balance}
\usepackage{verbatim}
\usepackage{paralist,enumitem}
\usepackage{thmtools, thm-restate}

\usepackage{float}
\newtheorem{thm}{Theorem}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{prop}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{note}[theorem]{Note}
\newtheorem{obs}[theorem]{Observation}
\newtheorem{rem}[theorem]{Remark}
\newtheorem{cor}[theorem]{Corollary}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\MSE}{\mathrm{MSE}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\CBE}{\mathrm{CBE}}
\newcommand{\N}{\mathrm{N}}
\newcommand{\D}{\mathcal{D}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\sign}{\mathrm{sign}}
\newcommand{\R}{\mathbb{R}}
\renewcommand{\O}{\tilde{O}}
\newcommand{\RMSE}{\mathrm{RMSE}}
\newcommand{\MLE}{\mathrm{MLE}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\fa}{\lambda}
\newcommand{\balpha}{\boldsymbol{\alpha}}
\newcommand{\bbeta}{\boldsymbol{\beta}}
\newcommand{\Sign}{h}
\newcommand{\Signn}{h'}
\newcommand{\SRP}{\mathrm{SRP}}
\newcommand{\CSSRP}{\mathrm{CSSRP}}
\newcommand{\CSSRPL}{\mathrm{CSSRP-L}}
\newcommand{\SB}{\mathrm{SuperBit}}
\newcommand{\CS}{\textsc{Count-Sketch Sign-Random-Projection}}
\newcommand{\CSL}{\textsc{Count Sketch Signed Random Projection-L}}
\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}

\newcommand{\thetaxe}{\theta_{\vec{x},\vec{e}}}
\newcommand{\thetaye}{\theta_{\vec{y},\vec{e}}}


\title{Improving Sign-Random-Projection via Count Sketch}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<s19021@students.iitmandi.ac.in>?Subject=Your UAI 2022 paper}{Punit Pankaj Dubey}{}}
\author[1]{Bhisham Dev Verma}
\author[1]{Rameshwar Pratap}
\author[2]{Keegan Kang}

% Add affiliations after the authors
\affil[1]{%
    % Computer Science Dept.\\
   Indian Institute of Technology Mandi, H.P., India
}
\affil[2]{%
    % Second Affiliation\\
   Bucknell University, Lewisburg, Pennsylvania, USA
}
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%   }
  
  \begin{document}
\maketitle
\begin{abstract}
  Computing the angular similarity between pairs of vectors is a core part of various machine learning algorithms. The seminal work of Charikar~\citep{simhash} (\textit{a.k.a.} Sign-Random-Projection $(\SRP)$ or SimHash) provides an unbiased estimate for the same. However, $\SRP$ suffers from the following limitations: \begin{inparaenum}[(i)] \item  large variance in the similarity estimation, \item and high running time while computing the sketch. \end{inparaenum} There are improved variants that address these limitations. However, they are known to improve on only one aspect in their proposal, for \textit{e.g.}~\citep{CBE} suggest a faster algorithm, ~\citep{superbit,MLE} provide estimates with a smaller variance. In this work, we propose a sketching algorithm that addresses both aspects in one algorithm -- a faster algorithm along with a smaller variance in the similarity estimation. Moreover, our algorithm is space-efficient as well. We present a rigorous theoretical analysis of our proposal and complement it via experiments on synthetic and real-world datasets. 
\end{abstract}
\section{Introduction}\label{sec:intro}

High-dimensional datasets are ubiquitous in many real-life applications. Performing  analytics on such datasets is tedious and, at times impossible due to the \textit{curse of dimensionality}. The dimensionality reduction or sketching algorithms suggest probabilistic algorithmic techniques that compress the high dimensional dataset into low dimensions while preserving pairwise similarity measures such as  JL lemma~\citep{JL83} and its improved variants~\citep{achlioptas2001database,DBLP:conf/kdd/2006, dasgupta2010sparse,10.1145/2559902} for real-valued vectors and  pairwise euclidean distance. Minhash~\citep{BroderCFM98} and its improved variants~\citep{b-bit,OPH,DOPH} for sets and pairwise Jaccard similarity. {Feature Hashing~\citep{WeinbergerDLSA09} and its improved variant~\citep{verma2022variance} preserves the pairwise inner product for real valued vectors. FSketch~\citep{bera2021dimensionality} and a duo of Cabin and Cham~\citep{verma2022efficient}  preserve pairwise hamming distance for categorical vectors. For binary vectors BDR~\citep{pratap2018efficient}, BCS~\citep{pratap2018efficientbcs} and BinSketch~\citep{pratap2019efficient} preserve the inner product, hamming distance, cosine and Jaccard similarity.}

In this work, we focus on the sketching algorithm for real-valued data that approximates pairwise cosine similarity. The seminal work due to Charikar~\citep{simhash} suggest an algorithm for this task which has been extensively used in applications such as detecting near-duplicates~\citep{ndwc}, Spam-email detection~\citep{sed}.  Their algorithm compresses large-dimensional datasets into low-dimensional binary vectors such that the Hamming distance between the sketched vectors gives an unbiased estimate of the pairwise cosine similarity. Let $\vec{a}, \vec{b}\in \R^{D}$ such that the angle between them is $\theta_{(\vec{a}, \vec{b})}$, and  let $\mathcal{H}=\{\xi^{(i)}(\cdot)\}_{i\geq 1}$ denote the family of hash function stated as follows:
\begin{align}
 \xi^{(i)}(\vec{a})=\begin{cases}
    1, & \text{if $\langle \vec{a}, \vec{r}_i\rangle $} \geq 0.\\
    0, & \text{otherwise}, 
  \end{cases} \label{eq:eqsimhash}
\end{align}
where $\vec{r}_i=\langle r_{i1}, \ldots, r_{ij}, \ldots, r_{iD} \rangle \in \R^{D}$ such that $r_{ij}\sim \mathcal{N}(0, 1).$ Repeating the step stated in Equation~\eqref{eq:eqsimhash} $K$ times, and concatenating the corresponding hash values gives a $K$ dimensional binary vector corresponding to the input vector.  Let $X$ be the estimator random variable for the estimate of cosine similarity by $\SRP$ defined as:
\begin{align*} 
    &X = \frac{\pi}{K} \sum_{i=1}^K X^{(i)}, 
     \text{where~}
     X^{(i)}= \mathbbm{1}_{\xi^{(i)}(\vec{a})\neq \xi^{(i)}(\vec{b})}.\\
     &\E[X] =\theta_{(\vec{a},\vec{b})}.\\
\end{align*}
\begin{align*}
    &\Var[X] =\frac{\theta_{(\vec{a},\vec{b})}\left(\pi-\theta_{(\vec{a}, \vec{b})}\right)}{K}.\numberthis\label{eq:var_simhash}
\end{align*}
$\SRP$ can also be seen as a multiplication of the projection matrix (of dimension $K\times D$) with input vectors. Thus, the running time and space required (by the projection matrix) to compute the sketch per data point  is $O(DK)$. This highlights the following limitations of $\SRP$:

\begin{inparaenum}[(i)]
\item higher running time and space requirement, especially when the data dimension $D$ is large,  and \item high variance, when sketch dimension $K$ is smaller, and also when pairwise angle of data points is close to $\pi/2$. 
\end{inparaenum}
\begin{figure}
   \centering
  \includegraphics[scale=0.36]{RMSEALL.pdf}
  \caption{Comparison  based on RMSE for angular similarity estimation on a pair of points. Original dimension of points is $10^{4}$. A smaller RMSE indicates better performance.}
  \label{fig:RMSEALL}
\end{figure}

\textbf{Previously known improved variants of $\SRP$:}
There are several results that address some of these limitations of $\SRP$. The result of \citep{CBE,CBE_JMLR} a.k.a. $\CBE$ gives  a faster and space efficient algorithm for the task, but its variance remains the same as of SRP, whereas method proposed by \citep{MLE} $(\MLE)$ and ~\citep{superbit} $(\SB)$ reduces the variance but at the cost of higher running time than $\SRP$. We present an elaborated discussion in Section~\ref{sec:related_work}. 

To the best of our knowledge, there is no work that addresses all  the limitations of $\SRP$  in the same sketching algorithm. In this work,  we  propose one such algorithm.  Our key contributions are  summarized  as follows:\\~\\
$\bullet$ Our first algorithm is \textsc{\CS} $(\CSSRP)$ that compresses high dimensional points into low dimensional binary vectors and closely approximates pairwise cosine similarity, offer a faster running time and simultaneously provide a significantly smaller variance than $\SRP$. At a high level, the $\CSSRP$ is inspired from \textsc{Count Sketch}~\citep{CHARIKAR20043} algorithm, where we first  apply the \textsc{Count Sketch} algorithm on the input vectors and then compute the sign of the resultant sketch vector. The similarity estimation step remains exactly the same as $\SRP$ (Definition~\ref{def:our_sketch}). Note that our algorithm can be seen as projecting the input vector on a $K\times D$ projection matrix (whose each column has exactly one non-zero entry at the index randomly sampled from $\{1, \ldots, K\}$, and takes value between $\{\pm 1\}$ with probability $1/2$), and computing the sign of the resultant $K$-dimensional vector. However, the mentioned improvement of $\CSSRP$ holds when the sketch dimension $K=o(D)$ (Theorems~\ref{thm:prove_expt_angle}, \ref{thm:prove_var_angle}). 

$\bullet$ To  alleviate the limitation  of $\CSSRP$ mentioned above, we propose another sketching algorithm, namely - $\CSL$ $(\CSSRPL)$ (Definition~\ref{def:improved_estimator}). The basic difference  $\CSSRPL$ and $\CSSRP$ is in the process of generating the random projection matrix - each column of the projection matrix for  $\CSSRPL$ has  exactly $l$ non-zero values (randomly sampled from $\{\pm 1\}$ with probability $1/2$) at randomly chosen positions, where $l\ll K$. $\CSSRPL$ offers significant variance reduction even for large $K$ where $K=o(lD)$ (Theorems~\ref{thm:cssrpl_esitmation}, \ref{thm:prove_var_angle_l}). We summarise a quick  comparison between $\CSSRP$ and $\CSSRPL$ using the standard RMSE metric in Figure~\ref{fig:RMSEALL}. It is evident that  for small values of $K$, $\CSSRP$ has a smaller RMSE. However, at higher values of $K$, its RMSE starts increasing, which gets settled by  $\CSSRPL$, even for very small values of $l$ say $2, 3$. Furthermore, both proposals are space-efficient and require $O(D)$ and $O(lD)$ space for projection matrices, for $\CSSRP$ and  $\CSSRPL$, respectively, which is significantly less than that required by most baseline algorithms.

$\bullet$ We present our theoretical analysis in Section~\ref{sec:analysis}, and complement it via experiments (in Section~\ref{sec:experiments}) on synthetic and real-world datasets, on the metrics  such as running time,  similarity search, and variance analysis via box-plot. We observed a significant speedup (upto $3896\times$) in running time, while  simultaneously offering a better performance on the remaining experiments. Our observation is that for small values of the sketch dimension $K$, $\CSSRP$ offers both significant speedup and smaller variance, whereas for large values of $K$ $\CSSRPL$ performs similarly even for small values of $l=\{2, 3, 5\}$. We summarise a tabular comparison among the baselines on asymptotic sketching time, space complexity,  and variance in Table~\ref{Varinace-Table}.

\begin{table*}[ht]
\begin{center}
\scalebox{0.9}{
\begin{tabular}{|l|l|l|l|}
\hline
% \textbf{Algorithm}  &\textbf{Sketching-Time}&\textbf{Space-Complexity} &\textbf{Variance}  \\
\textbf{Algorithm}  &\textbf{Sketching}&\textbf{Space} &\textbf{Variance}  \\
\textbf{}  &\textbf{Time}&\textbf{Complexity} &   \\
\hline
$\SRP$~\citep{simhash}&$O(DK)$&$O(DK)$ & $\theta (\pi-\theta)/K$\\
\hline
$\CBE$~\citep{CBE}&$O(D\log D)$ &$O(D)$ &$\theta (\pi-\theta)/K$\\
\hline
$\SB$~\citep{superbit} &$O(DK^2)$ &$O(DK)$&$({\pi^2}/{K^2})\cdot\left(K\left({\theta}/{\pi}\right)+K(K-1)\left({\theta}/{\pi}\right)\times p_{21}\right)-{\theta}^2$\\
\hline
$\MLE$ ~\citep{MLE}& $O(DK)$& $O(DK)$ &$(2\pi/K) \cdot\left( \frac{1}{\theta +\thetaxe -\thetaye} + \frac{1}{\thetaxe + \thetaye -\theta} + \frac{1}{2\pi - \thetaxe-\thetaye - \theta} + \frac{1}{\theta + \thetaye - \thetaxe}  \right)$ \\
\hline
$\CSSRP$ (this work)&$O(D)$&$O(D)$&$({\pi^2}/{K^2}) \cdot \left(K\left({\theta}/{\pi}\right)+K(K-1)\left({\theta}/{\pi}\right)\times \eta\right)-{\theta}^2$\\
\hline
$\CSSRPL$ (this work) &$O(lD)$&$O(lD)$&$ ({\pi^2}/{K^2}) \cdot \left(K\left({\theta}/{\pi}\right)+K(K-1)\left({\theta}/{\pi}\right)\times \eta_l\right)-{\theta}^2$\\
\hline
\end{tabular}}
\end{center}
\caption{Comparison among the baselines on asymptotic sketching time, space complexity,  and variance. \citep{MLE} conditioned the estimate on a weighted vector $\vec{e}$, hence variance includes the angle formed between the vector pairs and $\vec{e}$. Note  that for $\vec{a}, \vec{b} \in \R^D$, in SuperBit,  $p_{21}$ is defined as $\Pr[ \xi^{(k_2)}(\vec{a}) \neq  \xi^{(k_2)}(\vec{b}) | \xi^{(k_1)}(\vec{a}) \neq  \xi^{(k_1)}(\vec{b})]$, where  $ \xi^{(k)}(\cdot)$ is the hash function used in $\SRP$ (Equation~\eqref{eq:eqsimhash}) \textit{s.t.} the rows of the matrix $R$ are orthonormal to each other.  $\eta$ and $\eta_l$ are defined in Theorems~\ref{thm:prove_var_angle} and~\ref{thm:prove_var_angle_l}, respectively.
} \label{Varinace-Table}
\end{table*}
\section{Related works}\label{sec:related_work}
Our work focuses on computing fast and accurate pairwise cosine similarity between input vectors, which has been extensively studied; we summarized some of the related works below:

 \textbf{CBE:} \citep{CBE} proposed a faster algorithm to compute pairwise cosine similarity. Their algorithm employs a special kind of matrix called circulant matrix, which consists of  a random vector $\vec{r} = (r_0,\dots, r_i,\dots,r_D)$, where  $r_i\in \mathcal{N}(0,1)$,  and $d-1$ vectors obtained via applying circular shift in  $\vec{r}$. Their projection matrix is the matrix obtained via multiplication of the circulant matrix and a random diagonal matrix, whose entries  are in $\{-1, +1\}$ with probability $1/2$. This projection matrix enables the use of the Fast Fourier transform, which reduces the sketching time to $O(D\log D)$. Moreover, if implemented carefully, the space complexity of  the algorithm is $O(D)$.  However, its variance remains the same as of the $\SRP$. In comparison,  our proposal is not only faster both asymptotically and empirically \textit{w.r.t.} $\CBE$ but simultaneously  offers a smaller variance.% in the similarity estimation.
 
\textbf{SuperBit:}~\citep{superbit} proposed an algorithm that offers smaller variance than $\SRP$. Their main idea is to use a projection matrix that consists of orthogonal vectors obtained via the Gram-Schmidt process in  $O(DK^2)$ time, which makes its running time high. In comparison, our proposal is much faster both asymptotically and empirically (speedup upto $3896\times$, see Table~\ref{speedup-table} and Figure~\ref{fig:Time}), and simultaneously space efficient as well. However, variance expression of both the methods looks similar.  

\noindent \textbf{MLE:}~\citep{MLE} suggest employing \textit{maximum-likelihood-estimation} technique on top of the sketch obtained from $\SRP$. Inspired by~\cite{DBLP:conf/colt/2006}, their techniques include formulating the similarity estimation problem into computing the real roots of a cubic polynomial. In comparison,  our proposal is both faster (asymptotically and empirically) as well as space efficient.

In order to understand the comparison among the baselines on their theoretical variances, we plot their respective expressions stated in Table~\ref{Varinace-Table}. To do so, we generate several data pairs of $10^4$ dimension such that their pairwise angles are between $30^{o}$ and $150^{o}$. We summarise it  via a scatter plot in Figure~\ref{fig:theoritical_variance}.  It is evident that variances of $\SRP$ and $\CBE$ remain the highest among all, followed by $\MLE$. Further, the variances of $\SB$, and our proposals $\CSSRP$ and $\CSSRPL$ remains the lowest, and are comparable with each other.  

Our proposals are based on correlated hash functions. We note that such hash functions have been explored earlier to get an accurate estimation for random projection and angular kernel estimation~\citep{choromanski2017unreasonable}. Also, the \textsc{Count Sketch} projection matrix used in $\CSSRP$ have been used earlier to get a faster algorithm for  tasks such as low-Rank approximation and regression~\citep{DBLP:conf/stoc/ClarksonW13,clarkson2017low}. Furthermore, our proposal $\CSSRPL$ uses a projection matrix whose entries are sampled from sparse Bernoulli distribution. We note that such projection matrices have been used in the context of random projection~\citep{DBLP:conf/kdd/2006,dasgupta2010sparse,10.1145/2559902} to get a faster algorithm. 

{In contrast to the use of the correlated hash functions for variance reduction, statistical techniques such as the control variate trick~\citep{Lavenberg} and the maximum likelihood estimation method~\citep{murphy2012machine}, have been also employed to improve the estimates of different sketching algorithms like AMS sketch~\citep{pratap2021improving}, Count-Sketch and Count Min Sketch~\citep{pratap2021variance}, Random Projections~\citep{kang2021improving}, and Feature Hashing~\citep{verma2022variance}.}
\begin{figure}
   \centering
  \includegraphics[scale=0.5]{Theoretical_var.pdf}
  \caption{Illustration of theoretical variances of the baselines.}
  \label{fig:theoritical_variance}
\end{figure}
\section{Background}
\label{sec:background}
\begin{table}[ht!]
\begin{center}
\begin{adjustbox}{width=\columnwidth,center}
\begin{tabular}{|c|l|}
\hline
\multicolumn{2}{|c|}{\textbf{Notations}} \\
\hline
$\vec{a}, \vec{b} \in \R^D$     &Input vectors \\
\hline
$a_i$                           & $i$-th feature of $\vec{a}$ \\
\hline
$\vec{\alpha}, \vec{\beta} \in \R^K$ &Sketch vectors \\
\hline
$D$ &Original dimension \\
\hline
$K$ &Sketch dimension\\
\hline
$R$ &Projection matrix\\
\hline 
$\theta_{(\vec{a}, \vec{b})}$ &Angle between $\vec{a}$ and $\vec{b}$\\
\hline 
$\vec{r}_k$ &$k$-th row of projection matrix\\
\hline
$r_{kj}= s(j) \mathbbm{1}_{kj}$ &$(k,j)$-th  index of projection matrix\\
\hline 
\end{tabular}
\end{adjustbox}
\end{center}
\label{sample-table}
\end{table}
\begin{definition}[Count-Sketch~\citep{count_sketch,WeinbergerDLSA09}]\label{def:count_sketch}
Let $\vec{\alpha} = (\alpha_1,\ldots, \alpha_k, \ldots, \alpha_K) \in \mathbb{R}^{K}$ be  the  sketch of input vector $\vec{a}\in \mathbb{R}^{D}$, obtained from  Count-sketch algorithm. Then, the $k$-th feature of  $\vec{\alpha}$ 
\begin{align}
    \alpha_{k} &= \sum_{j=1}^D a_{j} s(j) \mathbbm{1}_{kj},
\end{align}
where $s:[D]\mapsto \{-1, +1\}$, and $g:[D]\mapsto [K]$ are hash functions from $2$-universal hash families,  and $\mathbbm{1}_{kj}$ is indicator  of the event $g(j) = k$.
\end{definition}

\textsc{Count-Sketch} operation can also be represented as a matrix projection. Let  $R$ be a random matrix \text{such that~}$r_{kj} = s(j)\cdot \mathbbm{1}_{kj},~\text{ for all } k \in [K],~ j\in[D]$.
\begin{align*}
    &R = \begin{bmatrix} 
   \vec{r}_1 \\
   \vdots\\
   \vec{r}_k \\
   \vdots \\
   \vec{r}_K
    \end{bmatrix}_{K\times D},\textit{~where } \vec{r}_k= (r_{k1}, \ldots, r_{kj}, \ldots, r_{kD}),\notag\\
\end{align*}
Therefore, $\vec{\alpha}= R \vec{a}^{T}.$
\begin{figure}[ht!]
   \centering
  \includegraphics[scale=0.7]{linearmap.pdf}
  \caption{Count-Sketch as matrix projection.}
  \label{fig:linear}
\end{figure}
\section{Improving SRP using Count-Sketch}\label{sec:analysis}
At a high level our proposal is computing the sketch of input vectors using Count-sketch (see Definition~\ref{def:count_sketch}) and taking the sign of the resultant vector. We state it as follows.
\begin{definition}[\CS-$\CSSRP$]\label{def:our_sketch}
We denote  our proposal as a hash function $h(\cdot)$ that takes a vector $\vec{a} \in \mathbb{R}^{D}$ as input, first compress it (say vector $\vec{\alpha} \in \mathbb{R}^{K}$) using  Count-sketch (Definition~\ref{def:count_sketch}), and then compute the sign of  each component of the compressed vector  
\begin{align*}
    \Sign(\vec{a}) %&= sign(\vec{\alpha})\notag\\
    &= \left(\Sign^{(1)}(\vec{a}), \ldots, \Sign^{(k)}(\vec{a}), \ldots, \Sign^{(K)}(\vec{a}) \right).
\end{align*}
where, $\Sign^{(k)}(\vec{a}) = sign(\alpha_{k})$ and $sign(\alpha_{k})$ returns $1$ if $ \alpha_{k} \geq 0$, otherwise returns 0. 
\end{definition}

In what follows, we prove that our proposal gives an unbiased estimate of the pairwise cosine similarity, further show that the variance of our estimate is smaller than that of $\SRP$.

Our proof techniques relies in showing that the projection matrix (see Figure~\ref{fig:linear}) corresponding to \textsc{Count-Sketch} algorithm, approximates sparse Bernoulli distribution, and further we show that features of the sketch vector obtained from \textsc{Count-Sketch} asymptotically converges to the Gaussian distribution when the sketch dimension $K=o(D)$. 
% However, we show this when the sketch dimension $K=o(D)$. 


Note that the pairwise angular similarity is only meaningful if all dimensions of the data are more or less equally important; otherwise, the exceptionally large entries will dominate. Therefore, our assumption is that the fourth moment of the input vectors is bounded i.e. $\E[a_{i}^4] < \infty$, $\E[b_{i}^4] < \infty$ and $\E[a_{i}^2 b_{i}^2] < \infty$, for  $\vec{a},~ \vec{b} \in \mathbb{R}^{D}$ (as discussed  in Sections $4, 5$ of~\citep{DBLP:conf/kdd/2006}). However, the proof of asymptotic normality and analyzing its rate of convergence only require a bounded third moment or even a much weaker condition.

{We adopt the following two lemmas from  \citep{DBLP:conf/kdd/2006} to support our proofs. Our all results hold asymptotically as $D \rightarrow \infty$.} 

\begin{restatable}{lemma}{vsrpLi}[Adapted from Lemma $4$ of~\cite{DBLP:conf/kdd/2006}]\label{lem:v_normal}
%\begin{lemma}\label{lem:v_normal}[Adapted from Lemma $4$ of~\cite{DBLP:conf/kdd/2006}]
Let $\vec{r} = (r_1, \ldots, r_{j}, \ldots, r_{D}) \in \mathbb R^D$ s.t.
\begin{align}
r_j \sim \left\{ \begin{array}{r l}
1 & ~\text{with probability } \frac{1}{2K} \\
0 & ~\text{with probability } \frac{K-1}{K} \\
-1 & ~\text{with probability } \frac{1}{2K} \\
\end{array}\right.
\end{align}
and $\vec{a}  \in \mathbb R^D$. Denote $\alpha = \sum_{j=1}^D r_ja_j = \langle \vec{r}, \vec{a} \rangle$. Then if $D\rightarrow \infty$ and $K=o(D)$, we have 
$
\alpha \xRightarrow[]{\mathcal{L}} \mathcal{N}\left(0,\frac{||\vec{a}||^2}{K} \right)
$
with the rate of convergence
\begin{align}
|F_{\alpha}(y)-\Phi(y)|&\leq 0.8\sqrt{K}\frac{\sum_{i=1}^D|a_{i}|^3}{(\sum_{i=1}^D a_{i}^2)^{3/2}} \notag\\
&= 0.8\sqrt{\frac{K}{D}}\frac{\mathbb E[|a_{i}|^3]}{(\mathbb E [a_{i}^2])^{3/2}}\rightarrow 0,
\end{align}
where $\xRightarrow[]{\mathcal{L}}$ denotes ``convergence in distribution", $F_{\alpha}(y)$ is the empirical cumulative density function of $\alpha$, and $\Phi(y)$ is the CDF of  $\mathcal{N}\left(0,\frac{||\vec{a}||^2}{K} \right)$.
%\end{lemma}
 \end{restatable}
\begin{lemma}\label{lem:bv_normal}
Let $\vec{r} \in \mathbb R^D$ with the probability distribution in Lemma~\ref{lem:v_normal}, and  $\vec{a}, \vec{b} \in \mathbb R^D$. Suppose we denote $ \alpha  = \sum_{j=1}^D r_ja_j = \langle \vec{r}, \vec{a} \rangle$, and $\beta = \sum_{j=1}^D r_jb_j = \langle \vec{r}, \vec{b} \rangle$.
As $D \rightarrow \infty$, we have
{\footnotesize
\begin{align*}
\sqrt{K}\left[\begin{array}{c c}
\|\vec{a}\| & \vec{a}\vec{b}^T \\
\vec{a}\vec{b}^T & \|\vec{b}\|\\
\end{array} \right]^{-\frac{1}{2}}\left(\begin{array}{c}
\alpha \\
\beta\\
\end{array}\right) \xRightarrow[]{\mathcal{L}} \mathcal{N}\left( \left(\begin{array}{c}
0 \\
0
\end{array}\right), \left(\begin{array}{c c}
1 & 0 \\
0 &1
\end{array}\right) \right),
\end{align*}
}
\text{ with } $\E\left[ \left\|  sign(\alpha) - sign(\beta) \right\|_1\right] = \frac{\theta_{(\vec{a}, \vec{b})}.}{\pi}$.
\end{lemma}
With the help of Lemmas \ref{lem:v_normal} and \ref{lem:bv_normal}, in the following we show that $\CSSRP$  gives an unbiased estimate of pairwise cosine similarity.
\begin{theorem}
 \label{thm:prove_expt_angle}
 Let $\vec{a}, \vec{b}\in \R^D$, and $\Sign(\vec{a})$, $\Sign(\vec{b})$ be their $K$-dimensional binary vector obtained via our proposal (Definition~\ref{def:our_sketch}). If $K=o(D)$, then as $D \rightarrow \infty$  we have the following
  \begin{align}
     \E\left[\frac{\pi}{K}|| \Sign(\vec{a})- \Sign(\vec{b})||_1\right]=\theta_{(\vec{a}, \vec{b})}.
 \end{align}
 \end{theorem}
\begin{proof}
We first consider each row $\vec{r}_k, 1 \leq k \leq K$ of the random matrix in Figure~\ref{fig:linear}. The goal is to find the distribution of each $\vec{r}_k$, and hence compute
\begin{align*}
\mathbb E\left[ \sum_{k=1}^K | \Sign^{(k)}(\vec{a}) - \Sign^{(k)}(\vec{b}) | \right] &  = \sum_{k=1}^K \mathbb E\left[ | \Sign^{(k)}(\vec{a}) - \Sign^{(k)}(\vec{b}) |  \right].
\end{align*}
Suppose we denote $Z_k : = | \Sign^{(k)}(\vec{a}) - \Sign^{(k)}(\vec{b})|$. While each $Z_k$ are not independent due to our construction of $R$, let us briefly consider how each $\vec{r}_k$ is distributed.

When $k = 1$, we have that each entry in $\vec{r}_1$ comes from a Sparse Bernoulli distribution with 
\begin{align*}
r_{1j} \sim \left\{ \begin{array}{r l}
1 & \text{with probability } \frac{1}{2K} \\
0 & \text{with probability } \frac{K-1}{K} \\
-1 & \text{with probability } \frac{1}{2K}
\end{array}\right.
\end{align*} 
where $\mathbb E[r_{1j}] = 0$, with $\text{Var}[r_{1j}] = \frac{1}{K}$. Here, we note that each entry in $\vec{r}_1$ is i.i.d.

We can also compute the moment generating function of each $r_{1j}$ and get
\begin{align}
\mathbb E\left[e^{sr_{1j}}\right] = \frac{K-1}{K} + \frac{\exp\{s\} + \exp\{-s\}}{2K}.
\end{align}
Now let us consider the case $k =2$, and compute the moment generating function for each $r_{2j}$. By using the Law of Total Expectation, we have
\begin{align*}
\mathbb E\left[e^{sr_{2j}}\right]&= \mathbb E\left[e^{sr_{2j}}~|~r_{1j} = 0\right] \mathbb P\left[r_{1j} = 0\right]\notag\\
&\; \quad+ \mathbb E\left[e^{sr_{2j}}~|~r_{1j} = 1\right] \mathbb P\left[r_{1j} = 1\right]\notag\\
&\;\quad+ \mathbb E\left[e^{sr_{2j}}~|~r_{1j} = -1\right] \mathbb P\left[r_{1j} = -1\right]. \\
\ & = \left( \frac{\exp\{s\} + \exp\{-s\} }{2(K-1)} + \frac{K-2}{K-1}\right)\frac{K-1}{K}  \notag\\
&\qquad + \frac{1}{2K } + \frac{1}{2K}. \\
\ & = \frac{\exp\{s\} + \exp\{-s\} }{2K} + \frac{K-2}{K} + \frac{1}{K}. \\
\ & = \frac{\exp\{s\} + \exp\{-s\} }{2K} + \frac{K-1}{K}. \numberthis
\end{align*}
which is the same moment generating function as the sparse Bernoulli distribution. 

Moreover, we also note that each element in $\vec{r}_2$ are i.i.d., i.e. each $r_{2i}$ is independent of $r_{2j}$ (albeit dependent on ${r}_{1i}$). Now, consider $\vec{r}_k, 2 < k \leq K$, and consider each ${r}_{kj}$.
By Law of Total Expectation, and conditioning on previous vectors:
\begin{align*}
&\mathbb E\left[e^{sr_{kj}}\right] = \mathbb E\left[e^{sr_{kj}}~|~\text{all zeros for $r_{k'j}, k' < k$}\right]  \\
\ &\qquad \times \mathbb P\left[ \text{all zeros for $r_{k'j}, k' < k$}\right] \notag \\
\ & ~ + \mathbb E\left[e^{sr_{kj}}~|~\text{1  appears for at most one $r_{k'j}$, $k'< k$}\right]  \notag \\
 \ & \qquad \times \mathbb P\left[\text{1 appears for at most one $r_{k'j}$, $k'< k$}\right] \notag \\
\ &~ + \mathbb E\left[e^{sr_{kj}}~|~\text{$-1$ appears for at most one $r_{k'j}$, $k'<k$}\right]  \notag \\
\ &\qquad \times \mathbb P\left[\text{$-1$ appears for at most one $r_{k'j}$, $k'<k$}\right]. \notag \\
\ & = \left( \frac{K-k}{K-k+1} + \frac{\exp\{s\} + \exp\{-s\}}{2(K-k+1)}  \right) \frac{K-k+1}{K} \notag \\
 & \qquad + \frac{k-1}{2K} + \frac{k-1}{2K}. \notag \\
\ & = \frac{K-k}{K} + \frac{\exp\{s\} + \exp\{-s\}}{2K} + \frac{k-1}{K}.  \\
\ & = \frac{\exp\{s\} + \exp\{-s\} }{2K} + \frac{K-1}{K}. \numberthis
\end{align*}

which gives the same moment generating function as the sparse Bernoulli distribution.\\
Now, we can use Lemma~\ref{lem:v_normal} to show that $\alpha_{k} = \langle \vec{r}_k, \vec{a} \rangle$  and $\beta_{k} = \langle \vec{r}_{k}, \vec{b} \rangle$ converge in distribution to $\mathcal{N}\left(0,\frac{||\vec{a}||^2}{K} \right)$ and $\mathcal{N}\left(0,\frac{||\vec{b}||^2}{K} \right)$ respectively as $D$ grows large. Moreover, by Lemma~\ref{lem:bv_normal}, we see that $\mathbb E\left[ | \Sign^{(k)}(\vec{a}) - \Sign^{(k)}(\vec{b}) | \right] =\mathbb E\left[ | sign(\alpha_{k}) - sign(\beta_{k}) | \right] =  \frac{\theta_{(\vec{a},\vec{b})}}{\pi}$ for each $1 \leq k \leq K$.\\
Hence we must have that $\mathbb E\left[ \sum_{k=1}^K | \Sign^{(k)}(\vec{a}) - \Sign^{(k)}(\vec{b}) | \right] = K\frac{\theta_{(\vec{a},\vec{b})}}{\pi}$, and on rearranging, we have 
\begin{align}
\mathbb E\left[ \frac{\pi}{K}\sum_{k=1}^K | \Sign^{(k)}(\vec{a}) - \Sign^{(k)}(\vec{b}) | \right] = \theta_{(\vec{a},\vec{b})}
\end{align}
which is what we wanted to show.
\end{proof}

We  give a bound on the variance of $\CSSRP$. We defer its proof to the appendix due to space limit. 
\begin{restatable}{theorem}{varCSSRP}\label{thm:prove_var_angle}
%\begin{theorem}\label{thm:prove_var_angle}
 Let $\vec{a}, \vec{b}\in \R^D$, and $\Sign(\vec{a})$, $\Sign(\vec{b})$ be their $K$-dimensional binary vector obtained via our proposal (Definition~\ref{def:our_sketch}). If $K=o(D)$, then  as $D \rightarrow \infty$ we have the following
  \begin{align*}
      &\Var\left[\frac{\pi}{K}||  h(\vec{a})-  h(\vec{b})||_1\right] \notag \\
      &=\frac{\pi^2}{K^2}\left(\frac{K\theta_{(\vec{a}, \vec{b})}}{\pi}+K(K-1)\frac{\theta_{(\vec{a}, \vec{b})}}{\pi}\times \eta \right)-\theta_{(\vec{a}, \vec{b})}^2. 
 \end{align*}
 where, $k_1 \neq k_2$, $k_1, k_2 \in[K]$, and \\
 $\eta=\Pr\left[ \left(\Sign^{(k_2)}(\vec{a}) \neq \Sign^{(k_2)}(\vec{b}) \right) | \left(\Sign^{(k_1)}(\vec{a}) \neq \Sign^{(k_1)}(\vec{b}) \right) \right]$.
 \end{restatable}
 
\begin{remark}\label{rem:CSSRP_var_red}
Recall that the variance of $\SRP$ is 
\begin{align}
\frac{\pi^2}{K^2}\left(\frac{K\theta_{(\vec{a}, \vec{b})}}{\pi}+K(K-1)\left(\frac{\theta_{(\vec{a}, \vec{b})}}{\pi}\right)^2 \right)-\theta_{(\vec{a}, \vec{b})}^2. \notag
\end{align}
We remark that the variance of $\CSSRP$ stated in Theorem~\ref{thm:prove_var_angle} is smaller than that of $\SRP$ because $\eta \leq \frac{\theta}{\pi}$. We validate this empirically by plotting $\eta$ for several values of $\theta$ and summarise it in Figure~\ref{fig:eta_simulation}. We notice that  $\eta$ always remains smaller than $\frac{\theta}{\pi}$, and leads to variance reduction as also supported in Figure~\ref{fig:theoritical_variance}. 
\end{remark}
 \begin{figure*}[ht!]
   \centering
  \includegraphics[width=0.9\linewidth]{eta_estimate.pdf}
  \caption{Empirical estimation of $\eta$ and $\eta_l$ via synthetically generated data points  for various pairwise angles $\theta$, and reduced dimensions $K$.}
  \label{fig:eta_simulation}
\end{figure*}

\subsection{Another improved estimator - $\CSSRPL$: } \label{l-nonzeros}
 We note that the stated in  Theorems~\ref{thm:prove_expt_angle} and \ref{thm:prove_var_angle} holds when $K=o(D)$. We wish to show that our results hold for higher values of $K$ as well. Our sketching algorithm $\CSSRPL$ stated below achieves the same. 

\begin{definition}[$\CSSRPL$]\label{def:improved_estimator}
Let $R'$ be a $K \times D$ projection matrix such that each column of $R'$ has exactly $l$ non-zero entries. These $l$ positions are sampled uniformly at random and each of them takes value $\{\pm1\}$ with probability $1/2$
 \begin{align} \label{eq:R_l}
     R' &= \begin{bmatrix}
     \vec{r}'_{1} \\
     \vdots\\
     \vec{r}'_{k}\\
     \vdots\\
     \vec{r}'_{K}
     \end{bmatrix}_{K\times D}.
 \end{align}
We denote our proposal $\CSSRPL$ as a hash function $\Signn(\cdot)$ that takes a vector $\vec{a} \in \mathbb{R}^{D}$  as  input, first compress it (say vector $\vec{\alpha}' \in \mathbb{R}^K$) by projecting it on the matrix $R'$ (i.e. $\vec{\alpha}' = R' \vec{a}^T$), and then compute the sign of each component of the compressed vector
  \begin{align*}
     \Signn(\vec{a})
     &= \left(\Signn^{(1)}(\vec{a}) , \ldots, \Signn^{(k)}(\vec{a}) , \ldots, \Signn^{(K)}(\vec{a}) \right).
 \end{align*}
 where  $\Signn^{(k)}(\vec{a}) = sign(\alpha'_{k})$, $sign(\alpha'_{k})$ returns $1$ if $ \alpha'_{k} \geq 0$, and $0$ otherwise.
 \end{definition}

In the following theorem, we show that our proposal gives an unbiased estimate of pairwise angular similarity. Its proof is built on similar lines to the proof of Theorem~\ref{thm:prove_expt_angle}. We defer it to the appendix. 
\begin{restatable}{theorem}{CSSRPLUnbaisedEstimation}\label{thm:cssrpl_esitmation}
 Let $\vec{a}, \vec{b}\in \R^D$, and $\Signn(\vec{a})$, $\Signn(\vec{b})$ be their $K$-dimensional binary vector obtained via our improved estimator proposal (stated in Definition~\ref{def:improved_estimator}). If $K=o(lD)$, then as $D \rightarrow \infty$ we have the following
  \begin{align}
     \E\left[\frac{\pi}{K}|| \Signn(\vec{a})- \Signn(\vec{b})||_1\right]=\theta_{(\vec{a}, \vec{b})}.
 \end{align}
 \end{restatable}

We give a bound on the variance of our proposal $\CSSRPL$ estimator, its proof is analogous to that of Theorem~\ref{thm:prove_var_angle}.
\begin{theorem}\label{thm:prove_var_angle_l}
 Let $\vec{a}, \vec{b}\in \R^D$, and $\Signn(\vec{a})$, $\Signn(\vec{b})$ be their $K$-dimensional binary vector obtained via our improved estimator (Definition~\ref{def:improved_estimator}). If $K=o(lD)$, then  as $D \rightarrow \infty$ we have the following
  \begin{align*}
      &\Var\left[\frac{\pi}{K}||  \Signn(\vec{a})-  \Signn(\vec{b})||_1\right] \notag \\
      &=\frac{\pi^2}{K^2}\left(\frac{K\theta_{(\vec{a}, \vec{b})}}{\pi}+K(K-1)\frac{\theta_{(\vec{a}, \vec{b})}}{\pi}\times \eta_{l} \right)-\theta_{(\vec{a}, \vec{b})}^2
 \end{align*}
 where, $k_1 \neq k_2$, $k_1, k_2 \in[K]$, and \\
 $\eta_l=\Pr\left[ \left(\Signn^{(k_2)}(\vec{a}) \neq \Signn^{(k_2)}(\vec{b}) \right) | \left(\Signn^{(k_1)}(\vec{a}) \neq \Signn^{(k_1)}(\vec{b}) \right) \right]$. 
 %Further, the variance of our estimator is smaller than that of $\SRP$. 
\end{theorem}

\begin{remark}
Similar to Remark~\ref{rem:CSSRP_var_red}, the variance of $\CSSRPL$ (Theorem~\ref{thm:prove_var_angle_l}) is smaller than that of $\SRP$ as $\eta_l \leq \frac{\theta}{\pi}$. Its numerical simulation is mentioned in Figure~\ref{fig:eta_simulation}.
Further, when $l$ is equal to $K$, then rows  of the matrix $R’$ defined in Equation~\eqref{eq:R_l} become independent, and our proposal $\CSSRPL$  (Definition~\ref{def:improved_estimator}) becomes exactly similar to $\SRP$.  \end{remark}

\section{Experiments}\label{sec:experiments}
\begin{figure*}[ht!]
   \centering
  \includegraphics[width=0.92\linewidth]{Time.pdf}
  \caption{Comparison among the baselines on average running time  which consist of both dimensionality reduction time as well as pairwise similarity computation time for a pair. Note that $\CSSRPL-2$ denotes $\CSSRPL$ algorithm with $l=2$, and so on.}
  \label{fig:Time}
\end{figure*}

\begin{table*}
\begin{center}
\scalebox{0.82}{
\begin{tabular}{|l|l|l|l|l|}
\hline
\textbf{Estimators}  & \textbf{Dimension-}$\mathbf{10^5}$ &  \textbf{Dimension-}$\mathbf{5\times10^5}$ &  \textbf{Dimension-}$\mathbf{10^6}$&  \textbf{Dimension-}$\mathbf{10^7}$\\
\hline
$\SRP$~\citep{CHARIKAR20043} &$\mathbf{7.32}\times$&$\mathbf{8.41}\times$&$\mathbf{8.44}\times$&$\mathbf{8.65}\times$\\
\hline
$\CBE$~\citep{CBE} &$\mathbf{1.45}\times$&$\mathbf{1.70}\times$&$\mathbf{1.811}\times$&$\mathbf{2.074}\times$\\
\hline
$\MLE$~\citep{MLE} &$\mathbf{11.80}\times$&$\mathbf{9.79}\times$&$\mathbf{9.32}\times$&$\mathbf{9.24}\times$\\
\hline
$\SB$~\citep{superbit} &$\mathbf{2541.35}\times$&$\mathbf{3820.91}\times$&$\mathbf{3826.56}\times$&$\mathbf{3896.71}\times$\\
\hline
$\CSSRPL-2$ \text{(this work)} &$1.09\times$&$1.12\times$&$ 1.15\times$&$ 1.20\times$\\
\hline
$\CSSRPL-5$ \text{(this work)} &$1.22\times$&$1.25\times$&$1.29\times$&$1.39\times$\\
\hline
$\CSSRPL-10$ \text{(this work)} &$1.31\times$&$1.35\times$&$1.40\times$&$1.55\times$\\
\hline
\end{tabular}}
\end{center}
\caption{Numerical speedup of $\CSSRP$ \textit{w.r.t.} other baselines on  a fixed reduced dimension $1000$.} \label{speedup-table}
\end{table*}
\noindent \textbf{Hardware description:}
We conducted our experiments on a machine with the following configuration
 CPU: Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz (8 CPUs); Memory: 8GB; OS: Window 10; Model: MSI GL62M 7RDX.
 
We use synthetic and real-world dataset for our experiments. In the synthetic dataset,  the value of each feature is randomly sampled from $[0,1]$. Description of real-world dataset is summarized in Table~\ref{tab:dataset}.

\begin{table}[H]
\caption{Description of real-world datasets.}\label{tab:dataset}
\begin{center}
\scalebox{0.82}{
\begin{tabular}{|l|l|l|}
\hline
\textbf{Dataset}  & \textbf{$\#$ of points} &  \textbf{Dimension}\\
\hline
Gisette~\citep*{UCI} &$13,500$&$5,000$\\
\hline
Arcene~\citep*{UCI} & $900$&$10,000$\\
\hline
Gene RNA-Seq~\citep{UCI}&$801$&$20,531$\\
\hline
PEMS-SF\citep{Dua:2019} & $440$&$138,672$\\
\hline
\end{tabular}}
\end{center}
\end{table}

\subsection{Baselines and end tasks}
We evaluate the performance of our proposals Count-Sketch-Signed Random Projection $(\CSSRP)$ and Count-Sketch-Signed Random Projection-L $(\CSSRPL)$ to that of the Signed Signed-random-projection $(\SRP)$ \citep{simhash}, Circulant Binary Embedding $(\CBE)$ \citep{CBE}, Maximum Likelihood Estimation $(\MLE)$ \citep{MLE}, and Super-Bit LSH $(\SB)$ \citep{superbit}. Note that the MLE estimator requires an extra vector for similarity computation, and we use the first principal component vector for the same, as mentioned in~\citep{MLE}. We use the following metrics for evaluations: \begin{inparaenum}[(i)]
\item running time to generate the sketch,  \item variance analysis via box-plot, \item similarity search.
\end{inparaenum} 

\begin{figure*}[ht!]
   \centering
  \includegraphics[width=0.93\linewidth]{Accuracy.pdf}
  \caption{Comparison among the baselines on the task of Top-$k$ similarity search. A higher value of accuracy indicates a  better performance.}
  \label{fig:Accuracy}
 \end{figure*}
 
  
\begin{figure*}[ht!]
   \centering
  \includegraphics[width=0.98\linewidth]{box_plot.pdf}
  \caption{Comparison among baselines on the task of variance analysis via box plot. The sampled  pairs are at angles $\ang{60}$ and $\ang{150}$,  respectively. The smaller interquartile range is an indicator of lower variance. The dotted line represents the actual angle in degree.
}
  \label{fig:box1}
\end{figure*}
\subsection{Running Time:}
\noindent\textbf{Experimental setting:} We aim to compare the running time of all the baselines. {To do so, we generate high dimensional synthetic datasets of dimensions ranging from $10^5$ to $10^7$.} We compress the datasets for different values of reduced dimension using various baselines, and record the sum of  sketching time and pairwise similarity computation time. Note that the sketching approach of $\MLE$ remains same as that of $\SRP$, however its similarity estimation step is different, and involves computing the root of a cubic polynomial. Therefore to have a fair comparison among all the baselines, we included both sketching time as well as similarity computation time. We compute average running time required by a pair of points, over various reduced dimensions, and summarise it in Figure \ref{fig:Time}. We also note the corresponding speedup obtained via our proposal $\CSSRP$ \textit{w.r.t.} baselines, and  report it in Table~\ref{speedup-table}.


\noindent\textbf{Insight:} We observed that $\CSSRP$ is  much faster than all the baselines, and we observed a significant numerical speedup (upto $3800\times$). We would like to highlight that our $\CSSRP$ is also faster (speed up $1.45\times$ to $2\times$) than $\CBE$~\citep{CBE}, which is a faster variant of $\SRP$. Further, the running time of our other  proposal $\CSSRPL$ remains somewhat comparable to  $\CSSRP$. Note that the $\SB$ method remains the slowest among all the baselines; this is due to the step of generating orthonormal  vectors (via \textit{Gram-Schmidt} orthogonalization process) required for  the projection matrix.

\subsection{Similarity Search:} 
\noindent\textbf{Experimental setting:} In this experiment, aim is to check if points proximity are maintained after dimensionality reduction. We discuss our experimental setting as follows.
 We split the dataset randomly into two parts $90\%$ and $10\%$ --  we refer the former as the training partition, while the latter one as the query partition. For each point in the query partition, we record top-$k$ similar points (under cosine similarity) from training partition for the uncompressed datasets. We denote this set by $S$. We compress the dataset (both query and training partition) using several baselines on various reduced dimensions, and record top-$k$ similar points (on the sketch) of the query points from the sketch of the training partition. We denote this set by $S^{'}$. We use two evaluation metrics -- \textit{recall}:= $|S\cup S^{'}|/|S|$, and \textit{accuracy}= $|S \cap S^{'}|/|S \cup S^{'}|$. We compute them for all the points in the query partition, and record their average. We summarize our findings for accuracy in Figure~\ref{fig:Accuracy} and for recall in appendix. 

\textbf{Insight:} We observed that at small reduced dimensions (listed in first rows of the respective plots) for both accuracy and recall, our estimator $\CSSRP$ estimator performed significantly better than the baselines. However, with the increase of dimension $\CSSRP$ performance slightly decreases (listed in second rows of the respective plots), which was circumvented  by our other proposal $\CSSRPL$, whose performance remains at least in the top two. 

\subsection{Variance analysis via Box-Plot:}
\textbf{Experimental setting:} In this experiment, our aim is to compare the variances of the baselines via box-plot. To do so, we generate a synthetic dataset in $10000$ dimension, and randomly sample a pair of points from it. We compress this pair and compute the estimated similarity  using all the baselines. We repeat this step $500$ times independently, and use the respective estimate to generate the box plot. We summarise our findings in Figures~\ref{fig:box1}.


\noindent\textbf{Insight:} We observe that at a small reduced dimension, the variance of our $\CSSRP$ estimator is lower than the variance of the other baselines. However, at higher reduced dimension variance of  $\CSSRP$ is slightly worse than the remaining.  This problem is tackled by our other proposal  $\CSSRPL$,  which offers smaller variance than the baselines, even at higher values of the reduced dimension.

\section{Conclusion}\label{sec:conclusion}
 We consider dimensionality reduction for real-valued data that approximate cosine similarity. The classical algorithm for this task - $\SRP$~\citep{simhash} suffers from high variance, running time, and space complexity involved in the similarity computation. Popular improvements  such as~\citep{MLE,CBE,superbit}  address only one or two aspects of the above. We present   algorithms ($\CSSRP$ and $\CSSRPL$) that address all these limitations. When the sketch dimension $K=o(D)$, our proposal $\CSSRP$ offers a faster and space-efficient algorithm along with the smaller variance.  However, for large $K$, the guarantee of $\CSSRP$  does not hold. Our other proposal  $\CSSRPL$, addresses this by offering a faster and space efficient algorithm with smaller variance, when $K$ is large. We give a theoretical analysis of our proposals and complement it via empirical simulations. We notice the speedup of several orders (even with faster variants of $\SRP$~\citep{CBE}) and simultaneously accurate performance on end tasks, \textit{w.r.t.} baselines. 
Finally, we could only empirically show that our proposals have smaller variance \textit{w.r.t.} the baselines. Giving its mathematical proof still remains an open question of the work.

% \begin{acknowledgements} % will be removed in pdf for initial submission,
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.

%     \emph{All} acknowledgements go in this section.
% \end{acknowledgements}

\bibliography{dubey_233}
\end{document}
