\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{amsthm}
\usepackage{amsthm}
\usepackage{amsfonts}

\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}

\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{example}[theorem]{Example}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

\title{Split, Count, and Share: A Differentially Private \\ Set Intersection Cardinality Estimation Protocol}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<michael.purcell1@anu.edu.au>?Subject=Your UAI 2023 paper}{Michael Purcell}{}}
\author[1]{Yang Li}
\author[1]{Kee Siong Ng}
% Add affiliations after the authors
\affil[1]{%
    School of Computing\\
    Australian National University\\
    Canberra, Australian Capital Territory, Australia
}

\begin{document}
\maketitle

\begin{abstract}
  We describe a simple two-party protocol in which each party contributes a set as input. The output of the protocol is an estimate of the cardinality of the intersection of the two input sets. We show that our protocol is efficient and secure. We show that the space complexity and communication complexity are constant, the time complexity for each party is proportional to the size of their input set, and that our protocol is differentially private. We also analyze the distribution of the output of the protocol, deriving both its asymptotic distribution and finite-sample bounds on its tail probabilities. These analyses show that, when the input sets are large, our protocol produces accurate set intersection cardinality estimates. We claim that our protocol is an attractive alternative to traditional private set intersection cardinality (PSI-CA) protocols when the input sets are large, exact precision is not required, and differential privacy on its own can provide sufficient protection to the underlying sensitive data.
\end{abstract}

\section{Introduction}
Secure multiparty computation (SMC) protocols allow multiple parties, each of whom holds some input to a given function, to jointly evaluate that function without sharing their inputs with one another \citep{goldreich1998secure}.
Unfortunately, SMC protocols are expensive \citep{damgaard2010perfectly} and do not provide any protection for sensitive information that may be revealed by their output. 


For example, suppose that the nature of some business is such that its customers would prefer that their association with that business be kept confidential. Further, suppose that some other organization was allowed to use an SMC protocol to find out how many customers the two business have in common. Such a protocol is known as private set intersection cardinality (PSI-CA) protocol. While a PSI-CA protocol would protect any customers who were associated with only one of the two parties, it would necessarily compromise the privacy of anyone associated with both parties. 

Preventing this kind of privacy breach requires a different notion of privacy. Traditional SMC protocols can be problematic because their outputs are exact.
Attackers are able to use such protocols to make inferences about the data with extremely high confidence \citep{dinur2003revealing}. 
If the output of an SMC protocol was inexact, perhaps because the exact output was perturbed with some kind of noise, then an attacker would necessarily be less confident in any inferences that they made based on those outputs.
This is the idea that underpins the concept of differential privacy (DP) \citep{dwork2006calibrating}. DP ensures that the output of a function will not reveal whether a record was used to compute that output and provides a degree of plausible deniability to entities that contribute sensitive data to a data set.

In this paper, we will describe the \emph{Split, Count, and Share protocol}, a differentially private alternative to PSI-CA. Our protocol relies on differential privacy to protect all elements of both parties' sets. By doing so, we sacrifice precision; our protocol only produces accurate cardinality estimates when the input sets are large.

Also, our protocol provides only a differential privacy guarantee. It does not provide any security guarantee whatsoever in the sense of traditional SMC protocols \citep{cramer2015secure,evans2018pragmatic,lindell2020secure}. 
In exchange, however, we are able to dramatically reduce the time, space, and communiation complexity of our protocol compared with PSI-CA protocols. 

We will describe our protocol in detail, derive parameter values sufficient to guarantee $(\epsilon, \delta)$-differential privacy, and prove rigorous bounds on the difference between the estimates produced by our protocol and the actual cardinality of the intersection of the two input sets. We will conclude with a discussion that shows that our protocol is practically useful when both parties have large input sets.

\section{Related Work}

Traditionally, private set intersection (PSI) has been a topic of interest primarily in the field of secure multiparty computing. As such, much of the literature concerning PSI addresses questions of what can be achieved by mutually distrustful parties who are unwilling to reveal any information about their inputs to one another. More precisely, an SMC protocol is considered secure if the participants learn no more than what they could learn in an ideal world, where a trusted curator is present to ensure all participants' inputs are kept secret from each other and a correctly computed output is returned to the participants \citep{goldreich1998secure}.

Despite the fact that Split, Count, and Share provides no security guarantee in the sense of traditional SMC protocols, it nevertheless occupies a similar niche to existing PSI and PSI-CA protocols.  So, we will focus our survey of related work on existing SMC protocols and emphasize the fact that because we provide a different kind of security guarantee, it is difficult to make direct comparisons between the performance of our protocol and the performance of those mentioned below.

\subsection{Exact PSI} 

A wide variety of cryptographic primitives have been proposed as components on which private set intersection (PSI) protocols can be built.  Among these, Diffie-Hellman (DH) key exchange and oblivious transfer (OT) extension are the most common.
Generally, DH-based protocols have lower communication complexity but greater time complexity than OT-extension based protocols, which dominate the field for larger sets.
But the unique characteristics of DH-based protocols seem to increase speed for small sets PSI as shown in a recent work by \citet{rosulek2021compact}.


The state-of-the-art OT-extension based PSI protocols in the semi-honest setting are the computation time optimized protocol by \citet{kolesnikov2016efficient}, communication time optimized protocol by \citet{pinkas2019spot} and an efficient balanced protocol by \citet{chase2020private}. 
Although beyond scope of this work, \citet{pinkas2020psi}, \citet{rindal2021vole} and \citet{garimella2021oblivious} describe some recent efficient PSI developments in the malicious model. Some of these protocols, when analyzed under the semi-honest model, are almost as efficient as those mentioned above. Furthermore, the authors have done some thorough theoretical and experimental comparisons with selected state-of-the-art protocols to demonstrate the efficiency of their protocols.

PSI-CA protocols compute only the size of the intersection of the input sets rather than the intersection itself.
Many such protocols have been proposed, including those that work by modifying an underlying PSI protocol
\citep{freedman2004efficient,freedman2016efficient}
and those that work by post-processing the output of a circuit-based PSI protocol \citep{pinkas2019efficient}.
In either case, the most efficient PSI-CA protocols to date
\citep {cristofaro2012fast,freedman2016efficient,ion2020deploying,debnath2021secure,trieu2022multiparty}
have linear computation and communication costs in the input set sizes. There are also some recently developed efficient application-driven PSI-CA protocols \citep{dittmer2020function,duong2020catalic,trieu2020epione} and efficient multiparty private set intersection protocols \citep{chandran2021efficient}.

\subsection{Approximate PSI}


Dramatic efficiency gains can be had by approximating the cardinality of the intersection of two input sets rather than computing it exactly. Doing so, however, inevitably comes at the cost of decreased precision. Some early works of PSI-CA approximation are \citet{freedman2004efficient} and \citet{egert2015privately}.
Recently, several works including
\citet{dong18Approximating}, \citet{sparka2018p2kmv}, and \citet{hu2021make}
have proposed using sketches to compute intersection cardinality estimates in less than linear time.

The most efficient PSI-CA approximation protocol that we are aware of \citep{dong18Approximating} realizes logarithmic computation and communication time complexity in the largest possible cardinality value. Moreover, the protocol's approximation error can be tuned to adjust the accuracy and efficiency tradeoff. To illustrate the superior efficiency of this protocol for large sets, the authors compared it with an efficient exact PSI-CA protocol \citep{cristofaro2012fast} and an early approximation protocol \citep{egert2015privately}. The experimental results show that for set size  $10^6$, \cite{cristofaro2012fast} and \cite{egert2015privately} (at $1\%$ error rate) have computation time 3507.38 and 488.48 seconds respectively, whilst the FM sketch based protocol (at $1\%$ error rate) has computation time only 2.97 seconds. For more experimental results, see \cite{dong18Approximating}.


\subsection{DP PSI}

As a popular privacy-enhancing technique that addresses data contributors' membership privacy concerns, differential privacy (DP) \citep{dwork2006calibrating}
%sees its applications in SMC too.
has a long history of application in SMC protocols.
The early work of \citet{beimel2008distributed} studied the feasibility of using DP to increase the efficiency of secure function evaluation protocols. More recent work by \citet{groce2019cheaper} also demonstrated the effectiveness of DP in reducing standard PSI running costs. The use of DP in this work is to replace bin padding, which plays a significant role in hiding the actual sizes of the bins that contain hashed elements.

Another recent combination of DP and PSI is described in \cite{kacsmar2020differentially}, which proposed differentially private mechanisms for both PSI and PSI-CA, for the imbalanced database setting, where the server holds a much larger database than the client. By using homomorphic encryption, the proposed mechanisms provide a stronger protection to the data, in the sense that the server learns nothing about the client's data due to encryption and the client learns DP guaranteed set intersection (cardinality). The communication complexity of the proposed mechanisms is at optimal $O(m)$ because of the help of homomorphic encryption, where $m$ is the smaller database size. The computation time is $O(m+n)$, which does not take into account ciphertext size expansion due to encryption. 

\section{Split, Count, and Share}
Suppose Alice holds the set $A \subset S$ and Bob holds the set $B \subset S$. Alice would like to estimate $|A \cap B|$, Bob is willing to work with Alice to help her do so, but neither party is willing to reveal the elements of their set to the other. To simplify the subsequent analysis, we will assume that Alice and Bob are willing to share the cardinality of their sets.% with each other.

Also, suppose that for $1 \leq i \leq r$ Alice and Bob can randomly partition their sets into two subsets $A_{i,0},A_{i,1} \subset A$ and $B_{i,0},B_{i,1} \subset B$
in such a way that if $x \in A \cap B$ then Alice and Bob will put $x$ into the same set; i.e. Alice puts $x$ in $A_{i,j}$ if and only if Bob puts $x$ in $B_{i,j}$.  
We will call each such partitioning a \emph{round} and assume that the splitting decisions in each round are independent of all other rounds.

Finally, after each round both parties count how many of their set elements were put into each subset. That is, for all $i$ Alice computes ${V_i = |A_{i,1}|}$ and Bob computes $W_i = |B_{i,1}|$. Recall that Alice and Bob make the same splitting decisions for all $x \in |A \cap B|$. If $|A \cap B|$ is large relative to $|A|$ and $|B|$ then $V_i$ and $W_i$ will be strongly correlated.  Conversely, if $|A \cap B|$ is small relative to $|A|$ and $|B|$ then $V_i$ and $W_i$ will be weakly correlated. So, the sample correlation between $V_i$ and $W_i$ can be used to estimate $|A \cap B|$.

Notice that if Bob publishes his vector of counts, then Alice could use that information to make inferences about Bob's set. If Bob uses a differentially private release mechanism to perturb his vector of counts, he could publish the perturbed counts without revealing which elements comprise his set $B$. Alice could then compute the correlation between her vector of counts and Bob's perturbed vector of counts to derive an estimate of $|A \cap B|$.

In principle, Bob could use any differentially private mechanism to perturb his counts. The binomial mechanism (see Appendix \ref{appendix:differential_privacy}), however, is a particularly appealing choice for this application.  Notice that Bob's unperturbed counts will be binomially distributed. If Bob uses the binomial mechanism to perturb his vector of counts, then his perturbed counts will be binomially distributed as well. Furthermore, Bob can generate his perturbed counts by simply augmenting his set with an appropriate number of dummy elements.

\subsection{Description} \label{subsection:description}
A more precise description of the Split, Count, and Share protocol is as follows (see Section \ref{subsection:scs_security} for a derivation of the value of $n_{\epsilon, \delta}(r)$):
\begin{description}
    \item[(Negotiate)] Alice and Bob agree on:
    \begin{enumerate}
        \item a number of rounds to perform $r \in \mathbb{N}$,
        \item differential privacy parameters $(\epsilon, \delta)$,
        \item $r$ independent random oracles $\{\mathcal{E}_i\}_{i=1}^r$ where we have $\mathcal{E}_i: \mathbb{Z} \rightarrow \{0,1\}$ for all $1 \leq i \leq r$.
    \end{enumerate}
    \item[Split] Alice and Bob use independent random oracles to partition their sets.
    \begin{enumerate}
        \item For $1 \leq i \leq r$ and $j \in \{0,1\}$, Alice computes $$A_{i,j} = \{x \in A \colon \mathcal{E}_i(x) = j\}.$$
        \item For $1 \leq i \leq r$ and $j \in \{0,1\}$, Bob computes $$B_{i,j} = \{x \in B \colon \mathcal{E}_i(x) = j\}.$$
    \end{enumerate}
    \item[Count] Alice and Bob count the number of elements in each of their split-sets.
    \begin{enumerate}
        \item For $1 \leq i \leq r$, Alice computes $$V_i = |A_{i,1}|.$$
        \item For $1 \leq i \leq r$, Bob computes $$W_i = |B_{i,1}| + \text{Binomial}(n_{\epsilon, \delta}(r), 1/2).$$
    \end{enumerate}
    \item[Share] Bob shares his (perturbed) counts with Alice.
    \begin{enumerate}
        \item Bob sends $\{W_i\}_{i=1}^r$ to Alice.
    \end{enumerate}
    \item[(Estimate)] Alice estimates $|A \cap B|$.
    \begin{enumerate}
        \item Alice computes% $\widehat{|A \cap B|} = (4/r)\sum_{i=1}^r\left(V_i - \mathbf{E}[V_i]\right)\left(W_i - \mathbf{E}[W_i]\right)$.
        \begin{equation} \label{equation:scs_estimate}
            \widehat{|A \cap B|} = \frac{4}{r}\sum_{i=1}^r\left(V_i - \mu_V\right)\left(W_i - \mu_W\right),
        \end{equation}
        where $\mu_V = |A|/2$ and $\mu_W = (|B| + n_{\epsilon, \delta}(r)) / 2$.
    \end{enumerate}
\end{description}

Crucially, Bob receives no output from the protocol. Alice should not share $\widehat{|A \cap B|}$ with Bob.  Because Bob knows the values of his perturbed counts that Alice used to compute $\widehat{|A \cap B|}$, he could use the value of Alice's estimate to make inferences about her set.

If Bob wants to estimate $|A \cap B|$, then Alice should instead use the differentially private mechanism to perturb her counts and send those perturbed counts to Bob. Bob can then use his (unperturbed) vector of counts and Alice's perturbed vector of counts to compute his estimate. This is conceptually equivalent to Alice and Bob running the protocol a second time, but with their roles reversed.

% =========================================================================
% Notation
% =========================================================================
\subsection{Motivation}\label{subsection: notation}
To motivate our choice of estimator as described by Equation \eqref{equation:scs_estimate}, we first need to establish some notation. Notice that for $1 \leq i \leq r$ we have 
\begin{equation*}
    \begin{split}
        V_i &= X_i + Z_i \\
        W_i &= Y_i + Z_i + N_i\\
    \end{split}
\end{equation*}
where $X_i$, $Y_i$, $Z_i$, and $N_i$ are independent binomial random variables with ${X_i \sim \mathcal{B}(|A| - |A \cap B|, 1/2)}$, ${Y_i \sim \mathcal{B}(|B| - |A \cap B|, 1/2)}$, ${Z_i \sim \mathcal{B}(|A \cap B|, 1/2)}$, and ${N_i \sim \mathcal{B}(n_{\epsilon, \delta}(r), 1/2)}$.

So, if $\mu_V = \mathbf{E}[V_i]$ and $\mu_W = \mathbf{E}[W_i]$ then
\begin{equation*}
    \begin{split}
        \mu_V &= |A|/2 \\
        \mu_W &= \left(|B| + n_{\epsilon, \delta}(r)\right)/2.
    \end{split}
\end{equation*}
Similarly, if $\sigma_V^2 = \text{Var}(V_i)$ and $\sigma_W^2 = \text{Var}(W_i)$ then
\begin{equation*}
    \begin{split}
        \sigma_V^2 &= |A|/4 \\
        \sigma_W^2 &= \left(|B| + n_{\epsilon, \delta}(r)\right)/4.
    \end{split}
\end{equation*}
Notice that $\mu_V$, $\mu_W$, $\sigma_V^2$, and $\sigma_W^2$ are defined in terms of the known quantities $|A|$, $|B|$, and $n_{\epsilon, \delta}(r)$.

If we let $\sigma_{VW} = \text{Cov}(V_i, W_i)$, then we have
\begin{equation}\label{equation:covariance}
    \sigma_{VW} = |A \cap B|/4.
\end{equation}
Because $\sigma_{VW}$ depends on the unknown quantity $|A \cap B|$, we cannot use it directly. We can, however, estimate $\sigma_{VW}$ via the sample covariance $\hat{\sigma}_{VW}$ where
\begin{equation}\label{equation:covariance_estimator}
    \hat{\sigma}_{VW} = \frac{1}{r} \sum_{i=1}^r (V_i - \mu_V)(W_i - \mu_W).
\end{equation}
Together, \eqref{equation:covariance} and \eqref{equation:covariance_estimator} suggest that $4\hat{\sigma}_{VW}$ is a reasonable estimator of $|A \cap B|$.

\subsection{Additional Notation}
In what follows, it will be convenient to work with the correlations rather than covariances. Observe that if we let $\rho_{\epsilon, \delta}(r)$ be the correlation between $V_i$ and $W_i$, then we have
\begin{equation*}
    \rho_{\epsilon, \delta}(r) = \frac{\sigma_{VW}}{\sigma_V \sigma_W} = \frac{|A \cap B|}{\sqrt{|A|\left(|B| + n_{\epsilon, \delta}(r)\right)}}.
\end{equation*}
Because $\rho_{\epsilon, \delta}(r)$ depends on the unknown quantity $|A \cap B|$, we cannot use it directly. We can, however, estimate $\rho_{\epsilon, \delta}(r)$ via the sample correlation $\hat{\rho}$. If we let $\tilde{V}_i = (V_i - \mu_V) / \sigma_V$ and $\tilde{W}_i = (W_i - \mu_W) / \sigma_W$ then we have
\begin{equation*}
\begin{split}
\mathbf{E}\left[\tilde{V}_i\tilde{W}_i\right] &= \rho_{\epsilon, \delta}(r), \\
\text{Var}\left(\tilde{V}_i\tilde{W}_i\right) &= 1 + \frac{|A \cap B|^2 - 2|A \cap B|}{|A|\left(|B| + n_{\epsilon, \delta}(r)\right)}.
\end{split}
\end{equation*}
and
\begin{equation}\label{equation:rho_hat}
        \hat{\rho} = \frac{\hat{\sigma}_{VW}}{\sigma_V\sigma_W} = \frac{1}{r} \sum_{i=1}^r \tilde{V}_i\tilde{W}_i.
\end{equation}



% =========================================================================
% Complexity
% =========================================================================
\subsection{Complexity}\label{subsection:complexity}
To carry out the Split, Count, and Share protocol, Bob must compute $\mathcal{E}_i(b)$ for all $1 \leq i \leq r$ and $b \in B$. He must then generate $r$ binomial random variables to perturb each element in his count vector. So, the total time complexity of the protocol for Bob is $O(r|B|)$. Alice must compute $\mathcal{E}_i(a)$ for all $1 \leq i \leq r$ and $a \in A$. She must then compute the correlation between the two vectors of counts. So, the time complexity of the protocol for Alice is $O(r|A|)$.

Observe that the Split, Count, and Share protocol is a streaming protocol. That is, Alice and Bob do not need to store the outputs of $\mathcal{E}_i$. Indeed, they do not even need to store the elements of their sets. Instead, they can each maintain a set of $r$ accumulators. Alice can take a single pass through her set, incrementing her $i$th accumulator whenever $\mathcal{E}_i(a) = 1$. After she does so, Alice's $i$th accumulator will contain the value of $|A_{i,1}|$. Similarly, Bob can take a single pass through his set, incrementing his $i$th accumulator whenever $\mathcal{E}_i(b) = 1$. After he does so, Bob's $i$th accumulator will contain the value of $|B_{i,1}|$.

As such, the space complexity of the protocol is determined by the space required to store the two vectors of counts.  If we assume that both parties will use a sixty-four bit integer to store each count, then the space complexity of the protocol is $O(r)$. If $A$ or $B$ is small, then this complexity can be reduced by using fewer than sixty-four bits for each counter.  In this case, the complexity of the protocol is $O(r \log_2(|A|) + r \log_2(|B|))$.

The communication complexity of the protocol is determined by the amount of data that Bob must send to Alice when he sends her his vector of perturbed counts. As such, the communication complexity of the protocol is $O(r)$.

% =========================================================================
% Security
% =========================================================================
\subsection{Security}\label{subsection:scs_security}
The security of the Split, Count, and Share protocol is entirely dependent on the noise that Bob adds to his counts before sharing them with Alice. As mentioned above, Bob will use the binomial mechanism to perturb his vector of counts. The binomial mechanism is characterized by two parameters, $n$ and $p$.  We will restrict our attention to the case where $p = 1/2$. We will let $n_{\epsilon, \delta}(r)$ be the smallest value of $n$ that provides $(\epsilon, \delta)$-differential privacy for the $r$-round version of the Split, Count, and Share protocol.

The privacy guarantees provided by many differentially private release mechanisms depend on the sensitivity of the input query. Precisely how this sensitivity is measured depends on the release mechanism.
As discussed in \citet{agarwal2018}, the privacy guarantee provided by the binomial mechanism depends on three sensitivity parameters, $\Delta_1$, $\Delta_2$, and $\Delta_{\infty}$. The precise nature of this dependence is described by the following Lemma. 

\begin{lemma}\label{lemma:sensitivity_values}
    If $f_r$ is the function that computes the vector of counts for the $r$-round Split, Count, and Share protocol, that is $f_r(B) = (|B_{1,1}|, |B_{2,1}|, \ldots, |B_{r,1}|)$, then $\Delta_1 f_r = r$, $\Delta_2 f_r = \sqrt{r}$, and $\Delta_{\infty} f_r = 1$.
\end{lemma}

\begin{proof}
    We have $f_r = (f_{r,1}, f_{r,2}, \ldots, f_{r,r})$ where the coordinate functions $\{f_{r,i}\}_{i=1}^r$ are independent counting queries. Because $f_{r,i}$ is real valued, $\Delta_p f_{r,i} = \Delta f_{r,i}$ for all $p$. Furthermore because $f_{r,i}$ is a counting query we have $\Delta f_{r,i} = 1$ for all $1 \leq i \leq r$. Therefore
    \begin{align*}
        \Delta_1 f_r &= \sum_{i=1}^r |\Delta f_{r,i}| = r, \\
        \Delta_2 f_r &= \left(\sum_{i=1}^r \left(\Delta f_{r,i}\right)^2\right)^{1/2} = \sqrt{r}, \\ 
        \Delta_{\infty} f_r &= \max_{1 \leq i \leq r} |\Delta f_r| = 1. \qedhere
    \end{align*}
\end{proof}

Armed with the values of the relevant sensitivity parameters, we can use the following theorem to determine the value of $n_{\epsilon, \delta}(r)$ required to ensure that the binomial mechanism is $(\epsilon, \delta)$-differentially private.  

\begin{theorem}\label{theorem: binomial parameter value}
    Suppose that $f_r$ is the function that computes counts for $r$ rounds of the Split, Count, and Share protocol and that $\delta > 0$. Let
    \begin{equation*}
    \begin{split}
        \phi_{\delta}(r) &= \sqrt{8r \log\left(\frac{1.25}{\delta}\right)} \\
        \psi_{\delta, 1}(r) &= \frac{4r}{3(1-\delta/10)} \\
        \psi_{\delta, 2}(r) &= \frac{10\sqrt{r\log\left(10/\delta\right)}}{(1-\delta/10)} \\
        \psi_{\delta, \infty}(r) &= \frac{8}{3} \left( \log\left(\frac{1.25}{\delta}\right) + \log\left(\frac{20r}{\delta}\right)\log\left(\frac{10}{\delta}\right)\right).
    \end{split}
    \end{equation*}
    Furthermore let $\psi_\delta(r) = \psi_{\delta, 1}(r) + \psi_{\delta, 2}(r) + \psi_{\delta, \infty}(r)$ and
    \begin{equation*}
        n_{\epsilon, \delta}^{\prime}(r) = \left(\frac{\phi_\delta(r) + \sqrt{\phi_{\delta}(r)^2 + 4\psi_{\delta}(r)\epsilon}}{2\epsilon}\right)^2.
    \end{equation*}
    If we have
    \begin{equation} \label{equation: messy binomial parameter}
        n_{\epsilon, \delta}(r) \geq \max\left(n_{\epsilon, \delta}^{\prime}(r), \ 92\log\left(\frac{10r}{\delta}\right), \ 8 \right)
    \end{equation}
    then the mechanism used to compute Bob's counts for the $r$-round Split, Count, and Share protocol, that is $\mathcal{M}(B, f_r(\cdot); n_{\epsilon, \delta}(r))$, is $(\epsilon, \delta)$-differentially private.
\end{theorem}
\begin{proof}
    If $\Delta_1f_r$, $\Delta_2f_r$, and $\Delta_{\infty}f_r$ are as in Lemma \ref{lemma:sensitivity_values}, then Corollary \ref{corollary: binomial mechanism} (see Appendix \ref{appendix:differential_privacy}) implies the result.
\end{proof}

Observe that the value of $n_{\epsilon, \delta}(r)$ grows with $r$. That is, as the number of rounds $r$ increases, so too does the amount of noise required to ensure a given level of differential privacy.  Conceptually, this is because during each round, the noisy count that Bob shares with Alice reveals some information about his set.  Increasing the amount of noise that Bob adds during each round ensures that the total amount of information that he reveals to Alice is limited. The following theorem describes the growth rate of $n_{\epsilon, \delta}(r)$.

\begin{theorem} \label{theorem: dp n limit}
    If $\delta$, $\phi_{\delta}(r)$, $\psi_{\delta}(r)$ are as in Theorem \ref{theorem: binomial parameter value} and $n_{\epsilon, \delta}(r)$ is the smallest value that satisfies \eqref{equation: messy binomial parameter}, then for all $\epsilon > 0$ we have
    \begin{equation*}
        \lim_{r \rightarrow \infty} \frac{n_{\epsilon, \delta}(r)}{r} = C_{\epsilon, \delta}
    \end{equation*}
    where
    \begin{equation*}
        C_{\epsilon, \delta} = \left(\frac{\phi_{\delta}(1) + \sqrt{\phi_{\delta}(1)^2 + \frac{16\epsilon}{3(1-\delta/10)}}}{2\epsilon}\right)^2.
    \end{equation*}
\end{theorem}
\begin{proof}
    Notice that we have
    \begin{align*}
        \lim_{r \rightarrow \infty} \frac{\phi_\delta(r)}{\sqrt{r}} &= \sqrt{8\log\left(\frac{1.25}{\delta}\right)} \\
        \lim_{r \rightarrow \infty} \frac{\psi_\delta(r)}{r} &= \frac{4}{3(1-\delta/10)}.
    \end{align*}
    Therefore the result follows from Theorem \ref{theorem: binomial parameter value}.
\end{proof}

% =========================================================================
% Utility
% =========================================================================
\subsection{Utility} \label{section:utility}
Having determined the amount of noise that Bob needs to introduce to ensure that the $r$-round Split, Count, and Share protocol is $(\epsilon, \delta)$-differentially private, it is natural to ask how accurately Alice can estimate $|A \cap B|$ using only her vector of counts and Bob's vector of perturbed counts. This will depend on a variety of factors including: the values of $\epsilon$ and $\delta$, the number of rounds performed $r$, the size of Alice's set $|A|$, and the size of Bob's set $|B|$.

Broadly speaking, Alice's accuracy improves as $r$, $\epsilon$, and $\delta$ increase. Crucially, while increasing $|A|$ and $|B|$ worsens Alice's absolute accuracy (i.e. doing so increases the absolute magnitude of her approximation errors), it improves her relative accuracy. So, if Alice is interested in the relative magnitude of her approximation errors, then the Split, Count, and Share protocol will provide better utility as the input sets $A$ and $B$ get larger.

At first glance, Equation \eqref{equation:rho_hat} appears to suggests that we can simply invoke the Strong Law of Large Numbers to analyze the performance of $\hat{\rho}$ as an estimator of $\rho_{\epsilon, \delta}(r)$. Unfortunately, because $n_{\epsilon, \delta}(r) = O(r)$ (see Theorem \ref{theorem: dp n limit}), we have $\lim_{r \rightarrow \infty} \rho_{\epsilon, \delta}(r) = 0$. So, the situation is a bit more complicated and will require more careful analysis.

In what follows we will prove bounds on the probability that $\widehat{|A \cap B|}$ differs from $|A \cap B|$ by arbitrary threshold values.  To do so, we will first characterize how well the sample correlation $\hat{\rho}$ approximates the true correlation $\rho_{\epsilon, \delta}(r)$. Because we have $\widehat{|A \cap B|} = \hat{\rho}\sqrt{|A|(|B| + n_{\epsilon, \delta}(r))}$ and $|A \cap B| = \rho\sqrt{|A|(|B| + n_{\epsilon, \delta}(r))}$, we can ``lift'' those bounds to describe how well $\widehat{|A \cap B|}$ approximates $|A \cap B|$.
Our first result shows that the distribution of Alice's errors is approximately normally distributed with mean zero and variance $\nu_{\epsilon, \delta}(r) / r$. 

\begin{theorem}\label{theorem:asymptotic_error}
Let $\Phi$ be the cumulative distribution function (CDF) of a standard normal random variable, that is $\Phi(x) = \mathbf{P}\{\mathcal{N}(0,1) \leq x\}$. For all $t \geq 0$ we have 
    \begin{equation*}
        \lim_{r \rightarrow \infty} \mathbf{P}\left\{|\hat{\rho} - \rho_{\epsilon, \delta}(r)| \geq t\sqrt{\frac{\nu_{\epsilon, \delta}(r)}{r}}\right\} = 2\Phi(-t),
    \end{equation*}
    where
    \begin{equation*}
        \nu_{\epsilon, \delta}(r) = 1 + \frac{|A \cap B|^2 - 2|A \cap B|}{|A|(|B| + n_{\epsilon, \delta}(r))}.
    \end{equation*}
\end{theorem}
\begin{proof}
Notice that if $S_r = \sum_{i=1}^r \tilde{V}_i\tilde{W}_i$, then we have
\begin{equation*}
    \begin{split}
    &\mathbf{P}\left\{\left|\hat{\rho} - \rho_{\epsilon, \delta}(r)\right| \geq t\sqrt{\frac{\nu_{\epsilon, \delta}(r)}{r}}\right\} \\
    &\qquad\qquad\qquad\qquad = \mathbf{P}\left\{\left|\frac{S_r - r\rho_{\epsilon, \delta}(r)}{\sqrt{r\nu_{\epsilon, \delta}(r)}}\right| \geq t\right\}.
    \end{split}
\end{equation*}

Recall, $\mathbf{E}\left[\tilde{V}_i\tilde{W}_i\right] = \rho_{\epsilon, \delta}(r)$ and $\text{Var}\left(\tilde{V}_i\tilde{W}_i\right) = \nu_{\epsilon, \delta}(r)$. Therefore, because $\tilde{V_i}$ and $\tilde{W}_i$ are bounded, we have $\mathbf{E}\left[\left|\tilde{V}_i\tilde{W}_i - \rho_{\epsilon, \delta}(r)\right|^3\right] < \infty$ and the result follows from the Berry-Esseen Theorem (Theorem \ref{theorem: berry esseen}).
\end{proof}

Notice that the deviations described by Theorem \ref{theorem:asymptotic_error} are absolute errors rather than relative errors.  This is significant because $\lim_{r \rightarrow \infty} \rho_{\epsilon, \delta}(r) = 0$. So, when we multiply by $\sqrt{|A|(|B| + n_{\epsilon, \delta}(r))}$, we will find that the accuracy of Alice's cardinality estimates will depend on $|A|$ and $|B|$.

\begin{corollary}\label{corollary:asymptotic_error}
    Let $\widehat{|A \cap B|} = 4\hat{\sigma}_{VW}$. For all $t \geq 0$ we have
    \begin{equation*}
        \lim_{r \to \infty}\mathbf{P}\left\{\left\lvert \widehat{|A \cap B|} - |A \cap B| \right\rvert \geq t\sqrt{|A|C_{\epsilon, \delta}}\right\} = 2\Phi(-t),
    \end{equation*}
    where $\Phi(x) = \mathbf{P}\{\mathcal{N}(0, 1) \leq x\}$ is the cumulative distribution function of a standard normal random variable.
\end{corollary}
\begin{proof}
    Observe that
    \begin{equation*}
        |\hat{\rho} - \rho_{\epsilon, \delta}(r)| = \left|\frac{4\hat{\sigma}_{VW} - |A\cap B|}{4\sigma_V\sigma_W}\right|.
    \end{equation*}
    So, we have
    \begin{equation*}
    \begin{split}
    &\mathbf{P}\left\{\left|\hat{\rho} - \rho_{\epsilon, \delta}(r)\right| \geq t\sqrt{\frac{\nu_{\epsilon, \delta}(r)}{r}}\right\} \\ &\qquad = \mathbf{P}\left\{\left|\frac{\widehat{|A \cap B|} - |A \cap B|}{4\sigma_V\sigma_W}\right| \geq t\sqrt\frac{\nu_{\epsilon, \delta}(r)}{r}\right\}.
    \end{split}
    \end{equation*}
    Furthermore, because $\lim_{r \rightarrow \infty} \nu_{\epsilon, \delta}(r) = 1$, Theorem \ref{theorem: dp n limit} implies that
    \begin{equation*}
        \lim_{r \rightarrow \infty} 4t\sigma_V\sigma_W\sqrt{\frac{\nu_{\epsilon, \delta}(r)}{r}} = t\sqrt{|A|C_{\epsilon, \delta}}.
    \end{equation*}
    Therefore, Theorem \ref{theorem:asymptotic_error} implies the result.
\end{proof}

Corollary \ref{corollary:asymptotic_error} shows that for large $r$, Alice's absolute errors will generally be on the order of the square root of the size of her set $A$. As such, the larger $A$ is, the smaller relative errors will be. This fact is the basis for our claim that the Split, Count, and Share protocol is particularly well suited for settings where both Alice and Bob's input sets are large.  Notice also that Corollary \ref{corollary:asymptotic_error} implies that there is a law of diminishing returns as $r$ increases. As we approach the asymptotic regime, the marginal cost for increasing $r$ remains constant while the marginal utility gain for doing so steadily decays.

Our last result is a concentration inequality that shows that the sample correlation will be close to the true correlation with high probability. This result is a finite-sample bound. That is, it is a statement that applies for all values of $r$ rather than a statement that applies only in the limit as $r$ diverges. 
\begin{theorem}\label{theorem: correlation bound}
For all $t \geq 0$ we have
\begin{equation*}
    \mathbf{P}\left\{\left| \hat{\rho} - \rho_{\epsilon, \delta}(r) \right| \geq t \right\} \leq 2\exp\left(\frac{-rt^2}{6 + 4t}\right).
\end{equation*}
\end{theorem}
\begin{proof}
    This follows from Bernstein's Inequality, Khintchine's Inequality, and the Legendre duplication formula. See Appendix \ref{appendix: concentration inequalities} for details. In particular, this result is a direct consequence of Theorem \ref{theorem: binomial correlation concentration inequality}.
\end{proof}

As with Theorem \ref{theorem:asymptotic_error}, we can ``lift'' Theorem \ref{theorem: correlation bound} to make precise statements about the accuracy of Alice's cardinality estimates. Because this operation is so similar to that demonstrated in the preceding discussion, and because the formulae involved are significantly more complicated, we state the following result without proof.

\begin{corollary}
    For all $\gamma \geq 0$ we have
    \begin{equation*}
        \lim_{r \rightarrow \infty} \mathbf{P}\left\{\big\lvert \widehat{|A \cap B|} - |A \cap B| \big\rvert \geq \gamma\sqrt{|A|} \right\} \leq U_{\epsilon, \delta}(\gamma),
    \end{equation*}
    where
    \begin{equation*}
    U_{\epsilon, \delta}(\gamma) = 2\exp\left(\frac{-\gamma^2}{6C_{\epsilon, \delta}}\right).
    \end{equation*}
\end{corollary}

% =========================================================================
% Discussion
% =========================================================================
\section{Discussion}\label{section:discussion}
To use the Split, Count, and Share protocol in practice, Alice and Bob must agree on a suitable set of parameter values. We can subdivide these parameters into two types. The first describe how Alice and Bob will make their splitting decisions in each round of the protocol.  The second describe the privacy guarantees that the protocol will provide.

To reason about the first type of parameter, recall that Alice and Bob need to agree on a collection of $r$ independent random oracles to use as splitting functions. Cryptographically secure hash functions are a natural class of functions to use to implement these random oracles in practice.
Furthermore, if $\mathcal{E}$ is a hash function with a digest length of $r$ bits,
%Given such a function, if $r$ is equal to its output size
then Alice and Bob can compute all of the splitting decisions for a given input with a single function evaluation. So, setting $\mathcal{E} = \text{SHA3-512}$ and $r = 512$ may be reasonable defaults.

To reason about the second type of parameters, recall that only Alice will receive the output of the protocol. So, Bob is more concerned with privacy than utility. He needs the values of the privacy parameters $(\epsilon, \delta)$ to be small enough to guarantee sufficient protection for his data. Alice is more concerned with utility than privacy. She needs the values of the privacy parameters $(\epsilon, \delta)$ to be large enough to guarantee that her estimates will be sufficiently accurate.

As a general rule \citep{mcsherry2017}, $\delta$ should be chosen to be negligible relative to $1/|B|$.  As such, a cryptographically small value such as $\delta = 2^{-128}$ may be a reasonable default. It is less clear what a reasonable default value for $\epsilon$ might be.  Appropriate values for $\epsilon$ depend on how much privacy loss Bob is willing to tolerate and how often he expects to participate in the protocol. Because the utility guarantees for the protocol are given in terms of the relative standard error of Alice's estimates, the choice of $\epsilon$ also depends on the size of her set. 


Figure \ref{figure:standard_error_curves} depicts the relationship between the value of $\epsilon$ and the approximate standard error of Alice's cardinality estimates. Here, we let $r = 512$, $\delta = 2^{-128}$, $|A| = |B|$, and $|A \cap B| = |A| / 2$. Each curve in the graph describes this relationship for a different value of $|A|$.  Observe that the standard error decreases as $\epsilon$ and $|A|$ increase.  Furthermore, notice that we have $\lim_{|A| \rightarrow \infty} 4\sigma_V\sigma_W / |A| = 1$ and $\lim_{|A| \rightarrow \infty} \nu_{\epsilon, \delta}(r) \approx 1.25$. So, as $|A|$ increases, the relative standard error converges to $\sqrt{1.25 / 512} \approx 0.05$. 

So, we see that if both of their sets are large, then Alice and Bob can use the Split, Count, and Share protocol to compute fairly accurate estimates of how many elements their sets have in common. For relatively small values of $\epsilon$, say $\epsilon = 0.05$, Alice can compute cardinality estimates that are accurate to within $0.1 \cdot |A|$ approximately 96\% of the time. Doing so requires only that both parties hash their set elements and then update each of $r=512$ counters after computing each hash. 

\begin{figure}
\centering
\includegraphics[width=\columnwidth]{epsilon_vs_standard_error2.png}
\caption{Standard Error Curves.} \label{figure:standard_error_curves}
\end{figure}


\subsection{Discrete Gaussian Noise}
One idiosyncrasy of the Split, Count, and Share protocol as described above is the distribution of the noise that Bob uses to ensure differential privacy. As described, Bob uses the binomial mechanism to perturb his counts. This allowed us to compute rigorous finite-sample tail bounds on the estimate errors that the protocol will produce. This, in turn, allows us to make strong statements about the kinds of utility guarantees the protocol can provide. There are, however, alternative noise distributions that Bob could use.

In particular, he could use the discrete Gaussian mechanism \citet{canonne2020discrete}. That is, Bob could use a release mechanism that perturbs his counts with noise drawn from a discrete Gaussian distribution. This is appealing because the discrete Gaussian mechanism has been shown to outperform the binomial mechanism. In this case, by ``outperform'' we mean that if parameters are chosen such that the two mechanisms provide equivalent privacy guarantees, then the variance of the noise produced by the discrete mechanism is smaller than that produced by the binomial mechanism. 

Indeed, Figure \ref{figure:dgm_vs_bm} depicts the performance of two versions of the Split, Count, and Share mechanism, one using the discrete Gaussian mechanism and one using the binomial mechanism. As in Figure \ref{figure:standard_error_curves}, we let $r = 512$, $\delta = 2^{-128}$, $|A| = |B|$, and $|A \cap B| = |A| / 2$. Each curve in the graph depicts the absolute difference, averaged over one thousand experiments, between the set cardinality estimate produced by the protocol and the true value of $|A \cap B|$. For each value of $|A|$ we have a curve describing the performance of the binomial mechanism, depicted with a solid line, and a curve describing the performance of the discrete Gaussian mechanism, depicted with a dashed line. In all cases we see that the discrete Gaussian mechanism outperforms the binomial mechanism. This difference is fairly small, however, and both versions exhibit similar qualitative behavior.

Unfortunately, the output distribution of Bob's perturbed counts when he uses the discrete Gaussian mechanism is complicated. As such, the utility guarantees that we provide above no longer apply. Given the empirical evidence presented here, however, it might be reasonable to assume that these utility guarantees are conservative. So, if Alice and Bob are interested in maximizing utility subject to some preexisting privacy constraints, then they may be best served by using the version of the protocol in which Bob uses the discrete Gaussian mechanism to perturb his counts. 

\begin{figure}
\centering
\includegraphics[width=\columnwidth]{binomial_vs_discrete_gaussian.png}
\caption{Binomial vs Discrete Gaussian Noise.} \label{figure:dgm_vs_bm}
\end{figure}

% =========================================================================
% Future Work
% =========================================================================
\section{Future Work}\label{section:future_work}
The results described above suggest several natural avenues for future research. Most obvious among these is to explore alternative protocols that could provide better utility than the Split, Count, and Share protocol.  The na\"ive approach of combining a traditional SMC protocol to compute the exact set intersection cardinality and a simple differentially private release mechanism is one such alternative that is optimal with respect to the utility that it provides. Unfortunately, traditional PSI-CA protocols are expensive and thus impractical to use when Alice and Bob's sets are large. So, any alternative protocols must be efficient in terms of time and space complexity to accommodate large inputs.

Notice that the cosine of the angle between the characteristic vectors of the sets $A$ and $B$ is equal to the correlation between Alice and Bob's vectors of counts in the Split, Count, and Share protocol. Furthermore, in \citet{charikar2002}, the author describes how SimHash can be used to estimate the size of the intersection of two sets by estimating the angle between their characteristic vectors. So, SimHash could be used as an explicit basis for other differentially private set intersection cardinality estimation protocols. Perhaps other set similarity estimation algorithms, e.g. MinHash \citep{broder1997}, could be used in similar ways as well.

\appendix


% =========================================================================
% Differential Privacy
% =========================================================================

\section{Differential Privacy} \label{appendix:differential_privacy}
\begin{definition}[\citet{dwork2006calibrating}]
    A randomized algorithm $\mathcal{M}$ with domain $\mathbb{N}^{|\chi|}$ is $(\epsilon, \delta)$-differentially private if for all $S \subset \text{Range}(\mathcal{M})$  and for all $x, y \in \mathbb{N}^{|\chi|}$ with $\lVert x - y \rVert_1 \leq 1$ we have
    \begin{equation*}
        \mathbf{P}\{\mathcal{M}(x) \in S\} \leq e^{\epsilon} \ \mathbf{P}\{\mathcal{M}(y) \in S\} + \delta.
    \end{equation*}
    If $\delta = 0$ we say that $\mathcal{M}$ is $\epsilon$-differentially private.
\end{definition}

\begin{definition}[\citet{dwork2006calibrating}]
    Let $f: \mathbb{N}^{|\chi|} \rightarrow \mathbb{R}^d$ be an arbitrary $d$-dimensional function. The $\ell_p$ sensitivity of $f$ is $\Delta_p f = \max \{\lVert f(x) - f(y) \rVert_p \colon x, y \in \mathbb{N}^{|\chi|}, \lVert x - y\rVert_1\}$.
    % \max_{x \sim y} \lVert f(x) - f(y)\rVert_p$.
\end{definition}


\begin{definition}[\citet{agarwal2018}]\label{definition:binomial_mechanism}
    Suppose that we have $f: \mathbb{N}^{|\chi|} \rightarrow \mathbb{Z}^r$. The binomial mechanism, denoted $\mathcal{M}_B(x, f(\cdot); n)$, adds noise $N_i \sim \text{Binomial}(n, 1/2)$ to each of the $r$ components of the output of $f$. That is,
    \begin{equation*}
        \mathcal{M}_B(x, f(\cdot); n) = f(x) + (N_1, N_2, \ldots, N_r),
    \end{equation*}
    where $\{N_i\}_{i=1}^r$ are independent random variables.
\end{definition}

\begin{theorem}[\citet{agarwal2018}]\label{theorem: binomial mechanism}
    Let $f: \mathbb{N}^{|\chi|} \rightarrow \mathbb{Z}^r$ and $\delta > 0$. Let $\Delta_i = \Delta_i f$ for $i \in \{1,2,\infty\}$ and let
    \begin{align*}
        \phi &= \Delta_2 \cdot \sqrt{8 \log\left(\frac{1.25}{\delta}\right)} \\
        \psi_1 &= \Delta_1 \cdot \frac{4}{3(1-\delta/10)} \\
        \psi_2 &= \Delta_2 \cdot \frac{10\sqrt{\log\left(10/\delta\right)}}{(1-\delta/10)} \\
        \psi_{\infty} &= \Delta_\infty \cdot  \frac{8}{3} \left( \log\left(\frac{1.25}{\delta}\right) +  \log\left(\frac{20r}{\delta}\right)\log\left(\frac{10}{\delta}\right)\right).
    \end{align*}
    Finally, let $\psi = \psi_1 + \psi_2 + \psi_{\infty}$.
    
    If $n \geq \max(92\log(10r/\delta), 8\Delta_{\infty})$ then the binomial mechanism  $\mathcal{M}_B(x, f(\cdot); n)$ is $(\epsilon, \delta)$-differentially private for
    \begin{equation*}
        \epsilon \geq \frac{\phi}{\sqrt{n}} + \frac{\psi}{n}.
    \end{equation*}
\end{theorem}
\begin{proof}
    See Appendix B of \cite{agarwal2018}.
\end{proof}

\begin{corollary}[\citet{agarwal2018}]\label{corollary: binomial mechanism}
    Suppose that $f$, $\delta$, $\phi$, and $\psi$ are as in Theorem \ref{theorem: binomial mechanism} and $\epsilon > 0$. If
    \begin{equation*}
        n^{\prime} = \left(\frac{\phi + \sqrt{\phi^2 + 4\psi\epsilon}}{2\epsilon}\right)^2
    \end{equation*}
    then for every 
    \begin{equation}\label{equation:epsilon_vs_n_appendix}
        n \geq \max\left(n^{\prime}, \ 92\log\left(\frac{10r}{\delta}\right), \ 8\Delta_{\infty}(f) \right)
    \end{equation}
    the binomial mechanism $\mathcal{M}_B(x, f(\cdot); n)$ is $(\epsilon, \delta)$-differentially private.
\end{corollary}
\begin{proof}
This follows from an application of the quadratic formula to determine the smallest value of $n$ required to ensure that \eqref{equation:epsilon_vs_n_appendix} holds.
\end{proof}


% =========================================================================
% Concentration Inequalities
% =========================================================================

\section{Concentration Inequalities} \label{appendix: concentration inequalities}

\begin{theorem}[Berry-Esseen Theorem]\label{theorem: berry esseen}
Let $\{X_i\}_{i=1}^r$ be a sequence of independent and identically distributed random variables with $\mathbf{E}[X_i] = 0$, $\text{Var}(X_i) = 1$, and $\mathbf{E}[|X|^3] = \beta_3 < \infty$.  Let $S_r = \sum_{i=1}^r X_i$ and let $F_r$ denote the cumulative distribution function of $S_r / \sqrt{r}$. That is $F_r(x) = \mathbf{P}\{S_r \leq x\sqrt{r}\}$. Let $\Phi$ denote the cumulative distribution function of a standard normal random variable.  That is, $\Phi(x) = \mathbf{P}\{\mathcal{N}(0,1) \leq x\}$. Then there exists a finite positive absolute constant $C_0$ such that
\begin{equation*}
    \sup_{x \in \mathbb{R}} \left|F_r(x) - \Phi(x)\right| \leq \frac{C_0\beta_3}{\sqrt{r}}.
\end{equation*}
\end{theorem}
\begin{proof}
    See \cite{berry1941}, \cite{shevtsova2011absolute}.
\end{proof}


\begin{theorem}\label{theorem: binomial correlation concentration inequality}
    Suppose that $\{(X_i, Y_i)\}_{i=1}^n$ are independent random vectors with $X_i \sim \text{Binomial}(n_X, 1/2)$ and ${Y_i \sim \text{Binomial}(n_Y, 1/2)}$ for all $1 \leq i \leq n$. If we let $\tilde{X}_i = (X_i - \mu_X)/\sigma_X$ and $\tilde{Y}_i = (Y_i - \mu_Y)/\sigma_Y$ then 
    \begin{equation*}
        \mathbf{P}\left\{\frac{1}{n}\left| \sum_{i=1}^n \tilde{X}_i \tilde{Y}_i - \mathbf{E}[\tilde{X}_i \tilde{Y}_i] \right| \geq t \right\} \leq 2\exp\left(\frac{-nt^2}{6 + 4t}\right).
    \end{equation*}
\end{theorem}
\begin{proof}   
    Bernstein's Inequality %(Theorem \ref{theorem: bernstein inequality})
    implies that it suffices to show that for all $p \geq 2$ we have
    \begin{equation*}
        \mathbf{E}\Big[\big\lvert \tilde{X}_i \tilde{Y}_i\big\rvert^{p}\Big] \leq \frac{3}{2}p!2^{p-2}.
    \end{equation*}
    To that end, observe that for all $1 \leq i \leq n$, if $p \geq 2$ we have
    \begin{align}
        \mathbf{E}\Big[\big\lvert \tilde{X}_i \tilde{Y}_i\big\rvert^{p}\Big] &\leq \sqrt{\mathbf{E}\left[\tilde{X}_i^{2p}\right]}\sqrt{\mathbf{E}\left[\tilde{Y}_i^{2p}\right]} \label{equation:cauchy_schwartz} \\
        &\leq \frac{2^p}{\sqrt{\pi}}\Gamma\left(p + 1/2\right) \label{equation:khintchine} \\
        &= \frac{2^p}{\sqrt{\pi}}\left(\frac{\sqrt{\pi}\Gamma(2p+1)}{2^{2p}\Gamma(p+1)}\right) \label{equation:legendre} \\
        &= \frac{(2p)!}{2^p p!} \nonumber,
    \end{align}
    where \eqref{equation:cauchy_schwartz} is an application of the Cauchy-Schwartz inequality, \eqref{equation:khintchine} is an application of Khintchine's inequality, %(Theorem \ref{theorem: khintchine inequality}),
    and \eqref{equation:legendre} is an application of the Legendre duplication formula.
    
    It remains to show that for all $p \geq 2$ we have
    \begin{equation*}%\label{equation:induction_inequality}
        \frac{(2p)!}{2^p p!} \leq \frac{3}{2}p! 2^{p-2}.
    \end{equation*}
    Let $f(p) = \frac{(2p)!}{2^pp!}$ and $g(p) = \frac{3}{2}p!2^{p-2}$ and observe that we have $f(p+1) = (2p+1)f(p) < (2p+2)f(p)$ and $g(p+1) = (2p+2)g(p)$. Notice that $f(2) = g(2) = 3$. Therefore, if $f(q) \leq g(q)$ for all $q \in \{2, 3, \ldots, p\}$, then $f(p+1) \leq g(p+1)$ and the result follows by induction.  
\end{proof}

\bibliography{purcell_82}
\end{document}
