%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

\usepackage{xcolor}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage[edges]{forest}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{amsthm}

\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
\newcommand\blfootnote[1]{%
	\begingroup
	\renewcommand\thefootnote{}\footnote{#1}%
	\addtocounter{footnote}{-1}%
	\endgroup
}

\newcommand{\expm}{\exp_\mu}
\newcommand{\R}{\mathbb{R}}
\newcommand{\p}{\mathbf{p}}
\newcommand{\q}{\mathbf{q}}

 \newenvironment{proofof}[1]{\noindent{\bf Proof of #1:}}{$\qed$\par}

\usepackage[T1]{fontenc}

\usepackage{comment}
%\input{preamble}
%%%%%%%

%\theoremstyle{plain}
\newtheorem{thm}{\protect\theoremname}
%\theoremstyle{plain}
\newtheorem{claim}[thm]{\protect\claimname}
%\theoremstyle{plain}
\newtheorem{prop}[thm]{\protect\propositionname}
%\theoremstyle{plain}
\newtheorem{lem}[thm]{\protect\lemmaname}
%\theoremstyle{plain}
\newtheorem{cor}[thm]{\protect\corollaryname}
%\theoremstyle{definition}
\newtheorem{defn}[thm]{\protect\definitionname}
%\theoremstyle{definition}
\newtheorem{assump}{\protect\assumptionname}
%\theoremstyle{definition}
\newtheorem{rem}{\protect\remarkname}
%\theoremstyle{plain}
\newtheorem{fact}[thm]{\protect\factname}
%\theoremstyle{definition}
\newtheorem{obs}{\protect\obsname}
%\theoremstyle{definition}
\newtheorem{question}{\protect\questionname}

\makeatother

%\usepackage{babel}

\providecommand{\claimname}{Claim}
\providecommand{\lemmaname}{Lemma}
\providecommand{\propositionname}{Proposition}
\providecommand{\theoremname}{Theorem}
\providecommand{\corollaryname}{Corollary}
\providecommand{\definitionname}{Definition}
\providecommand{\assumptionname}{Assumption}
\providecommand{\remarkname}{Remark}
\providecommand{\factname}{Fact}
\providecommand{\obsname}{Observation}
\providecommand{\questionname}{Question}

\global\long\def\RR{\mathbb{R}}
\global\long\def\CC{\mathbb{C}}
\global\long\def\ZZ{\mathbb{Z}}

\global\long\def\F{{\cal F}}
\global\long\def\R{{\cal R}}
\global\long\def\H{{\cal H}}
\global\long\def\X{{\cal X}}
\global\long\def\Y{{\cal Y}}
\global\long\def\e{{\mathbf{e}}}
\global\long\def\et#1{{\e(#1)}}
\global\long\def\ef{{\mathbf{\et{\cdot}}}}
\global\long\def\x{{\mathbf{x}}}
\global\long\def\w{{\mathbf{w}}}
\global\long\def\q{{\mathbf{q}}}
\global\long\def\xt#1{{\x(#1)}}
\global\long\def\xf{{\mathbf{\xt{\cdot}}}}
\global\long\def\d{{\mathbf{d}}}
\global\long\def\a{{\mathbf{a}}}
\global\long\def\b{{\mathbf{b}}}
\global\long\def\u{{\mathbf{u}}}
\global\long\def\t{{\mathbf{t}}}
\global\long\def\y{{\mathbf{y}}}
\global\long\def\w{{\mathbf{w}}}
\global\long\def\yt#1{{\y(#1)}}
\global\long\def\yf{{\mathbf{\yt{\cdot}}}}
\global\long\def\z{{\mathbf{z}}}
%\global\long\def\v{{\mathbf{v}}}
\global\long\def\h{{\mathbf{h}}}
\global\long\def\s{{\mathbf{s}}}
\global\long\def\c{{\mathbf{c}}}
\global\long\def\p{{\mathbf{p}}}
\global\long\def\f{{\mathbf{f}}}
\global\long\def\rb{{\mathbf{r}}}
\global\long\def\rt#1{{\rb(#1)}}
\global\long\def\rf{{\mathbf{\rt{\cdot}}}}
\global\long\def\mat#1{{\ensuremath{\bm{\mathrm{#1}}}}}
\global\long\def\matN{\ensuremath{{\bm{\mathrm{N}}}}}
\global\long\def\matX{\ensuremath{{\bm{\mathrm{X}}}}}
\global\long\def\matA{\ensuremath{{\bm{\mathrm{A}}}}}
\global\long\def\matB{\ensuremath{{\bm{\mathrm{B}}}}}
\global\long\def\matC{\ensuremath{{\bm{\mathrm{C}}}}}
\global\long\def\matD{\ensuremath{{\bm{\mathrm{D}}}}}
\global\long\def\matP{\ensuremath{{\bm{\mathrm{P}}}}}
\global\long\def\matU{\ensuremath{{\bm{\mathrm{U}}}}}
\global\long\def\matM{\ensuremath{{\bm{\mathrm{M}}}}}
\global\long\def\matR{\mat R}
\global\long\def\matS{\mat S}
\global\long\def\matSigma{\mat \Sigma}
\global\long\def\matPhi{\mat \Phi}
\global\long\def\matY{\mat Y}
\global\long\def\matI{\mat I}
\global\long\def\matJ{\mat J}
\global\long\def\matZ{\mat Z}
\global\long\def\matV{\mat V}
\global\long\def\matL{\mat L}
\global\long\def\matQ{\mat Q}
\global\long\def\matK{\mat K}
\global\long\def\matH{\mat H}
\global\long\def\TNormS#1{\|#1\|_{2}^{2}}
\global\long\def\LTNormS#1{\left\|#1\right\|_{2}^{2}}
\global\long\def\TNorm#1{\|#1\|_{2}}
\global\long\def\LTNorm#1{\left\|#1\right\|_{2}}
\global\long\def\InfNorm#1{\|#1\|_{\infty}}
\global\long\def\FNorm#1{\|#1\|_{F}}
\global\long\def\FNormS#1{\|#1\|^{2}_{F}}
\global\long\def\UNorm#1{\|#1\|_{\matU}}
\global\long\def\UNormS#1{\|#1\|_{\matU}^{2}}
\global\long\def\UINormS#1{\|#1\|_{\matU^{-1}}^{2}}
\global\long\def\ANorm#1{\|#1\|_{\matA}}
\global\long\def\XNorm#1#2{\|#1\|_{#2}}
\global\long\def\XNormS#1#2{\|#1\|^2_{#2}}
\global\long\def\BNorm#1{\|#1\|_{\mat B}}
\global\long\def\ANormS#1{\|#1\|_{\matA}^{2}}
\global\long\def\AINormS#1{\|#1\|_{\matA^{-1}}^{2}}
\global\long\def\T{\textsc{T}}
\global\long\def\pinv{\textsc{+}}
\global\long\def\Expect#1{{\mathbb{E}}\left[#1\right]}
\global\long\def\ExpectC#1#2{{\mathbb{E}}_{#1}\left[#2\right]}
\global\long\def\dotprod#1#2#3{\langle#1,#2\rangle_{#3}}
\global\long\def\dotprodsqr#1#2#3{(#1,#2)_{#3}^{2}}
\global\long\def\Trace#1{\mathrm{Tr}\left(#1\right)}
\global\long\def\range#1{{\bf Range}\left(#1\right)}
\global\long\def\spn#1{{\bf Span}\left(#1\right)}
\global\long\def\rank#1{\mathrm{rank}(#1)}
\global\long\def\diag#1{{\mathrm{diag}}\left(#1\right)}
\global\long\def\sinc#1{\mathrm{sinc}\left(#1\right)}
\global\long\def\nnz#1{\mathrm{nnz}\left(#1\right)}
\global\long\def\vol#1{\mathrm{vol}\left(#1\right)}
\global\long\def\erfc#1{\mathrm{erfc}\left(#1\right)}
\global\long\def\sign#1{\mathrm{sign}\left(#1\right)}
\global\long\def\poly#1{\mathrm{poly}\left(#1\right)}
\newcommand*{\rect}{\mathrm{rect}}
\global\def\eqdef{\equiv}
\global\long\def\conj{*}
\newcommand{\wh}{\widehat}

\newcommand{\norm}[1]{\|#1\|}
\newcommand{\Fc}{\mathcal{F}}

%\newcommand{\Chris}{\bf \color{black} Chris: }

\newcommand{\bs}[1]{\boldsymbol{#1}}
\newcommand{\bv}[1]{\mathbf{#1}}

\global\long\def\valpha{{\bs{\alpha}}}
\global\long\def\veta{{\bs{\eta}}}
\global\long\def\vxi{{\bs{\xi}}}

\global\long\def\rad{R}



%%%%%%%


\newcommand{\wt}{\widetilde}
%\include{LSH}
\newcommand{\muu}[1]{\exp_{\mu}\left({#1}\right)}
\newcommand{\KD}{\textsc{KD}}


%----Helper code for dealing with external references----
% (by cyberSingularity at http://tex.stackexchange.com/a/69832/226)

\usepackage{xr}
%\makeatletter

%\newcommand*{\addFileDependency}[1]{% argument=file name and extension
%\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
%\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
%\IfFileExists{#1}{}{\typeout{No file #1.}}
%}\makeatother

%\newcommand*{\myexternaldocument}[1]{%
%\externaldocument{#1}%
%\addFileDependency{#1.tex}%
%\addFileDependency{#1.aux}%
%}
%------------End of helper code--------------

% put all the external documents here!
%\myexternaldocument{kreacic_397}

\externaldocument{kreacic_397}


\newcommand{\swap}[3][-]{#3#1#2} % just an example
\title{Differentially Private Synthetic Data Using KD-Trees \\(Supplementary Material)}



% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<eleonora.kreacic@jpmchase.com>?Subject=Your UAI 2023 paper}{Eleonora~Krea\v{c}i\'{c}}}
\author[1]{Navid~Nouri}
\author[1]{Vamsi~K.~Potluru}
\author[1]{Tucker~Balch}
\author[1]{Manuela~Veloso}
% Add affiliations after the authors
\affil[1]{
    JP Morgan AI Research
}



  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
%This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 
%Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 
%\appendix\label{app:supplementary}
%\section{Supplementary material}\label{app:supplementary}

\appendix
\section{Some auxiliary results}

\begin{rem}\label{fact:tailbound4laplace}
     For $X\sim \textsc{Lap}(2/\epsilon)$, we have
     \begin{align*}
         \Pr[X \ge \alpha] = \frac{1}{2}e^{-\frac{\alpha}{2/\epsilon}}.
     \end{align*}
\end{rem}

\begin{rem} \label{rem laplace tail log}
    As a consequence of Remark \ref{fact:tailbound4laplace}, for $X\sim \textsc{Lap}(2/\epsilon)$, we have
     \begin{align*}
         \Pr\left[|X| \ge \frac{4C\log n}{\epsilon}\right] = n^{-2C}.
     \end{align*}
\end{rem}


\begin{rem}
     Let $X\sim \textsc{Lap}(2/\epsilon)$, then we have
     \begin{align*}
         \mathbf{E}[|X|] = \frac{1}{\epsilon}.
     \end{align*}
\end{rem}


\begin{rem}[\cite{Laurent2000AdaptiveEO}]
     Let $Y:=\sum_{k=1}^{d} Z_k^2$, where $Z_k\sim \mathcal{N}(0,1)$ are i.i.d. random variables. Then
     \begin{align*}
         \Pr\left[Y \ge d+2\sqrt{dx}+2x \right] \le e^{-x}, \forall x>0
     \end{align*}
\end{rem}
\begin{cor}
Let $Y:=\sum_{k=1}^{d} Z_k^2$, where $Z_k\sim \mathcal{N}(0,\sigma^2)$ are i.i.d. random variables. Then
     \begin{align*}
         \Pr\left[Y/\sigma^2 \ge  2d+3x \right] \le \Pr\left[Y/\sigma^2 \ge  d+2\sqrt{dx}+2x \right] \le e^{-x}, \forall x>0
     \end{align*}
\end{cor}
\begin{cor}\label{cor:tailboundschisquared}
For any $X\sim \mathcal{N}(\mathbf{c},\sigma^2I)$, we have
\begin{align*}
    \Pr\left[||X-\mathbf{c}||_2 \ge \sigma \sqrt{2d+3x}\right]\le e^{-x} 
\end{align*}
\end{cor}


\begin{lem}[Chernoff bounds]\label{lem:chernoff}
Let $X_1,X_2,\ldots,X_n$ be independent binary random variables. Define $Y:=\sum_{i=1}^{n}X_i$ and $\mu := \mathbb{E}[Y]$. Then, for any %$\Delta>0$ and 
$\delta>0$
and 
\begin{align*}
    \Pr[|Y-\mu|>\delta\mu] \le 2 \exp(-\delta^2\mu/4).
\end{align*}
\end{lem}

\section{Data independent approach: $t=0$ case}\label{Appendix: data indep no filter}


\begin{lem}[Perturbation bound for Gaussian kernel] \label{lem:rounding}
For Gaussian kernel given by $K(\mathbf{x},\mathbf{y})=e^{-\frac{||\mathbf{x}-\mathbf{y}||_2^2}{2}}$, $\mathbf{x},\mathbf{y}\in \mathbb{R}^d$, if $\mathbf{x}\in \mathbb{R}^d$ and $\mathbf{x'}\in \mathbb{R}^d$ are such that $||\mathbf{x}-\mathbf{x'}||_2\le \alpha$, then:
\begin{align*}
    \max_{\mathbf{y}\in \mathbb{R}^d} |K(\mathbf{x},\mathbf{y})-K(\mathbf{x'},\mathbf{y})| \le \min(1,\frac{\alpha}{\sqrt{e}})
\end{align*}
\end{lem}
\begin{proof}
First, by triangle inequality and the assumption that $||x-x'||_2\le \alpha$, we have
\begin{align}\label{eq:triangleineq1}
    ||\mathbf{x}-\mathbf{y}||_2 - ||\mathbf{x'}-\mathbf{y}||_2 \le ||\mathbf{x}-\mathbf{x'}||_2 \le \alpha. 
\end{align}
For $f(x) = e^{-\frac{x^2}{2}}$, we have
\begin{align}\label{eq:maxderivative}
    \max_{x\in \mathbb{R}} |f'(x)| = \frac{1}{\sqrt{e}},
\end{align}
and thus
\begin{align*}
    \max_{\mathbf{y}\in \mathbb{R}^d}|K(\mathbf{x},\mathbf{y})-K(\mathbf{x'},\mathbf{y})|\le \frac{\alpha}{\sqrt{e}}.
\end{align*}
It remains to note that $0\leq K(\mathbf{x},\mathbf{y})\leq 1$ and thus also $\max_{\mathbf{y}\in \mathbb{R}^d}|K(\mathbf{x},\mathbf{y})-K(\mathbf{x'},\mathbf{y})|\le 1$.
\end{proof}

\begin{lem}[Error analysis of rounding to centers]\label{claim:rounding-centers}
Let $P\subset \mathbb{R}^d$ be the input dataset and let $P':=\{(c_1,v_1),\ldots,(c_J,v_J)\}$, where vector of $J$ point counts $\mathbf{v} \in \mathbb{R}^J$, and centers of $J$ bins $c_{1},\dots, c_{J}$ are defined in lines~\ref{line:v} and \ref{line:c} of Algorithm~\ref{alg:rounding-new}, respectively. Then for the KDE metric between $P'$ and $P$ we have
\begin{align}\label{eq:tripart1}
    \sup_{x\in \mathbb{R}^d}|\KD_{P'}(x) -\KD_{P}(x)|\le \max\left(\frac{w\sqrt{d}}{2\sqrt{e}},1\right).
\end{align}
\end{lem}
\begin{proof}%{Lemma~\ref{claim:rounding-centers}}
For any point $x\in \mathbb{R}^d$ that belongs to a bin with a center $c\in \mathbb{R}^d$ we have
\begin{align*}
    ||x-c||_2\le \frac{w\sqrt{d}}{2}.
\end{align*}
Plugging the above bound in Lemma~\ref{lem:rounding} completes the proof.
\end{proof}

\begin{lem}\label{cor:tailboundslaplace}
If total number of bins $J$ in Algorithm \ref{alg:rounding-new} is such that $J\leq n^{C}$, then for each bin $i\in[J]$, the noise term $\mathbf{w}_i=\mathbf{\wt{v}}_i-\mathbf{v}_i$ added in step \ref{line:laplace-noise} is such that
\begin{align*}
         \mathbf{w}_i \le \frac{4C\log n}{\epsilon},
     \end{align*}
     with probability at least $1-\frac{1}{n^C}$.
\end{lem}
\begin{proof}
    This is a consequence of Remark \ref{rem laplace tail log} and union bound argument.
\end{proof}
\begin{lem}[Bound on the total noisy count]\label{cor:v-tilde}
For $J$-dimensional vector of noisy point counts $\mathbf{\wt{v}}$ defined in line \ref{line:laplace-noise} of Algorithm \ref{alg:rounding-new}, with probability at least $1-\frac{1}{n^C}$ we have
\begin{align*}
    n - \frac{4CJ\log n}{\epsilon} \le |\mathbf{\wt{v}}|\le n+\frac{4CJ\log n}{\epsilon}.
\end{align*}
\end{lem}

\begin{proof}
    This is a consequence of Lemma \ref{cor:tailboundslaplace} and the fact that initial size of dataset is $n$.
\end{proof}

\begin{lem}[Error analysis of noise addition]\label{lem:data_ind_no_filter_Laplace_contribution}
Let $P':=\{(c_1,v_1),\ldots,(c_J,v_J)\}$, where vector of $J$ point counts $\mathbf{v} \in \mathbb{R}^J$, and centers of $J$ bins $c_{1},\dots, c_{J}$ are defined in lines~\ref{line:v} and \ref{line:c} of Algorithm~\ref{alg:rounding-new}, respectively. Let $Q:=\left\{(c_1,\wt{\mathbf{v}}_1),(c_2,\wt{\mathbf{v}}_2),\ldots,(c_J,\wt{\mathbf{v}}_J)\right\}$ be the noisy output of the algorithm. Then with probability at least $1-\frac{1}{n^{C}}$ we have
\begin{align}
     \sup_{x\in\mathbb{R}^d}|\KD_Q(x)-\KD_{P'}(x)| \le \frac{8CJ\log n}{\epsilon n - 4CJ\log n}.%\label{eq:Q2Pprime}
\end{align}
\end{lem}

\begin{proof}
For any $x\in \mathbb{R}^d$ we have
\begin{align*}
    |\KD_Q(x)-\KD_{P'}(x)| &= \left|\frac{1}{|\wt{\mathbf{v}}|}\sum_{i=1}^{J}\wt{\mathbf{v}}_i K(c_i,x) - \frac{1}{|{\mathbf{v}}|}\sum_{i=1}^{J}{\mathbf{v}}_i K(c_i,x)\right| \\
    % &= \frac{1}{|{\mathbf{{v}}}|}\left|\sum_{i=1}^{J}\frac{|{\mathbf{{v}}}|}{|{\mathbf{\wt{v}}}|}\wt{\mathbf{v}}_i K(c_i,q) - \sum_{i=1}^{J} {\mathbf{v}}_i K(c_i,q)\right|\\
    &=\frac{1}{n}\left|\sum_{i=1}^{J}K(c_i,x)\left(\frac{n}{|{\mathbf{\wt{v}}}|}\wt{\mathbf{v}}_i  - {\mathbf{v}}_i\right)  \right| %&&\text{since $|\mathbf{v}|=n$}
    \\
     &\le \frac{1}{n}\sum_{i=1}^{J}\left|\frac{n}{|{\mathbf{\wt{v}}}|}\wt{\mathbf{v}}_i  - {\mathbf{v}}_i \right| %&&\text{since $K(c_i,x) \le 1$},\\
    \end{align*}
where the second equality is the consequence of the fact that the point count in the original dataset is $n$, i.e. $|\mathbf{v}|=n$, and the inequality follows from $K(c_i,x) \le 1$. Let $\mathbf{w}_i$ denote the noise added to the $i$th bin's point count in step \ref{line:laplace-noise} of Algorithm \ref{alg:rounding-new}, so that $\mathbf{w}_i=\mathbf{\wt{v}}_i-\mathbf{v}_i$. Then we have
    \begin{align}
     |\KD_Q(x)-\KD_{P'}(x)| 
    %  &\le \frac{1}{n}\sum_{i=1}^{J}\left|\frac{n}{|{\mathbf{\wt{v}}}|}\left(\mathbf{v}_i+\mathbf{w}_i\right)  - {\mathbf{v}}_i \right|\\
     &\le \frac{1}{n}\sum_{i=1}^{J}\left|\mathbf{v}_i\left(\frac{n}{|{\mathbf{\wt{v}}}|}-1\right)  + \frac{n}{|{\mathbf{\wt{v}}}|}{\mathbf{w}}_i \right|\nonumber\\
     &\le \frac{1}{n}\sum_{i=1}^{J}\left|\mathbf{v}_i\left(\frac{n}{|{\mathbf{\wt{v}}}|}-1\right) \right| +\frac{1}{n}\sum_{i=1}^{J}\left| \frac{n}{|{\mathbf{\wt{v}}}|}{\mathbf{w}}_i \right|\\%\text{by triangle inequality}\nonumber\\
     &\le \left|\frac{n}{|{\mathbf{\wt{v}}}|}-1 \right| +\sum_{i=1}^{J}\left| \frac{1}{|{\mathbf{\wt{v}}}|}{\mathbf{w}}_i \right| &&%\text{since $|\mathbf{v}|=n$},\nonumber
     \end{align}
     where the second inequality is the triangle inequality and the last one follows as $|\mathbf{v}|=n$.
     Since $\mathbf{w}_i\sim\textsc{Lap}(2/\epsilon)$, by Lemma~\ref{cor:v-tilde} we have
     \begin{align}
     |\KD_Q(x)-\KD_{P'}(x)| &\le \left|\frac{n}{n - \frac{4CJ\log n}{\epsilon}}-1 \right| +\frac{1}{n - \frac{4CJ\log n}{\epsilon}}\sum_{i=1}^{J}\left| {\mathbf{w}}_i \right| \nonumber\\
     &\le \frac{n}{n - \frac{4CJ\log n}{\epsilon}}-1  +\frac{1}{n - \frac{4CJ\log n}{\epsilon}}\frac{4CJ\log n}{\epsilon}\\%&&\text{by Corollary~\ref{cor:tailboundslaplace}}\nonumber\\
    %  &\le \frac{4CJ\log n}{\epsilon n - 4CJ\log n}  +\frac{4CJ\log n}{\epsilon n - 4CJ\log n}\\
    &= \frac{8CJ\log n}{\epsilon n - 4CJ\log n}.\label{eq:Q2Pprime}
\end{align}
with probability $1-\frac{1}{n^{C}}$. The second inequality is the consequence of Lemma \ref{cor:tailboundslaplace}.
\end{proof}

\begin{proofof}{Theorem \ref{thm:general-1}}
    This is a consequence of Lemma \ref{claim:rounding-centers} and Lemma \ref{lem:data_ind_no_filter_Laplace_contribution} for $C$ such that $\delta = \frac{1}{n^{C}}$. Triangle inequality completes the proof.
\end{proofof}

\section{Data independant approach: $t>0$ case}\label{Appendix: data indep with filtering}


\begin{lem}[Algorithm~\ref{alg:rounding-new} filters out all $t/2$-light bins]\label{claim:lightbinssurviving}
If for $t=\frac{8C\log n}{\epsilon}$ and the total number of bins $J$ we have $J\leq n^{C}$ for some constant $C$, then with probability at least $1-\frac{1}{2}n^{-C}$ all $t/2$-light bins will be filtered out by step \ref{line:filtering} of Algorithm \ref{alg:rounding-new}.
\end{lem}

\begin{proof}
For $t=\frac{8C\log n}{\epsilon}$, since we are adding $\text{Lap}(2/\epsilon)$ noise the probability of a bin with point count less than $t/2$ having noisy point count more than $t$ is upper bounded by $\frac{1}{2} n^{-2C}$ (see Remark \ref{rem laplace tail log}).

Union bound over $J\leq n^C$ bins completes the proof.
\end{proof}
\begin{lem}[Algorithm~\ref{alg:rounding-new} does not filter any $3t/2$-heavy bins]\label{claim:heavybinsnotsurvive} 
If for $t=\frac{8C\log n}{\epsilon}$ and the total number of bins $J$ we have $J\leq n^{C}$ for some constant $C$, then with probability at least $1-\frac{1}{2}n^{-C}$ no $3t/2$-heavy bin gets filtered out by step \ref{line:filtering} of Algorithm \ref{alg:rounding-new}.
 Algorithm~\ref{alg:rounding-new} does not filter any $3t/2$-heavy bin with probability at least
    $1-\frac{1}{2}n^{-C}$.
\end{lem}

\begin{proof}
For $t=\frac{8C\log n}{\epsilon}$, since we are adding $\text{Lap}(2/\epsilon)$ noise the probability of a bin with point count at least $3t/2$ having noisy point count less than $t$ is upper bounded by $\frac{1}{2} n^{-2C}$ (see Remark \ref{rem laplace tail log}). Union bound argument over $J<n^{C}$ bins completes the proof.
\end{proof}


\begin{lem}[Noisy point counts]\label{claim:v-tilde-alg-filtering}
If for $t=\frac{8C\log n}{\epsilon}$ and the total number of bins $J$ we have $J\leq n^{C}$ for some constant $C$, then for $J$-dimensional vector of noisy point counts $\mathbf{\wt{v}}$ defined in line \ref{line:laplace-noise} of 
Algorithm~\ref{alg:rounding-new}, with probability at least $1-\frac{1}{n^C}$ we have
\begin{align*}
    n -m- \frac{4CM\log n}{\epsilon} \le |\mathbf{\wt{v}}|\le n+\frac{4CM\log n}{\epsilon},
\end{align*}
where $M$ and $m$ denote the total number of $t/2$-heavy bins and the total number of points in $3t/2$-light bins, respectively.
\end{lem}

\begin{proof}
Let $F:=\{i: \mathbf{v}_i>0, \mathbf{\wt{v}}_i = 0\}$, $Z:=\{i: \mathbf{v}_i=0, \mathbf{\wt{v}}_i = 0\}$ and $H:=\{i: \mathbf{\wt{v}}_i>0\}$ denote the set of non empty bins that are filtered out, the set of empty bins that are filtered out and the set of bins that survive filtering, respectively. Note that every bin belongs to one of the three sets i.e. $[J] = F \cup Z\cup H$. We have
\begin{align*}
    |\mathbf{\wt{v}}| &= \sum_{i = 1}^{J} \mathbf{\wt{v}}_i\\
    &= \sum_{i\in F}\mathbf{\wt{v}}_i +\sum_{i\in Z}\mathbf{\wt{v}}_i +\sum_{i\in H}\mathbf{\wt{v}}_i \\
    &= \sum_{i\in H}\mathbf{\wt{v}}_i  \\
    &\le |\mathbf{v}| + \left|H\right|\cdot \frac{4C\log n}{\epsilon}\\
    &\le n + \frac{4CM\log n}{\epsilon},
\end{align*}
where the third equality follows by definition of $F$ and $Z$, and the first inequality is the consequence of Lemma \ref{cor:tailboundslaplace}. The last inequality follows from $|\mathbf{v}|=n$ and the consequence of Lemma \ref{claim:lightbinssurviving} which gives that with probability at least $1-\frac{1}{2}n^{-C}$ any bin that survives filtering is $t/2$-heavy i.e. $|H|\le M$.
On the other hand, we also have
\begin{align*}
    |\mathbf{\wt{v}}| &= \sum_{i = 1}^{J} \mathbf{\wt{v}}_i\\
    &= \sum_{i\in H}\mathbf{\wt{v}}_i \\
    &\ge \sum_{i\in H}\mathbf{v}_i - |H|\cdot\frac{4C\log n}{\epsilon}\\
    &\ge \sum_{i\in H}\mathbf{v}_i - \frac{4CM\log n}{\epsilon}\\
    &\ge n-m - \frac{4CM\log n}{\epsilon}
\end{align*}
where again second equality comes from the definition of $F$ and $Z$, and the first inequality is the consequence of Lemma \ref{cor:tailboundslaplace} and the second inequality follows from $|H|\le M$ as above. Finally, the last inequality is the consequence of Lemma \ref{claim:heavybinsnotsurvive} which gives us that with probability at least $1-\frac{1}{2}n^{-C}$ any bin that gets filtered out is $3t/2$-light and so the total number of filtered out points is upper bounded by $m$. This means that the total number of points in bins that survive filtering is at least $n-m$ i.e. $\sum_{i\in H} \mathbf{v}_i \ge |\mathbf{v}|- m \ge n - m$. Union bound argument completes the proof.
\end{proof}

Now, we analyze the error between kernel density induced by $Q$ and $P'$. For any $q\in \mathbb{R}^d$ we have

\begin{lem}\label{lem appendix Q P' t>0}
     Let $P':=\{(c_1,v_1),\ldots,(c_J,v_J)\}$, where $v$, $c$ are defined as in lines ~\ref{line:v} and \ref{line:c} of Algorithm~\ref{alg:rounding-new}. For $t=\frac{8C\log n}{\epsilon}$ and the total number of bins $J\leq n^{C}$, with probability $1-n^{-C}$ we have

     \begin{align}
     \sup_{x\in\mathbb{R}^{d}}|\KD_Q(x)-\KD_{P'}(x)| \leq \frac{\epsilon m + 8CM\log n}{\epsilon n - \epsilon m - 4CJ\log n}+\frac{m}{n}.
\end{align}
\end{lem}

\begin{proof}
Let $H:=\{i: \mathbf{\wt{v}}_i>0\}$ denote the bins that survive filtering step. We have
\begin{align*}
    |\KD_Q(x)-\KD_{P'}(x)| &= \left|\frac{1}{|\wt{\mathbf{v}}|}\sum_{i=1}^{J}\wt{\mathbf{v}}_i K(c_i,x) - \frac{1}{|{\mathbf{v}}|}\sum_{i=1}^{J}{\mathbf{v}}_i K(c_i,x)\right| \\
    % &= \frac{1}{|{\mathbf{{v}}}|}\left|\sum_{i=1}^{J}\frac{|{\mathbf{{v}}}|}{|{\mathbf{\wt{v}}}|}\wt{\mathbf{v}}_i K(c_i,q) - \sum_{i=1}^{J} {\mathbf{v}}_i K(c_i,q)\right|\\
    &=\frac{1}{n}\left|\sum_{i=1}^{J}K(c_i,x)\left(\frac{n}{|{\mathbf{\wt{v}}}|}\wt{\mathbf{v}}_i  - {\mathbf{v}}_i\right)  \right|\\
     &\le \frac{1}{n}\sum_{i=1}^{J}\left|\frac{n}{|{\mathbf{\wt{v}}}|}\wt{\mathbf{v}}_i  - {\mathbf{v}}_i \right|\\
     &\le \frac{1}{n}\left(\sum_{i\in H}\left|\frac{n}{|{\mathbf{\wt{v}}}|}\wt{\mathbf{v}}_i  - {\mathbf{v}}_i \right| + \sum_{i\not\in H} |\mathbf{v}_i|\right)\\
     &\le\frac{1}{n}\left(\sum_{i\in H}\left|\frac{n}{|{\mathbf{\wt{v}}}|}\wt{\mathbf{v}}_i  - {\mathbf{v}}_i \right| + m\right)
    \end{align*}
where the second equality holds since $|\mathbf{v}|=n$, and the first inequality is the consequence of $K(c_i,x)\leq 1$. The second inequality follows by the definition of $H$, and the final one is the consequence of Lemma \ref{claim:heavybinsnotsurvive}, as with probability at least $1-\frac{1}{2}n^{-C}$ any bin that is filtered out must be $3t/2$-light. Let $\mathbf{w}_i=\mathbf{\wt{v}}_i-\mathbf{v}_i$, then $\mathbf{w}_i\sim\textsc{Lap}(2/\epsilon)$. Then we further have
    \begin{align}
     |\KD_Q(x)-\KD_{P'}(x)| 
     &\le\frac{1}{n}\left(\sum_{i\in H}\left|\frac{n}{|{\mathbf{\wt{v}}}|}\wt{\mathbf{v}}_i  - {\mathbf{v}}_i \right| + m\right)\\
     &\le \frac{1}{n}\left(\sum_{i\in H}\left|\mathbf{v}_i\left(\frac{n}{|{\mathbf{\wt{v}}}|}-1\right)  + \frac{n}{|{\mathbf{\wt{v}}}|}{\mathbf{w}}_i \right|+m\right)\nonumber\\
     &\le \frac{1}{n}\left(\sum_{i\in  H}\left|\mathbf{v}_i\left(\frac{n}{|{\mathbf{\wt{v}}}|}-1\right) \right| +\frac{1}{n}\sum_{i\in H}\left| \frac{n}{|{\mathbf{\wt{v}}}|}{\mathbf{w}}_i \right|+m\right)\nonumber\\
     &\le \left|\frac{n}{|{\mathbf{\wt{v}}}|}-1 \right| +\sum_{i\in H}\left| \frac{1}{|{\mathbf{\wt{v}}}|}{\mathbf{w}}_i \right|+\frac{m}{n}\nonumber
     \end{align}
where the third is the triangle inequality and the last one follows since $|\mathbf{v}|=n$. Let $n':=n -m- \frac{4CM\log n}{\epsilon}$. By Lemma \ref{claim:v-tilde-alg-filtering} with probability $1-n^{-C}$ we have
     \begin{align}
     |\KD_Q(x)-\KD_{P'}(x)| &\le \left|\frac{n}{n'}-1 \right| +\frac{1}{n'}\sum_{i=1}^{J}\left| {\mathbf{w}}_i \right| + \frac{m}{n} \nonumber\\
     &\le \frac{n}{n'}-1  +\frac{4CM\log n}{n'\epsilon}+\frac{m}{n}\nonumber\\
    %  &\le \frac{4CJ\log n}{\epsilon n - 4CJ\log n}  +\frac{4CJ\log n}{\epsilon n - 4CJ\log n}\\
    &= \frac{\epsilon m + 8CM\log n}{\epsilon n - \epsilon m - 4CJ\log n}+\frac{m}{n}.\label{eq:Q2Pprime-alg-filtering}
\end{align}
where the second inequality holds by Lemma \ref{cor:tailboundslaplace} 
\end{proof}


\begin{proofof}{Theorem \ref{thm:general-2}}

As a consequence of triangle inequality and Lemmas \ref{claim:rounding-centers} and \ref{lem appendix Q P' t>0}, for $C$ such that $\delta = n^{-C}$ we have
\begin{align*}
\sup_{x\in\mathbb{R}^{d}}|\KD_P(q) - \KD_Q(q)|&\le \frac{\epsilon m + 8CM\log n}{\epsilon n - \epsilon m - 4CM\log n} +\frac{m}{n}+ \frac{w\sqrt{d}}{2\sqrt{e}}\\
    &= \frac{\epsilon m + 8M\log \frac{1}{\delta}}{\epsilon n - \epsilon m - 4M\log \frac{1}{\delta}} +\frac{m}{n}+\frac{w\sqrt{d}}{2\sqrt{e}}.
\end{align*}
This completes the proof.
\end{proofof}

\section{Special case: Original dataset from mixture of Gaussians}\label{Appendix Gausssian}

\begin{lem}\label{lem aux furtherst heavy gaussian}
    If $r>0$, $C>0$ are such that %$\frac{n w^d}{(2\pi\sigma^2)^{d/2}}e^{-\frac{(r/2)^2}{2\sigma^2}}> \frac{2C\log n}{\epsilon}$
    $\frac{n w^d}{(2\pi\sigma^2)^{d/2}}e^{-\frac{(r/2)^2}{2\sigma^2}}> \frac{2C\log n}{\epsilon}$,
    then $r\leq 3\sigma\sqrt{\log n + d \log  \left(\frac{w}{\sigma\sqrt{2\pi}}\right)}$.
\end{lem}

\begin{proof}
    The condition of the lemma translates to

\begin{align*}
    r &\le 2\sqrt{2}\sigma\sqrt{\log\left(\frac{ n}{2C\log n} \cdot \left(\frac{w}{\sigma\sqrt{2\pi}}\right)^d \right)}\\
    &= 2\sqrt{2}\sigma \sqrt{\log n - \log\log n -\log 2C + d\log \left(\frac{w}{\sigma\sqrt{2\pi}}\right)} \\
    &\le 3\sigma\sqrt{\log n + d \log  \left(\frac{w}{\sigma\sqrt{2\pi}}\right)} 
    % &\ge \sigma \sqrt{\frac{\log n}{2}}.
\end{align*}
\end{proof}

\begin{lem}\label{lem aux light gaussian}
  If $r>0$, $C>0$ are such that $\frac{n w^d}{(2\pi\sigma^2)^{d/2}}e^{-\frac{r^2}{\sigma^2}}< \frac{16C\log n}{\epsilon}$, then we have
  $r\geq \sigma\sqrt{\log (\epsilon n) - \log(16 C \log n)+ d\log\left(\frac{w}{\sigma \sqrt{2\pi}}\right)}$
\end{lem}

\begin{proof}
Condition on $r$ translates to
\begin{align*}
    r 
     &\ge \sqrt{-\sigma^2\log\left(\frac{16C\log n}{\epsilon n} \cdot \left(\frac{\sigma\sqrt{2\pi}}{w}\right)^d \right) }\\
    &\ge\sigma\sqrt{\log\left(\frac{\epsilon n}{16C\log n} \cdot \left(\frac{w}{\sigma\sqrt{2\pi}}\right)^d \right)}\\
    &= \sigma \sqrt{\log (\epsilon n) - \log(16C\log n) + d\log \left(\frac{w}{\sigma\sqrt{2\pi}}\right)}.
\end{align*}
\end{proof}



\begin{defn}
For a bin (hypercube) $B$ and a point $y\in \mathbb{R}^d$, we define their distance as the $\ell_2$ distance of the 
center of $B$ to $y$.
\end{defn}
\begin{lem}[Upper bound on $M$]     
     For a dataset coming from a multivariate Gaussian with variance $\sigma^2I$ in $\mathbb{R}^d$  there are at most $$\left(\frac{6\sigma}{w}\sqrt{\log n + d \log  \left(\frac{w}{\sigma\sqrt{2\pi}}\right)} + 2\right)^d$$
     $t/2$-heavy bins with arbitrary high probability.
\end{lem}
\begin{proof}

Without loss of generality we assume that the mean of the distribution is the origin. Let $B$ be a bin at distance $r$ from the origin, and $x\in\mathbb{R}^d$ be a point inside $B$. For $X$ from multivariate Gaussian we have

\begin{align*}
         %X\sim N(0,\sigma^2I) \rightarrow 
f(X=x)\le\frac{1}{(2\pi\sigma^2)^{d/2}}e^{-\frac{(r - \frac{w\sqrt{d}}{2})^2}{2\sigma^2}}.
\end{align*}

Hence, the expected number of points within $B$ is upper bounded by $\frac{n w^d}{(2\pi\sigma^2)^{d/2}}e^{-\frac{(r - \frac{w\sqrt{d}}{2})^2}{2\sigma^2}}$. For $r$ such that $w\sqrt{d}\le r$, this is further upper bounded by $\frac{n w^d}{(2\pi\sigma^2)^{d/2}}e^{-\frac{(r/2)^2}{2\sigma^2}}$.

For simplicity of notation, let us introduce $\mu(r) = \frac{n w^d}{(2\pi\sigma^2)^{d/2}}e^{-\frac{(r/2)^2}{2\sigma^2}}$. If we assume that $r$ is large enough so that %$\mu(r) \le \frac{2C\log n}{\epsilon}$, 
$\mu(r) \le {2C\log n}$,
Chernoff bounds (see Lemma~\ref{lem:chernoff}) give us
\begin{align*}
    \Pr\left[| B|\ge 
    4C\log n\right]&\le 2e^{-\mu}\\
    &\le \frac{1}{n^{2C}},
\end{align*}
where $|B|$ denotes the number of the points from the dataset within $B$. Thus if $r$ is such that $\mu(r)> 2C\log n$, then bins at distance at least $r$ from the origin are $t/2$-light with probability at least $1-\frac{1}{n^{2C}}$. Equivalently this means that all $t/2$-heavy bins are at distance at most $r$ from the origin. 
 
By Lemma \ref{lem aux furtherst heavy gaussian} the furthest bin that can be $t/2$-heavy is at distance at most $$3\sigma\sqrt{\log n + d \log  \left(\frac{w}{\sigma\sqrt{2\pi}}\right)}$$
from the origin.
It remains to note that the area of diameter $\alpha$ is covered by at most $\alpha/w+2$ bins of width on a single axis. Thus, the number of $t/2$-heavy bins is bounded by
$$\left(\frac{6\sigma}{w}\sqrt{\log n + d \log  \left(\frac{w}{\sigma\sqrt{2\pi}}\right)} + 2\right)^d$$ 
with probability at least $1-\frac{1}{n^{2C}}$. 
\end{proof}

\begin{lem}[Upper bound on $m$]
For a dataset coming from a multivariate Gaussian with variance $\sigma^2I$ in $\mathbb{R}^d$, when binning is done with widths such that $8\le \log (\frac{w}{\sigma\sqrt{2\pi}})$, there are at most $n^{2/3+o(1)}$ points in
$3t/2$-light bins with arbitrary high probability.
\end{lem}

\begin{proof}
Without loss of generality we assume that the mean of the distribution is the origin. Let $B$ be a bin at distance $r$ from the origin, and $x\in\mathbb{R}^d$ be a point inside $B$. For $X$ from multivariate Gaussian we have

\begin{align*}
         %X\sim N(0,\sigma^2I) \rightarrow 
         \frac{1}{(2\pi\sigma^2)^{d/2}}e^{-\frac{(r + \frac{w\sqrt{d}}{2})^2}{2\sigma^2}}\le f(X=x).
\end{align*}
Hence, the expected number of points within $B$ is lower bounded by $\frac{n w^d}{(2\pi\sigma^2)^{d/2}}e^{-\frac{(r + \frac{w\sqrt{d}}{2})^2}{2\sigma^2}}$. For $r$ such that $\frac{w\sqrt{d}}{2}\le (\sqrt{2}-1)r$ we further have the lower bound of $\mathbb{E} [\vert B \vert]\geq \frac{n w^d}{(2\pi\sigma^2)^{d/2}}e^{-\frac{2r^2}{2\sigma^2}}$, where $B$ denotes the number of points from the dataset within $B$.

For simplicity of notation let $\mu(r) = \frac{n w^d}{(2\pi\sigma^2)^{d/2}}e^{-\frac{r^2}{\sigma^2}}$. For $r$ such that $\mu(r) \ge \frac{16C\log n}{\epsilon}$, 
%$\mu(r) \ge {16C\log n}$, 
Chernoff bounds (see Lemma~\ref{lem:chernoff}) we have

\begin{align*}   \Pr\left[|B|\le \frac{{12C\log n}}{\epsilon} \right]
    &\le \frac{1}{n^{2C}}.
\end{align*}
Thus if $r$ is such that $\mu(r)\ge \frac{16C\log n}{\epsilon}$ 
%$\mu(r)\ge {16C\log n}$
for some $r$, then bins at distance at most $r$ are $3t/2$-heavy with probability at least $1-\frac{1}{n^{2C}}$. In other words, all $3t/2$-light bins are at distance at least $r$.
By Lemma \ref{lem aux light gaussian}, the smallest $r$ such that all $3t/2$-light bins are at distance at least $r$ is % OLD $\frac{1}{2}\sigma\sqrt{\log n + d \log  \left(\frac{w}{\sigma\sqrt{2\pi}}\right)}$.
$\sigma\sqrt{\log (\epsilon n) - \log(16 C \log n)+ d\log\left(\frac{w}{\sigma \sqrt{2\pi}}\right)}$.


From Corollary~\ref{cor:tailboundschisquared} it follows that the probability of a point taking distance at least %$\frac{1}{2}\sigma\sqrt{\log n + d \log  \left(\frac{w}{\sigma\sqrt{2\pi}}\right)}$ 
$\sigma\sqrt{\log (\epsilon n) - \log(16 C \log n)+ d\log\left(\frac{w}{\sigma \sqrt{2\pi}}\right)}$
from the cluster center is


\begin{align*}
    \Pr\left[X \ge \sigma \sqrt{2d + 3 \left(\frac{\log (\epsilon n) -\log(16C \log n)+ d \log  (\frac{w}{\sigma\sqrt{2\pi}})-2d}{3}\right)}\right] &\le \exp\left({-\frac{(\log (\epsilon n) -\log(16C \log n)+ d (\log  (\frac{w}{\sigma\sqrt{2\pi}})-2)}{3}}\right)\\
    & \leq \frac{(16 C\log n)^{1/3}\cdot e^{-\frac{d}{3}(\log \frac{w}{\sigma \sqrt{2 \pi}}-2)}}{(\epsilon n)^{1/3}}
\end{align*}

Let $\mathcal{C}'$ be the set of points %in $\mathcal{C}$ 
that are at least %$\sigma \sqrt{2d + 3 \left(\frac{\frac{1}{4}\log n + \frac{1}{4}d \log  (w/\sigma)-2d}{3}\right)}$ 
$\sigma \sqrt{2d + 3 \left(\frac{\log (\epsilon n) -\log(16C \log n)+ d \log  (\frac{w}{\sigma\sqrt{2\pi}})-2d}{3}\right)}$
far from the mean of the ditribution i.e. origin. Any point in $3t/2$-light bins belongs to $\mathcal{C}'$ with probability at least $1-\frac{1}{n^{2C}}$. Chernoff bound (see Lemma~\ref{lem:chernoff}) gives us
\begin{align*}
    |\mathcal{C}'|\le \epsilon^{-1/3} n^{2/3}{(16 C\log n)^{1/3}\cdot e^{-\frac{d}{3}(\log \frac{w}{\sigma \sqrt{2 \pi}}-2)}}
\end{align*}
with high probability. This completes the proof.
\end{proof}

\begin{proofof}{Theorem \ref{thm one gaussian tradeoff}}
From above analysis we have that the number of $t/2$-heavy bins is bounded by
$$M = \left(\frac{6\sigma}{w}\sqrt{\log n + d \log  \left(\frac{w}{\sigma\sqrt{2\pi}}\right)} + 2\right)^d$$ 
with high probability.
We also have that the total number of points in $3t/2$-light bins is upper bounded by %$m=n^{11/12}+o(1)$. 
$m = \epsilon^{-1/3} n^{2/3}{(16 C\log n)^{1/3}\cdot e^{-\frac{d}{3}(\log \frac{w}{\sigma \sqrt{2 \pi}}-2)}}$. Thus  we have
\begin{align}
M &= \left(\frac{6\sigma}{w}\sqrt{\log n + d \log  \left(\frac{w}{\sigma\sqrt{2\pi}}\right)} + 2\right)^d \\
& \leq \left(\frac{6\sigma}{w}\sqrt{2}\sqrt{\log n} +2\right)^d\\
& \leq \left(\frac{12\sigma}{w}\right)^d (\log n)^{d/2}
\end{align}
where the first inequality follows by $n\geq \left(\frac{w}{\sigma \sqrt{2\pi}}\right)^d$ and the second also follows for large $n$. We have
\begin{align}
    \epsilon m + 8M\log \frac{1}{\delta} &\leq \epsilon m +8 \log \left(\frac{1}{\delta}\right) \cdot \left(\frac{12\sigma}{w}\right)^d (\log n)^{d/2}\\
    & \leq \frac{1}{2}\epsilon n
\end{align}
where the second inequality follows from condition $\frac{n}{(\log n)^{d/2}}>16\log \left(\frac{1}{\delta}\right)\cdot \left(\frac{12\sigma}{w}\right)^d$.
Thus remains to apply Theorem \ref{thm:general-2} and we get

\begin{align}
\frac{\epsilon m + 8M\log \frac{1}{\delta}}{\epsilon n - \epsilon m - 4M\log \frac{1}{\delta}} +\frac{m}{n} + \frac{w\sqrt{d}}{2\sqrt{e}} &\leq \frac{\epsilon(\epsilon^{-1/3} n^{2/3}{(16 C\log n)^{1/3}\cdot e^{-\frac{d}{3}(\log \frac{w}{\sigma \sqrt{2 \pi}}-2)}})+8 \log \left(\frac{1}{\delta}\right) \cdot \left(\frac{12\sigma}{w}\right)^d (\log n)^{d/2}}{\frac{1}{2}\epsilon n}\\
&+\frac{n^{2/3}{(16 C\log n)^{1/3}\cdot e^{-\frac{d}{3}(\log \frac{w}{\sigma \sqrt{2 \pi}}-2)}}}{n}+ \frac{w\sqrt{d}}{2\sqrt{e}}\\
& \leq \frac{3 (16 C\log n)^{1/3}\cdot e^{-\frac{d}{3}(\log \frac{w}{\sigma \sqrt{2 \pi}}-2)}}{(\epsilon n)^{1/3}}+\frac{16 \log \left(\frac{1}{\delta}\right)\cdot \left(\frac{12 \sigma}{w}\right)^d (\log n)^{d/2}}{\epsilon n}+\frac{w\sqrt{d}}{2\sqrt{e}}
\end{align}
\end{proofof}

\section{Data dependent algorithm}

\begin{figure}[h]
    \centering
\includegraphics[width=0.3\textwidth]{figures/step1.pdf}\hfill
\includegraphics[width=0.3\textwidth]{figures/step2.pdf}\hfill
\includegraphics[width=0.3\textwidth]{figures/binning_example.pdf}
    \caption{Example of stages of data dependent partitioning of the dataset in $\mathbb{R}^2$.}
    \label{fig:binning}
\end{figure}

\subsection{Implicit sampling for data independent Algorithm \ref{alg:rounding-new}}\label{appendix secti implicit data indep}

For the data independent algorithm, implicit implementation of empty bins sampling is straightforward. Instead of storing $(\frac{R}{w})^{d}$ bins centers, it is enough to store only those corresponding to non empty bins, and sample empty centers via independent uniform random sampling of each coordinate from the set of possible values (and rejection if gluing them together gives center corresponding to a non empty bin). Note that this requires storing $O(\frac{R}{w}\cdot d)$ values for non empty bins instead of $O(\frac{R}{w})^{d}$.

\section{Implicit sampling of empty bins} \label{appendix sect implicit}

\begin{figure}
\centering
\scalebox{0.5}{
\begin{forest}
for tree={
    grow=south,
    circle, draw, minimum size=3ex, inner sep=0.75pt,
    s sep=2mm
        }
[
    [
        [, edge=dashed,fill=gray
        [,no edge, draw=none]
        [,no edge, draw=none]
        ]
        [ 
            [ 
            [
            [,fill=black[,no edge, draw=none]
        [,no edge, draw=none]]
            [, edge=dashed,fill=gray
            [,no edge, draw=none
            ]
        [,no edge, draw=none]]
            ]
            [, edge=dashed,fill=gray
            [,no edge, draw=none
            ]
        [,no edge, draw=none]]
            ]
            [, edge=dashed,fill=gray
            [,no edge, draw=none]
        [,no edge, draw=none]
        ]
        ]
    ]
    [
        [
            [, edge=dashed,fill=gray
            [,no edge, draw=none]
        [,no edge, draw=none]
        ]
            [,fill=black
            [,no edge, draw=none]
        [,no edge, draw=none]] 
        ]
        [, edge=dashed,fill=gray
        [,no edge, draw=none]
        [,no edge, draw=none]
        ]
    ]
]
\end{forest}}
\vspace{-0.5cm}
\caption{Tree with $h = 2$ and $h' = 5$. Black nodes are non-empty bins, gray nodes are empty bins we need to sample.}
\end{figure}

\begin{proofof}{Lemma \ref{lem:sampling explicit equivalent implicit}}
Equivalence of Algorithm \ref{alg:explicit-empty-bins} and Algorithm \ref{alg:implicit-empty-bins} is the consequence of independence of Bernoulli indicators (as Laplace noise are independent for different bins) and the fact that Binomial can be represented as the sum of independent Bernoullis.
\end{proofof}


\begin{lem} \label{lem total number empty bins}
For a dataset of size $n$, if the decision tree is data independent up to depth level $h$ and data dependent in the remaining part, then the number of empty bins is upper bounded by
\begin{align*}
    2^h + n(h'-h)
\end{align*}
where $h'$ denotes the total number of levels. 
\end{lem}
\begin{proof}
We need to upper bound the number of leaves for such binary tree. Since the binary tree is complete up to depth $h$, we have at most $2^{h}$ leaves at level $h$. Furthermore, for any non empty bin we can have at most $h'-h$ empty bins between depth $h+1$ and $h'$. Thus, we have at most $n(h'-h)$ empty bins between depth $h+1$ and $h'$.
\end{proof}

\begin{proofof}{Lemma \ref{lem:empty bin not split}}
For a single empty bin, by Remark \ref{fact:tailbound4laplace} we have 
\begin{align*}
         \Pr[\text{Lap}(\frac{2(h'-h)}{\epsilon}) \ge \tau] = \frac{1}{2}e^{-\frac{\tau}{2(h'-h)/\epsilon}}.
     \end{align*}
Thus for $$\tau = \frac{2(h'-h)}{\epsilon}\log \left(\frac{1}{\delta}\cdot\left(2^h + n(h'-h)\right)\right)$$ the right hand side is less than $\frac{\delta}{2^h + n(h'-h)}$. As a consequence of Lemma \ref{lem total number empty bins} there are at most $2^h + n(h'-h$ empty bins and thus union bound argument completes the proof.
\end{proofof}

\paragraph{More implementation details:} Our algorithm for implicitly sampling the empty bins proceeds as follows: in the recursion tree, from left to right, our algorithm first finds common ancestor for any two consecutive non-empty bins, say $i$'th and $i+1$'th and calls it \emph{$i$'th common ancestor}. Then, it calculates the number of empty bins between $i$'th non-empty bin and $i$'th common ancestor. And similarly, it calculates the number of empty bins between $i+1$'th non-empty bin and $i$'th common ancestor. Note that by the above-mentioned binomial distribution implicit sampling argument, one can apply a binomial distribution sampling technique to each of these numbers, and it is not hard to locate the sampled empty bins. Finally, we need to apply an implicit sampling technique to empty bins at level $h$. This is done similarly to the rejection sampling technique we mentioned in implicit implementation of data independent algorithm.  

\section{ Experiments}\label{appendix experiments}
\subsection{Experimental setting for comparison with \cite{DBLP:conf/icml/BalogTS18}} \label{appendix_balog_exp_setting}

For both dimension $2$ and $5$, the dataset consists of $n = 100, 000$ samples from
a multivariate mixture of Gaussians. The mixture has $10$ components, with mixing weights proportional to $(1, 1/2,\dots, 1/10)$, and their means are chosen from spherical Gaussian with mean $[100,\dots,100]$ and covariance $200I$.  Each point is simulated by first
sampling the mixture component, and then sampling from a spherical Gaussian centered at the mean of the chosen mixture component and with covariance $30I$. For accuracy of the comparison, we did not re-sample the dataset and used the exact version in the code of \cite{DBLP:conf/icml/BalogTS18}.

\subsection{Dependency on dataset size}\label{appendix: dep on size}
Intuitively, it is easier to hide the contribution of an individual in a large set, compared to a small set, in kernel density. We show this empirically using three datasets with different dataset sizes, but the same underlying mixture of Gaussians parameters as the 5 dimensional datasets in the experiments section, see Figure~\ref{Fig:dep-N}. This experiment also shows that our algorithm is able to produce synthetic datasets with better minimum error when the size of the dataset, $N$, is larger. This dependency in $N$ was also evident in our theoretical results in Theorem~\ref{thm:general-2} and Theorem~\ref{thm one gaussian tradeoff}.
\begin{figure}[ht!]
    \centering
    \includegraphics[scale=0.33]{figures/dep_N.pdf}
    \caption{Using three 5 dimensional datasets with 1k, 10k and 100k data points, yet with the same underlying distribution parameters, we show that larger dataset size, $N$, naturally results in a better performance. Moreover, for larger $N$, our algorithm is capable of achieving better minimum error.}
    \label{Fig:dep-N}
\end{figure}

\subsection{Dependency on variance}\label{appendix: dep on variance}

Next, we show the effect of $\sigma$ (see Theorem~\ref{thm one gaussian tradeoff} for the definition of $\sigma$) in the mixture of Gaussians datasets. We consider two datasets with underlying mixture of Gaussians distributions in dimension $10$, with $\sigma = 30$ and $\sigma = 3$ for each cluster. As expected, the simulation results presented in Figure~\ref{fig:d10-sigma} confirm that our algorithm performs better in the setting where clusters are more concentrated around a center, i.e., small $\sigma$ case.

\begin{figure}[ht!]
    \centering
    \includegraphics[width=0.33\textwidth]{figures/d10.pdf}
    \caption{Performance comparison of our data dependent algorithm on 10-dimensional datasets with underlying mixture of Gaussians distribution with $\sigma = 3$ and $\sigma = 30$. Note that our algorithm performs better with smaller $\sigma$ as predicted by our theory.}
    \label{fig:d10-sigma}
\end{figure}


\subsection{Binary classification}\label{appendix:binary_classification}

We use a dataset from a Kaggle competition \url{https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud} with information of credit card transactions which were either
fraudelent or not, which was also used in \cite{DBLP:conf/aistats/HarderAP21}. This dataset has 31 categories, 30 numerical features and a binary label. Similarly to \cite{DBLP:conf/aistats/HarderAP21}, we use all but the first feature (Time).

For both our data dependent algorithm and DP-MERF \citep{DBLP:conf/aistats/HarderAP21}, we use $80\%$ of input data for synthetic data generation, for various privacy budgets $\epsilon$. Synthetic data is then used to train $12$ classifiers (see Table \ref{tab:classification_data_dep}), which are tested on remaining $80\%$ of input data.

For training DP-MERF synthesizers, we set parameters as in \cite{DBLP:conf/aistats/HarderAP21}, i.e. number of epochs $4000$, number of Furier features $5000$, mini-batch side $0.5$, undersampling rate $0.005$. For our data dependent algorithm, we use undersampling rate of $0.005$, and set the number of data independent levels to be equal to $30$ and maximal number of levels to $60$.

As comparison metrics, we use ROC (area under the receiver operating curve). Table \ref{tab:classification_data_dep} shows average ROC for our data dependent algorithm over $20$ repetitions for each classifier, as well as average ROC over the classifiers. Table \ref{tab:classification_dp_merf} shows average ROC over classifiers for DP-MERF with $5$ repetitions for each classifier. 

Although our data dependent algorithm does not outperform DP-MERF, its performance degrades slower for increasing privacy.
\begin{table}[hb]
    \centering
    \caption{Data dependent algorithm: ROC for various levels of privacy. Average over 20 repetitions.}\label{tab:classification_data_dep}
    \begin{tabular}{ c c c c c }
      \toprule % from booktabs package
      &\bfseries $\epsilon=10$ & \bfseries $\epsilon=1$ & \bfseries $\epsilon=0.1$ & \bfseries $\epsilon=0.01$\\
      \midrule % from booktabs package
      %\bfseries Logistic Regression & 0.12345\\
      \bfseries Logistic Regression & 0.705 & 0.545 &0.481 &0.527  \\ 
  \bfseries Gaussian Naive Bayes & 0.562 & 0.563 & 0.479 & 0.547  \\
  \bfseries Bernoulli Naive Bayes & 0.495 & 0.564 & 0.497 & 0.521  \\
\bfseries  Linear SVM & 0.758 & 0.524 & 0.508 & 0.546 \\
\bfseries  Decision Tree &  0.676 & 0.611 & 0.519 & 0.532  \\ 
\bfseries LDA &  0.518 & 0.580 & 0.480 & 0.542  \\
\bfseries Ada Boost &  0.632 & 0.572 & 0.485 & 0.521  \\
\bfseries Bagging &  0.673 & 0.579 & 0.518 & 0.508  \\
\bfseries Random Forest & 0.663 & 0.594 & 0.530 & 0.543  \\
\bfseries GBM &  0.631 & 0.582 & 0.521 & 0.523 \\
\bfseries Multi-layer percepton & 0.625 & 0.553 & 0.486 & 0.525  \\
\bfseries XGBoost &  0.588 & 0.598 & 0.527 & 0.478\\
\bfseries Average &   0.627 & 0.572 & 0.503 & 0.526 \\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table}

\begin{table}[hb]\label{table appendix DP MERF}
    \centering
    \caption{DP-MERF: ROC for various levels of privacy. Average over 5 repetitions.}\label{tab:classification_dp_merf}
    \begin{tabular}{ c c c c }
      \toprule % from booktabs package
      &\bfseries $\epsilon=10$ & \bfseries $\epsilon=1$ & \bfseries $\epsilon=0.1$ \\
      \midrule % from booktabs package
      %\bfseries Logistic Regression & 0.12345\\
      \bfseries Average &   0.880 & 0.792 & 0.564 \\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table}

\clearpage

\section{Approximating Gaussian Distribution by Mixture of Uniforms}\label{appendix:proxy_mixture_uniforms}

\subsection{MMD}
Let us assume that the data is arising from a Gaussian distribution and we are estimating with the samples $\{z_i\}_1^n$. The maximum-mean discrepance (MMD) between the population P and the samples is given by: 
\begin{equation}
\mathrm{MMD}_{u}^{2}(\mathcal{N}_{d},Q_{n})=E_{x,x'\sim\mathcal{N}_{d}}[k(x,x')]-\frac{2}{n}\sum_{i=1}^{n}E_{x\sim\mathcal{N}_{d}}[k(x,z_{i})]+\frac{1}{n(n-1)}\sum_{i=1}^{n}\sum_{j\neq i}^{n}k(z_{i},z_{j}).\label{eq:mmdu}
\end{equation}
The expectations in the expression above can be computed analytically
to yield the formula~\cite{https://doi.org/10.1002/sta4.329}:

\[
\mathrm{MMD}_{u}^{2}(\mathcal{N}_{d},Q_{n})=\left(\frac{\gamma^{2}}{2+\gamma^{2}}\right)^{d/2}-\frac{2}{n}\left(\frac{\gamma^{2}}{1+\gamma^{2}}\right)^{d/2}\sum_{i=1}^{n}e^{-\frac{\Vert z_{i}\Vert^{2}}{2(1+\gamma^{2})}}+\frac{1}{n(n-1)}\sum_{i=1}^{n}\sum_{j\neq i}^{n}e^{-\frac{\Vert z_{i}-z_{j}\Vert^{2}}{2\gamma^{2}}}.
\]

\subsection{Widths of bins}
Let us assume that we have $2k+1$ boxes to approximate the Gaussian distribution where we assume an odd number of boxes to apply symmetry arguments. 
The distributions are given by:
\begin{align}
    P(x) = \frac{1}{\sqrt{2\pi}} e^{-x^2/2}
    Q(x) = w_0 I_0 + \sum_{i=1}^k w_i I_i + \sum_{i=1}^k w_{-i} I_{-i}
\end{align}
where $I_i = I[(2i-1)C <= x < (2i+1)C]$ and $\sum_{i=-k}^{k} w_i = 1$.
The KL divergence between distributions is given by:
\begin{align}
    D_{KL}(P||Q) := -\int_{\infty}^{\infty} p(x) \log\frac{p(x)}{q(x)} dx
\end{align}
and in particular for our setting is given by:
\begin{align}
   D_{KL}(Q||P) = \sum_{i=-k}^k w_i \log\frac{w_i}{2C} + \frac{1}{2} \log(2\pi) + \sum_{i=-k}^k \frac{w_i C^2}{6} (12i^2+1)  
\end{align}
Using the KL bounds for measuring divergence, we are able to obtain the following weights and size of the boxes. Each box is of size given by $2c$ and placed at location $2ci$ for $i\in[-k,k]$. 
Applying the method of Lagrange multipliers, we can obtain
the optimal box weights for a mixture of uniform distributions with respect to the Gaussian distribution:
\begin{align}
w_0 &= \frac{1}{1 + 2\sum_{i=1}^k e^{-2i^2c^2}} \\
w_i &= w_0 e^{-2i^2 c^2} \qquad \forall i \in [-k, k] \textrm{ and } i\ne 0
\end{align}


\bibliography{kreacic_397}
\end{document}