\documentclass{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units

\usepackage{microtype}
\usepackage{graphicx}

% \usepackage{subfigure}
\usepackage{subcaption}
\usepackage{booktabs} % for professional tables
\usepackage{multicol}
% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2023} with \usepackage[nohyperref]{icml2023} above.
\usepackage{hyperref}
\usepackage{times}
\usepackage{appendix}
\usepackage{amsmath}
\usepackage{amsfonts}  
%\usepackage{lmodern}
\usepackage{algorithmic,algorithm}

\usepackage{booktabs}
\usepackage{nicefrac} 
\usepackage{microtype}
\usepackage{xcolor}
\usepackage{multirow}
\usepackage{dsfont}
\usepackage{mathtools,enumitem}
%\usepackage{caption} 
%\captionsetup[table]{skip=0.5pt}

\usepackage{comment}
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\DeclareMathOperator*{\argmax}{arg\,max}

\def\R{\mathbb{R}}
\def\E{\mathbb{E}}
\def\P{\mathbb{P}}
\def\Cov{\mathrm{Cov}}
\def\Var{\mathrm{Var}}
\def\half{\frac{1}{2}}
\def\th{\mathrm{th}}
\def\tr{\mathrm{tr}}
\def\df{\mathrm{df}}
\def\dim{\mathrm{dim}}
\def\col{\mathrm{col}}
\def\row{\mathrm{row}}
\def\nul{\mathrm{null}}
\def\rank{\mathrm{rank}}
\def\nuli{\mathrm{nullity}}
\def\spa{\mathrm{span}}
\def\sign{\mathrm{sign}}
\def\supp{\mathrm{supp}}
\def\diag{\mathrm{diag}}
\def\aff{\mathrm{aff}}
\def\conv{\mathrm{conv}}
\def\hy{\hat{y}}
\def\ty{\tilde{y}}
\def\hbeta{\hat{\beta}}
\def\tbeta{\tilde{\beta}}
\def\htheta{\hat{\theta}}
\def\btheta{\boldsymbol{\theta}}
\def\halpha{\hat{\alpha}}
\def\hf{\hat{f}}
\def\hmu{\hat{\mu}}
\def\hlambda{{\hat{\lambda}}}
\def\heta{{\hat{\eta}}}
\def\hR{{\widehat{R}}}
\def\cA{\mathcal{A}}
\def\cB{\mathcal{B}}
\def\cD{\mathcal{D}}
\def\cE{\mathcal{E}}
\def\cF{\mathcal{F}}
\def\cG{\mathcal{G}}
\def\cK{\mathcal{K}}
\def\cH{\mathcal{H}}
\def\cI{\mathcal{I}}
\def\cM{\mathcal{M}}
\def\cN{\mathcal{N}}
\def\cR{\mathcal{R}}
\def\cP{\mathcal{P}}
\def\cS{\mathcal{S}}
\def\cT{\mathcal{T}}
\def\cW{\mathcal{W}}
\def\cX{\mathcal{X}}
\def\cY{\mathcal{Y}}
\def\cZ{\mathcal{Z}}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
% \usepackage{xr} 
% \externaldocument{uai2023-main}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Supplementary Material for: ``Private Prediction Strikes Back!'' Private Kernelized Nearest Neighbors with Individual R\'{e}nyi Filter}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\section{Omitted proofs and algorithm in Section 3}
\begin{theorem}[Restatement of Theorem 3.1]
Algorithm 3 satisfies $(\alpha, B\alpha)$-RDP for all $\alpha\geq 1$.
\end{theorem}

\begin{proof}

The privacy analysis relies on  individual RDP (Definition 2.4), which quantifies the maximum impact of adding or deleting a specific individual from any potential dataset to the prediction outcome, measured in terms of R\'enyi divergence.

We first demonstrate that only the selected neighbors have to account for their individual privacy loss. 
The decision rule for ``being selected'' is
based on a comparison between the kernel weight and a data-independent threshold $\tau$, which is not influenced by any other private data points. Therefore, ``unselected'' neighbors do not incur any individual privacy loss. 

For each selected neighbor $(x_i, y_i)$ at time $t$, its individual privacy analysis is broken down into two parts: the first part is the release of the number of neighbors $|\cN_t|$, and the second part is the release of its label associated with the kernel weight.

Note that adding or removing one selected neighbor would only change $|\cN_t|$ by $1$, thus the individual RDP of releasing $|\cN_t|$ at order $\alpha$ satisfies $\frac{\alpha}{2\sigma_1^2}$-RDP for all selected data. 
We next analyze the individual RDP of releasing the label.
Fix a selected neighbor $z=(x_i, y_i)$, for all possible set of selected neighbors $\cN_t=(z_1, ..., z_m)$ that include $z$, it holds that
\begin{align*}
D_\alpha^{\leftrightarrow}\bigg(\big(\sum_{j\in \cN_t} \kappa(x_j, q_t)\cdot y_j\big)+\cN(0, \sigma_2^2K_t \mathbb{I}_c)||\big(\sum_{j \in \cN_t \setminus z} \kappa(x_j, q_t)\cdot y_j\big)+\cN(0, \sigma_2^2K_t \mathbb{I}_c) \bigg)\leq \frac{\kappa(x_i, q_t)^2\alpha }{2\sigma_2^2 K_t}
\end{align*}
by the definition of individual RDP.

Finally, The ``delete'' step in the algorithm ensures that the privacy loss for each private data point $(x_i, y_i)$ is bounded by a fixed value $B$, i.e., $\sum_{j=1}^t \left((g_i+\frac{1}{2\sigma_1^2\cdot K_j})\cdot \mathbb{I}[(x_i, y_i) \in \cN_j]\right)\leq B$. 
According to the fully adaptive composition theorem of individual RDP (Theorem 2.5),
by ensuring the  sum is less than or equal to $B$ for all time steps $t$ and for all data point $(x_i, y_i)$, the algorithm is shown to be $(\alpha, \alpha\cdot B)$-RDP.
\end{proof}

We show the full algorithm of Ind-KNN-Hash in Algorithm \ref{alg:ind_kNN_hashing}.



\begin{algorithm}[t]
\caption{Ind-KNN-Hash}
\label{alg:ind_kNN_hashing}
\begin{algorithmic}[1]
\STATE{\textbf{Input}: Dataset $S\in (\cX \times \cY)^n$, number of hash tables $L$ and the width parameter $b$,
the kernel function $\kappa(\cdot, \cdot)$, the minimum kernel weight threshold $\tau$, sequence of queries $q_1, ..., q_T$, the noisy scale $\sigma_1$ and $\sigma_2$ and the individual budget $B$. }
\STATE{Initialize individual budget $z_i = B, \forall i \in [n]$.}
\STATE{Construct a LSH family: $\cF =(f_1, ..., f_L)$, where $f_\ell: \cR^d \to \{0, 1\}^b$.}
\FOR{$t=1$ to $T$}
\STATE{Retrieve the hash set: $\cF(q_t)$.}
\STATE{Update the active set $S=\{(x_i, y_i)|z_i >0, (x_i, y_i) \in \cF(q_t)\}$.}
% \STATE{$a_t$ = Algorithm~\ref{alg:noisy_label}($S, \kappa(\cdot, \cdot), q_t, \sigma_1, \sigma_2$).}
\STATE{The selected neighbors: $\cN_t := \{(x_i, y_i)|\kappa(x_i, q_t) \geq \tau   \text{ for all } i \in S\}$.}
\STATE{Drop $(x_i, y_i)$ from $\cN_t$ if $z_i \leq \frac{1}{2\sigma_1^2}$.}
\STATE{Release $|\cN_t|$: $K_t:=|\cN_t| + \cN(0, \sigma_1^2).$}
\FOR{$(x_i, y_i)\in \cN_t$}
\STATE{Update $z_i$ after releasing $K_t$: $z_i=z_i - \frac{1}{2\sigma_1^2}$. }
\STATE{Evaluate individual ``contribution'': $g_i = \min \left(\frac{\kappa(x_i, q_t)^2}{2\sigma_2^2\cdot K_t}, \sigma_2\sqrt{2 K_t z_i}\right)$.}
\STATE{Update $z_i$ after releasing label: $z_i = z_i - g_i$.}
%\STATE{Drop $(x_i, y_i)$ from the active set $S$ if $z_i\leq0$.}
\ENDFOR
\STATE{Compute $a_t = \argmax_{j \in [c]}\big(\sum_{i\in \cN_t}\kappa(x_i, q_t)\cdot y_i +\cN(\boldsymbol{0}, \sigma_2^2\cdot K_t\mathbb{I}_c)\big)_j$.}
\ENDFOR
\STATE{\textbf{Return} $(a_1, ..., a_T)$}
\end{algorithmic}
\end{algorithm}

\section{More experiments}

\textbf{Ablation study on the threshold $\tau$.}
The minimum kernel weight threshold $\tau$ determines the number neighbors selected for each query-response pair. We conduct an ablation study to investigate the relation between the optimal $\tau$ and the privacy level. Table~\ref{tab: best_hyper_indknn} provides the set of hyper-parameters of Ind-kNN that results in the best utility. Our finding shows that the optimal choice on $\tau$ increases as $\epsilon$ increases across four datasets and two kernel methods. We conjecture this is because when  $\epsilon$ is small, the added noise requires a larger margin among the top-k votes to determine the correct output, thus requiring a smaller $\tau$. In contrast, when $\epsilon$ is large, the smaller noise scale enables the algorithm to  pick a set of more selective neighbors, thus resulting in a larger $\tau$.



In Table~\ref{tab: cifar10}, we provide an ablation study of two variants of Ind-KNN under $(2.0, 10^{-5})$-DP. We instantiate all Ind-KNN variants with cosine kernels, and the hashing variant uses $12$ hash tables with the width parameter $b=8$. When answering $400$ queries, we find that using hashing decreases the accuracy from $84.3\%$ to $83.5\%$, which still matches the accuracy of $83.4\%$ from Linear NoisySGD. Moreover, exploiting released predictions enables Ind-KNN to have better accuracy when answering both $400$ and $800$ queries. 
\begin{table}[ht]
\centering
\setlength{\tabcolsep}{3pt}
\caption{Median accuracy on CIFAR-10 under $(2.0, 10^{-5})$-DP. We report the accuracy (\%) of 400 and 800 test queries.}
\begin{tabular}{lcccc}
    \toprule
    Methods & \  Acc ($400$) & Acc ($800$)  \\
    \hline
     Linear NoisySGD      & $83.4$  & $83.4$  \\ 
     Private kNN &$82.8$ &$81.5$\\
    \hline
    Ind-KNN       &  $84.3$ &  $83.1$ \\
    Ind-KNN-Hash  &    $83.5$     &  $81.8$&\\
    Ind-KNN + Reuse predictions& $85.0$     & $83.5$ \\
    \hline
\end{tabular}
\label{tab: cifar10}
\end{table}

\begin{table}[ht]
\centering
\caption{The range of hyper-parameters for Private kNN.}
\begin{tabular}{c|cccc}
    \toprule
   Hyper-parameters & CIFAR-10& Fashion MNIST & AG News & DBpedia \\
    \hline
 sampling ratio $p$ & \multicolumn{4}{c}{$\{0.1, 0.2\}$}\\
      \hline
 number of neighbors $K$ & $\{100, 200, 300, 400\}$ & $\{100, ..., 500\}$ & $\{100, ..., 600\}$& $\{100, ..., 600\}$\\
    \hline
\end{tabular}
\label{tab: hyper_private_knn}
\end{table}
\section{Experimental details}
In this section, we present the implementation details of Ind-KNN and Private kNN.

\textbf{Hyper-parameters search of Ind-kNN.}
%We set the individual RDP budget $B$ such that using RDP to DP conversion on $(\alpha, B\alpha)$-RDP satisfies the predefined privacy budget $(\epsilon, \delta)$. 
The noise scale $\sigma_1$ is set to be $\sqrt{\frac{T}{6B}}$ to use roughly half of the individual RDP budget $B$ for each data point being selected at every query. Further reducing $\sigma_1$ does not result in significant improvement. To prevent overflow in $\frac{\kappa(x_i, q_t)^2}{2\sigma_2^2K_t}$ due to random noise, $K_t$ is set to $\max(K_t,30)$ for all experiments. 
For Ind-KNN using a cosine kernel, we fine-tune the noise scale $\sigma_2$ and the threshold $\tau$ on the validation set. To reduce the computational cost of searching all possible $(\sigma_2, \tau)$-pairs, we first estimate the optimal threshold $\tau$ by running a  non-private Ind-KNN on the valid set to collect individual kernels weights and sweep through $\tau \in \{0.05, 0.1, ..., 0.95\}$. With $\tau^*$ in hand, we perform a second round of hyper-parameter search for the optimal $(\sigma_2, \tau)$ pair under different privacy levels, where $\tau$ ranges between $[\tau^*-0.05, \tau^*+0.05]$. The table below records the range of $(\sigma_2,\tau)$ pairs consider in the second-round  search.

We present the range of hyper-parameters search for  Private kNN in Table~\ref{tab: hyper_private_knn} and the best hyper-parameter sets we use in Table~\ref{tab: best_hyper_indknn}.

\textbf{Feature preprocessing.}
As we mentioned in the experiment section, we use a pre-trained ResNet50 model on ImageNet for feature extraction in image classification tasks. Then we perform L2 normalization on the extracted features as a preprocessing step. As for text classification, the extracted features from sentence-transformer are already normalized. In this case, we don't need to apply any additional preprocessing steps.
% how to pre-process features? 
% the hyper-parameter ranges for all mechanisms.
% the best hyper-parameters.
% ablation study on tau




\begin{table}[ht]
\centering
%\setlength{\tabcolsep}{3pt}
\caption{Set of hyper-parameters of Ind-KNN resulting the best utility for a set of privacy budgets used in Sec 4.2.}
\resizebox{1.0\columnwidth}{!}{
\begin{tabular}{c|c|cccc}
    \toprule
 Methods & Datasets &  $\epsilon=0.5$& $\epsilon=1.0$ & $\epsilon=1.5$  & $\epsilon=2.0$\\
    \hline
\multirow{3}{*}{Cosine kernel ($\sigma_2, \tau$) }& CIFAR-10 & $(\sigma_2=0.7,\tau=0.50)$ &$(\sigma_2=0.4, \tau=0.50)$ &$(\sigma_2=0.3, \tau=0.52)$ &$(\sigma_2=0.2, \tau=0.53)$\\
& Fashion MNIST&($\sigma_2=1.3, \tau=0.6$) &$(\sigma_2=0.6,\tau=0.6)$ & ($\sigma_2=0.3, \tau=0.6$)& $(\sigma_2=0.3, \tau=0.6)$\\
& AG News& ($\sigma_2=0.6, \tau=0.35$) &($\sigma_2=0.4, \tau=0.36$) & $(\sigma_2=0.25, \tau=0.37)$&$(\sigma_2=0.2, \tau=0.38)$\\
& DBpedia& ($\sigma_2=0.45, \tau=0.35$) & $(\sigma_2=0.3,\tau=0.37)$ &$(\sigma_2=0.2, \tau=0.37)$ & $(\sigma_2=0.1, \tau=0.38)$ \\
    \hline
    \multirow{2}{*}{RBF kernel $(\sigma_2, \tau, \nu)$}& CIFAR-10& $(\sigma_2=0.7, \tau=0.8)$ &$(\sigma_2=0.6, \tau=0.82)$ &$(\sigma_2=0.6, \tau=0.82)$ &$(\sigma_2=0.45, \tau0.84)$\\
& Fashion MNIST & ($\sigma_2=1.3,\tau=0.83$)&$(\sigma_2=0.7,\tau=0.82)$ & $(\sigma_2=0.4, \tau=0.84)$& ($\sigma_2=0.3, \tau=0.84$)\\
    \hline
\multirow{2}{*}{ Hash $(L=30, b, \sigma_2, \tau)$}& CIFAR-10 & $(b=8, \sigma_2=0.7, \tau=0.50)$ &$(b=8, \sigma_2=0.4, \tau=0.50)$ &$(b=8, \sigma_2=0.3, \tau=0.52)$ &$(b=8, \sigma_2=0.2, \tau=0.53)$\\
& AG News & ($b=9, \sigma_2=0.7, \tau=0.35$)& ($b=9, \sigma_2=0.4, \tau=0.36$) & ($b=9, \sigma_2=0.25, \tau=0.36$) &($b=9, \sigma_2=0.2, \tau=0.36$)\\
    \hline
\end{tabular}}
\label{tab: best_hyper_indknn}
\end{table}


\begin{table}[ht]
\centering
\caption{The range of hyper-parameters for Ind-KNN.}
\begin{tabular}{c|cccc}
    \toprule
   Hyper-parameters & CIFAR-10& Fashion MNIST & AG News & DBpedia \\
    \hline
Noise scale $\sigma_1$ & \multicolumn{4}{c}{$\sqrt{\frac{T}{6B}}$}\\
\hline
  Noise scale $\sigma_2$ & \multicolumn{4}{c}{$\{0.1, 0.2, ..., 0.9\}$}\\
      \hline
 Minimum threshold $\tau$ (cosine kernel) &[0.48, 0.53]&[0.58, 0.63]&[0.35, 0.40]& [0.35, 0.40]\\
    \hline
 Minimum threshold $\tau$ (RBF kernel)& [0.8, 0.85]&  [0.8, 0.85]&- & -\\
   scale parameter $\nu$ (with RBF kernel) & $1.7$& 1.5 &- &- \\
    \hline
\end{tabular}
%label{tab: time_vs_hash}
\end{table}


%\bibliography{uai2023-template}

\end{document}
