%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{zref-xr}
\zexternaldocument*[appendix-]{zhu_599.tex}
\usepackage{microtype}
\usepackage{graphicx}

% \usepackage{subfigure}
\usepackage{subcaption}
\usepackage{booktabs} % for professional tables
\usepackage{multicol}
% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2023} with \usepackage[nohyperref]{icml2023} above.
\usepackage{hyperref}
\usepackage{times}
\usepackage{appendix}
\usepackage{amsmath}
\usepackage{amsfonts}  
%\usepackage{lmodern}
\usepackage{algorithmic,algorithm}

\usepackage{booktabs}
\usepackage{nicefrac} 
\usepackage{microtype}
\usepackage{xcolor}
\usepackage{multirow}
\usepackage{dsfont}
\usepackage{mathtools,enumitem}
%\usepackage{caption} 
%\captionsetup[table]{skip=0.5pt}

\usepackage{comment}
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\DeclareMathOperator*{\argmax}{arg\,max}

\def\R{\mathbb{R}}
\def\E{\mathbb{E}}
\def\P{\mathbb{P}}
\def\Cov{\mathrm{Cov}}
\def\Var{\mathrm{Var}}
\def\half{\frac{1}{2}}
\def\th{\mathrm{th}}
\def\tr{\mathrm{tr}}
\def\df{\mathrm{df}}
\def\dim{\mathrm{dim}}
\def\col{\mathrm{col}}
\def\row{\mathrm{row}}
\def\nul{\mathrm{null}}
\def\rank{\mathrm{rank}}
\def\nuli{\mathrm{nullity}}
\def\spa{\mathrm{span}}
\def\sign{\mathrm{sign}}
\def\supp{\mathrm{supp}}
\def\diag{\mathrm{diag}}
\def\aff{\mathrm{aff}}
\def\conv{\mathrm{conv}}
\def\hy{\hat{y}}
\def\ty{\tilde{y}}
\def\hbeta{\hat{\beta}}
\def\tbeta{\tilde{\beta}}
\def\htheta{\hat{\theta}}
\def\btheta{\boldsymbol{\theta}}
\def\halpha{\hat{\alpha}}
\def\hf{\hat{f}}
\def\hmu{\hat{\mu}}
\def\hlambda{{\hat{\lambda}}}
\def\heta{{\hat{\eta}}}
\def\hR{{\widehat{R}}}
\def\cA{\mathcal{A}}
\def\cB{\mathcal{B}}
\def\cD{\mathcal{D}}
\def\cE{\mathcal{E}}
\def\cF{\mathcal{F}}
\def\cG{\mathcal{G}}
\def\cK{\mathcal{K}}
\def\cH{\mathcal{H}}
\def\cI{\mathcal{I}}
\def\cM{\mathcal{M}}
\def\cN{\mathcal{N}}
\def\cR{\mathcal{R}}
\def\cP{\mathcal{P}}
\def\cS{\mathcal{S}}
\def\cT{\mathcal{T}}
\def\cW{\mathcal{W}}
\def\cX{\mathcal{X}}
\def\cY{\mathcal{Y}}
\def\cZ{\mathcal{Z}}
% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
% \usepackage[textsize=tiny]{todonotes}
%% Provided macros


% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%\newcommand{\yw}[1]{\textit{\textcolor{purple}{[yuxiang]: #1}}} % yuxiang's notes



\title{``Private Prediction Strikes Back!'' Private Kernelized Nearest Neighbors with Individual R\'{e}nyi Filter }
%{Data-adaptive privacy-preserving prediction}
% Private Prediction Strikes Back: Kernelized Nearest Neighbor with Individual Renyi Filter  

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Yuqing Zhu}
\author[1]{Xuandong Zhao}
\author[2]{Chuan Guo}
\author[1]{Yu-Xiang Wang}
% Add affiliations after the authors
\affil[1]{%
   UC Santa Barbara
} \affil[2]{%
FAIR / Meta AI
}

  
\begin{document}
\maketitle

\begin{abstract}

Most existing approaches of differentially private (DP) machine learning focus on \emph{private training}.  Despite its many advantages, \emph{private training} lacks the flexibility in adapting to incremental changes to the training dataset such as deletion requests from exercising GDPR’s \emph{right to be forgotten}. 
We revisit a long-forgotten alternative, known as \emph{private prediction} \citep{dwork2018privacy}, and propose a new algorithm named \emph{Individual Kernelized Nearest Neighbor} (Ind-KNN). Ind-KNN is easily updatable over dataset changes and it allows precise control of the R\'{e}nyi DP at an individual user level --- a user's privacy loss is measured by the exact amount of her contribution to predictions; and a user is removed if her prescribed privacy budget runs out. Our results show that Ind-KNN consistently improves the accuracy over existing private prediction methods for a wide range of $\epsilon$ on four vision and language tasks. We also illustrate several cases under which Ind-KNN is preferable over private training with NoisySGD. Code is available at \url{https://github.com/jeremy43/Ind_kNN}.

\end{abstract}

 % \vspace{-1em}
\section{Introduction}


Differential privacy (DP; \citet{dwork2006calibrating, dwork2014algorithmic}) is a promising approach for mitigating privacy risks in machine learning (ML). The predominant setting for private ML is to produce the model learned from sensitive data using DP primitives, a.k.a. private training ~\citep{chaudhuri2011differentially,kasiviswanathan2011can,abadi2016deep}. The resulting trained model can then be safely deployed with peace of mind, because DP ensures that no individual training sample can be identified from the model itself or its downstream predictions. 




Unfortunately, private training comes with several irky properties that hamper its real-life deployment. 
To begin, private training comes at a significant computation cost that can be restrictive in many applications. The NoisySGD algorithm~\citep{abadi2016deep} requires per-sample gradient computation, which is much more computation- and memory-intensive than standard training.

Secondly, private training outputs a static model that cannot easily adapt to a changing dataset. For instance, additional data can arrive in a streaming fashion continuously. Also, training data could be mislabeled or corrupted~\citep{chen2017targeted, jagielski2018manipulating} and the model needs to be patched accordingly. In addition, if the model is trained on user data, privacy regulations such as GDPR entitle the user to request the removal of their data from the model~\citep{ginart2019making, guo2019certified, bourtoule2021machine} with the so-called \emph{right to be forgotten}~\citep{mantelero2013eu}. These requirements can be satisfied by periodically re-training the model, but such an approach is not applicable to private training due to its high computation cost as well as privacy degradation after repeated training runs.

Thirdly, privacy training operates under a very strong threat model in which all downstream users can collude with each other in a coordinated attack on any individual training sample. Sometimes it makes sense to make realistic assumptions that limit the adversaries' information or resources. For example, Harvard's Privacy Tools project (now OpenDP) adopts a weaker threat model where each downstream user keeps the results to themselves \citep{dptools}. In this way, they each get to spend the privacy budget independently of everyone else and enjoy higher utility. Private training unfortunately does not have a means to benefit from having weaker adversaries.



\begin{table*}[ht]
\centering
%\setlength{\tabcolsep}{0.8pt}
\caption{ The amortized computational and privacy cost of answering $T=2000$ queries on CIFAR-10. The median accuracy of all approaches across five independent runs is aligned to $96.0\%$. We estimate the amortized computational cost by calculating the averaged time spent (in seconds) to answer a single query, which is the total time of training divided by $T$ in Linear NoisySGD~\citep{feldman2021individual} and the total time of predictions divided by $T$ in Private kNN~\citep{zhu2020private} and Ind-KNN. We use $\delta=10^{-5}$.  In the retraining scenarios, we assume that a retraining request is made every answering 100 queries, resulting in a total of 20 retraining requests among $T$ queries.}
% \vspace{-0.5em}
\resizebox{\textwidth}{!}{
\begin{tabular}{c|c|c|c|c|c}
    \toprule
    &   NoisySGD & NoisySGD (with retrain) & Private kNN & Ind-KNN (ours) & Ind-KNN+hashing (ours)  \\
    \hline
     Computational cost (s)  & 0.008 &0.16 &0.12 & 0.25 & 0.04 \\ 
      \hline
      Privacy loss ($\epsilon$)   &  1.5 & 6.2 & 4.1 & 2.0  & 3.2\\ 
    \hline
\end{tabular}}
\label{tab: intro}
 \vspace{-0.5em}
\end{table*}

To address these issues of private training, we revisit a viable but less-known alternative setting in differentially private machine learning known as \emph{privacy-preserving prediction} (or simply \emph{private prediction}) \citep{dwork2018privacy}. Instead of privately training the models and then using the model for predictions, private prediction aims at generating a sequence of predictions using the data directly. Notable methods include those that perturb the predictions of non-private models \citep{dwork2018privacy,pate2018,bassily2018,dagan2020pac}, or those that perturb the voting scores of the nearest neighbors \citep{zhu2020private}. These methods require no changes to the (non-private) data workflow, and thus could more easily adapt to changing data. 

%Readers who are familiar with DP basics may complain \textit{``Wait a minute! An ML model needs to serve many queries. Isn't the privacy-utility tradeoff going down the toilet?''} 
 
From the privacy-utility trade-off point of view, the private prediction setting may appear to be counter-intuitive, because, for every prediction that it generates, a unit of privacy budget is spent. It is unreasonable to expect private prediction methods to outperform private training methods such as NoisySGD when we need to make many predictions. This was well-documented in the work of \citet{van2020trade}.  However, in the aforementioned situations when either frequent data updates are needed or a weaker adversary is assumed\footnote{Consider the example of a recommendation system, each user makes a much smaller number of predictions than all users collectively.}, private prediction methods can significantly outperform NoisySGD (See Table~\ref{tab: intro} and Figure~\ref{fig: unlearn} for an illustration).  In fact, we will demonstrate that when combined with modern DP accounting techniques, data-adaptive DP algorithm design, and some clever reuse of previous predictions, a small privacy budget can answer thousands of queries without significantly increasing the privacy loss.  





In this work, we propose \emph{Individual Kernelized Nearest Neighbors} (Ind-KNN) --- a new private prediction mechanism that significantly increases the number of queries one can answer with an individualized R\'{e}nyi Differential Privacy accountant and other techniques. 
Intuitively, in KNN prediction, training samples that do not belong to the query's neighbor set do not contribute to the prediction, and hence their privacy cost should be negligible.
We show that by slightly modifying KNN and leveraging R\'{e}nyi filter~\citep{feldman2021individual} to account for the privacy cost of each sample individually, we can realize this intuition in the privacy accounting and allow each training sample to participate in the query response until its own privacy budget is exhausted.
In effect, common queries can be answered with relatively low privacy costs due to a large number of similar samples present in the training set.

% \paragraph{Contributions} Our contributions are the following:
\textbf{Experimental results.} We summarize our experimental results as follows:
\begin{enumerate}
 %\vspace{-1em}
\item We show that Ind-KNN consistently outperforms the private prediction benchmark, Private-kNN~\citep{zhu2020private}, across four vision and language tasks for a range of epsilon between $[0.5, 2.0]$. 
\item We demonstrate that Ind-KNN is a viable alternative to private training methods even in a static data setting. Our results indicate that Ind-KNN achieves higher accuracy than NoisySGD when answering less than 2000 queries on CIFAR-10 under $(1.0, 10^{-5})$-DP. 
\item For frequent data updates, Ind-KNN significantly outperforms the private training benchmark Linear NoisySGD~\citep{feldman2021individual}. As shown in Table~\ref{tab: intro}, Linear NoisySGD requires a DP budget of $\epsilon=6.2$ to achieve an accuracy of $96.0\%$ on $2000$ queries of CIFAR-10, while Ind-KNN only requires $\epsilon=2.0$.

\item We describe two simple techniques that significantly enhance the computational efficiency and utility of Ind-KNN. First, we show  that incorporating hashing tricks into Ind-KNN can provide a $6\times$ speedup in making predictions, with only a negligible drop in accuracy. Additionally, we propose to reuse the results of previous queries via post-processing, which allows Ind-KNN to answer an additional $1000$ queries on CIFAR-10 without compromising in privacy or utility.
%\item We also propose two techniques to enhance Ind-KNN's efficiency and utility: incorporating hashing tricks for a $6\times$ speedup with only a negligible drop in accuracy and reusing predictions for an additional 1000 queries on CIFAR-10 without compromising privacy or utility.
 % \vspace{-0.8em}
\end{enumerate}








\noindent\textbf{Related work and novelty.} 
The problem of private prediction was pioneered by \citet{dwork2018privacy} as a weakened goal for private machine learning. Model-based approaches for private predictions either require analyzing the  stability of model training \citep{dwork2018privacy,dagan2020pac} or to enforce stability of prediction via subsample-and-aggregate \citep{pate2018,bassily2018}. Our method is closest to Private kNN \citep{zhu2020private} but uses kernel-weighted neighbors with a variable $K$ instead of a fixed $K$. This change is critical for adapting the individual R\'{e}nyi DP accountant (and filter) for our purpose. Other components such as adaptive noise-level, prediction reuse, and the fast hashing trick are new to this paper. Technically, we apply the same individual R\'{e}nyi filter \citep{feldman2021individual} that retires data samples when their privacy budget runs out. The difference is that we applied it to KNN rather than noisy gradient descent. KNN naturally has bounded support thus it is efficient to maintain the individual RDP accountants.



 % \vspace{-1em}
\section{Preliminaries}\label{sec:preliminary}




We start with the definition of differential privacy.
\begin{definition}[Differential Privacy~\citep{dwork2006calibrating}] \label{def:dp}
	A randomized algorithm $\cA : \cX \to \Theta$ is $(\epsilon,\delta)$-DP (differentially private) if for every pair of neighboring datasets $S, S'\in \cS$, and every possible (measurable) output set $E \subseteq \Theta$ the following inequality holds: $$\Pr[\cA(S) \in E] \leq e^{\epsilon} \Pr[\cA(S') \in E] + \delta.$$
\end{definition} 

\begin{definition}[R\'enyi Differential Privacy \citep{mironov2017renyi}] \label{def:RDP}
	We say that a mechanism $\cA$ is $(\alpha, \epsilon(\alpha))$-RDP with order $\alpha\in (1,\infty)$ if for all neighboring datasets $S,S'$: 
	\begin{align*}
	&D_{\alpha}(\cA(S)\|\cA(S') )\\
	&= \frac{1}{\alpha-1}\log E_{\theta\sim \cA(S')}\left[ \left(\frac{p_{\cA(S)}(\theta)}{p_{\cA(S')}(\theta)}\right)^\alpha \right] \leq \epsilon(\alpha).
	\end{align*}
\end{definition}
As $\alpha\rightarrow \infty$, RDP converges to the standard $(\epsilon,0)$-DP. More generally, we can convert RDP to standard $(\epsilon, \delta)$-DP for any $\delta>0$ using conversions from 
~\citep{balle2020hypothesis}.

\noindent\textbf{Privacy composition.} RDP features a natural composition theorem that significantly simplifies privacy analysis over compositions, and often leads to a tighter privacy guarantee.   If $\cA_1(\cdot)$ is $(\alpha, \epsilon_{\cA_1}(\alpha))$-RDP and $\cA_2(\cdot)$ is $(\alpha, \epsilon_{\cA_2}(\alpha))$-RDP, then the adaptive composition theorem for RDP says that $\epsilon_{\cA_1 \circ \cA_2}(\cdot)$ satisfies $(\alpha, \epsilon_{\cA_1}(\alpha)+\epsilon_{\cA_2}(\alpha))$-RDP.

\noindent\textbf{Privacy-preserving prediction.} 
% \paragraph{Privacy-preserving prediction}
We now formally state the setting of privacy-preserving prediction. Consider a prediction task over a domain $\cX$ and label space $\cY$.  The prediction interface $\cA$ has access to a 
private dataset $S=(x_i, y_i)_{i=1}^n \in (\cX \times \cY)^n$, which outputs a value $a \in \cY$ if given a query $q \in \cX$. We denote by $Q$ a query generating algorithm that can adaptively generate a query given the previous released outputs.  Namely, we denote by  $\cA(S)  \rightleftharpoons_T Q =(q_t, a_t)_{t=1}^T$ the sequence of query-response pairs generated by the prediction interface $\cA$ over a sequence of length $T$ queries on dataset $S$, where $a_t=\cA_t(a_1, ..., a_{t-1}, S, q_t)$.

The privacy guarantee of private prediction is applied for a sequence of predictions generated by the interface $\cA$.

\begin{definition}[Privacy-preserving prediction interface]\citep{dwork2018privacy}
 A prediction interface $\cA$ is $(\epsilon, \delta)$-differentially private, if for every interactive query generating algorithm $Q$, the output $\cA(S)  \rightleftharpoons_T Q =(q_t, a_t)_{t=1}^T$ is $(\epsilon, \delta)$-DP with respect to dataset $S$.

\end{definition}


Privacy-preserving prediction algorithms can be useful in a variety of situations where releasing a DP model is restricted or not practical. 
For example, companies that train a privacy-preserving model and only require making a limited number of predictions can rely on a prediction interface instead of releasing the entire model. In addition, in health or financial data scenarios, private prediction algorithms allow for a cloud-based interface to be exposed, which can also help to ensure compliance with regulatory requirements.







\noindent\textbf{Individual RDP.}
% \paragraph{Individual RDP}
Our privacy analysis relies on individual privacy loss, which accounts for the maximum possible impact of an individual data point on a dataset. The following definition states the individual privacy loss in terms of R\'{e}nyi divergence. 
\begin{definition}[Individual RDP~\citep{feldman2021individual}]\label{def: indRDP}
Fix $n\in \cN$ and a private data point $z=(x, y)\in \cX\times\cY$. We say that a randomized algorithm $\cA$ satisfies $(\alpha, \rho)$-individual R\'{e}nyi differential privacy for $z$ if for all datasets $S=(z_1, ..., z_m)$ such that $m\leq n$ and $z_i=z$ for some $i$, it holds that
\[
D_\alpha^{\leftrightarrow}\big(\cA(S)||\cA(S^{-i})\big)\leq \rho,
\] where $D_\alpha^\leftrightarrow$ denotes the max of $D_{\alpha}\big(\cA(S)\|\cA(S^{-i})\big)$ and $D_{\alpha}\big(\cA(S^{-i})\|\cA(S)\big)$.
\end{definition}
%We use the $\leftrightarrow$ notation to denote the two directions of R\'{e}nyi divergence.

Note that the individual RDP parameter $\rho$ is a function of a data point $z$, and thus does not imply the standard RDP guarantee in Definition \ref{def:RDP}. However, we can obtain the standard RDP guarantee by requiring that all data points $z$ satisfy individual RDP with the same $\rho$.

Now, we present an example of individual RDP computation on Gaussian mechanism.
\begin{lemma}[Linear queries with Gaussian mechanism~\citep{feldman2021individual}]\label{lem: ind_linear}
Let $S=(z_1, ..., z_n)\in (\cX \times \cY)^n$. Suppose that $\cA$ is a $d$-dimensional linear query with Gaussian noise addition, $\cA(S)=\sum_{j\in[n]}q(z_j) + \cN(0,\sigma^2 \mathbf{1}_d)$ for some $q:\cX\times \cY \to \cR^d$. Then $\cA$ satisfies
\[D_\alpha^{\leftrightarrow}\big(\cA(S)||\cA(S^{-i})\big)\leq \frac{\alpha ||q(z_i)||_2^2}{2\sigma^2}\]
individual $RDP$ for $z_i$. Note that by replacing$||q(z_i)||_2$ with the $\ell_2$ global sensitivity of $q(\cdot)$, the expression above recovers the standard RDP of Gaussian mechanism.
\end{lemma}




% why needs fully adaptive composition?


The following theorem states the composition property of individual privacy. For a sequence of algorithms, as long as the composition of individual RDP parameters does not exceed a pre-specified budget for all data points, the output of the adaptive composition preserves the standard RDP guarantee.
\begin{theorem}[Corollary 3.3~\citep{feldman2021individual}]\label{thm: comp}
Fix any $G\geq 0$ and any $\alpha\geq 1$. For any input dataset $S=(z_1, ..., z_n)$ and for any sequence of algorithms $\cA_1, ..., \cA_T$, let $\rho_t^{(i)}$ denote the individual RDP parameter of the $t$-th adaptively composed algorithm $\cA_t$ with respect to $z_i$.
if  $\sum_{t=1}^T\rho_t^{(i)} \leq G$ holds almost surely for all $i\in[n]$ then the adaptive composition $\cA^{(T)}$ satisfies $(\alpha, G)$-RDP.
\end{theorem}
The composition rule described above is known as fully adaptive composition~\citep{rogers2016privacy}, which takes \textit{adaptively-chosen} privacy parameters instead of pre-specified ones in the classical adaptive composition. This type of composition is necessary for individual privacy since the individual RDP parameters themselves are random variables that depend on the outputs released by previous composed mechanisms.

To implement the composition above, we need a tool called \emph{R\'{e}nyi filter}, which is designed to ensure that the composed individual privacy parameters is maintained within a given budget $G$ for all individuals.
% provide an explicit example of Renyi filter.
In practice, we can implement R\'{e}nyi filter by providing each data point with an individual accountant that estimates its composed individual RDP $\sum_{t=1}^T \rho_t^{(i)}$ and dropping the data point once it exceeds the budget, as shown in Algorithm~\ref{alg: filter}. 

However, despite its tighter privacy analysis, this technique has been criticized for its computational cost of tracking individual privacy costs for all data samples. In this work, we demonstrate that KNN works seamlessly with the individual RDP accountant. Only selected neighbors are required to update their individual privacy accountants, which significantly reduces the computational cost.


\begin{algorithm}[htbp]
 \caption{Adaptive composition $\cA^{(T)}$ with R\'enyi filter}
 \label{alg: filter}
 \begin{algorithmic}[1]
 \STATE{\textbf{Input}: Dataset $S\in (\cX \times \cY)^n$, sequence of algorithms $\cA_{1:T}$ and privacy budget $G$}.
 \FOR{$t=1, ..., T$}
 \STATE{For all $z_i \in S$, compute \\\begin{small}$\rho_t^{(i)}=\sup_{S'\in \cS}D_{\alpha}^{\leftrightarrow}\left(\cA(a_{1:t-1}, S')||\cA(a_{1:t-1}, S'^{-i})\right)$\end{small}}
 \STATE{Update the active set $S=\{z_i| \sum_{j=1}^t \rho_j^{(i)}\leq G\}$}
 \STATE{Compute $a_t=\cA_t(a_{1:t-1}, S)$}
 \ENDFOR
 \STATE{\textbf{Return} $(a_1, ..., a_T)$}
 \end{algorithmic}
 \end{algorithm}


 % \vspace{-1em}
\section{Private Prediction with Ind-KNN}
 % \vspace{-0.5em}
%\section{Private Prediction with Ind-KNN}
\label{sec:main}

To overcome the limitations of private training, we propose \emph{Individual Kernelized Nearest Neighbor} (Ind-KNN)---a k-nearest neighbor-based private prediction algorithm that achieves a comparable DP guarantee and test accuracy to that of private training.

\noindent\textbf{Notations and setup.} 
% \paragraph{Notations and setup}
We focus on the task of multi-class classification. 
Given a private dataset $S=(x_i, y_i)_{i=1}^n \in (\cX \times \cY)$, we assume $y_i$ is an one-hot vector over $c$ class, i.e., $y_i \in\{0,1\}^c$. 
Let $\phi(\cdot)$ denote a \emph{public} feature extractor that maps the input $x\in \cX$ to a fixed-length feature representation $\phi(x)\in \cR^d$. This could be image features extracted from the penultimate layer of a ResNet50 pre-trained model or language features extracted from the final layer of a transformer model. The feature extractor is used to encode both the private dataset and public queries.
% need a description of the private dataset
\begin{algorithm}[htbp]
\caption{Privacy-preserving prediction with naive kNN}
\label{alg: naive_ind_kNN}
\begin{algorithmic}[1]
\STATE{\textbf{Input}: Dataset $S\in (\cX \times \cY)^n$, sequence of queries $q_1, ..., q_T$, number of neighbor $k$ and the noisy scale $\sigma$.}
\FOR{$t=1$ to $T$}
\STATE{$\cN_k:=$ top k nearest neighbors of the query $q_t$}
\STATE{$a_t = \argmax_{j\in[c]}\left(\sum_{i \in \cN_k }y_i +\cN(\boldsymbol{0}, \sigma^2\mathbf{1}_c) \right)_j$}
\ENDFOR
\STATE{\textbf{Return} $(a_1, ..., a_T)$}
\end{algorithmic}
\vspace{-1mm}
\end{algorithm}

% why knn does not work?
% why need to release k?
% why using kernel?
Previously, k-Nearest Neighbor (kNN) has been used for privacy-preserving prediction by \citet{zhu2020private} (Algorithm \ref{alg: naive_ind_kNN}).
In this method, when a query $q_t$ arrives, the top k nearest neighbors are selected from the private dataset based on the distance in the feature space, and their labels are utilized for prediction through a Gaussian mechanism.

However, the privacy loss of Algorithm~\ref{alg: naive_ind_kNN} 
accumulates rapidly as the number of queries increases, owing to its conservative privacy analysis that bounds the worst-case individual privacy loss over all individuals. In contrast, the Ind-KNN approach emphasizes individual privacy accounting, providing precise control over privacy loss at an individual data level. This allows each data point's privacy to be charged by the exact amount of its contribution to the query response, and private data is removed once its own privacy budget has been exhausted.






\begin{algorithm}[htbp]
\caption{Kernelized-nearest-neighbor with individual privacy accounting (Ind-KNN)}
\label{alg:ind_kNN}
\begin{algorithmic}[1]
\STATE{\textbf{Input}: Dataset $S\in (\cX \times \cY)^n$, the kernel function $\kappa(\cdot, \cdot)$, the threshold $\tau$, sequence of queries $q_{1:T}$, the noisy scale $\sigma_1$, $\sigma_2$ and the individual budget $B$. }
\STATE{Initialize individual budget $z_i = B, \forall i \in [n]$.}
\FOR{$t=1$ to $T$}
\STATE{Update the active set $S=\{(x_i, y_i)|z_i \geq \frac{1}{2\sigma_1^2}\}$.}
\STATE{Release the number of selected neighbors: $K_t:= \sum_{(x_i, y_i) \in S} \mathbb{I}[\kappa(x_i, q_t)\geq \tau] + \cN(0, \sigma_1^2).$}
\FOR{$(x_i, y_i)\in S$}
\STATE{Update the remaining budget $z_i$ after releasing $K_t$: $z_i=z_i - \frac{1}{2\sigma_1^2}\cdot \mathbb{I}[\kappa(x_i, q_t)\geq \tau]$. }
\STATE{Evaluate individual contribution $f_t: \cX \times \cY \to \cR^c$ as $f_t(x_i, y_i):=\min\big(\kappa(x_i, q_t)\cdot y_i \cdot \mathbb{I}[\kappa(x_i, q_t)\geq \tau], \sigma_2\sqrt{2K_t\cdot z_i}\cdot \mathbf{1}_c\big)$}

\STATE{Update the remaining budget $z_i$ after releasing label: $z_i = z_i - \frac{||f_t(x_i, y_i)||_2^2}{2\sigma_2^2\cdot K_t}$.}
\ENDFOR
\STATE{$a_t = \argmax_{j \in [c]}\big(\sum_{(x_i, y_i)\in S} f_t(x_i, y_i)+\cN(\boldsymbol{0}, \sigma_2^2\cdot K_t\cdot\mathbf{1}_c)\big)_j$.}
%\STATE{$a_t$ = Algorithm~\ref{alg:noisy_label}($S, \kappa(\cdot, \cdot), q_t, \sigma_1, \sigma_2$).}
\ENDFOR
\STATE{\textbf{Return} $(a_1, ..., a_T)$}
\end{algorithmic}
\vspace{-1mm}
\end{algorithm}



\begin{comment}
\begin{algorithm}[htbp]
\caption{Kernelized-nearest-neighbor with individual privacy accounting (Ind-KNN)}
\label{alg:ind_kNN_old}
\begin{algorithmic}[1]
\STATE{\textbf{Input}: Dataset $S\in (\cX \times \cY)^n$, the kernel function $\kappa(\cdot, \cdot)$, the threshold $\tau$, sequence of queries $q_{1:T}$, the noisy scale $\sigma_1$, $\sigma_2$ and the individual budget $B$. }
\STATE{Initialize individual budget $z_i = B, \forall i \in [n]$.}
\FOR{$t=1$ to $T$}
\STATE{Update the active set $S=\{(x_i, y_i)|z_i >0\}$.}
%\STATE{Compute the kernel weight $\kappa(x, q_t) $ for $ (x, y) \in S$.}
\STATE{The selected neighbors: $\cN_t := \{(x_i, y_i)|\kappa(x_i, q_t) \geq \tau   \text{ for all } (x_i, y_i) \in S\}$.}
\STATE{Drop $(x_i, y_i)$ from $\cN_t$ if $z_i \leq \frac{1}{2\sigma_1^2}$.}
\STATE{Release $|\cN_t|$: $K_t:=|\cN_t| + \cN(0, \sigma_1^2).$}
\FOR{$(x_i, y_i)\in \cN_t$}
\STATE{Update $z_i$ after releasing $K_t$: $z_i=z_i - \frac{1}{2\sigma_1^2}$. }
\STATE{Evaluate individual ``contribution'': $g_i = \min \left(\frac{\kappa(x_i, q_t)^2}{2\sigma_2^2\cdot K_t}, \sigma_2\sqrt{2 K_t z_i}\right)$.}
\STATE{Update $z_i$ after releasing label: $z_i = z_i - g_i$.}
%\STATE{Drop $(x_i, y_i)$ from the active set $S$ if $z_i\leq0$.}
\ENDFOR
\STATE{Compute $a_t = \argmax_{j \in [c]}\big(\sum_{i\in \cN_t}\kappa(x_i, q_t)\cdot y_i +\cN(\boldsymbol{0}, \sigma_2^2\cdot K_t\mathbb{I}_c)\big)_j$.}
%\STATE{$a_t$ = Algorithm~\ref{alg:noisy_label}($S, \kappa(\cdot, \cdot), q_t, \sigma_1, \sigma_2$).}
\ENDFOR
\STATE{\textbf{Return} $(a_1, ..., a_T)$}
\end{algorithmic}
\vspace{-1mm}
\end{algorithm}

 \vspace{-1.em}
\subsection{Ind-KNN}
 \vspace{-0.5em}
\end{comment}
We propose a novel solution \emph{Individualized Kernelized Nearest Neighbor} (Ind-KNN) in Algorithm~\ref{alg:ind_kNN}. Intuitively, nearest neighbor-based prediction leak little to no private information when the query point is near a dense region of the training data. This is because the result of the query is determined by a large number of training samples and hence is insensitive to individual training points. We make several modifications to Private kNN to realize this intuition.


First, we introduce individual privacy accounting by assigning each private data point $(x_i, y_i)$ with a pre-determined privacy budget $B$, represented by the variable $z_i:=B$. For each query, the algorithm updates the private dataset $S$ to only include data points where $z_i>\frac{1}{2\sigma_1^2}$. This ensures that the privacy budget for each individual is not exceeded. 

Second, Ind-KNN improves upon Algorithm~\ref{alg: naive_ind_kNN} by utilizing a pre-specified threshold $\tau$ and a kernel-based similarity function $\kappa(\cdot, \cdot)$ to select only neighbors with similarity above $\tau$. This approach allows only the selected neighbors to be accountable for their privacy loss, preserving the privacy budget of un-selected private individuals for future queries. It is worth noting that simply selecting the exact top k neighbors, as in Algorithm~\ref{alg: naive_ind_kNN}, is not consistent with individual privacy loss. This is because the decision of selection is dependent on the dataset: a $k+1$th nearest neighbor in one dataset may be the top nearest neighbor in another dataset. Hence, all private data points must account for their individual privacy loss, even if only a subset of them contribute to the prediction, according to the definition of individual RDP (Definition~\ref{def: indRDP}).

Moreover, Ind-KNN employs kernel weights for prediction aggregation instead of equal weight for all nearest neighbors. In our experiments, we consider two types of kernel functions, RBF and cosine, to measure the similarity. For example,  the RBF kernel is defined as $\kappa(x, q_t):= e^{\frac{-||\phi(x)-\phi(q_t)||_2^2}{\nu^2}}$, where $\phi(x)$ and $\phi(q_t)$ are the encoded feature and $\nu$ is a scalar parameter. This adaptation, made possible by individual privacy accounting, results in a more accurate characterization of each individual's contribution to the query. However, changing from equal weight to kernel weight in Algorithm~\ref{alg: naive_ind_kNN} would not alter its privacy analysis (as the worst-case kernel weight is bound by 1), but would instead decrease the signal-to-noise ratio (each neighbor's contribution would be less than 1).



Finally, Ind-KNN dynamically adjusts the magnitude of noise added to the noisy prediction by publishing the number of neighbors at each query.
We find that adding noise with variance proportional to $K_t$ is crucial for good performance. This allows us to adjust the margin of the voting space --- the difference between the largest and the second largest coordinate of  $\sum_{(x_i, y_i)\in S} f(x_i, y_i)$ adapted to the noise scale. Specifically, when the margin is significant, adding larger noise will not change the output label, but it reduces each individual's individual privacy loss proportional to the reciprocal of $K_t$, enabling them to participate in more queries in the future.

\textbf{Algorithm.} The modifications made in Ind-KNN are summarized in Algorithm~\ref{alg:ind_kNN}. Specifically, since the number of selected neighbors is considered private information, each selected neighbor accounts for its individual  privacy loss due to releasing $\mathbb{I}[\kappa(x_i, q_t)\geq \tau]$ by subtracting $z_i$ with $\frac{1}{2\sigma_1^2}$ at line 7 of Algorithm~\ref{alg:ind_kNN}. Meanwhile, 
$f_t(x_i, y_i)$ at line 8 accounts for the individual contribution of releasing its label associated with kernel weight. 
The first term represents the ``weighted'' one-hot label for selected neighbors and all-zero vectors for unselected private data.
The second term $ \sigma_2\sqrt{2 K_t z_i}$ ensures that the incurred individual privacy loss of releasing label will not go beyond the remaining budget $z_i$. 


\begin{lemma}[Individual RDP of releasing $a_t$]
Given a query $q_t$, for each $(x_i, y_i)$, define the function $f_t: \cX \times \cY \to \cR^c $ as line $8$ in Algorithm~\ref{alg:ind_kNN}.  Then the release of $a_t = \argmax_{j\in [c]}\big(\sum_{(x_i, y_i)\in S}f_t(x_i, y_i) +\cN(0, \sigma_2^2 \cdot K_t \cdot \mathbf{I}_c)\big)_j$ satisfies $(\alpha, \frac{\alpha \cdot ||f_t(x_i, y_i)||_2^2}{2\sigma_2^2\cdot K_t})$ individual RDP for each $(x_i, y_i)$.
\end{lemma}
The proof directly follows from Lemma~\ref{lem: ind_linear} and the post-processing property of individual privacy. Note that for unselected private data, their individual privacy loss is always zero since their individual contribution $f(\cdot)$ is zero.
\begin{theorem}\label{thm: privacy_indknn}
Algorithm~\ref{alg:ind_kNN} satisfies $(\alpha, B\alpha)$-RDP for all $\alpha\geq 1$.
\end{theorem}
\begin{proof}[Proof sketch]
The proof (deferred to the appendix) makes use of the facts that: (1) the decision rule for ``being selected''  is not influenced by any other private data points, thus, ``unselected'' neighbors does not incur any individual privacy loss. (2) adding/removing one selected neighbor would only change $\sum_{(x_i, y_i)\in S} \mathbb{I}[\kappa(x_i, q_t)\geq \tau]$ by one, thus the release of $K_t$ satisfies $(\alpha, \frac{\alpha}{2\sigma_1^2})$ individual RDP for selected neighbors. (3)  the release of label associated with the kernel weight satisfies $(\alpha, \frac{\alpha ||f_t(x_i,y_i)||_2^2}{2\sigma_2^2 K_t})$ individual RDP.
\end{proof}
\begin{remark}
We remark that the privacy guarantee of Ind-KNN is determined by the given individual budget, and remains the same regardless of the number of predictions made. However, as the number of predictions increase, the exclusion of private data may result in a degradation of the algorithm's utility.
\end{remark}

% add a remark 
% \vspace{-1.em}
\subsection{Efficient Ind-KNN}\label{sec: hash}
% \vspace{-0.5em}
In this section, we present two novel techniques that aim to improve the efficiency of Ind-KNN in terms of both utility and computational cost.

% in which scenarios we can reuse prediction?
% \paragraph{Ind-KNN with prediction reuse}
\noindent\textbf{Ind-KNN with prediction reuse.}
The first technique improves the utility of Ind-KNN by exploiting the previously released predictions. We acknowledge that the query-response pairs that have been disclosed can be considered public information. Therefore, we incorporate those predictions into the active set $S$ without any limitation on their privacy budgets. The results of our experiments demonstrate that this extension effectively mitigates the utility loss caused by the exclusion of private data points and improves the test accuracy when handling a large number of queries. 



% \paragraph{Ind-KNN with hashing}
\noindent\textbf{Ind-KNN with hashing.}
Algorithm~\ref{alg:ind_kNN} requires searching through all private data to answer each query, which can be computationally expensive if the private dataset is large.
To address this issue, we present a variant of Ind-KNN that incorporates locality-sensitive hashing (LSH)~\citep{gionis1999similarity} for efficient nearest neighbor search. The full algorithm of Ind-KNN-Hash is in the appendix.
LSH is a well-established technique to speed up the approximate nearest neighbor search.  The principle behind the algorithm is to apply LSH to group private data points into ``buckets'' based on their hash values. When a query is made, the algorithm only needs to search the bucket that the query falls into, rather than searching through the entire dataset. 

Concretely, Ind-KNN-Hash creates $L$ hash tables $\cF=(f_1, ..., f_L)$ with each of them maps a feature $\mu\in \cR^d$ to a $b$-dimension bucket.  For each table $f$, the algorithm generates $b$ independent random Gaussian vectors from $\cN(\boldsymbol{0}, \mathbf{1}_d)$, denoted by $r_j$ for $1\leq j\leq b$. Then we encode $\mu$ with $f(\mu) = (h_1(\mu),..., h_b(\mu))$, where $h_j(\mu)=0$ if $r_j^\top \mu <0$, otherwise $h_j(\mu)=1$. The algorithm then indexes all private data points into the hash tables using their encoded features. When a query $q_t$ is received, the algorithm uses LSH to retrieve a set of private data points that are hashed into the same bucket in at least one table, which is denoted by $\cF(q_t)$. Finally,  Algorithm~\ref{alg:ind_kNN} is called to label each query with a slight modification on the active set, which is now restricted to the retrieved data points with non-negative individual budgets. Typically, increasing the number of hash tables $L$ and reducing the bucket size $b$ results in more accurate neighbors but higher computational costs. 

Incorporating LSH into Ind-KNN does not impose any additional privacy cost. This is because the encoding of each private data point is based on random Gaussian vectors and is executed independently of any other private data points.



 \vspace{-1em}
\section{Experiments}\label{sec:exp}
 % \vspace{-1em}
%\subsection{Datasets and Features}\label{sec:feature}
We consider the following standard image classification and language classification datasets. For each dataset, we take the training set as the private domain and the testing set as the public domain.% We report the averaged prediction accuracy on a random subset from the testing set. 

\noindent\textbf{Image classification.} 
% \paragraph{Image classification}
We evaluate our method on two widely used image classification benchmarks, CIFAR-10 \citep{Krizhevsky2009LearningML} and Fashion MNIST \citep{Xiao2017FashionMNISTAN}. For CIFAR-10, we employed the recent Vision Transformer (ViT) model~\citep{vit}, which is pre-trained on the ImageNet-21k consisting of 14 million images and 21843 classes). The extracted from the ViT model are represented as 768-dimensional vectors. For Fashion MNIST, we consider the publicly available ImageNet-pretrained ResNet50 \cite{He_2016_CVPR} from Pytorch as the feature extractor. The model returns a $1000$-dim vector for each input image.

\noindent\textbf{Text classification.} 
We utilize AG News \citep{Zhang2015CharacterlevelCN} and DBPedia \citep{Lehmann2015DBpediaA} datasets to evaluate the performance of Ind-KNN on text classification tasks. We employ sentence embedding models \citep{zhao-etal-2022-compressing, reimers-2019-sentence-bert} to extract features. Specifically, we utilize the \texttt{all-roberta-large-v1} sentence-transformer, which has been fine-tuned on a 1B sentence pairs dataset using a self-supervised contrastive learning objective. The extracted features are 1024-dimensional vectors for each text instance.


We consider the following two algorithms for comparisons:

 \textbf{Linear+NoisySGD}~\citep{tramer2020differentially} is a private training benchmark that has been shown outperforming end-to-end privacy-preserving deep learning methods (including those pre-trained on public data, see \citet{de2022unlocking}) for a wide range of $\epsilon$. We consider this algorithm as a reference point for private training to investigate how well Ind-KNN performs compared to private training while we gain those computational savings. We implement the algorithm by training a linear model with features extracted from the same extractor as Ind-KNN. We use the default batch size $256$ and clip the gradient norm to $0.1$. The model is trained for 10 epochs with a grid search over the learning rate and the noise level is determined by the target privacy budget.
    
 \textbf{Private-kNN}~\citep{zhu2020private} is a private prediction baseline that we consider. For each query, the algorithm first samples a random subset from the private dataset, retrieves the k-nearest neighbors from the subset (based on the extracted features), and then releases the noisy label of kNN prediction using Report-Noisy-Max. We tune the sampling ratio and the number of neighbors on the validation set. The noise scale is calibrated based on the target privacy budget. 













\begin{figure}[hbt]
\centering
\begin{subfigure}[h]{.85\linewidth}
    \centering\includegraphics[width=0.95\linewidth]{img/cifar10_trade_off_vit.pdf}
    \caption{Accuracy of $1000$ queries on CIFAR-10. }
    %\label{fig: cifar10_tradeoff}
\end{subfigure}
\begin{subfigure}[h]{.85\linewidth}
    \centering\includegraphics[width=0.95\linewidth]{img/query_vs_acc_cifar10_vit.pdf}
    \caption{Accuracy vs number of query on CIFAR-10 under $(1, 10^{-5})$-DP. }
    \label{fig: query_vs_acc}
\end{subfigure}
\caption{Privacy-utility trade-offs on CIFAR-10. We plot the median accuracy across $5$ independent runs.}
\label{fig: cifar10} \vspace{-1.em}
\end{figure}



% \paragraph{Hyper-parameters of Ind-KNN}
\noindent\textbf{Hyper-parameters of Ind-KNN.}
We set the individual RDP budget $B$ such that using RDP to DP conversion on $(\alpha, B\alpha)$-RDP satisfies the predefined privacy budget $(\epsilon, \delta)$. Then, we set the noise scale $\sigma_1$ to be $\sqrt{\frac{T}{6B}}$ to use roughly half of the individual RDP budget $B$ for each data point being selected at every query and tune the noise scale $\sigma_2$ on the validation set. We consider two kernel methods, the RBF kernel $\kappa(x, q) = e^{\frac{-||\phi(x)-\phi(q)||_2^2}{\nu^2}}$ and the cosine similarity $\kappa(x, q) = \cos(\phi(x), \phi(q))$. A linear scaling search is run on the minimum kernel weight threshold $\tau$ for each kernel method. Additional details are given in the  appendix.
   


\begin{figure*}[t]
\centering	
\begin{subfigure}[t]{.3\linewidth}
    \centering\includegraphics[width=0.95\linewidth]{img/fmnist_trade_off.pdf}
    \caption{Accuracy of $500$ queries on FMNIST.}
    \label{fig:fmnist_tradeoff}
\end{subfigure}\qquad
\begin{subfigure}[t]{.3\linewidth}
    \centering\includegraphics[width=0.95\linewidth]{img/agnews_trade_off.pdf}
    \caption{Accuracy of $800$ queries on AG News.}
    \label{fig: agnews_tradeoff}
\end{subfigure}\qquad
\begin{subfigure}[t]{.3\linewidth}
    \centering\includegraphics[width=0.95\linewidth]{ind_knn/img/dbpedia_trade_off.pdf}
    \caption{Accuracy of $800$ queries on DBPedia.}
    \label{fig: dbpedia_tradeoff}
\end{subfigure}
\caption{Privacy-accuracy trade-offs on FMNIST, AG News and DBPedia. We consider $\delta=10^{-5}$ for FMNIST and AG News and $\delta=10^{-6}$ for DBPedia.}
\label{fig: trade_off}
 \vspace{-1em}
\end{figure*}
% \paragraph{Experiment setting}
\noindent\textbf{Experiment setting.}
For all experiments, we use a random seed to generate a validation set of size $T$. For example, we randomly sample $1000$ examples from the CIFAR-10 testing dataset and tune the best hyper-parameters of all approaches on the validation set. We then report the median accuracy across 5 independent sampled query sets. 
All experiments are conducted on a server with an Intel i7-5930K CPU @ 3.50GHz and Nvidia TITAN Xp GPU.
 





\vspace{-0.5em}
\subsection{Main Results}\label{sec: exp_main_result}
 \vspace{-0.5em}
% \paragraph{Privacy-accuracy trade-off on CIFAR-10}
\noindent\textbf{Privacy-accuracy trade-off on CIFAR-10.}
In the top figure of Figure~\ref{fig: cifar10}, we plot the median accuracy evaluated on  $1000$ randomly chosen queries from the CIFAR-10 test set over a range of privacy budget $\epsilon$. The hyper-parameters were fine-tuned for each algorithm at each value of $\epsilon$.  For Ind-kNN, we found that the best hyper-parameter $\tau$ (the minimum threshold) increases as the privacy budget grows. We note this because, with smaller value of $\epsilon$, the added noise requires a larger margin among the selected neighbors' votes to  determine the correct output. This larger margin, in turn, corresponds to a smaller threshold and more selected neighbors.
For Ind-KNN with RBF kernel, we set the kernel bandwidth to $\nu=e^{1.5}$ and search for the optimal minimum threshold $\tau$ on the validation set. We find that different choices of kernel bandwidth in the RBF kernel produce similar accuracy results. 
As shown in Figure~\ref{fig: cifar10}, Ind-kNN with RBF kernel performs slightly better than its cosine kernel and both kernel methods are comparable to Linear NoisySGD across various value of $\epsilon$.

% For Private-kNN, we found that the best hyper-parameter $k$ decreases as the privacy budget increases. For example, we used $k=300$ when $\epsilon=0.5$, while $k=100$ when $\epsilon=2.0$.


% \paragraph{Accuracy vs $|$queries$|$ on CIFAR-10}
\noindent\textbf{Accuracy vs number of queries on CIFAR-10.}
Given a fixed privacy budget, the accuracy of all private prediction methods typically degrades as the number of predictions increases, while the accuracy of private training methods remains unaffected.   In the bottom figure of Figure~\ref{fig: cifar10}, we study how quickly the accuracy of Ind-KNN drops as the number of queries increases. We present the median accuracy of answering $T$ queries over five independent rounds. %We observe a low accuracy region for answering less than 400 queries, owing to the randomness associated with a small number of queries.
The accuracy of Private kNN drops rapidly with the increasing number of queries. This decline is expected, as Private kNN applies the standard R\'enyi composition theorem to analyze privacy loss, requiring the noise level to increase proportionally to the square root of $T$. In contrast, Ind-KNN uses individual privacy accountants, which only require selected neighbors to account for privacy loss, resulting in no significant accuracy drop as more queries are answered. Furthermore, exploiting released predictions allows Ind-KNN to answer an additional $1000$ queries (from $T=2000$ to $T=3000$) without an accuracy drop.
The figure also shows that if the number of queries is less than 2000, Ind-KNN can in fact outperform Linear NoisySGD, making it a practical alternative to private training methods when only a small number of predictions is needed.



% \paragraph{Privacy-accuracy trade-off on Fashion MNIST, AG News and DBPedia}
\noindent\textbf{Privacy-accuracy trade-off on Fashion MNIST, AG News and DBPedia.}
Next, we examine the privacy-accuracy trade-off on Fashion MNIST, AG News and DBPedia datasets. We use Ind-KNN with cosine kernel for all datasets. Figure~\ref{fig:fmnist_tradeoff} shows that Ind-KNN outperforms Private-kNN for all values of the privacy parameter $\epsilon$ on Fashion MNIST. On AG News, we compare the performance of Ind-KNN to that of Linear NoisySGD, and the results are presented in Figure~\ref{fig: agnews_tradeoff}. We evaluate $T=800$ queries on AG News and find that the accuracy of Ind-KNN either surpasses or matches that of Linear NoisySGD for $\epsilon\geq 0.5$. We also observe similar improvements over Private-kNN  on DBPedia.

Overall, Ind-KNN demonstrates its versatility by delivering competitive accuracy results on all three datasets, making it a promising solution for balancing differential privacy and accuracy.




 \begin{figure}[htb]
 \centering 
 \begin{subfigure}[c]{.87\linewidth}
     \centering\includegraphics[width=0.90\linewidth]{img/speed_trade_off.pdf}
    \caption{Amortized computational cost vs retraining frequency. \label{fig: speed}}
 \end{subfigure}\qquad
 \begin{subfigure}[c]{.87\linewidth}
     \centering\includegraphics[width=0.90\linewidth]{img/cifar10_align_ac.pdf}
    \caption{Privacy cost vs $|\text{queries}|$ when accuracy is aligned.}
     \label{fig: cost1}
      \vspace{-0.5em}
 \end{subfigure}
 \caption{(a): We estimate the amortized computational cost by averaging the time (in seconds) spent to answer each query under different retrain settings on CIFAR-10. The x-axis denotes the retraining frequency, i.e., retraining a model every receiving $Q$ queries.  
(b): The accumulated privacy cost of answering a stream of $T=2000$ queries when the final accuracy (over 2000 queries) is aligned to $96.0\%$ on CIFAR-10. The red curve fixed the individual privacy budget at the beginning, resulting in a constant privacy loss. The yellow curve reports the median of individual privacy loss across  all private data. }\label{fig: unlearn}
\vspace{-1em}
\end{figure}


 \vspace{-1em}
\subsection{Ablation Studies}
 \vspace{-0.5em}
We first perform an ablation study in Figure~\ref{fig: unlearn} to better understand how the  periodical retraining affects the performance of private training method and our Ind-KNN in terms of computational and privacy cost on CIFAR-10.
% \paragraph{Periodical retraining}

\noindent\textbf{Periodical retraining.}
In Figure~\ref{fig: speed}, we provide empirical measurements of the amortized computational cost associated with periodical retraining on CIFAR-10 of answering a stream of total $T=10^5$ queries. We assume a retraining request is triggered every time the model has answered $Q$ queries. To simplify the analysis, we assume each retraining is performed on the same dataset. For Linear NoisySGD, we retrain the model for 10 epochs and we calculate the per-query computational cost by dividing the total time spent on retraining and answering $T$ queries by $T$. This provides an estimate of the average time required to answer a single query.  For Ind-KNN with the cosine kernel, the average time of making predictions with is reported.   Ind-KNN-Hash uses 30 hash tables with the width parameter $b=8$. % The x-axis indicating the frequency of retraining requests $Q$.
 Our results demonstrate that the computational cost per query remains constant for Ind-KNN and Ind-KNN-Hash, as they do not require retraining the model, and the time required to add or delete individual data points is negligible. In contrast, for Linear NoisySGD, every retraining request incurs a substantial computational cost and the privacy loss grows $\propto \frac{1}{\sqrt{Q}}$ (proportional to the square root of total epochs). These findings highlight the advantage of Ind-KNN and Ind-KNN-Hash over Linear NoisySGD in terms of efficiency and resource utilization for machine unlearning and other scenarios with periodic retraining requests.



 \begin{figure}[htb]
 \centering 
 \begin{subfigure}[t]{.83\linewidth}
     \centering\includegraphics[width=0.9\linewidth]{ind_knn/img/cifar10_hash.pdf}
    \caption{Accuracy of $T=1000$ queries on CIFAR-10.}
     \label{fig: hash_cifar}
 \end{subfigure}\qquad
 \begin{subfigure}[t]{.83\linewidth}
     \centering\includegraphics[width=0.90\linewidth]{ind_knn/img/agnews_hash.pdf}
    \caption{Accuracy of $T=1000$ queries on AG News.}
     \label{fig: hash_ag}
 \end{subfigure} \vspace{-0.5em}
 \caption{Ablation study on hashing under $\delta=10^{-5}$. \label{fig: hash}}
  \vspace{-1em}
\end{figure}
Figure~\ref{fig: cost1} evaluates the accumulated privacy loss of answering a stream of $T=2000$ queries on CIFAR-10.
We tune hyper-parameters for both approaches such that the averaged accuracy of answering $T$ queries is aligned to $96.0\%$. We consider two types of retraining scenarios: $Q=100$ and $Q=200$. 
Periodic retraining has a negligible privacy impact on  Ind-KNN. Therefore, we only use one red curve to indicate the privacy loss of Ind-KNN under two scenarios. The individual privacy budget of Ind-KNN is pre-determined, thus the standard privacy guarantee remained unchanged when making more predictions. The yellow curve plots the median of individual privacy loss over all private data points and reflects how much individual privacy loss deteriorates as the number of answered queries increases. We note the median individual privacy loss is $\epsilon=1.2$ after answering 2000 queries, which suggests that only half of the privacy budget has been spent at an individual level.
The privacy loss curve of Ind-KNN and two Linear NoisySGDs are met when there received six retraining requests. This suggests that if there are more than six retraining requests among the $2000$ queries, the privacy loss of Ind-KNN would be  better than that of Linear NoisySGD. 
% what is yellow curve?

\begin{table}[htbp]
\centering
\caption{ Test Accuracy of $T=1000$ queries on CIFAR-10 under different pre-trained models: vision transformer (ViT)~\citep{vit},  SimCLRv2 model~\citep{clr} and ResNet50~\citep{He_2016_CVPR}. }
\resizebox{0.47\textwidth}{!}{%
\begin{tabular}{c|c|c|c|c}
  \hline
   $\epsilon (\delta = 10^{-5})$ & Method & ResNet50 & SimCLRv2 & ViT \\
  \hline
  \multirow{3}{*}{$\epsilon=0.5$} & Linear NoisySGD  & 86.2\% & 89.7\%& 95.0\%  \\
    & Private kNN &73.1\%& 76.0\% &  94.4\% \\
      & Ind-kNN  &79.4\%& 82.4\%& 95.2\%  \\
  \hline
    \multirow{2}{*}{$\epsilon=2.0$} & Linear NoisySGD &88.4\% & $90.2\%$ & $96.7\%$  \\
    & Private kNN &81.6\% & 84.7\%& $96.3\%$ \\
      & Ind-kNN  & 82.8\% & 86.3\%& $96.4\%$  \\
       \hline
        \multirow{2}{*}{$\epsilon=\inf$} & Linear NoisySGD & 90.0\% & 90.7\% & 97.0\% \\
    & Private kNN & 82.9\% &  85.1\% & 96.6\%\\
    & Ind-kNN  &  84.7\% & 89.2\% &  96.9\% \\
    \hline
\end{tabular}%
} \vspace{-0.5em}
\label{tab: vary_model}
\end{table}


\begin{table}[htbp]
\centering
\setlength{\tabcolsep}{3pt}
\caption{The Averaged time (in second) to answer each query on CIFAR-10 and AG News using Ind-KNN and its hashing variants.}
% \vspace{-0.5em}
\begin{tabular}{lcccc}
    \toprule
    Dataset & Table=10& Table=20 &Table=30 &Ind-KNN  \\
    \hline
      CIFAR-10    &   0.02 & 0.03&  0.04& 0.25 \\ 
      \hline
     AG News &0.01 & 0.02 & 0.03 & 0.29\\
    \hline
\end{tabular}
 \vspace{-0.5em}
\label{tab: time_vs_hash}
\end{table}


% \paragraph{Computational cost vs utility}
% what's computational cost?
\noindent\textbf{Ablation study on hashing.}
In Sec~\ref{sec: hash}, we introduce hashing to improve the computational efficiency of Ind-KNN. We now investigate the trade-off between computational cost and utility of Ind-KNN-Hash on CIFAR-10 and AG News.  We set the width parameter $b=8$ for CIFAR-10 and $b=9$ for AG News, and evaluate the  performance of Ind-KNN-Hash with varying numbers of hash tables.
As shown in Figure~\ref{fig: hash} and Table~\ref{tab: time_vs_hash}, the accuracy of hashing variants increases with more hash tables and more computational cost. Notably, the computational cost roughly grows linearly with the number of the hash table.
In particular, Ind-KNN with $30$ hash tables matches the accuracy of the original Ind-KNN for a wide range of epsilon on CIFAR-10 but reduces the running time per query from $0.25$ second to $0.03$ second. 
Figure~\ref{fig: agnews_tradeoff} shows similar observations for AG News. 
 
 



\noindent\textbf{Ablation study on the feature extractor.}
The quality of the feature extractor plays an crucial role in all three pre-trained feature-based methods. Remarkably, With the ViT feature extractor, even the Private kNN achieves an impressive accuracy of $96.3\%$ at $\epsilon=2.0$ on CIFAR-10,  surpassing the previously reported best result of $95.4\%$~\citep{de2022unlocking} achieved using Wide-ResNets. Next, we present an ablation study focusing on three feature extractors and investigate the efficiency of each method on the CIFAR-10 task. Specifically, we consider three widely used vision models:  vision transformer (ViT) ~\citep{vit}, the SimCLRv2 model~\citep{clr} and ResNet 50~\citep{He_2016_CVPR}. The SimCLRv2-based feature extractor has been considered by prior work Linear NoisySGD (\cite{tramer2020differentially}), which trains a ResNet model on unlabeled ImageNet using SimCLRv2 model and provides a $4096$-dim feature for each input image. For Resnet50, we consider the publicly-available ImageNet-pretrained Reset50 from Pytorch, which achieves a non-private accuracy at $90.0\%$ for LinearSGD.  As shown in Table~\ref{tab: vary_model}, we find that Linear NoisySGD  outperforms Private kNN and our Ind-kNN across ResNet50 and SimCLRv2. However, the performance gap decreases when applying a better feature extractor.  This can be explained by the fact of their non-private performance. We also note that Private kNN is more fragile when $\epsilon$ is small, which could be due to its ``loose'' privacy analysis.   Meanwhile, Ind-kNN handle the setting of small $\epsilon$ nicely, and can sometimes outperform Linear NoisySGD with a good feature extractor. 





\vspace{-1.em}
\section{Summary}
\vspace{-1.em}
The paper proposes a new algorithm, Individual Kernelized Nearest Neighbor (Ind-KNN), for private prediction in machine learning that is more flexible and updatable over dataset changes than private training. By modifying the KNN prediction and leveraging individualized privacy accountants, Ind-KNN allows a precise control of privacy at an individual level. Through extensive experimentation on four datasets, we demonstrate that Ind-KNN outperforms prior work Private kNN in terms of privacy and utility trade-offs. Furthermore, Ind-KNN exhibits superior computational efficiency and utility when dealing with frequent data updates, surpassing the private training method.

\vspace{-0.5em}
\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                        % so you can already fill it to test with the
                        % ‘accepted’ class option
\vspace{-0.5em}
YZ, XZ and YXW are partially supported by NSF Award \#2048091. YZ was supported by a Google PhD Fellowship and XZ was supported by UCSB Chancellor's Fellowship.
\end{acknowledgements}
\bibliography{egbib, DP}
% References
%\newpage
%appendix
%\onecolumn
%\input{ind_knn/10_appendix}

\end{document}
