\section{Data Dependent Smoothing}

% We first require some background on randomized smoothing before introducing our approach. Interested readers can follow \cite{cohen2019certified} for a more detailed description.

% We introduce our main technique for data dependent Gaussian smoothing. 


\subsection{Preliminaries and Notations}
% \vspace{-0.15cm}
% We consider the standard classification problem, 

Let $x \in \mathbb{R}^d$ and the labels $y \in \mathcal{Y} = \{1,\dots,k\}$ be the input-label pairs $(x,y)$ sampled from an unknown data distribution. Unless explicitly mentioned, we consider a classifier $f_\theta: \mathbb{R}^d \rightarrow \mathcal{P}(\mathcal{Y})$ parameterized by $\theta$ where $\mathcal{P}(\mathcal{Y})$ is a probability simplex over $k$ labels. We say that $f_\theta$ is $\ell_p^r$ certifiably accurate for an input $x$, if and only if, $\argmax_c f_\theta^c(x) = \argmax_c f_\theta^c(x+\delta) = y ~~\forall~ \|\delta\|_p \leq r$, where $f_\theta^c$ is the $c^{\text{th}}$ element of $f_\theta$. That is to say, the classifier correctly predicts the label of $x$  and enjoys a constant prediction for all perturbations $\delta$ that are in the $\ell_p$ ball of radius $r$ from $x$. As such, the overall $\ell_p^r$ certification accuracy is defined as the average certified accuracy over the data distribution. Following prior art \citep{cohen2019certified,salman2019provably,zhai2020macer}, we focus on $\ell_2^r$ certification. 

% We will denote hard and soft classifiers as $f_\theta: \mathbb{R}^d \rightarrow \mathcal{Y}$ and $\hat{f}_\theta: \mathbb{R}^d \rightarrow \mathcal{P}(\mathcal{Y})$, respectively, where $\mathcal{P}(\mathcal{Y})$ is a probability simplex in $\mathbb{R}^k$. We say that a hard classifier is $\ell_p^r$ certifiably accurate for an input $x$ if and only if $f_\theta(x) = f_\theta(x+\delta) = y, ~\forall~ \|\delta\|_p \leq r$ and equivalently $\argmax_c\hat{f}_\theta^c(x) = \argmax_c\hat{f}_\theta^c(x+\delta) = y, ~\forall~ \|\delta\|_p \leq r$ for soft classifiers, where $\hat{f}_\theta^c$ is the $c^{\text{th}}$ element of $\hat{f}_\theta$. That is to say, the classifier correctly predicts the label of $x$  and enjoys a constant prediction for all perturbations $\delta$ that are in the $\ell_p$ ball of radius $r$ from $x$. As such, the overall $\ell_p^r$ certification accuracy is defined as the average certified accuracy over the data distribution. In this paper and following previous works \cite{\textsc{Cohen}2019certified,salman2019provably,zhai2020\textsc{MACER}}, we focus on $\ell_2^r$ certification.  


% \[g_\theta(x) = \argmax_{c}~ \mathbb{P}_\epsilon \left(f_\theta (x+\epsilon)=c\right).\]
% \begin{definition}
% \label{def_hard_classifier}
% For any $\sigma > 0$, the smoothed classifier of $f_\theta$ is defined as follows with $\epsilon \sim \mathcal{N}(0,\sigma^2I)$:
% \begin{equation}
% \label{eq:def_smoothed_class}
% \begin{aligned}
%     g_\theta(x) = \argmax_{c}~ \mathbb{P}_\epsilon \left(f_\theta (x+\epsilon)=c\right).
% \end{aligned}
% \end{equation}
% \end{definition}
% It is generally difficult to use $g_\theta$ for prediction as that requires the output density of an arbitrary $f_\theta$ under Gaussian input. However, \cite{\textsc{Cohen}2019certified} proposed a Monte Carlo algorithm that succeeds in predicting the class of $x$ by $g_\theta$ with high probability.

\subsection{Overview of Randomized Smoothing}
% \vspace{-0.15cm}
% Randomized smoothing constructs a smoothed classifier $g_\theta$ from an arbitrary classifier $f_\theta$ assigning the most likely class to be predicted by $f_\theta$ if inputs were subjected to isotropic Gaussian perturbations. 
Randomized smoothing constructs a certifiable classifier $g_\theta$ by smoothing a base classifier $f_\theta$. For any $\sigma > 0$, the smooth classifier is defined as: $g_\theta(x) = \mathbb{E}_{\epsilon \sim \mathcal{N}(0,\sigma^2I)} \left[f_\theta (x+\epsilon)\right]$. 
% More importantly, \cite{\textsc{Cohen}2019certified} presented a tight certification radius within which the smoothed classifier $g_\theta$ is certifiable. 
Let $g_\theta$ predict label $c_A$ for input $x$ with some confidence, \ie $\mathbb{E}_\epsilon[f^{c_A}_\theta(x+\epsilon)] = p_A \ge p_B = \max_{c \neq c_A} \mathbb{E}_\epsilon[f^c_\theta(x+\epsilon)]$, then, $g_\theta$ is certifiably robust at $x$ with certification radius:
\begin{equation}
\begin{aligned}
    \label{eq:certification_radius}
        R =  \frac{\sigma}{2} \left(\Phi^{-1}(p_A) - \Phi^{-1}(p_B)\right).
\end{aligned}
\end{equation}
Here,  $g(x+\delta) = g(x)~\forall \|\delta\|_2 \leq R$, where $\Phi$ is the CDF of the standard Gaussian. 
% We refer to $R$ throughout as the certification radius. 



\subsection{Robustness-Accuracy Trade-off}
% \vspace{-0.15cm}
Note that Equation \ref{eq:certification_radius} holds regardless of the prediction $c_A$ made by the smooth classifier $g_\theta$. This suggests that one can perhaps improve the robustness of $g_\theta$, \ie increase certification radius $R$ where $g_\theta$ is constant, by increasing the hyper parameter $\sigma$ in Equation \ref{eq:certification_radius}. However, to reason about $\ell_2^r$ certification accuracy, it is not enough to increase the certification radius $R$, as this requires $c_A$ to be the correct prediction for $x$ by $g_\theta$. This reveals the robustness-accuracy trade-off as one cannot improve $\ell_2^r$ certified accuracy by only increasing  the certification radius $R$ (robustness) through the increase in $\sigma$. This is because it comes at the expense of requiring a classifier $g_\theta$ that correctly classifies $x$ with correct label $y$ under large Gaussian perturbations (accuracy). As such, the following inequality should hold $\mathbb{E}_\epsilon[f^{\textcolor{red}{y}}_\theta(x+\epsilon)] \ge p_A \ge p_B \ge \max_{c \neq \textcolor{red}{y}} \mathbb{E}_\epsilon[f^c(x+\epsilon)]$. 

% This establishes the role of $\sigma$ in trading off robustness with accuracy.

% the certification radius $R$


% The robustness-accuracy trade-off now comes as one can not improve the $\ell_2^r$ certified accuracy by increasing $\sigma$ as that comes at the expense of requiring a classifier $g_\theta$ that correctly classifies $x$ with correct label $y$ under large Gaussian perturbations (accuracy), \ie the following inequality holds $\mathbb{P}_\epsilon(f(x+\epsilon) = \textcolor{red}{y}) \ge p_A \ge p_B \ge \max_{c \neq \textcolor{red}{y}} \mathbb{P}_\epsilon(f(x+\epsilon) = c)$.

% This reveals the robustness-accuracy trade-off. O

% ne cannot improve the $\ell_2^r$ certification radius (robustness) by increasing $\sigma$ as that comes at the expense of requiring a classifier $g_\theta$ correctly classifying $x$ with correct label $y$ under large Gaussian perturbations (accuracy), \ie the following inequality holds $\mathbb{P}_\epsilon(f(x+\epsilon) = \textcolor{red}{y}) \ge p_A \ge p_B \ge \max_{c \neq \textcolor{red}{y}} \mathbb{P}_\epsilon(f(x+\epsilon) = c)$ \BG{this sentence is too long; break it up into smaller pieces}. This establishes the role of $\sigma$ in trading off robustness with accuracy.


\subsection{Data Dependent Smoothing for Certification}
% \vspace{-0.15cm}

The certification region $\mathcal{R} = \{\delta : \|\delta\|_2 \leq R\}$ at an input $x$ is fully characterized by the classifier $f_\theta$ and the standard deviation of the Gaussian distribution $\sigma$. Moreover, for a given $f_\theta$, the certification region $\mathcal{R}$ varies at different $x$, when $\sigma$ is fixed, due to the nonlinear dependence of the prediction gap
% \vspace{-0.5cm}
$\Phi^{-1}(p_A(x;\sigma)) - \Phi^{-1}(p_B(x;\sigma))$ on $x$. This hints that, for a given $f_\theta$, different inputs $x$ may enjoy a different optimal $
\sigma^*_x$ that maximizes the certification region. To see this, consider the three inputs $x_1$, $x_2$ and $x_3$ all correctly classified  by the binary classifier $f_\theta$ as $\mathcal{C}_1$ in Figure \ref{fig:pull_fig}. Using a fixed $\sigma$ to smooth the predictions of $f_\theta$, \ie predict with $g_\theta$, reveals that inputs, depending on how close they are from the decision boundaries, can enjoy different levels of smoothing without affecting the prediction of $g_\theta$. For instance, as shown in Figure \ref{fig:pull_fig} for constant $\sigma$, the input far from the decision boundary $x_1$ could have still been classified correctly with similarly large prediction gap even if $f_\theta$ were to be smoothed with a larger $\sigma$. This indicates that perhaps the certification radius at $x_1$ could have been enlarged with a larger smoothing $\sigma$. As for $x_2$, we can observe that while the prediction under this choice of $\sigma$ by $g_\theta$ is still correct, the prediction gap $\Phi^{-1}(p_A(x;\sigma)) - \Phi^{-1}(p_B(x;\sigma))$ drops, due to having more Gaussian samples fall in the $\mathcal{C}_2$ region. Thus, a different choice of $\sigma$ could have been used to trade-off the drop in prediction gap and certification radius.
% \begin{wrapfigure}{r}{0.5\textwidth}
% % \vspace{-0.25cm}
% \begin{minipage}
% \begin{algorithmic}
\begin{algorithm}[t]
  \DontPrintSemicolon
  \SetKwFunction{FMain}{\small OptimizeSigma}
  \SetKwProg{Fn}{Function}{:}{}
  \Fn{\FMain{$f_\theta$, $x$, $\alpha$, $\sigma_0$, $n$}}{
  \textbf{Initialize:} $\sigma_x^0 \gets \sigma_0$, $K$ \;
     \For{$k = 0 \dots K-1$ }{
     sample $\hat{\epsilon}_1,\dots \hat{\epsilon}_n \sim \mathcal N(0,I)$ \;
     $ \psi(\sigma^{k}_x) = \frac{1}{n} \sum_{i=1}^n f_\theta(x + \sigma^{k}_x\hat{\epsilon}_i)$\;
     $E_A(\sigma_x^k) = \max_c \psi^c$; $y_A = \argmax_c \psi^c$;\;
     $E_B(\sigma_x^k) = \max_{c \neq y_A} \psi^c$ \;
     $R(\sigma_x^k) =  \frac{\sigma^{k}_x}{2}\left(\Phi^{-1}(E_A) - \Phi^{-1}(E_B)\right)$ \;
    $\sigma^{k+1}_x \gets \sigma^{k}_x + \alpha \nabla_{\sigma_x^k}R(\sigma^{k}_x)$
     }
     $\sigma^*_x \leftarrow \sigma^K_x$ \;
        \KwRet $\sigma^*_x$ \; }
  \caption{ Data Dependent Certification}\label{alg:RS_DS}
% \vspace{-0.6cm}
\end{algorithm}
% \end{algorithmic}
% \end{minipage}
% \end{wrapfigure}
Last, for the input $x_3$ that is very close to the decision boundary, the sub optimal choice of $\sigma$ (too large for $x_3$) could result in an incorrect prediction by $g_\theta$. Despite the observations that $\sigma$ plays a significant role in $\ell_2^r$ certification accuracy, certification methods generally (\textbf{i})  choose $\sigma$ arbitrarily and (\textbf{ii}) set it to be constant for all $x$. Based on this observation, for a given smooth classifier with a specific $\sigma_0$, where $\sigma_0$ can be zero reducing the smooth classifier to $f_\theta$, we seek to construct another smooth classifier with parameter $\sigma_x^*$ for every input $x$ such that: (\textbf{i}) the prediction of both smooth classifiers (smoothing with $\sigma_0$ and $\sigma_x^*$) is identical for all $x$. (\textbf{ii}) The certification radius of the new smooth classifier at every $x$ is maximized. 
% If both conditions (\textbf{i}) and (\textbf{ii}) hold for every input $x$, then the new smooth classifier with $\sigma_x^*$ for every corresponding $x$ will enjoy a larger certification radius at every $x$ compared to the one smoothed with $\sigma_0$ while both retain the same predictions.
% will enjoy a certified accuracy $\ell_2^r$ that is at least equal to the $\ell_2^r$ certified accuracy of the classifier smoothed with $\sigma_0$. 
To construct a classifier smoothed with $\sigma_x^*$ enjoying the two previous properties, let $c_A$ be the prediction under $\sigma_0$ smoothing, \ie $c_A = \argmax_c \mathbb{E}_{\epsilon\sim\mathcal{N}(0, \sigma_0I)}[f^c(x+\epsilon)]$. We maximize $R$ in Equation \ref{eq:certification_radius} over $\sigma$ for every $x$ by solving:
% \vspace{-0.65cm}
\begin{equation}
\label{eq:our_objective}
\begin{aligned}
    \sigma^*_x =&\argmax_{\sigma} ~\frac{\sigma}{2} \Bigg(\Phi^{-1}\left( \mathbb{E}_{\epsilon \sim \mathcal{N}(0,\sigma^2I)}[f_\theta^{c_A}(x+\epsilon)]\right)  \\ \,\, - \,\, & \Phi^{-1}\left(\max_{c \neq c_A} \mathbb{E}_{\epsilon \sim \mathcal{N}(0,\sigma^2I)}[f_\theta^c(x+\epsilon)]\right) \Bigg).
\end{aligned}
\end{equation}
Since $\Phi^{-1}$ is a strictly increasing function, it is important to note that solving Equation \ref{eq:our_objective} for a fixed $c_A$ can at worst yield a smooth classifier of an identical radius to when the classifier is smoothed with $\sigma_0$ both predicting $c_A$ for $x$. 

\noindent \textbf{Solver}.
While our proposed Objective \ref{eq:our_objective} has a similar form to the \textsc{MACER} regularizer \citep{zhai2020macer} used during training, ours differs in that we optimize $\sigma$ for every $x$ and not the network parameters $\theta$, which are fixed here. A natural solver for \ref{eq:our_objective} is stochastic gradient ascent with the expectation approximated with $n$ Monte Carlo samples. As such, the gradient of the objective at the $k^{\text{th}}$ iteration  will be approximated as follows: $% \vspace{-0.15cm}
% \begin{align*}
   \nabla_{\sigma^k} \frac{\sigma^k}{2}\left[ \Phi^{-1}\left(\gamma^{c_A}(\sigma^k)\right) - \Phi^{-1}\left(\max_{c \neq c_A} \gamma^c(\sigma^k)\right)\right],
% \end{align*}$
$ where $\gamma^c(\sigma^k) = \frac{1}{n}\sum_{i=1}^n f^c(x+\epsilon_i)$ for $\epsilon_1, \dots, \epsilon_n \sim \mathcal{N}(0,(\sigma^k)^2I)$. However, this estimation of the gradient suffers from high variance due to the dependence of the expectation on the optimization variable $\sigma$ that parameterizes the smoothing distribution $\mathcal{N}(0,\sigma^2 I)$ \citep{williams1992simple}. %Interestingly, one can observe that our objective is very similar to the Evidence Lower Bound (ELBO) objective used to train variational auto encoders (VAEs) \cite{kingma2014auto,rezende2014stochastic}. The similarities between \eqref{eq:our_objective} and ELBO are in the dependence of the expectation on the variables of optimization. 
To alleviate this, we use the \textit{reparameterization trick} suggested by \cite{kingma2014auto,rezende2014stochastic} to compute a lower variance gradient estimate for our Objective \ref{eq:our_objective}. In particular, with the change of variable $\epsilon = \sigma \hat{\epsilon}$ where $\hat{\epsilon} \sim \mathcal{N}(0,I)$, Objective \ref{eq:our_objective} is \textit{equivalent to}:
% \vspace{-0.15cm}
\begin{equation}
\label{eq:our_objective_v2}
\begin{aligned}
    \sigma^*_x =&\argmax_{\sigma} \frac{\sigma}{2} \Bigg(\Phi^{-1}
    \left(\mathbb{E}_{\hat{\epsilon}\sim \mathcal{N}(0,I)}[f_\theta^{c_A}(x+\sigma \hat{\epsilon})]\right) - \\ & \Phi^{-1}\left(\max_{c \neq c_A} \mathbb{E}_{\hat{\epsilon}\sim \mathcal{N}(0,I)}[f_\theta^c(x+\sigma\hat{\epsilon})]\right)\Bigg)
\end{aligned}
\end{equation}
\noindent Note that, unlike before, the expectation over the distribution $\hat{\epsilon} \sim \mathcal{N} (0,I)$ no longer depends on the optimization variable $\sigma$. This allows the gradient of \ref{eq:our_objective_v2} to enjoy a lower variance compared to the gradient of \ref{eq:our_objective} \citep{kingma2014auto,rezende2014stochastic}. Algorithm \ref{alg:RS_DS} summarizes the updates for optimizing $\sigma$ for each $x$ by solving \ref{eq:our_objective_v2} with $K$ steps of stochastic gradient ascent. It is worthwhile to mention that the function \texttt{OptimizeSigma} in Algorithm \ref{alg:RS_DS} is agnostic of the choice of architecture $f_\theta$ and of the training procedure that constructed $f_\theta$. 


% \textcolor{red}{Once $\sigma_x^*$ is attained by \texttt{OptimizeSigma} for a given model $f_\theta$, we certify the smoothed classifier $g_\theta$ under this $\sigma_x^*$ using the Monte Carlo algorithms proposed by \cite{cohen2019certified}.}

% Empirically, we demonstrate the effectiveness of the proposed algorithm by certifying pre-trained models with (i) Gaussian augmentation (\textsc{Cohen}) \cite{cohen2019certified}, (ii) adversarially trained smoothed classifiers (\textsc{SmoothAdv}) \cite{salman2019provably}, and (iii) \textsc{MACER} \cite{zhai2020macer}, where $\sigma^*_x$ is used for each $x$. 



\begin{figure*}[t]
    \centering
    % \includegraphics[width=\textwidth]{ICLR22/new_figures/post_certificate.png}
    \includegraphics[width=\textwidth]{figures/post_certificate_2.png}
    \caption{ \textbf{Memory-based certification of the data dependent classifier.} Given a memory of an input $x_1$ with a certified region $\mathcal{R}_1$ and another input $x_2$ with a certified region $\mathcal{R}_2$. Three scenarios could arise where $\mathcal{R}_1$ and $\mathcal{R}_2$ intersect. \textbf{Left}: The certified regions intersect while both $x_1$ and $x_2$ share the same prediction. In this case, $x_2$ along with its certified region are directly added to memory. \textbf{Middle}: $x_2$ lies inside $\mathcal{R}_1$ with a different prediction from $x_1$. In this case, $x_2$ is predicted with the same prediction as $x_1$ and  added to memory along with the largest subset of $\mathcal{R}_2$ that is within $\mathcal{R}_1$. \textbf{Right}: $x_2$ lies outside the $\mathcal{R}_1$ with a different prediction from $x_1$ In this case, $x_2$ with its prediction are added to memory along with the largest certified region in $\mathcal{R}_2$ not intersecting with $\mathcal{R}_1$.}
    \label{fig:memory-based-algorithm}
    % \vspace{-0.50cm}
\end{figure*}


\subsection{Memory-Based Certification for Data Dependent Classifiers}\label{sec:memory-algorithm}

Unlike previous approaches where $\sigma$ is constant for all inputs, the data dependent classifier $g_\theta$ with varying $\sigma$ per input can not be directly certified by the classical Monte Carlo algorithms proposed by \cite{cohen2019certified}. This is since the data dependent classifier $g_\theta$ does not enjoy a constant $\sigma$ within the given certification region, \ie $g_\theta$ tailors a new $\sigma_x$ for every input $x$ including within the certified region of $x$. Informally, let $R(\sigma_{x_1}^*)$ be the radius of certification at $x_1$ granted by the data dependent classifier $g_\theta$. The data dependent classifier \textit{does not guarantee} that there \textit{can not exist} $x_2$ within the region of certification of $x_1$, \ie $\|x_1 - x_2\|_2 \leq R(\sigma_{x_1}^*)$, where $g_\theta$ with $\sigma_{x_2}^*$ predicts  $x_2$ differently from $x_1$ breaking the soundness of certification. 
% \begin{wrapfigure}{r}{0.60\textwidth}\vspace{-0.15cm}
% \begin{minipage}{0.60\textwidth}
\begin{algorithm}[t]
% \small
  \DontPrintSemicolon
  \SetKwFunction{FMain}{TrainBatch}
  \SetKwProg{Fn}{Function}{:}{}
  \Fn{\FMain{$f_\theta$, $\{x_i,y_i\}_{i=1}^B$, $\{ \sigma_{x_i}\}_{i=1}^B$, $\alpha$, $n$}}{
 \For{$i=1, \dots, B$}{
   $\sigma^*_{x_i} = \texttt{\small OptimizeSigma}(f_\theta, x_i, \alpha, \sigma_{x_i}, n)$
 }
 \texttt{\small TrainFunction} $\left(\{x_i,y_i\}_{i=1}^B, \{\sigma^*_{x_i}\}_{i=1}^B\right)$ 
  \tcp{\small any training routine e.g. \textsc{MACER}}
 }
   \caption{ Training with Data Dependent $\sigma_{x_i}$} \label{alg:DS_DS}
\end{algorithm}
% \end{minipage}
% \vspace{-0.50cm}
% \end{wrapfigure} 
To circumvent this problem, we propose a memory-based procedure to certifying our proposed data dependent classifier. Let $\{x_i\}_{i=1}^N$ be a set of previously predicted inputs and $\{\mathcal{C}_i\}_{i=1}^N$ be their corresponding predictions with mutually exclusive $\ell_2$ certified regions $\mathcal{R}_i$ for differently predicted inputs, \ie $\mathcal{R}_i \cap \mathcal{R}_j = \emptyset ~\forall i\neq j, \mathcal{C}_i \neq \mathcal{C}_j$. Let $x_{N+1}$ be a new input with a certified region $\mathcal{R}_{N+1}$ computed by the Monte Carlo algorithms of \cite{cohen2019certified} for the data dependent classifier $g_\theta$ with prediction $\mathcal{C}_{N+1}$. If there exists an $i$ such that $\mathcal{R}_{N+1} \cap \mathcal{R}_i \neq \emptyset$,  $x_{N+1} \in \mathcal{R}_i$, and $\mathcal{C}_{N+1} \neq \mathcal{C}_i$, we adjust the prediction of the data dependent classifier $g_\theta$ to be $\mathcal{C}_i$ and update $\mathcal{R}_{N+1}$ to be the largest subset of $\mathcal{R}_{N+1}$ that is a subset of $\mathcal{R}_i$ (see middle example in Figure \ref{fig:memory-based-algorithm}). %We then add the $x_{N+1}$, $\mathcal{C}_{N+1}$, and $\tilde{\mathcal{R}}_{N+1}$ to memory. 
On the other hand, if $\mathcal{R}_{N+1} \cap \mathcal{R}_i \neq \emptyset$, $x_{N+1} \notin \mathcal{R}_i$, and that $\mathcal{C}_{N+1} \neq \mathcal{C}_i$, we update $\mathcal{R}_{N+1}$ to be the largest subset of $\mathcal{R}_{N+1}$ not intersecting with $\mathcal{R}_i$ (see right example in Figure \ref{fig:memory-based-algorithm}). We perform the previous operations for all elements in the memory and add $x_{N+1}, \mathcal{C}_{N+1}, 
\mathcal{R}_{N+1}$ to memory. The aforementioned procedure grants a sound certification for the data dependent classifier preventing by construction overlapping certified regions with different predictions.% $\mathcal{C}$ as $\mathcal{C}_{N+1}$  by the Monte Carlo algorithms of \cite{cohen2019certified}, if $x_{N+1} \in \mathcal{R}_i$ where $\mathcal{C}_{}$
% we adjust the prediction of the data dependent classifier to be $\mathcal{C}_i$ if $x_{N+1} \in \mathcal{R}_i$ and $\mathcal$. Otherwise, \ie $x_{N+1} \notin \mathcal{R}_i \forall i$, we predict with our proposed data dependent classifier and compute the certification region $\mathcal{R}_{N+1}$. We then update the memory with the new input $x_{N+1}$ and its certified region $\mathcal{R}_{N+1}$ while satisfying the mutually exclusive property for differently predicted inputs, \ie $\mathcal{R}_i \cap \mathcal{R}_j = \emptyset, \forall i\neq j, c_i \neq j$. To do so, two cases arise that we summarize in the middle and right examples of Figure \ref{fig:memory-based-algorithm}. In the first case, middle example of Figure \ref{fig:memory-based-algorithm}, we adjust the prediction of the data dependent classifier to $\mathcal{C}_i$ and find the largest subset of $\mathcal{R}_{N+1}$ that is within $\mathcal{R}_1$ and then add it to memory. The other case, right example in Figure \ref{fig:memory-based-algorithm}, where $x_{N+1} \notin \mathcal{R}_1$ with $\mathcal{C}_i \neq \mathcal{C}_{N+1}$, we find the largest subset of $\mathcal{R}_{N+1}$ not intersecting with $\mathcal{R}_1$
% or reduce the certified region to the largest overlapping radius. We then certify the aforementioned data dependent classifier using the Monte Carlo algorithms proposed by \cite{cohen2019certified}. 
While the memory-based certification is essential for a sound certification, empirically, we never found in any of the later experiments a case where two inputs predicted differently suffer from intersecting certified regions. That is to say \textcolor{black}{while our sound certificate works on the memory-enhanced data dependent smooth classifier, we found that} the certified \textcolor{black}{radius of}
% regions in
the memory \textcolor{black}{classifier} for every input is the \textcolor{black}{radius}
% regions 
granted by the Monte Carlo certificates of \cite{cohen2019certified} for 
the data dependent classifier. \textcolor{black}{Therefore and throughout, we refer to the memory-enhanced data dependent smooth classifier and data dependent smooth classifier interchangeably.}
We elaborate more on this and provide an algorithm in the \textbf{Appendix}.


% Figure \ref{fig:memory-based-algorithm} summarizes the three cases in which an overlap happens where our memory based procedure corrects the certified radius alone (Right) or adjusts the predicted label as well (middle).





% $x_{N+1}$ to we have that



% To circumvent this problem, we propose a memory based procedure that corrects the data-dependent certified radii such that we guarantee non-overlapping certified regions between differently predicted samples. In a nutshell at test time, as a new sample $x_i$ approaches, we check if there is an overlap between its certified region and any certified region of a differently predicted sample saved in memory. If that is the case, then we adjust the certified radius of the new sample such that no overlap happens. We then save the sample along with its optimal smoothing parameter, its predicted label, and its certified radius in memory. Figure \ref{fig:memory-based-algorithm} summarizes the three cases in which an overlap happens where our memory based procedure corrects the certified radius alone (Right) or adjusts the predicted label as well (middle).




\subsection{Training with Data Dependent Smoothing}
% \vspace{-0.15cm}
Models that enjoy a large $\ell_2^r$ certification accuracy under the randomized smoothing framework need to enjoy a large certification radius $R$ in Equation \ref{eq:certification_radius} for all $x$ and be able to correctly classify inputs corrupted with Gaussian noise, \ie $g_\theta(x) = y$. While there are several approaches to train $f_\theta$ (or directly $g_\theta$) so as to output correct predictions for inputs corrupted with noise sampled from $\mathcal{N}(0,\sigma^2 I)$, all existing works fix $\sigma$ for all inputs during training. We are interested in complementing these approaches with smoothing distributions that are data dependent. As such, we can employ the training procedure of these approaches but with $\sigma^*_x$ computed by \texttt{OptimizeSigma}. Algorithm \ref{alg:DS_DS} summarizes this proposed training pipeline. The function \texttt{TrainFunction} proceeds by performing backpropagation using any training scheme, given the estimated $\sigma^*_{x_i}$ for each $x_i$. We note that whenever Algorithm \ref{alg:DS_DS} is used, we initialize $\sigma_{x_i}$ at each epoch with $\sigma^*_{x_i}$ computed at the previous epoch. Since \textsc{Cohen}, \textsc{SmoothAdv} and \textsc{MACER} are among the most popular approaches that embed randomized smoothing certificates as part of the training routine,  \texttt{TrainFunction} refers here to any of these three training methods. Empirically, we show that we can boost all three methods even further when models are trained with Algorithm \ref{alg:DS_DS}.




% \textcolor{red}{Motasem: I think we should add the paragraph here}






% \begin{algorithm}[t]
%   \DontPrintSemicolon
%   \SetKwFunction{FMain}{Train}
%   \SetKwProg{Fn}{Function}{:}{}
%   \Fn{\FMain{$f_\theta$, $\{x_i,y_i\}_{i=1}^B$, $\alpha$, $K$, $M$, $S$, $\sigma_0$}}{
%   \textbf{Initialize:} $\sigma^*_{x_i} \gets \sigma_0\,\, \forall i\in\{1,\dots,B\}$ \\
%  \For{$j=1,\dots M$}{
%  \If{$j \geq S$}{
%  \For{$i=1, \dots, B$}{
%   $\sigma^*_{x_i} = \texttt{OptimizeSigma}(f_\theta, x_i, \alpha, \sigma^*_{x_i}, K)$
%  }}
%  \texttt{TrainFunction} $\left(\{x_i,y_i\}_{i=1}^n, \{\sigma^*_{x_i}\}_{i=1}^n\right)$  \;
%  }}
%   \caption{Training with Data Dependent Smoothing} \label{alg:RS_DS}
% \end{algorithm}


% \begin{algorithm}[t]
% \SetAlgoLined
% \textbf{Inputs:} Network $f_\theta$, batch size B, epochs, step size $\alpha$, number of optimization iterations $K$\\
%  \For{$j=1, \dots, \text{epochs}$ }{
%  \For{$i=1, \dots, B$}{
%   $\sigma^*_{x_i} = \texttt{OptimizeSigma}(f_\theta, x_i, \alpha, \sigma_{x_i}, K)$

%  Train with $x_i$ and $\mathcal N(0,\sigma^*_{x_i}I)$ \;
% %  Sample $\epsilon \sim \mathcal N(0,\sigma(x_i)I)$ \;
%  }}
%  \textbf{Output:} model parameters $\theta$
%  \caption{Training with Data Dependent Smoothing} \label{alg:DS_DS}
% \end{algorithm}


% To that regard, we are


% In general, for correctly classified inputs by $g_\theta$, one would generally be interested in doing so with as large $\sigma$ as possible as that can increase the certification radius.



% resulted in incorrect predictions

% ote that one can view the predictions by $g_\theta$ as th average prediction

% could have enjoyed a larger certification radius by $g_\theta$ certified by a larger radius $\sigma$ without changing the correct prediction of $g_\theta$

% far  is far from the decision boundary could have been certified by a larger radius $\sigma$ without changing the correct prediction of $g_\theta$

% points that are close to the decision boundary $x_2$ may have their correct predictions affected much more that points far away from decision boundary, e.g. $x_1$.




% In the first case, figure \ref{fig:pull_fig}, the classifier $g_\theta$ smooths the predictions over all points by the same amount $\sigma$. Note $g_\theta$, can be viewed as the average prediction of $f_\theta$ under Gaussian perturbations, is the average 

% both classified correctly by the binary classifier $f_\theta$ shown in Figure \textcolor{red}{x}. It is to be observed that for the sample $x_1$, which is far away from the decision boundary, the prediction of the classifier $g_\theta$ is less sensitive to the increase in $\sigma$ as compared to the prediction of $g_\theta$ at $x_2$ that is close to the decision boundary. 

