%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{xr}
\makeatletter
\newcommand{\printfnsymbol}[1]{%
  \textsuperscript{\@fnsymbol{#1}}%
}
\makeatother
% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Structure-Aware Robustness Certificates for Graph Classification\\(Supplementary Material)}

\usepackage{amssymb} % math fonts
\usepackage{graphicx}
\usepackage{subcaption}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{bm}
\usepackage{float}
% macros
\newcommand{\X}{\mathcal{X}} % smoothed input
\newcommand{\x}{\mathbf{x}} % input point
\newcommand{\z}{\mathbf{z}} % pmf input
\newcommand{\J}{\mathcal{J}} % elements in adj that change
\newcommand{\C}{\mathcal{C}} % community
\renewcommand{\z}{\mathbf{z}} % arbitary point
\newcommand{\Q}{\mathbf{Q}} % matrix Q
\newcommand{\R}{\mathcal{R}} % radius
\DeclareMathOperator*{\argmin}{argmin} % argmin 
\DeclareMathOperator*{\argmax}{argmax} % argmax
\DeclareMathOperator*{\Bin}{Bin} % binomial distribution
\DeclareMathOperator*{\Bern}{Bern} % bernoilli distribution
\renewcommand{\P}{\mathbb{P}} % probability
\DeclarePairedDelimiter\set{\lbrace}{\rbrace} % set {...}
\DeclarePairedDelimiter\norm{\lVert}{\rVert} % norm ||x||
% If your paper is accepted, change the options for the package
% aistats2023 as follows:
%
%\usepackage[accepted]{aistats2023}
%
% This option will print headings for the title of your paper and
% headings for the authors names, plus a copyright note at the end of
% the first column of the first page.

% If you set papersize explicitly, activate the following three lines:
%\special{papersize = 8.5in, 11in}
%\setlength{\pdfpageheight}{11in}
%\setlength{\pdfpagewidth}{8.5in}

% If you use natbib package, activate the following three lines:
%\usepackage[round]{natbib}
%\renewcommand{\bibname}{References}
%\renewcommand{\bibsection}{\subsubsection*{\bibname}}

% If you use BibTeX in apalike style, activate the following line:
%\bibliographystyle{apalike}

\author[1]{Pierre Osselin \thanks{Equal contribution.}}
\author[1]{Henry Kenlay \printfnsymbol{1}}
\author[1]{Xiaowen Dong}
% Add affiliations after the authors
\affil[1]{%
    Department of Engineering Science, University of Oxford, Oxford, UK
}
  

\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

% If your paper is accepted and the title of your paper is very long,
% the style will print as headings an error message. Use the following
% command to supply a shorter title of your paper so that it can be
% used as headings.
%
%\runningtitle{I use this title instead because the last one was very long}

% If your paper is accepted and the number of authors is large, the
% style will print as headings an error message. Use the following
% command to supply a shorter version of the authors names so that
% they can be used as headings (for example, use only the surnames)
%
%\runningauthor{Surname 1, Surname 2, Surname 3, ...., Surname n}

% Supplementary material: To improve readability, you must use a single-column format for the supplementary material.



\section{Proofs of propositions}

\subsection{Proof of Proposition 1}

\textbf{Disjoint Unions.} Let $\z \in \R_\Q $ and $\tilde{\z} \in \R_{\Q^{'}}$ such that for some $i \in I$ we have $Q_{i} \neq Q^{'}_{i}$. If $\z = \tilde{\z}$, it implies that $||\z_{\J_{i}} - \x_{\J_{i}}|| = Q_{i}$ and $||\tilde{z}_{\J_{i}} - \x_{\J_{i}}|| = Q^{'}_{i}$ which is a contradiction.

\noindent \textbf{Partition.} $|\J_{i}| \leq R_{i}$, and $||\z_{\J_{i}} - \x_{\J_{i}}|| \leq Q_{i}$ hence $\mathcal{X} = \cup_{Q \leq R} \mathcal{R}_{Q}^{R}$.

\subsection{Proof of Proposition 2}

As the noise for each entry is independent we can decompose the probabilities as so
\begin{equation}
\frac{P(\phi(\tilde{\x}) = \z)}{P(\phi(\x) = \z)} = \prod\limits_{k \in [N]} \frac{P(\phi(\tilde{\x})_k = \z_k)}{P(\phi(\x)_k = \z_k)}. 
\end{equation}
Furthermore, as each components belongs to exactly one edge community. 
\begin{equation}
    \prod\limits_{k \in [N]} \frac{P(\phi(\tilde{\x})_k = \z_k)}{P(\phi(\x)_k = \z_k)} = \prod_{i=1}^{I} \prod\limits_{k \in \C_i} \frac{P(\phi(\tilde{\x})_k = \z_k)}{P(\phi(\x)_k = \z_k)}. 
\end{equation}
We note that for $k$ where $\tilde{\x}_k = \x_k$ this fraction is one, so we can focus on terms when $\tilde{\x}_k \not = \x_k$. In equations this can be written as 

\begin{equation}
     \prod_{i=1}^{I} \prod\limits_{k \in \C_i} \frac{P(\phi(\tilde{\x})_k = \z_k)}{P(\phi(\x)_k = \z_k)} = \prod_{i=1}^{I} \prod\limits_{k \in \J_i} \frac{P(\phi(\tilde{\x})_k = \z_k)}{P(\phi(\x)_k = \z_k)}
\end{equation}

We can consider what the terms are equal to when $\x_k = \z_k$ and when $\x_k \not = \z_k$ (assuming that $\x_k \not = \tilde{\x}_k$). We get 
\begin{equation}
\frac{P(\phi(\tilde{\x})_k = \z_k)}{P(\phi(\x)_k = \z_k)} = 
\begin{cases}
\frac{p_i}{1-p_i} & \text{if } \x_k = \z_k \text{ and } \x_k \not = \tilde{\x}_k \\ 
\frac{1-p_i}{p_i} & \text{if } \x_k \not = \z_k \text{ and } \x_k \not = \tilde{\x}_k \\ 
\end{cases}.
\end{equation}
In total there are $R_i$ terms in each product, of which $Q_i$ are the first case and $R_i-Q_i$ are in case two. Thus 

\begin{align}
    \prod_{i=1}^{I} \prod\limits_{k \in \J_i} \frac{P(\phi(\tilde{\x})_k = \z_k)}{P(\phi(\x)_k = \z_k)} &= \prod_{i=1}^{I}  \left(\frac{p_i}{1-p_i}\right)^{Q_i}  \left(\frac{1-p_i}{p_i}\right)^{R_i-Q_i} \\
    &= \prod_{i=1}^C  \left(\frac{p_i}{1-p_i}\right)^{2Q_i-R_i} \\ 
    &= \prod_{i=1}^C  \left(\frac{1-p_i}{p_i}\right)^{R_i-2Q_i}
\end{align}
as required. We provide Fig.~\ref{fig:prop2} as a visual aid to the proof.


\begin{figure}[t]
\centering
\begin{tikzpicture}
\node[left] at (-0.1,0.5) {$\mathbf{x}_{\mathcal{C}}$};
\draw [fill=black] (0,0) rectangle (2.5,1) node[pos=0.5, text=white] {$1$};
\draw [fill=white] (2.5,0) rectangle (5,1) node[pos=0.5, text=black] {$0$};
\draw [fill=black] (5,0) rectangle (7.5,1) node[pos=0.5, text=white] {$1$};
\draw [fill=white] (7.5,0) rectangle (10,1) node[pos=0.5, text=black] {$0$};
\node[right] at (10,0.5) {$\ldots$};

\draw [<->] (0.1,1.2) -- (4.9,1.2) node [pos=0.5,above] {$|\mathcal{C}_1|=R_1$};
\draw [<->] (5.1,1.2) -- (9.9,1.2) node [pos=0.5,above] {$|\mathcal{C}_2|=R_2$};

\node[left] at (-0.1,-1) {$\tilde{\mathbf{x}}_{\mathcal{C}}$};
\draw [fill=white] (0,-1.5) rectangle (2.5,-0.5) node[pos=0.5, text=black] {$0$};
\draw [fill=black] (2.5,-1.5) rectangle (5,-0.5) node[pos=0.5, text=white] {$1$};
\draw [fill=white] (5,-1.5) rectangle (7.5,-0.5) node[pos=0.5, text=black] {$0$};
\draw [fill=black] (7.5,-1.5) rectangle (10,-0.5) node[pos=0.5, text=white] {$1$};
\node[right] at (10,-1) {$\ldots$};


/% \draw [<->] (1.35,-3.2) -- (3.65,-3.2) node [pos=0.5, fill=white, inner] {$\mathbf{Q}_1$};
/% \draw [<->] (6.35,-3.2) -- (8.65,-3.2) node [pos=0.5, fill=white, inner] {$\mathbf{Q}_2$};
\draw [<->] (1.35,-3.2) -- (3.65,-3.2) node [pos=0.5, fill=white] {$Q_1$};
\draw [<->] (6.35,-3.2) -- (8.65,-3.2) node [pos=0.5, fill=white] {$Q_2$};


\node[left] at (-0.1,-2.5) {$\mathbf{z}$};
\draw [fill=black] (0,-3) rectangle (1.25,-2) node[pos=0.5, text=white] {$1$};
\draw [fill=white] (1.25,-3) rectangle (2.5,-2) node[pos=0.5, text=black] {$0$};
\draw [fill=black] (2.5,-3) rectangle (3.75,-2) node[pos=0.5, text=white] {$1$};
\draw [fill=white] (3.75,-3) rectangle (5,-2) node[pos=0.5, text=black] {$0$};
\draw [fill=black] (5,-3) rectangle (6.25,-2) node[pos=0.5, text=white] {$1$};
\draw [fill=white] (6.25,-3) rectangle (7.5,-2) node[pos=0.5, text=black] {$0$};
\draw [fill=black] (7.5,-3) rectangle (8.75,-2) node[pos=0.5, text=white] {$1$};
\draw [fill=white] (8.75,-3) rectangle (10,-2) node[pos=0.5, text=black] {$0$};
\node[right] at (10,-2.5) {$\ldots$};

\draw [->, rounded corners, thick, gray] (0.625, -3.5) -- (0.625, -3.75) -- (3.5, -3.75) --  (3.5, -4.4) ;
\fill [white] (1.8,-3.8) rectangle (1.95,-3.7);
\fill [white] (3.05,-3.8) rectangle (3.2,-3.7);
\draw [->, rounded corners, thick] (1.875,-3.5) -- (1.875,-4.1) -- (0.625,-4.1) -- (0.625,-4.4) ;
\draw [->, rounded corners, thick] (3.125,-3.5) -- (3.125,-4.2) -- (1.875,-4.2) -- (1.875,-4.4) ;
\draw [->, thick, gray] (4.375, -3.5) -- (4.375, -4.4) ;

\draw [->, rounded corners, thick, gray] (5.625, -3.5) -- (5.625, -3.75) -- (8.5, -3.75) --  (8.5, -4.4) ;
\fill [white] (6.8,-3.8) rectangle (6.95,-3.7);
\fill [white] (8.05,-3.8) rectangle (8.2,-3.7);
\draw [->, rounded corners, thick] (6.875,-3.5) -- (6.875,-4.1) -- (5.625,-4.1) -- (5.625,-4.4) ;
\draw [->, rounded corners, thick] (8.125,-3.5) -- (8.125,-4.2) -- (6.875,-4.2) -- (6.875,-4.4) ;
\draw [->, thick, gray] (9.375, -3.5) -- (9.375, -4.4) ;

\node[left] at (-0.1,-5) {$\mathbf{P}(\phi(\x)=\z)$};
\draw [fill=white] (0,-5.5) rectangle (2.5,-4.5) node[pos=0.5, text=black] {$p_1^{\mathbf{Q}_1}$};
\draw [fill=white] (2.5,-5.5) rectangle (5,-4.5) node[pos=0.5, text=black] {$(1-p_1)^{\mathbf{R}_1-\mathbf{Q}_1}$};
\draw [fill=white] (5,-5.5) rectangle (7.5,-4.5) node[pos=0.5, text=black] {$p_2^{\mathbf{Q}_2}$};
\draw [fill=white] (7.5,-5.5) rectangle (10,-4.5) node[pos=0.5, text=black] {$(1-p_2)^{\mathbf{R}_2-\mathbf{Q}_2}$};
\node[right] at (10,-5) {$\ldots$};

\node[left] at (-0.1,-6.5) {$\mathbf{P}(\phi(\tilde{\x})=\z)$};
\draw [fill=white] (0,-7) rectangle (2.5,-6) node[pos=0.5, text=black] {$(1-p_1)^{\mathbf{Q}_1}$};
\draw [fill=white] (2.5,-7) rectangle (5,-6) node[pos=0.5, text=black] {$p_1^{\mathbf{R}_1-\mathbf{Q}_1}$};
\draw [fill=white] (5,-7) rectangle (7.5,-6) node[pos=0.5, text=black] {$(1-p_2)^{\mathbf{Q}_2}$};
\draw [fill=white] (7.5,-7) rectangle (10,-6) node[pos=0.5, text=black] {$p_2^{\mathbf{R_2}-\mathbf{Q_2}}$};
\node[right] at (10,-6.5) {$\ldots$};

\end{tikzpicture}
\caption{Pictorial representation of where the terms in Proposition 2 come from.}
\label{fig:prop2}
\end{figure}

\subsection{Proof of Proposition 3}
 We have $\R_\Q = \set{\z \in \X : \norm{\z_{\J_{i}} - \x_{\J_{i}}}_0 = Q_{i}}$. The probability $\P(\phi(\x) \in \mathcal{R}_{\Q})$ corresponds to each set $R_{i}$ having $Q_i$ entries not being flipped or equivalently $R_i - Q_{i}$ entries being flipped. Each node pair is flipped with a probability of 
 $p_{i}$. Since all flips are independent we can express the probability as $\P(\phi(\x) \in \mathcal{R}_{\Q}) = \prod_{i = 1 }^C \Bin(R_i - Q_{i} | R_{i}, p_{i})$.

\section{Implementation}
\label{sec:implementation}

\subsection{Noise sampling}
In order to sample from the anisotropic noise defined in eq. (11), we propose an illustration in Fig. \ref{fig:sampling_anisotropic}. Given disjoint regions of node pairs $\mathcal{C}_i$, new graphs are sampled by adding independent Bernoulli samples with parameters given by the regions to the appropriate part of the graph.
\begin{figure}[t]
\centering
\includegraphics[width=0.8\textwidth]{figures/Noise_sampling.pdf}    
\caption{A comparison between the anisotropic certificate and the sparsity-aware certificate. Each entry represents the ratio of correctly classified test-set samples that could be certified at a specified number of edge deletions and additions.}
\label{fig:sampling_anisotropic}
\end{figure}
\subsection{Estimations of probabilities}

The quantities $p_{y}(\x)$ cannot be computed in closed form for general $f$. Hence, we resolve to lower bound $p_A$ and upper bound $p_{y}(\x), y \neq c_A$ via sampling. To achieve this, we use the Clopper-Pearson interval. \cite{cai2005one}.

\subsection{Symmetries certification}
Solving the optimization problem defined in Eq. (8) is difficult as certificates have to be computed for every $\tilde \x$ in the ball around $\x$: $\mathcal{B}_{r}(\x)$. However, in practice, $\Phi_{\x, \tilde{\x}} (p_A,c_A)$ displays some symmetries depending on the noise distribution $\phi(\x)$. 

In the case of isotropic noise, the regions $\mathcal{H}_{k}$ and values $\eta_{k}$ only depends on $\norm{\x - \tilde \x}_{0}$. This implies $\Phi_{\x, \tilde{\x}} (p_A,c_A) = \Phi_{\x, \tilde{\x}^{'}} (p_A,c_A)$ for all $\tilde{\x}, \tilde{\x}^{'} \in \mathcal{S}_{r}(\x)$ which reduce the search on every spheres.

In the case of anisotropic noise, the regions $\mathcal{H}_{k}$ and values $\eta_{k}$ only depends on $\norm{\x_{\C_{i}} - \tilde{\x}_{\C_{i}}}_{0}$. This implies $\Phi_{\x, \tilde{\x}} (p_A,c_A) = \Phi_{\x, \tilde{\x}^{'}} (p_A,c_A)$ for all $\tilde{\x}, \tilde{\x}^{'} \in \mathcal{S}_{\mathbf{R}}(\x)$.
\section{Algorithm}
The full algorithm of our method is given in Alg. \ref{alg:main_alg} and its complexity is analyzed below.
\begin{algorithm}[ht]
\begin{footnotesize}
\caption{Structure aware randomized smoothing}
\begin{algorithmic}[1] \label{alg:main_alg}
\STATE \textbf{inputs}: Graph to certify $\x$, noise perturbation $\bm{\epsilon}$, anisotropic structure $(\mathcal{C}_i)_{i \in I} \subset [n]^{2}$, graph classification model $m: \mathcal{X} \rightarrow \mathcal{Y}$, number of samples $N$ and upper bounds on certificate radii $(\mathbf{R}_{max, i})_{i \in I}$.
\STATE \textbf{initialize}: Train model $m$ on classification data $\mathcal{D}$ or load model parameters.
\STATE \textbf{voting}
\begin{ALC@g}
\FOR{$i = 1, ..., N$}
\STATE{Sample random graph $\Tilde{\x}_i \sim \x \oplus \bm{\epsilon}$ }
\STATE{Compute model prediction $y_i \in \mathcal{Y}$}
\ENDFOR
\STATE Compute distribution label frequency from $(y_i)_{i \in [N]}$, denoted $(p_y, y)_{y \in \mathcal{Y}}$, and identify the most frequent and runner-up (second most frequent) class $(p_{A}, c_{A})$ and $(p_{B}, c_{B})$
\end{ALC@g}
\STATE \textbf{certification}
\begin{ALC@g}
\FOR{$\mathbf{R} \in \prod\limits_{i} [|\mathbf{R}_{max, i}|] $}
\STATE{Compute $\eta_{\mathbf{Q}}^{\mathcal{R}}$ according to the formula (13) and sort them.}
\STATE{Compute $\mathbb{P}(\phi(\x)) \in \mathcal{R}_{\mathbf{Q}}$ with formula (14).}
\STATE{Solve the linear programs described in eq. (9) and (10) greedily}
\STATE{Verify $\underline{\rho_{\x, \tilde{\x}}}(p_A, c_A) - \overline{\rho_{\x, \tilde{\x}}}(p_B, c_B) > 0$}
\ENDFOR
\end{ALC@g}
\STATE \textbf{return} Grid of certification for $\mathbf{R} \in \prod\limits_{i} [|\mathbf{R}_{max, i}|]$
\end{algorithmic} 
\end{footnotesize}
\end{algorithm} 
\subsection{Algorithmic Complexity: Certification}

Let $\x$ be a graph, $N$ the number of samples to perform the Clopper-Pearson statistical test, $n$ the number of nodes in the graph, and $\mathbf{R} \in \prod\limits_{i} [|\mathcal{C}_i|]$ a given radius to certify.
The certification algorithm proceeds as follow:
\begin{enumerate}
    \item Sample $N$ graphs from the noise distribution with complexity $\mathcal{O}(Nn^{2})$, this step is very easily parallelizable.
    \item Forward the $N$ sampled graphs through the model. Given a model forward complexity of $\mathcal{O}(m(n))$ (we omit potential depency on node or edge feature dimension), the total complexity is $\mathcal{O}(Nm(n))$, this step is very easily parallelizable.
    \item From estimates $(p_A, p_B)$ and noise distribution $\mathbf{\epsilon}$ find optimal radius $\mathbf{R}$.  and $T_{\mathbf{R}} = \prod\limits_{i} (R_{i} + 1)$:
    \begin{enumerate}
        \item Compute the vectors $\eta_{\mathbf{Q}}^{\mathcal{R}}$ and sort them, with respective complexity $\mathcal{O}(CT_{\mathbf{R}})$ and $\mathcal{O}(T_{\mathbf{R}} \log(T_{\mathbf{R}}))$.
        \item Solve the linear programs of eq. (9) and (10) and verify $\underline{\rho_{\x, \tilde{\x}}}(p_A, c_A) - \overline{\rho_{\x, \tilde{\x}}}(p_B, c_B) > 0$, with complexity $\mathcal{O}(T)$
    \end{enumerate}
    The total complexity becomes $\mathcal{O}(Nn^{2} + Nm(n) + CT_{\mathbf{R}} + T_{\mathbf{R}}\log(T_{\mathbf{R}}))$ 
\end{enumerate}
Regarding the model complexity, some example complexity are the following:
\begin{enumerate}
    \item Graph Neural Network: the complexity is quadratic in the number of nodes due to matrix multiplication: $m(n) = \mathcal{O}(n^{2})$
    \item Label kernel: the complexity is linear in the number of edges $\mathcal{O}(\mathcal{E}) = \mathcal{O}(n^{2})$ 
\end{enumerate}

\subsection{Algorithmic Complexity: Optimal radius}
Let $\x$ be a graph, $N$ the number of samples to perform the Clopper-Pearson statistical test, $n$ the number of nodes in the graph.
The algorithm to find the optimal radius proceeds as follow:
\begin{enumerate}
    \item Sample $N$ graphs from the noise distribution with complexity $\mathcal{O}(Nn^{2})$, this step is very easily parallelizable.
    \item Forward the $N$ sampled graphs through the model. Given a model forward complexity of $\mathcal{O}(m(n))$ (we omit potential depency on node or edge feature dimension), the total complexity is $\mathcal{O}(Nm(n))$, this step is very easily parallelizable.
    \item From estimates $(p_A, p_B)$ and noise distribution $\mathbf{\epsilon}$ find optimal radius $\mathbf{R}$. Select a vector, $\mathbf{R} \in \prod\limits_{i} [|\mathcal{C}_i|]$, let $T_{\mathbf{R}} = \prod\limits_{i} (R_i + 1)$ and $T = \prod\limits_{i} (R_{i, max} + 1)$:
    \begin{enumerate}
        \item Compute the vectors $\eta_{\mathbf{Q}}^{\mathcal{R}}$ and sort them, with respective complexity $\mathcal{O}(CT_{\mathbf{R}})$ and $\mathcal{O}(T_{\mathbf{R}} \log(T_{\mathbf{R}}))$.
        \item Solve the linear programs of eq. (9) and (10) and verify $\underline{\rho_{\x, \tilde{\x}}}(p_A, c_A) - \overline{\rho_{\x, \tilde{\x}}}(p_B, c_B) > 0$, with complexity $\mathcal{O}(T)$
    \end{enumerate} We output the pareto front $\mathbf{R}$ according to the partial ordering $\mathbf{R} \preceq \mathbf{R}' \iff \forall i, \mathbf{R}_i \leq \mathbf{R}'_{i}$ 
\end{enumerate}

The total naive complexity is $\mathcal{O}(Nn^{2} + Nm(n) + CT^{2} + T^{2}\log(T))$. However, we want to point out there are multiple places the complexity could drastically improve.
\begin{enumerate}
    \item First, the last point is problem agnostic, meaning that, given the estimates $(p_A, p_B)$ (first and second highest label probabilities) and the noise distribution $\mathbf{\epsilon}$, the corresponding optimal radii $\mathbf{R}$ can be computed. Given specific scenario, this opens the possibility to precompute tables $\mathbf{R}(p_A, p_B, \mathbf{\epsilon})$. This can be used to directly output $\mathbf{R}$ or use it to find the optimal $\mathbf{R}$ quicker.
    \item Second, the linear program described in equation (9) and (10) can be efficiently solved greedily. Given we know the closed-form formula for $\mu_k$, making the ordering explicitly dependant on $\mathbf{Q}$, one can compute them only when necessary.
    \item Finally, the partial ordering defined previously is, in practice indicative of the robustness certification, i.e. if we cannot certify a certain radius, a larger radius won't be certified either. Although we don't propose a formal proof of this property, it holds true in practice, as one can see on the experiment results, and could be exploited for more efficient search, similar to a multidimensional binary search. 
\end{enumerate}

\section{Additional results}

% \begin{figure}[t]
% \centering
%   \begin{tabular}[b]{@{}c@{}}
%     \begin{subfigure}[b]{0.295\textwidth}
%       \includegraphics[width=\textwidth]{Sample UAI 2023 paper/figures/ablation/lowerbound_samplesize.pdf}
%       \caption{Relationship between Lower bound of probability and sample size.}
%       \label{fig:lowerbound}
%     \end{subfigure}\\
%     \begin{subfigure}[b]{0.295\textwidth}
%       \includegraphics[width=\textwidth]{Sample UAI 2023 paper/figures/ablation/radius_isonosie.pdf}
%       \caption{Relationship between isotropic certification radius and  number of samples.}
%       \label{fig:classificationradius}
%     \end{subfigure}
%   \end{tabular}
%   \begin{subfigure}[b]{0.57\textwidth}
%     \includegraphics[width=\textwidth]{Sample UAI 2023 paper/figures/ablation/isoradius_nsamples.pdf}
%     \caption{Relationship between isotropic certification radius and  number of samples.}
%     \label{fig:classificationnsamples}
%   \end{subfigure}
%   \caption{Ablation studies.}
% \end{figure}

% \textbf{Lower bound of probability and sample size}. In Figure \ref{fig:lowerbound}, we analysed the role of sample size in the lower bound estimation of a true probability class $p_A$. As we can see, the lower bound error increases as the confidence rate increases as we require more samples to estimate the true class probability. With sample size, we can see this error is linear in log-log scale. After $1000$ samples, the studied errors are between $10^{-3}$ and $10^{-2}$. In our experiments we used $\alpha = 0.01$.

% \textbf{Isotropic noise and certification radius.} In Figure \ref{fig:classificationradius}, we studied the influence of isotropic noise on the probability class necessary to certify a certain radius. A noise level of $p_{Isotropic} = 0.2$ a classification accuracy of $0.995$ is necessary to certify a radius of $5$. As a result, it is not surprising most randomised smoothing papers applied to graph use default noise level of at least $0.3$, (see, Jia et al. “Certified Robustness of Graph Neural Networks against Adversarial Structural Perturbation”).

% \textbf{Isotropic certification radius and  number of samples.} In Figure \ref{fig:classificationnsamples}, we analysed the same phenomenon as the previous paragraph, however, we illustrated this phenomenon while taking into account the sample size used to estimate $p_A$. In our setting, setting up a sample size of $10^4$ gives the accurate certified radius. In our experiments, we took $N=10^5$.



\textbf{Varying the base classifier} In Figure \ref{fig:modelradius}, we compare our anisotropic certification performance across three kernels, the graphlet Sampling kernel \cite{shervashidze2009efficient}, the neighbourhood subgraph pairwise distance kernel \cite{costa2010fast} and vertex Histogram kernels \cite{sugiyama2015halting} for a sample size of $N=10,000$. %lthough the neighborhood subgraph pairwise distance and vertex histogram kernels perform similarly, we can see the graphlet kernel performs particularly not well. This powerful and expensive kernel computes features based on subgraphs counts, which we suspect is the cause of its misclassification results when anisotropic noise is applied. 
%This experiment demonstrates the certification results rely on the model via its ability to originally (prior to smoothing) be more robust to the specified perturbations.
In general, a model that is robust to noise will lead to  certificates with large radii.

\textbf{Number of sampled perturbations} In Figure \ref{fig:annoisensamples}, we analysed the impact of sample size when computing the anisotropic certification radius in our synthetic experiments. %While an extremely low sample size $N=10$ underestimates the radius, we obtain the same absolute certification for higher sample size.
The certificate performs poorly for a small number of samples. This is because the lower bound on $p_A$ becomes very loose.


\begin{figure}[t]
\centering
\includegraphics[width=\textwidth]{figures/ablation/aninoise_samples2.pdf}    
\caption{Influence of the underlying classifier on the anisotropic certificate radius.}
\label{fig:modelradius}
\end{figure}
\bibliography{osselin_756}

\begin{figure}[t]
\centering
\includegraphics[width=\textwidth]{figures/ablation/aninoise_samples1.pdf}    
\caption{Influence of sample size on anisotropic certificate radius.}
\label{fig:annoisensamples}
\end{figure}

\end{document}

