\documentclass[accepted]{uai2023}

% \usepackage{aistats2023}
% If your paper is accepted, change the options for the package
% aistats2023 as follows:
%
%\usepackage[accepted]{aistats2023}
%
% This option will print headings for the title of your paper and
% headings for the authors names, plus a copyright note at the end of
% the first column of the first page.

% If you set papersize explicitly, activate the following three lines:
%\special{papersize = 8.5in, 11in}
%\setlength{\pdfpageheight}{11in}
%\setlength{\pdfpagewidth}{8.5in}

% If you use natbib package, activate the following three lines:
%\usepackage[round]{natbib}
%\renewcommand{\bibname}{References}
%\renewcommand{\bibsection}{\subsubsection*{\bibname}}

% If you use BibTeX in apalike style, activate the following line:
%\bibliographystyle{apalike}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{multirow}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{bbm}
\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{comment}

\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\def\shired{\textcolor{red}}
\def\yanred{\textcolor{red}}
\def\st{\text{s.t.}}


\usepackage{amsmath,amsthm,amssymb,multirow,paralist,mathrsfs,amsfonts,dsfont}
\newtheorem{theorem}{Theorem}
\newtheorem*{theorem1}{Theorem 1}
\newtheorem{proposition}{Proposition}
\newtheorem{property}{Property}

\newtheorem{lemma}{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{observation}{Observation}
\newtheorem{remark}{Remark}
\let\oldremark\remark
\renewcommand{\remark}{\oldremark\normalfont}

\usepackage{enumitem}

% \def\indicator{{\bf 1}}
\def\indicator{\mathrm I}
\def\indicator{\mathds I}
\def\indicator{\mathbb I}


\def\PR{\text{PR}}

\def\iPR{\text{iPR}}
\def\aPR{\text{aPR}}
\def\rob{\text{rob}}


\def\CP{\text{CP}}
\def\RCP{\text{RCP}}
\def\LCP{\text{LCP}}
\def\AR{\text{AR}}
\def\cal{\text{cal}}
\def\tr{\text{tr}}
\def\test{\text{test}}
\def\gt{\text{gt}}
\def\and{\mathrm{and}}
\def\class{\mathrm{class}}

\def\calX{\mathcal X}
\def\calE{\mathcal E}

\def\calC{\mathcal C}
\def\calY{\mathcal Y}
\def\calZ{\mathcal Z}
\def\calN{\mathcal N}
\def\calB{\mathcal B}
\def\calR{\mathcal R}
\def\calP{\mathcal P}
\def\calM{\mathcal M}
\def\calD{\mathcal D}
\def\calF{\mathcal F}
\def\calS{\mathcal S}
\def\calT{\mathcal T}

\def\E{\mathbb E}
\def\P{\mathbb P}
\def\R{\mathbb R}



\def\vectort{\mathbf t}
\def\vectorone{{\bf 1}}



\usepackage{newfloat}
\usepackage{listings}







\begin{document}

% If your paper is accepted and the title of your paper is very long,
% the style will print as headings an error message. Use the following
% command to supply a shorter title of your paper so that it can be
% used as headings.
%
%\runningtitle{Probabilistically Robust Conformal Prediction}

% If your paper is accepted and the number of authors is large, the
% style will print as headings an error message. Use the following
% command to supply a shorter version of the authors names so that
% they can be used as headings (for example, use only the surnames)
%
%\runningauthor{Surname 1, Surname 2, Surname 3, ...., Surname n}


\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Probabilistically Robust Conformal Prediction (Supplementary material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1=]{\href{mailto:<subhankar.ghosh@wsu.edu.edu>}{Subhankar Ghosh}{}}
\author[1=]{Yuanjie Shi}
\author[1]{Taha Belkhouja}
\author[1]{Yan Yan}
\author[1]{Janardhan Rao Doppa}
\author[2]{Brian Jones}
% Add affiliations after the authors
\affil[1]{%
    School of Electrical Engineering and Computer Science\\
    Washington State University
}
\affil[2]{%
    Proofpoint Inc.   
}  
\onecolumn
\maketitle




\section{Technical Proofs}

In this section, we prove the theoretical results in the main paper.
To make it complete and self-contained, we also include the proof of Proposition 1, i.e., Theorem 1 in \citep{gendler2022adversarially}, with the framework and notations used in our paper.

\begin{proposition}
\label{proposition:AR_coverage_ARCP_appendix}
(Proposition 1 restated, adversarially robust coverage of RSCP, Theorem 1 in \citep{gendler2022adversarially})
Assume the score function $S$ is $M_r$-adversarially inflated.
Let $\calC^\AR(\widetilde X) = \{ y \in \calY : S(\widetilde X, y) \leq \tau^\AR(\alpha) \}$ be the prediction set for a testing sample $\widetilde X$.
Then RSCP achieves ($1-\alpha$)-adversarially robust coverage.
\end{proposition}
\begin{proof}
(of Proposition \ref{proposition:AR_coverage_ARCP_appendix})

After reviewing the inflated quantile in the adversarial sense, we extend it to the following probabilistic sense.
\begin{align*}
\P_Z \{ 
S(X + \epsilon, Y) \leq \tau^\AR(\alpha)
\}
\geq &
\P_Z \{
S(X, Y) + M_r
\leq 
\tau^\AR(\alpha)
\}
\\
= &
\P_Z \{
S(X, Y) + M_r
\leq 
Q(\alpha) + M_r
\}
\\
= &
\P_Z \{ S(X, Y) \leq Q(\alpha) \}
\\
= &
\P_{X,Y} \{ S(X, Y) \leq Q(\alpha) \}
\geq 
1 - \alpha ,
\end{align*}
where the first inequality is due to the condition of $M_r$-adversarially inflated conformity score function (Definition 2), the first equality is due to the setting of the inflated threshold $\tau^\AR(\alpha) = Q(\alpha) + M_r$,
and the last inequality is due to the definition of quantile $Q(\alpha)$.
\end{proof}



\begin{proposition}
\label{proposition:PR_coverage_iPRCP_appendix}
(Proposition 2 restated, probabilistically robust coverage of iPRCP)
Assume the score function $S$ is $M_{r, \eta}$-probabilistically inflated.
Let $\calC^\iPR(\widetilde X) = \{ y \in \calY : S(\widetilde X, y) \leq \tau^\iPR(\alpha; \eta) \}$ be the prediction set for a testing sample $\widetilde X=X+\epsilon$. 
Then iPRCP achieves ($1-\alpha$)-probabilistically robust coverage.
\end{proposition}


\begin{proof}
(of Proposition \ref{proposition:PR_coverage_iPRCP_appendix})

Denote $A_{r, \eta} = \{ Z \in \calX \times \calY \times \calE_r : S(X + \epsilon, Y) \leq S(X, Y) + M_{r, \eta} \}$, which implies $\P_Z \{ Z \in A_{ r, \eta } \} \geq 1 - \eta$.
Recall $\tau^\iPR(\alpha'; \eta) = Q(\alpha') + M_{r, \eta}$ for $\alpha'$ and $\eta$.
\begin{align*}
&
\P_Z \{ S(X + \epsilon, Y) \leq \tau^\iPR(\alpha'; \eta) \}
\\
= &
\P\{ Z \in A_{r, \eta} \} \cdot \P_Z \{ S(X + \epsilon, Y) \leq \tau^\iPR(\alpha'; \eta) | Z \in A_{r, \eta} \}
\\
&
+ \P\{ Z \notin A_{r, \eta} \} \cdot \P_Z \{ S(X + \epsilon, Y) \leq \tau^\iPR(\alpha'; \eta) | Z \notin A_{r, \eta} \}
\\
\geq & 
( 1 - \eta ) \cdot \P_Z \{ S(X + \epsilon, Y) \leq \tau^\iPR(\alpha'; \eta) | Z \in A_{r, \eta} \}
\\
\geq &
( 1 - \eta ) \cdot \P_Z \{ S(X, Y) + M_{r, \eta} \leq Q(\alpha') + M_{r, \eta} | Z \in A_{r, \eta} \} 
\\
= &
( 1 - \eta ) \cdot \P_{X, Y} \{ S(X, Y) \leq Q(\alpha') \}
\\
\geq &
( 1 - \eta ) ( 1 - \alpha' ) ,
\end{align*}
where the first inequality is due to the non-negativity of probability and the definition of $A_{r, \eta}$, and
the second inequality is due to $M_{r, \eta}$-probabilistically inflated score function (7).


In this case, define $\alpha^*_\iPR(\alpha; \eta) := \max\{ \alpha' : (1-\eta)(1-\alpha') \geq 1-\alpha \}$, and we can use $\tau^\iPR(\alpha^*_\iPR(\alpha; \eta); \eta)$ as the threshold to derive $(1-\alpha)$-probabilistically robust coverage.
However, we have to know the conformity score function very well, so that we access the value of $M_{r, \eta}$ given $\eta$ to determine $\tau^*_\iPR(\alpha; \eta)$, which is not always possible in practice.
\end{proof}




\begin{theorem}
\label{theorem:appendix:prob_robust_coverage_aPRCP}
(Theorem 1 restated, probabilistically robust coverage of aPRCP)
Let $\calC^\aPR(\widetilde X = X + \epsilon) = \{ y \in \calY : S(\widetilde X, y) \leq \tau^\aPR(\alpha; s) \}$ be the prediction set for a testing sample $\widetilde X$.
Then aPRCP achieves ($1-\alpha$)-probabilistically robust coverage.
\end{theorem}



\begin{proof}
(of Theorem \ref{theorem:appendix:prob_robust_coverage_aPRCP})

Denote $B = \{ (X, Y) \in \calX \times \calY : Q^\rob(X, Y; \alpha^*_\aPR) \leq \tau^\aPR(\alpha; s)\}$,
which implies that 
\begin{align}\label{eq:prob_B}
\P_{X,Y}\{ (X,Y) \in B \} \geq 1-\alpha+s
\end{align}
due to the definition of $\tau^\aPR(\alpha; s)$ in (9).
We simply check whether $\tau^\aPR(\alpha; s)$ can give us probabilistically robust coverage as follows:
\begin{align}\label{eq:PRCP_coverage}
&
\P_Z \{ S(X + \epsilon, Y) \leq \tau^\aPR(\alpha; s) \}
\nonumber\\
= &
\P_{X, Y}\{ X, Y : Q^\rob(X, Y; \alpha^*_\aPR) \leq \tau^\aPR(\alpha; s) \} \cdot \P_{\epsilon | X, Y} \{ S(X + \epsilon, Y) \leq \tau^\aPR(\alpha; s) \}
\nonumber\\
&
+ \P_{X, Y}\{ X : Q^\rob(X, Y; \alpha^*_\aPR) > \tau^\aPR(\alpha; s) \} \cdot \P_{\epsilon | X, Y} \{ S(X + \epsilon, Y) \leq \tau^\aPR(\alpha; s) \}
\nonumber\\
\geq &
\P_{X, Y}\{ X, Y : Q^\rob(X, Y; \alpha^*_\aPR) \leq \tau^\aPR(\alpha; s) \} \cdot \P_{\epsilon | (X, Y) \in B} \{ S(X + \epsilon, Y) \leq \tau^\aPR(\alpha; s) \}
\nonumber\\
\geq &
\P_{X, Y}\{ (X, Y) \in B \} \cdot \P_{\epsilon | (X, Y) \in B} \{ S(X + \epsilon, Y) \leq Q^\rob(X, Y; \alpha^*_\aPR) \}
\nonumber\\
\geq &
( 1 - \alpha + s ) \cdot \P_{\epsilon | (X, Y) \in B } \{ S(X + \epsilon, Y) \leq Q^\rob(X, Y; \alpha^*_\aPR) \}
\\
\geq &
( 1 - \alpha + s ) ( 1 - \alpha^*_\aPR ),
\nonumber
\end{align}
where the first inequality is due to the non-negativity of probability,
the second inequality is due to $Q^\rob(X,Y;\alpha^\aPR(\alpha)) \leq \tau^\aPR(\alpha; s)$ for $(X,Y) \in B$,
the third inequality is due to (\ref{eq:prob_B}),
and the last inequality is due to the definition of robust quantile $Q^\rob(X, Y; \tilde \alpha)$ in (8).

Recall $\alpha^*_\aPR = 1 - (1-\alpha) / (1-\alpha + s)$, so $( 1 - \alpha + s ) ( 1 - \alpha^*_\aPR ) = 1-\alpha$, which shows
\begin{align*}
\P_Z \{ S(X + \epsilon, Y) \leq \tau^\aPR(\alpha; s) \}
\geq 
1 - \alpha .
\end{align*}
\end{proof}


\begin{lemma}
\label{lemma:cross_domain_noise_coverage}
(Inflated probability for cross domain noise)
Assume $ \P_{\epsilon \sim \calP_\epsilon^{cal}}\{\epsilon\} - \P_{\epsilon \sim \calP_\epsilon^{test}}\{\epsilon\} \leq d$ for all $\| \epsilon \| \leq r$.
Then, for any threshold $\tau$, the following inequality holds:
\begin{align}
\label{eq:lemma1}
\P_{ \epsilon \sim \calP_\epsilon^{cal} | X, Y } \{ S(X + \epsilon, Y) \leq \tau \}
-
\P_{ \epsilon \sim \calP_\epsilon^{test} | X, Y } \{ S(X + \epsilon, Y) \leq \tau \} 
\leq
d .
\end{align}
\end{lemma}

\begin{proof}
(of Lemma \ref{lemma:cross_domain_noise_coverage})

\begin{align*}
&
\P_{\epsilon \sim \calP_\epsilon^{cal}} \{ S(X + \epsilon, Y) \leq \tau \}
-
\P_{\epsilon \sim \calP_\epsilon^{test}} \{ S(X + \epsilon, Y) \leq \tau \}
\\
= &
\E_{\epsilon \sim \calP_\epsilon^{cal}} [ \indicator [ S(X + \epsilon, Y) \leq \tau ] ]
-
\E_{\epsilon \sim \calP_\epsilon^{test}} [ \indicator[ S(X + \epsilon, Y) \leq \tau ] ]
\\
= &
\int_\epsilon \P_{\epsilon \sim \calP_\epsilon^{cal}} \{ \epsilon \} \cdot \indicator [ S(X + \epsilon, Y) \leq \tau ] d\epsilon
- \int_\epsilon \P_{\epsilon \sim \calP_\epsilon^{test}} \{ \epsilon \} \cdot \indicator [ S(X + \epsilon, Y) \leq \tau ] d\epsilon
\\
= &
\int_\epsilon \Big( \P_{\epsilon \sim \calP_\epsilon^{cal}} \{ \epsilon \} - \P_{\epsilon \sim \calP_\epsilon^{test}} \{ \epsilon \} \Big) \cdot \indicator [ S(X + \epsilon, Y) \leq \tau ] d\epsilon
\\
\leq &
\int_\epsilon ( d \cdot 1 ) d\epsilon
=
d .
\end{align*}

% The same proof can be applied to $\P_{\epsilon \sim \calD_2} \{ S(X + \epsilon, Y) \leq \tau \}
% -
% \P_{\epsilon \sim \calD_1} \{ S(X + \epsilon, Y) \leq \tau \} \leq d.$
\end{proof}


\begin{theorem}
\label{theorem:appendix:prob_robust_coverage_aPRCP_cross_domain_noise}
(Theorem 2 restated, probabilistically robust coverage of aPRCP for cross domain noise)
Let $\calP_\epsilon^{test}$ and $\calP_\epsilon^{cal}$ denote different distributions of $\epsilon$ during the testing and calibration phase, respectively.
Assume $\P_{\epsilon \sim \calP_\epsilon^{cal}}\{\epsilon\} - \P_{\epsilon \sim \calP_\epsilon^{test}}\{\epsilon\} \leq d$ for all $\| \epsilon \| \leq r$.
Set $\alpha^*_\aPR = 1 - d - ( 1 - \alpha) / (1 - \alpha + s )$ in (9).
Let $\calC^\aPR(\widetilde X = X + \epsilon) = \{ y \in \calY : S(\widetilde X, y) \leq \tau^\aPR(\alpha; s) \}$ be the prediction set for a testing sample $\widetilde X$.
Then aPRCP achieves ($1-\alpha$)-probabilistically robust coverage under $\calP_\epsilon^{test}$.
\end{theorem}

\begin{proof}
(of Theorem \ref{theorem:appendix:prob_robust_coverage_aPRCP_cross_domain_noise})
We start with (\ref{eq:PRCP_coverage}) in the proof of Theorem \ref{theorem:appendix:prob_robust_coverage_aPRCP_cross_domain_noise} which only considers the noise $\epsilon$ drawn from the same distribution during calibration and testing as follows.
\begin{align*}
&
\P_{ X, Y, \epsilon \sim \calP_\epsilon^{test}} \{ S(X + \epsilon, Y) \leq \tau^\aPR(\alpha; s) \}
\\
\geq &
( 1 - \alpha + s ) \cdot \P_{\epsilon \sim \calP_\epsilon^{test} | (X, Y) \in B } \{ S(X + \epsilon, Y) \leq Q^\rob(X, Y; \alpha^*_\aPR) \}
\\
\geq &
( 1 - \alpha + s ) \cdot \Big( \P_{\epsilon \sim \calP_\epsilon^{cal} | (X, Y) \in B } \{ S(X + \epsilon, Y) \leq Q^\rob(X, Y; \alpha^*_\aPR) \} - d \Big)
\\
\geq &
( 1 - \alpha + s ) \cdot \Bigg( 1 - \Big( 1 - d - \frac{1-\alpha}{1-\alpha+s} \Big) - d \Bigg)
\\
= &
( 1 - \alpha + s ) \cdot \frac{ 1 - \alpha }{ 1 - \alpha + s }
= 
1 - \alpha ,
\end{align*}
where the first inequality follows (\ref{eq:PRCP_coverage}), 
the second inequality is due to inequality \ref{eq:lemma1} in Lemma \ref{lemma:cross_domain_noise_coverage}, and
the third inequality is due to the definition $Q^\rob(X, Y; \alpha^*_\aPR)$ in (8) with $\alpha^*_\aPR = 1 - d - (1-\alpha) / (1-\alpha+s)$.
\end{proof}

\begin{corollary}
\label{corollary:compare_aPRCP_ARCP_appendix}
(Corollary 3 restated)
To achieve the same ($1-\alpha$)-probabilistically robust coverage on $Z$, the following inequalities hold: \begin{align*}
\min_{ \eta \in [0, \alpha] } \tau^\iPR(\alpha; \eta) \leq \tau^\AR(\alpha), ~~
\min_{ s \in [0, \alpha] }  \tau^\aPR(\alpha; s) \leq \tau^\AR(\alpha) .
\end{align*}
\end{corollary}


\begin{proof}
(of Corollary \ref{corollary:compare_aPRCP_ARCP_appendix})
For adaptive PRCP, if $s = 0$, to achieve ($1-\alpha$)-probabilistically robust coverage over $Z$, we must have $\alpha^*_\aPR = 0$.
Since $\alpha^*_\aPR$ controls how aggressively we derive the robust quantile for $(X, Y)$, it indicates that we have to consider $1$-robust quantile.
This is equivalent to deriving the adversarial $S(X+\epsilon, Y)$ for all $(X, Y)$.



For inflated PRCP, if $\eta=0$, to achieve ($1-\alpha$)-probabilistically robust coverage, we have $M_{\delta, \eta} = M_\delta$ and $\alpha^*_\iPR = \alpha$, recovering ARCP (adversarially robust conformal prediction).
This case is exactly the same with adpative PRCP with $s=0$.
Therefore, $\tau^\AR(\alpha) = \tau^\iPR(\alpha; 0) = \tau^\aPR(\alpha; 0)$.


Note that $\min_{s \in [0, \alpha]} \tau^\aPR(\alpha; s) \leq \tau^\aPR(\alpha; 0)$ 
and $\min_{\eta\in [0, \alpha]} \tau^\iPR(\alpha; \eta) \leq \tau^\iPR(\alpha; 0)$,
so by tuning the value of $s$ for aPRCP and the value of $\eta$ for iPRCP, to achieve the same probabilistically robust coverage $1-\alpha$, we can have a more efficient threshold than ARCP.
\end{proof}


\begin{proposition}
\label{proposition:empirical_quantile_concentration_appendix}
(Proposition 3 restated, concentration inequality for quantiles)
Let $Q(\alpha) = \max\{ t : \P_V\{ V \leq t \} \geq 1 - \alpha \}$ be the true quantile of a random variable $V$ given $\alpha$,
and $\widehat Q_n(\alpha) = V_{ ( \lceil (n+1) ( 1 - \alpha ) \rceil ) }$ be the empirical quantile estimated by $n$ randomly sampled set $\{V_1, ..., V_n\}_{i=1}^n$.
Then with probability at least $1-\delta$, we have
$
% | \sum_{i=1}^n Z_i - p n | \geq \frac{ \log(2 / \delta) }{ \sqrt{n} }  \
% | \widehat Q_n(\alpha) - Q(\alpha) |
% \leq 
% \tilde O( 1 / \sqrt{n} ), 
\widehat Q_n(\alpha + \tilde O(1/\sqrt{n}))
\leq
Q(\alpha)
\leq
\widehat Q_n(\alpha - \tilde O(1/\sqrt{n}))
$  
where $\tilde O$ hides the logarithmic factor.
\end{proposition}

\begin{proof}
(of Proposition \ref{proposition:empirical_quantile_concentration_appendix})

Define $Z_i = \indicator{ [ V_i \leq Q(\alpha) ] }$ where $1 \leq i \leq n$ and $\indicator[\cdot]$ is an indicator function.
Then $Z_{i}$ is a Bernoulli random variable with $\P\{ Z_i = 1 \} = 1 - \alpha$ and $\P\{ Z_i = 0 \} = \alpha$ from the definition of $Q(\alpha)$.
Let $\widehat Z = \frac{1}{n} \sum_{i=1}^n Z_i$ and $\E[\widehat Z] = 1-\alpha.$


According to Chernoff bound, we know
\begin{align*}
\P\Bigg\{ \Bigg| \frac{1}{n} \sum_{i=1}^n Z_i - \E[\widehat Z] \Bigg| \geq \varepsilon \E[\widehat Z] \Bigg\}
\leq 
2 \exp\Bigg( - \E[\widehat Z] \varepsilon^2 / 3 \Bigg) 
=
2 \exp\Bigg( - n (1-\alpha) \varepsilon^2 / 3 \Bigg) .
\end{align*}


By setting $\delta = 2 \exp( - n (1-\alpha) \varepsilon^2 / 3 )$, i.e., $\varepsilon = \sqrt{ ( 3 \log(2/\delta) ) / ( ( 1 - \alpha ) n  ) }$, we have with probability at least $1-\delta$:
\begin{align}\label{eq:abs_bound}
\Bigg| \frac{1}{n} \sum_{i=1}^n \indicator[ V_i \leq Q(\alpha) ] - ( 1 - \alpha ) \Bigg| 
\leq 
\varepsilon ( 1 - \alpha )
=
\sqrt{ ( 3 ( 1 - \alpha )  \log(2/\delta) ) / n }
=
\tilde O(1 / \sqrt{n}) .
\end{align}

% We have $\hat Q_n(\alpha)= \frac{1}{n} \sum_{i=1}^n Z_{i}$, $Q(\alpha)= E(Z_{i})=\frac{1}{n} \sum_{i=1}^n p_{i}= 1-\alpha$. \\
% Denote $| \hat Q_n(\alpha) - Q(\alpha) |=\gamma$\\
% According to Chernoff bound, we have 
% \begin{align*}
% P[| \sum_{i=1}^n Z_{i}-(1-\alpha)n| \geq \gamma n] \leq 2exp(-2 \gamma^2 n)] =\delta\\
% \gamma \leq \sqrt{\frac{ \log(2/\delta) }{ 2{n} }}
% \end{align*}


Recall the definition of the empirical quantile $\widehat Q_n(\alpha)$ given $\alpha$:
\begin{align*}
\widehat Q_n(\alpha) = \max\Bigg\{ t : \frac{1}{n} \sum_{i=1}^n \indicator[ V_i \leq t ] \geq 1 - \alpha \Bigg\} .
\end{align*}
Then we know the following upper bound and lower bound for $1-\alpha$:
\begin{align*}
( 1 - \alpha )
\leq 
\frac{1}{n} \sum_{i=1}^n \indicator[ V_i \leq \widehat Q_n(\alpha) ] , ~~~
( 1 - \alpha )
\geq 
\frac{1}{n} \sum_{i=1}^n \indicator[ V_i \leq \widehat Q_n(\alpha + 1 / n ) ] .
\end{align*}



Re-arranging (\ref{eq:abs_bound}) and using the above upper/lower bounds, with probability at least $1-\delta$, we have
\begin{align*}
&
( 1 - \alpha ) ( 1 - \varepsilon )
\leq 
\frac{1}{n} \sum_{i=1}^n \indicator[ V_i \leq Q(\alpha) ]
\leq 
( 1 - \alpha ) ( 1 + \varepsilon)
\\
\Leftrightarrow ~~~
& 
1 - ( \underbrace{ 1 - ( 1 - \alpha ) ( 1 - \varepsilon ) }_{ = \alpha' } )
\leq 
\frac{1}{n} \sum_{i=1}^n \indicator[ V_i \leq Q(\alpha) ]
\leq 
1 - ( \underbrace{ 1 - ( 1 - \alpha ) ( 1 + \varepsilon) }_{ = \alpha'' } )
\\
\Rightarrow ~~~ 
& 
\frac{1}{n} \sum_{i=1}^n \indicator[ V_i \leq \widehat Q_n( \alpha' + 1/n ) ]
\leq
\frac{1}{n} \sum_{i=1}^n \indicator[ V_i \leq Q(\alpha) ]
\leq 
\frac{1}{n} \sum_{i=1}^n \indicator[ V_i \leq \widehat Q_n( \alpha'' ) ] 
\\
\Leftrightarrow ~~~
&
\widehat Q_n(\alpha' + 1/n)
\leq 
Q(\alpha)
\leq 
\widehat Q_n(\alpha'') .
\end{align*}


Finally, we analyze $\alpha'$ and $\alpha''$ as follows
\begin{align*}
\alpha' 
=
1 - (1-\alpha) (1-\varepsilon)
=
\alpha + \varepsilon (1-\alpha)
=
\alpha + \sqrt{ 3 ( 1 - \alpha ) \log(2/\delta) / n }
=
\alpha + \tilde O(1/\sqrt{n}),
\\
\alpha''
=
1 - (1-\alpha) (1+\varepsilon)
=
\alpha - \varepsilon (1-\alpha)
=
\alpha - \sqrt{ 3 ( 1 - \alpha ) \log(2/\delta) / n }
=
\alpha - \tilde O(1/\sqrt{n}).
\end{align*}


Therefore, we have
\begin{align*}
\widehat Q_n(\alpha + \tilde O(1/\sqrt{n}))
\leq
Q(\alpha)
\leq
\widehat Q_n(\alpha - \tilde O(1/\sqrt{n})) .
\end{align*}
\end{proof}





\section{ADDITIONAL EXPERIMENTS AND IMPLEMENTATION DETAILS}

\textbf{Implementation details.}
Table \ref{tab:appendix_Acc_clean_adv} shows the testing accuracy of the different deep models using both standard training ($\sigma=0$) and Gaussian augmented training ($\sigma>0$). %Table \ref{hyper_param} shows the size of subset of data used to tune the hyper-parameter $s$.

% Please add the following required packages to your document preamble:
% \usepackage{multirow}
\begin{table*}[!h]
\centering
\begin{tabular}{|c|c|cc|cc|cc|}
\hline
\multirow{2}{*}{Architecture} & \multirow{2}{*}{Training} & \multicolumn{2}{c|}{CIFAR10}             & \multicolumn{2}{c|}{CIFAR100}            & \multicolumn{2}{c|}{ImageNet}            \\ \cline{3-8} 
                              &                                    & \multicolumn{1}{c|}{Clean(\%)} & Adv(\%) & \multicolumn{1}{c|}{Clean(\%)} & Adv(\%) & \multicolumn{1}{c|}{Clean(\%)} & Adv(\%) \\ \hline
\multirow{2}{*}{ResNet-110}   & $\sigma = 0.0$                                & \multicolumn{1}{c|}{89.99}     & 26.71   & \multicolumn{1}{c|}{71.12}     & 12.20   & \multicolumn{1}{c|}{-}         & -       \\ \cline{2-8} 
                              & $\sigma = 0.125$                              & \multicolumn{1}{c|}{81.70}     & 67.80   & \multicolumn{1}{c|}{58.11}     & 42.01   & \multicolumn{1}{c|}{-}         & -       \\ \hline
\multirow{2}{*}{VGG-19}       & $\sigma = 0.0$                                & \multicolumn{1}{c|}{93.10}     & 54.96   & \multicolumn{1}{c|}{72.22}     & 23.10   & \multicolumn{1}{c|}{-}         & -       \\ \cline{2-8} 
                              & $\sigma = 0.125$                              & \multicolumn{1}{c|}{86.50}     & 72.10   & \multicolumn{1}{c|}{55.12}     & 40.85   & \multicolumn{1}{c|}{-}         & -       \\ \hline
\multirow{2}{*}{DenseNet-161} & $\sigma = 0.0$                                & \multicolumn{1}{c|}{95.42}     & 23.28   & \multicolumn{1}{c|}{77.10}     & 04.30   & \multicolumn{1}{c|}{-}     & -   \\ \cline{2-8} 
                              & $\sigma = 0.125$                              & \multicolumn{1}{c|}{88.17}     & 73.15   & \multicolumn{1}{c|}{60.32}     & 46.91   & \multicolumn{1}{c|}{-}         & -       \\ \hline
\multirow{2}{*}{ResNet-50}    & $\sigma = 0.0$                                & \multicolumn{1}{c|}{-}         & -       & \multicolumn{1}{c|}{-}         & -       & \multicolumn{1}{c|}{75.69}     & 19.56   \\ \cline{2-8} 
                              & $\sigma = 0.250$                               & \multicolumn{1}{c|}{-}         & -       & \multicolumn{1}{c|}{-}         & -       & \multicolumn{1}{c|}{68.62}     & 56.15   \\ \hline
\end{tabular}
\caption{Testing accuracy of different deep models on clean and adversarial test examples (generated using the PGD attack algorithm) for all three data sets.}
\label{tab:appendix_Acc_clean_adv}
\end{table*}


% \begin{table*}[]
% \centering
% \begin{tabular}{|l|l|l|l|}
% \hline
% Data                            & CIFAR10 & CIFAR100 & ImageNet \\ \hline
% \# data for parameter $s$ & 3000    & 3000     & 5000     \\ \hline
% \end{tabular}
% \caption{We tune $s$ parameter for the \texttt{aPRCP}(worst-adv) method. The number of data points to tune $s$ for each data set is given in the table.}
% \label{hyper_param}
% \end{table*}



\subsection{Case of Similar Noise Distribution for both Calibration  and Testing}
\noindent{\bf Performance evaluation with a fixed $s$ hyper-parameter and varying $\tilde{\alpha}$. }
We present in Figures \ref{C100_uni_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both} and \ref{C10_uni_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both}
the probabilistic robust coverage and prediction set size performance of aPRCP using \textit{the Uniform distribution as a noise distribution for both calibration and testing purposes} respectively for the CIFAR100 and CIFAR10 datasets with the three different models that are trained with clean data. Similarly, we present in Figures \ref{C100_gaussian_cal_gaussian_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both} and \ref{C10_gaussian_cal_gaussian_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both}
the probabilistic robust coverage and prediction set size performance of aPRCP using \textit{the Gaussian distribution as a noise distribution for both calibration and testing purposes}. For calibration, we sample $m_s = 128$ noisy data points from the surrounding of each data point ($||\epsilon||_2 \leq 0.125$). For testing, we sample $n_s = 128$ data points from the surrounding of each testing point ($||\epsilon||_2 \leq 0.125$). We observe that the probabilistic robust coverage for noisy data increases monotonically as we increase the quantile robust coverage for each ball from $1 - \tilde{\alpha} = 0.90$ to $1 - \tilde{\alpha} = 1.0$. These observations hold for both conformal scores (HPS and APS) and using different deep neural network models. 




\begin{figure}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C100/C100_uni_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both.png}
\caption{Probabilistic robust coverage (top) and prediction set size (bottom) obtained by aPRCP$(\tilde{\alpha} = 0.10)$, aPRCP$(\tilde{\alpha} = 0.03)$, 
PRCP$(\tilde{\alpha} = 0.06)$, aPRCP$(\tilde{\alpha} = 0.09)$,
and aPRCP$(\tilde{\alpha} = 0.00)$, evaluated on CIFAR100 dataset for three different deep models. The target coverage is $90\%$. The results are shown over 50 different runs.}
\label{C100_uni_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both}
\end{figure}

\begin{figure}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C10/C10_uni_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both.png}
\caption{Probabilistic robust coverage (top) and prediction set size (bottom) obtained by aPRCP$(\tilde{\alpha} = 0.10)$, aPRCP$(\tilde{\alpha} = 0.03)$, 
PRCP$(\tilde{\alpha} = 0.06)$, aPRCP$(\tilde{\alpha} = 0.09)$,
and aPRCP$(\tilde{\alpha} = 0.00)$, evaluated on CIFAR10 dataset for three different deep models. The target coverage is $90\%$. The results are shown over 50 different runs.}
\label{C10_uni_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both}
\end{figure}


\begin{figure}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C100/C100_gaussian_cal_gaussian_test_ratio_0.0PRCP_fixed_ns_cvg_Size_Both.png}
\caption{Probabilistic robust coverage(top) and Prediction set size(bottom) obtained by aPRCP$(\tilde{\alpha} = 0.10)$, aPRCP$(\tilde{\alpha} = 0.03)$, 
PRCP$(\tilde{\alpha} = 0.06)$, aPRCP$(\tilde{\alpha} = 0.09)$,
and aPRCP$(\tilde{\alpha} = 0.00)$, evaluated on CIFAR100 dataset for three different deep models. The target coverage is $90\%$. The results are shown over 50 different runs.}
\label{C100_gaussian_cal_gaussian_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both}
\end{figure}
\clearpage
\begin{figure}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C10/C10_gaussian_cal_gaussian_test_ratio_0.0PRCP_fixed_ns_cvg_Size_Both.png}
\caption{Probabilistic robust coverage(top) and Prediction set size(bottom) obtained by aPRCP$(\tilde{\alpha} = 0.10)$, aPRCP$(\tilde{\alpha} = 0.03)$, 
PRCP$(\tilde{\alpha} = 0.06)$, aPRCP$(\tilde{\alpha} = 0.09)$,
and aPRCP$(\tilde{\alpha} = 0.00)$, evaluated on CIFAR10 dataset for three different deep models. The target coverage is $90\%$. The results are shown over 50 different runs.}
\label{C10_gaussian_cal_gaussian_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both}
\end{figure}


\noindent{\bf Performance evaluation with a fixed $\tilde{\alpha}$ hyper-parameter and varying $s$.}

Figures \ref{C100_s_changes_ratio_0.0PRCP_fixed_ns_cvg_Size_Both} and \ref{C10_s_changes_ratio_0.0PRCP_fixed_ns_cvg_Size_Both}
show the probabilistic robust coverage and prediction set size respectively for the CIFAR100 and CIFAR10 datasets with three different deep models that are trained using standard training. For calibration, we sample $m_s = 128$ noisy data points using the uniform sampling distribution from the surrounding of each data point ($||\epsilon||_2 \leq 0.125$). For testing, we sample $n_s = 128$ data points uniformly from the surrounding of each testing point ($||\epsilon||_2 \leq 0.125$). We observe that the probabilistic robust coverage for noisy data increases as we increase the $s$ parameter value from $0.0$ to $0.09$. This observation matches our proposition as a higher $s$ value produces higher coverage. The above observations hold for both conformal scores (APS and HPS) using different deep neural network models. 

\begin{figure}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C100/C100_s_changes_ratio_0.0PRCP_fixed_ns_cvg_Size_Both.png}
\caption{Probabilistic robust coverage(top) and Prediction set size(bottom) obtained by aPRCP$(\tilde{\alpha} = 0.10)$ while varying the $s$ parameter, evaluated on CIFAR100 dataset for three different deep models. The target coverage is $90\%$. The results are shown over 50 different runs.}
\label{C100_s_changes_ratio_0.0PRCP_fixed_ns_cvg_Size_Both}
\end{figure}


\begin{figure}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C10/C10_s_changes_ratio_0.0PRCP_fixed_ns_cvg_Size_Both.png}
\caption{Probabilistic robust coverage(top) and Prediction set size(bottom) obtained by aPRCP$(\tilde{\alpha} = 0.10)$ while varying the $s$ parameter, evaluated on CIFAR10 dataset for three different models. The target coverage is $90\%$. The results are shown over 50 runs forall three neural network models.}
\label{C10_s_changes_ratio_0.0PRCP_fixed_ns_cvg_Size_Both}
\end{figure}


\noindent{\bf Performance evaluation with fixed $s$ and $\tilde{\alpha}$ hyper-parameter and varying sampling radius ($||\epsilon||_2 \leq r$) around test samples.}
Figures \ref{C10_delta_changes_cvg} and \ref{C10_delta_changes_size}
present the probabilistic robust coverage and the prediction set size respectively for the CIFAR10 dataset. Similarly, figures \ref{C100_delta_changes_cvg} and \ref{C100_delta_changes_size}
present probabilistic robust coverage and prediction set size for the CIFAR100 dataset. We employ three different deep models that are trained with clean data. For calibration, we sample $m_s = 128$ noisy data points using the uniform sampling distribution from the surrounding of each data point ($||\epsilon||_2 \leq 0.125$), where $\epsilon$ is sampled uniformly over the segment $[0, 0.125]$. For testing, we sample $n_s = 128$ data points uniformly from the surrounding of each testing point ($||\epsilon||_2 \leq \{1.0, 2.0, 3.0\}$), where $\epsilon$ is uniformly sampled over the segment $[0, 1], [0, 2], [0, 3]$ respectively. 
%The $||\epsilon||_2$ for testing is higher than the $||\epsilon||_2$ for the calibration, so we can see the coverage decays at $d = 0.0$ as we increase the radius($||\epsilon||_2$) for the calibration because of the distribution shift. But, when we take $d = 0.1$, we are able to guarantee the probabilistic robust coverage. 
We observe that the probabilistic robust coverage for noisy data decays as we increase the sampling radius. Additionally, we note that when we set the $d$ parameter to $0.1$ (accounting for the change in noise distribution between calibration and testing as per Theorem 2), we guarantee achieving the target coverage. These observations hold for both conformal scores (APS and HPS) using different deep neural network models. 

%We observe that the probabilistic robust coverage for noisy data decays as we increase the sampling radius for the testing because there is a distribution shift between the calibration and the testing and eventually the probabilistic robust coverage is not satisfied for some cases at $d = 0.0$. When we set the $d$ parameter to $0.1$ (accounting for the change in noise distribution between calibration and testing as per Theorem 2), we guarantee achieving the target coverage. These observations hold for both conformal scores (APS and HPS) using different deep neural network models. 



\begin{figure}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C10/dataset_CIFAR10uni_cal_uni_eval_delta_changes_cvg.png}
\caption{Probabilistic robust coverage evaluated on CIFAR10 dataset for three different models. The target coverage is $90\%$. The results are shown over 50 runs forall three neural network models.}
\label{C10_delta_changes_cvg}
\end{figure}

\begin{figure}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C10/dataset_CIFAR10uni_cal_uni_eval_delta_changes_size.png}
\caption{Prediction set size evaluated on CIFAR10 dataset for three different deep models. The results are shown over 50 different runs.}
\label{C10_delta_changes_size}
\end{figure}

\begin{figure*}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C100/dataset_CIFAR100uni_cal_uni_eval_delta_changes_cvg.png}
\caption{Probabilistic robust coverage evaluated on CIFAR100 dataset for three different deep models. The target coverage is $90\%$. The results are shown over 50 different runs.}
\label{C100_delta_changes_cvg}
\end{figure*}


\clearpage

\begin{figure*}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C100/dataset_CIFAR100uni_cal_uni_eval_delta_changes_size.png}
\caption{Prediction set size evaluated on CIFAR100 dataset for three different deep models. The results are shown over 50 different runs.}
\label{C100_delta_changes_size}
\end{figure*}


\subsection{Case of Dissimilar Noise Distributions for Calibration and Testing}
\noindent{\bf Gaussian distribution for Calibration  and  Uniform distribution for Testing with a fixed $s$ hyper-parameter and varying $\tilde{\alpha}$.}
Figures \ref{C100_gaussian_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both} and \ref{C10_gaussian_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both} present probabilistic robust coverage and prediction set size respectively for the CIFAR100 and CIFAR10 datasets with  three different deep models that are trained with clean data. For calibration, we sample $m_s = 128$ data points using the Gaussian sampling distribution from the surrounding of each data point($||\epsilon||_2 \leq 0.125$). For testing, we sample $n_s = 128$ data points uniformly from the surrounding of each testing point($||\epsilon||_2 \leq 0.125$). We observe that the probabilistic robust coverage increased over the case of using the same distribution for sampling during the testing and calibration phases. 

\begin{figure}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C100/C100_gaussian_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both.png}
\caption{Probabilistic robust coverage(top) and Prediction set size(bottom) obtained by aPRCP$(\tilde{\alpha} = 0.10)$, aPRCP$(\tilde{\alpha} = 0.03)$, 
PRCP$(\tilde{\alpha} = 0.06)$, aPRCP$(\tilde{\alpha} = 0.09)$,
and aPRCP$(\tilde{\alpha} = 0.00)$, evaluated on CIFAR100 dataset for three different deep models. The target coverage is $90\%$. The results are shown over 50 different runs.}
\label{C100_gaussian_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both}
\end{figure}

\begin{figure}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C10/C10_gaussian_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both.png}
\caption{Probabilistic robust coverage(top) and Prediction set size(bottom) obtained by aPRCP$(\tilde{\alpha} = 0.10)$, aPRCP$(\tilde{\alpha} = 0.03)$, 
PRCP$(\tilde{\alpha} = 0.06)$, aPRCP$(\tilde{\alpha} = 0.09)$,
and aPRCP$(\tilde{\alpha} = 0.00)$, evaluated on CIFAR10 dataset for three different deep models. The target coverage is $90\%$. The results are shown over 50 different runs.}
\label{C10_gaussian_cal_uni_eval_ratio_0.0PRCP_fixed_ns_cvg_Size_Both}
\end{figure}


\noindent{\bf Uniform distribution for Calibration and  Gaussian distribution for Testing with a fixed $s$ hyper-parameter and varying $\tilde{\alpha}$.}
Figures \ref{uni_cal_C100_fixed_ms_APS_HPS_sigma_0_0_cvg} and \ref{uni_cal_C100_fixed_ms_APS_HPS_sigma_0_0_size} present probabilistic robust coverage and prediction size for CIFAR100 and CIFAR10 datasets respectively with three different deep models that are trained with clean data. For calibration, we sample $m_s = 128$ data points using the Uniform sampling distribution from the surrounding of each data point ($||\epsilon||_2 \leq 0.125$). For testing, we sample $n_s = 128$ data points using Gaussian distribution from the surrounding of each testing point ($||\epsilon||_2 \leq 0.125$). We observe a slightly different performance of aPRCP compared to the case of using the same distribution for noise during the testing and calibration phases. This observation corroborate the statement of Theorem 2 and Remark 2 explaining the relation between the gap of the density probability between the calibration and testing noise distributions with the probabilistic robust coverage for aPRCP.

\begin{figure}[h!]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C100/C100_uni_cal_gaussian_test_ratio_0.0PRCP_fixed_ns_cvg_Size_Both.png}
\caption{Probabilistic robust coverage(top) and Prediction set size(bottom) obtained by aPRCP$(\tilde{\alpha} = 0.10)$, aPRCP$(\tilde{\alpha} = 0.03)$, 
PRCP$(\tilde{\alpha} = 0.06)$, aPRCP$(\tilde{\alpha} = 0.09)$,
and aPRCP$(\tilde{\alpha} = 0.00)$, evaluated on CIFAR100 dataset for three different deep models. The target coverage is $90\%$. The results are shown over 50 different runs.}
\label{uni_cal_C100_fixed_ms_APS_HPS_sigma_0_0_cvg}
\end{figure}

\begin{figure}[!h]
\centering
\includegraphics[width=.9\linewidth]{Figures1/C10/C10_uni_cal_gaussian_test_ratio_0.0PRCP_fixed_ns_cvg_Size_Both.png}
\caption{Probabilistic robust coverage(top) and Prediction set size(bottom) obtained by aPRCP$(\tilde{\alpha} = 0.10)$, aPRCP$(\tilde{\alpha} = 0.03)$, 
PRCP$(\tilde{\alpha} = 0.06)$, aPRCP$(\tilde{\alpha} = 0.09)$,
and aPRCP$(\tilde{\alpha} = 0.00)$, evaluated on CIFAR10 dataset for three different deep models. The target coverage is $90\%$. The results are shown over 50 different runs.}
\label{uni_cal_C100_fixed_ms_APS_HPS_sigma_0_0_size}
\end{figure}



\clearpage

\subsection{Performance of \texttt{aPRCP(worst-adv)} with varying $m_s$}
Figures \ref{C10_ms_APS_HPS_sigma_0.25} and \ref{C100_ms_APS_HPS_sigma_0.25} show the performance of aPRCP with three different deep models when varying $m_s$ (number of noisy samples for calibration) for CIFAR10 and CIFAR100 datasets respectively. We show the robust coverage and prediction set size for both \texttt{APS} and \texttt{HPS} conformity scores. Both figures show that the \texttt{aPRCP(worst-adv)} reported performance is consistent for different values of $m_s$.

We show in Figure \ref{C100_ms_APS_HPS_ARCPworstadv_RSCP} the comparison of the prediction set size and the coverage between \texttt{RSCP} and \texttt{aPRCP(worst-adv)} using both \texttt{APS} and \texttt{HPS}. We employ ResNet110 model trained with Gaussian augmented data ($\sigma = 0.125$). We observe that \texttt{RSCP} is more conservative compared to our method \texttt{aPRCP(worst-adv)} for both \texttt{APS} and \texttt{HPS} conformity scores.

We show in Figure \ref{C10_ms_APS_ARCPworstadv_RSCP_different_sigma} and \ref{C100_ms_APS_ARCPworstadv_RSCP_different_sigma} the comparison of the  prediction set size and coverage between \texttt{RSCP} and \texttt{aPRCP(worst-adv)} for two different deep models trained with Gaussian augmented data ($\sigma = 0.0625$ and $\sigma = 0.125$). We observe that \texttt{aPRCP(worst-adv)} produces smaller prediction sets than \texttt{RSCP}.



\begin{figure}[h!]
\centering
\includegraphics[width=\linewidth]{Figures1/C10/CIFAR10ratio_cols_arc_rows_Size_APS_Both.png}
\caption{Robust coverage (top) and prediction set size (bottom) performance of two conformity scores (APS and HPS) for different deep models with varying $m_s$ samples on calibration data for CIFAR10 dataset. The results are reported over 50 different runs. We use all models trained with Gaussian augmented data using standard deviation $\sigma = 0.25$.}
\label{C10_ms_APS_HPS_sigma_0.25}
\end{figure}

\begin{figure}[h!]
\centering
\includegraphics[width=\linewidth]{Figures1/C100/CIFAR100ratio_cols_arc_rows_Size_APS_Both.png}
\caption{Robust coverage (top) and prediction set size (bottom) performance of two scores for different deep models with varying $m_s$ samples on calibration data for CIFAR100 dataset. The results are reported over 50 different runs. We use all models trained with Gaussian augmented data with standard deviation $\sigma = 0.25$.}
\label{C100_ms_APS_HPS_sigma_0.25}
\end{figure}

\begin{figure}[h!]
\centering
\includegraphics[width=\linewidth]{Figures1/C100/ratio_1.0_worst_adv_RSCP_rows_Size_Both.png}
\caption{Robust coverage (top) and prediction set size (bottom) performance of two methods, namely, aPRCP(worst-adv) and RSCP, with varying $m_s$ samples on calibration data for CIFAR100 dataset. The results are reported over 50 different runs. We use all models trained with Gaussian augmented data of standard deviation $\sigma = 0.125$.}
\label{C100_ms_APS_HPS_ARCPworstadv_RSCP}
\end{figure}


\begin{figure}[h!]
\centering
\includegraphics[width=\linewidth]{Figures1/C10/CIFAR10APS_worst_adv_RSCP_rows_Size_Both.png}
\caption{Robust coverage (top) and prediction set size (bottom) performance of two different models trained with Gaussian augmented data using standard deviation $\sigma = 0.0625$ and $\sigma = 0.125$ with varying $m_s$ samples on calibration data for CIFAR10 dataset. The results are reported over 50 different runs.}
\label{C10_ms_APS_ARCPworstadv_RSCP_different_sigma}
\end{figure}


\begin{figure}[h!]
\centering
\includegraphics[width=\linewidth]{Figures1/C100/CIFAR100APS_worst_adv_RSCP_rows_Size_Both.png}
\caption{Robust coverage (top) and prediction set size (bottom) performance of two different models trained with Gaussian augmented data using standard deviation $\sigma = 0.0625$ and $\sigma = 0.125$ with varying $m_s$ samples on calibration data for CIFAR100 dataset. The results are reported over 50 different runs.}
\label{C100_ms_APS_ARCPworstadv_RSCP_different_sigma}
\end{figure}

\clearpage
\subsection{The effect of Varying $||\epsilon||_2 \leq r$ during calibration}
We show in Figure \ref{fig:C100_radius_changes_worst_adv} 
the robust coverage and the prediction set size achieved by aPRCP(worst-adv) on CIFAR100 with a ResNet model that is trained with Gaussian augmented data ($\sigma = 0.125$). For calibration, we sample $m_s = 128$ noisy data points using the uniform sampling distribution from the surrounding of each data point ($||\epsilon||_2 \leq r$), where $r = \{0.125, 0.250, 1.0\}$. For testing, we generate data using an adversarial attack algorithm of energy $0.125$. We observe that the effect of the small changes in the sampling radius is negligible.
\begin{figure*}[!h]
    \centering
    \begin{minipage}{\linewidth}   
        \hfill
        \begin{minipage}{\linewidth}
        \centering
            \includegraphics[width=0.6\linewidth]{Figures1/legend_clean.png}
        \end{minipage}%% 
        \hfill       
        \begin{minipage}{\linewidth}
            \includegraphics[width=\linewidth]{Figures1/C100/ratio_1.0ARCP_fixed_ns_cvg_Size_Both.png}
        \end{minipage}%%
        \hfill
    \end{minipage}
    \caption{Robust coverage (left) and prediction set size (bottom) performance of the ResNet model trained with Gaussian augmented data using standard deviation $\sigma = 0.125$ with varying radius of robust quantile balls during calibration for CIFAR100 dataset. The results are reported over 50 different runs.}
    \label{fig:C100_radius_changes_worst_adv}
\end{figure*}


\subsection{Performance of aPRCP(worst-adv) with different Deep models}
Figure \ref{arcp_model_changes_C10_C100} shows the performance of our \texttt{aPRCP}(worst-adv) using DenseNet\citep{iandola2014densenet} and VGG\citep{simonyan2014very} models on the CIFAR10 and CIFAR100 datasets. We use the same adversarial attack algorithm for test examples with a magnitude of $r = 0.125$. During calibration, we sample $m_s = 128$ noisy samples ($r = 0.125$) for each calibration example. We observe that the robust coverage is achieved on all three deep models with small prediction sets.

\begin{figure*}[!h]
    \centering
    \begin{minipage}{\linewidth}
    \hfill
        \begin{minipage}{\linewidth}
        \centering
            \includegraphics[width=0.6\linewidth]{Figures1/legend_clean.png}
        \end{minipage}
        \begin{minipage}{.48\linewidth}
            \centering
            (a) CIFAR10
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.48\linewidth}
            \centering
            (b) CIFAR100
        \end{minipage} 
        \begin{minipage}{.48\linewidth}
            \includegraphics[width=\linewidth]{Figures1/C10/dataset_CIFAR10ARCP_fixed_ns_cvg_Size_Both.png}
        \end{minipage}%% 
        \hfill       
        \begin{minipage}{.48\linewidth}
            \includegraphics[width=\linewidth]{Figures1/C100/dataset_CIFAR100ARCP_fixed_ns_cvg_Size_Both.png}
        \end{minipage}%%
        \hfill

    \end{minipage}
    \caption{Robust coverage (top) and prediction set size (bottom) constructed by \texttt{aPRCP(worst-adv)} method for CIFAR10 (left) and CIFAR100 (right) datasets. The neural network models used are trained with Gaussian augmented data using standard deviation $\sigma = 0.25$. The results are reported  over 50 different runs. As can be seen, all models guarantee target coverage and VGG produces larger prediction sizes compared to other models.}
    \label{arcp_model_changes_C10_C100}
\end{figure*}

\clearpage 

\subsection{Results on Adversarial Examples Generated from a probability density distribution}
We evaluate the performance of aPRCP with a  different adversarial attack algorithm, namely NATTACK \citep{Black_box}. This attack algorithm generates a probability density distribution centered around an input from which adversarial examples can be sampled.
We employ this algorithm using an adversarial magnitude $||\epsilon||_2 \leq r = 0.125$ to generate adversarial examples for the test data of CIFAR10 and CIFAR100 on three different deep models trained with Gaussian augmented data ($\sigma = 0.125$). In all our experiments, we set $T = 1000$ as the number of maximum iterations, and a learning rate $\eta = 0.008$.

Both Figures \ref{All_blackBox_C10} and \ref{All_blackBox_C100} show that aPRCP is the only algorithm that can guarantee the adversarial robust coverage. This can be explained by the fact that RSCP requires the design of a specialized scoring function to guarantee coverage while aPRCP uses a quantile-of-quantile design and can employ any existing score function.

\begin{figure*}[!h]
    \centering
    \begin{minipage}{.98\linewidth}
        \begin{minipage}{\linewidth}
            \centering
            \includegraphics[width=.5\linewidth]{MainPaper/legend3.png}
        \end{minipage}     
        \begin{minipage}{\linewidth}
            \includegraphics[width=\linewidth]{Figures1/C10/ratio_1.0CIFAR10ARCP_black_box_Coverage_APS_HPS.png}
        \end{minipage}%% 
        \hfill
        \begin{minipage}{\linewidth}
            \includegraphics[width=\linewidth]{Figures1/C10/ratio_1.0CIFAR10ARCP_black_box_Size_APS_HPS.png}
        \end{minipage}%% 
    \end{minipage}
    \caption{Robust coverage (top) and prediction set size (bottom) constructed by three different CP methods. The target coverage is $90\%$. The results are reported over 50 different runs for the CIFAR10 data set.}
    \label{All_blackBox_C10}
\end{figure*}


\begin{figure*}[!h]
    \centering
    \begin{minipage}{.98\linewidth}
        \begin{minipage}{\linewidth}
            \centering
            \includegraphics[width=.5\linewidth]{MainPaper/legend3.png}
        \end{minipage}     
        \begin{minipage}{\linewidth}
            \includegraphics[width=\linewidth]{Figures1/C100/ratio_1.0CIFAR100ARCP_black_box_Coverage_APS_HPS.png}
        \end{minipage}%% 
        \hfill
        \begin{minipage}{\linewidth}
            \includegraphics[width=\linewidth]{Figures1/C100/ratio_1.0CIFAR100ARCP_black_box_Size_APS_HPS.png}
        \end{minipage}%% 
    \end{minipage}
    \caption{Robust coverage (top) and prediction set size (bottom) constructed by three different CP methods. The target coverage is $90\%$. The results are reported over 50 different runs for the CIFAR100 data set.}
    \label{All_blackBox_C100}
\end{figure*}



\subsection{Importance of Gaussian Augmented Training}
While aPRCP can work without any assumption on the base classifier, Figure \ref{Why_gaussian_training_C10} shows the importance of the model robustness to produce smaller prediction sets. Both \texttt{RSCP} and \texttt{aPRCP}(worst-adv) construct prediction sets that are larger when the base model is not adversarially robust.
\begin{figure*}[!h]
    \centering
    \begin{minipage}{.98\linewidth}     
        \begin{minipage}{\linewidth}
            \includegraphics[width=\linewidth]{Figures1/C10/dataset_CIFAR10ARCP_fixed_ns_Coverage_APS_HPS.png}
        \end{minipage}%% 
        \hfill
        \begin{minipage}{\linewidth}
            \includegraphics[width=\linewidth]{Figures1/C10/1dataset_CIFAR10ARCP_fixed_ns_Size_APS_HPS.png}
        \end{minipage}%% 
    \end{minipage}
    \caption{Robust coverage (top) and prediction set size (bottom) constructed by three different CP methods. The target coverage is $90\%$. The results are reported over 50 different runs for the CIFAR10 data set.}
    \label{Why_gaussian_training_C10}
\end{figure*}

\subsection{aPRCP nominal performance}
Figure \ref{Clean_data_results} shows a comparison of the nominal performance (evaluation on only clean inputs) on CIFAR10 and CIFAR100 datasets. We employ $m_s = 128$ for calibration and standard training to train the base model. We can observe that aPRCP achieves better trade-off between the nominal performance (evaluation on clean inputs) and the robust performance (evaluation on perturbed inputs). For both datasets, aPRCP achieves a tighter empirical coverage (closer to 90\%) with smaller prediction sets than RSCP.


\begin{figure*}[!h]
    \centering
    \begin{minipage}{.98\linewidth}
        \begin{minipage}{\linewidth}
            \centering
            \includegraphics[width=.6\linewidth]{Figures1/legend_clean.png}
        \end{minipage}     
        \begin{minipage}{.49\linewidth}
            \centering
            (a) CIFAR10
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.49\linewidth}
            \centering
            (b) CIFAR100
        \end{minipage} 
        \hfill
        \begin{minipage}{.49\linewidth}
            \includegraphics[width=\linewidth]{Figures1/C10/ratio_0.0CIFAR10ARCP_clean_Coverage_APS_HPS.png}
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.49\linewidth}
            \centering
            \includegraphics[width=\linewidth]{Figures1/C100/ratio_0.0CIFAR100ARCP_clean_Coverage_APS_HPS.png}
        \end{minipage}    
        \hfill
        \begin{minipage}{.49\linewidth}
            \includegraphics[width=\linewidth]{Figures1/C10/ratio_0.0CIFAR10ARCP_clean_Size_APS_HPS.png}
        \end{minipage}%%
        \hfill
        \begin{minipage}{.49\linewidth}
            \centering
            \includegraphics[width=\linewidth]{Figures1/C100/ratio_0.0CIFAR100ARCP_clean_Size_APS_HPS.png}
        \end{minipage}    
    \end{minipage}
    \caption{Robust coverage (top) and prediction set size (bottom) constructed by Vanilla CP, RSCP, and aPRCP(worst-adv) using HPS and APS conformity scoring functions (target coverage is $90\%$) for the CIFAR10 and CIFAR100 data sets. Results are averaged over 50 different runs.}
    \label{Clean_data_results}
\end{figure*}

\clearpage
\bibliography{reference}
\end{document}

