\documentclass[accepted]{uai2023}

% \usepackage{aistats2023}
% If your paper is accepted, change the options for the package
% aistats2023 as follows:
%
%\usepackage[accepted]{aistats2023}
%
% This option will print headings for the title of your paper and
% headings for the authors names, plus a copyright note at the end of
% the first column of the first page.

% If you set papersize explicitly, activate the following three lines:
%\special{papersize = 8.5in, 11in}
%\setlength{\pdfpageheight}{11in}
%\setlength{\pdfpagewidth}{8.5in}

% If you use natbib package, activate the following three lines:
%\usepackage[round]{natbib}
%\renewcommand{\bibname}{References}
%\renewcommand{\bibsection}{\subsubsection*{\bibname}}

% If you use BibTeX in apalike style, activate the following line:
%\bibliographystyle{apalike}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{multirow}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{bbm}
\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{comment}

\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\def\shired{\textcolor{red}}
\def\yanred{\textcolor{red}}
\def\st{\text{s.t.}}


\usepackage{amsmath,amsthm,amssymb,multirow,paralist,mathrsfs,amsfonts,dsfont}
\newtheorem{theorem}{Theorem}
\newtheorem*{theorem1}{Theorem 1}
\newtheorem{proposition}{Proposition}
\newtheorem{property}{Property}

\newtheorem{lemma}{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{observation}{Observation}
\newtheorem{remark}{Remark}
\let\oldremark\remark
\renewcommand{\remark}{\oldremark\normalfont}

\usepackage{enumitem}

% \def\indicator{{\bf 1}}
\def\indicator{\mathrm I}
\def\indicator{\mathds I}
\def\indicator{\mathbb I}


\def\PR{\text{PR}}

\def\iPR{\text{iPR}}
\def\aPR{\text{aPR}}
\def\rob{\text{rob}}


\def\CP{\text{CP}}
\def\RCP{\text{RCP}}
\def\LCP{\text{LCP}}
\def\AR{\text{AR}}
\def\cal{\text{cal}}
\def\tr{\text{tr}}
\def\test{\text{test}}
\def\gt{\text{gt}}
\def\and{\mathrm{and}}
\def\class{\mathrm{class}}

\def\calX{\mathcal X}
\def\calE{\mathcal E}

\def\calC{\mathcal C}
\def\calY{\mathcal Y}
\def\calZ{\mathcal Z}
\def\calN{\mathcal N}
\def\calB{\mathcal B}
\def\calR{\mathcal R}
\def\calP{\mathcal P}
\def\calM{\mathcal M}
\def\calD{\mathcal D}
\def\calF{\mathcal F}
\def\calS{\mathcal S}
\def\calT{\mathcal T}

\def\E{\mathbb E}
\def\P{\mathbb P}
\def\R{\mathbb R}



\def\vectort{\mathbf t}
\def\vectorone{{\bf 1}}



\usepackage{newfloat}
\usepackage{listings}







\begin{document}

% If your paper is accepted and the title of your paper is very long,
% the style will print as headings an error message. Use the following
% command to supply a shorter title of your paper so that it can be
% used as headings.
%
%\runningtitle{Probabilistically Robust Conformal Prediction}

% If your paper is accepted and the number of authors is large, the
% style will print as headings an error message. Use the following
% command to supply a shorter version of the authors names so that
% they can be used as headings (for example, use only the surnames)
%
%\runningauthor{Surname 1, Surname 2, Surname 3, ...., Surname n}


\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Probabilistically Robust Conformal Prediction}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1=]{\href{mailto:<subhankar.ghosh@wsu.edu.edu>}{Subhankar Ghosh}{}}
\author[1=]{Yuanjie Shi}
\author[1]{Taha Belkhouja}
\author[1]{Yan Yan}
\author[1]{Janardhan Rao Doppa}
\author[2]{Brian Jones}
% Add affiliations after the authors
\affil[1]{%
    School of Electrical Engineering and Computer Science\\
    Washington State University
}
\affil[2]{%
    Proofpoint Inc.   
}  

\maketitle

\begin{abstract}
Conformal prediction (CP) is a framework to quantify uncertainty of machine learning classifiers including deep neural networks. Given a testing example and a trained classifier, CP produces a prediction set of candidate labels with a user-specified  coverage (i.e., true class label is contained with high probability). Almost all the existing work on CP assumes clean testing data and there is not much known about the robustness of CP algorithms w.r.t natural/adversarial perturbations to testing examples. This paper studies the problem of probabilistically robust conformal prediction (PRCP) which ensures robustness to most perturbations around clean input examples. PRCP generalizes the standard CP (cannot handle perturbations) and adversarially robust CP (ensures robustness w.r.t worst-case perturbations) to achieve better trade-offs between nominal performance and robustness.  We propose a novel adaptive PRCP (aPRCP) algorithm to 
%determine an appropriate threshold during the calibration step to 
achieve probabilistically robust coverage. The key idea 
%behind our approach
behind aPRCP is to determine two parallel thresholds, one for data samples and another one for the perturbations on data (aka ``{\em quantile-of-quantile}'' design). We provide theoretical analysis to show that aPRCP algorithm achieves robust coverage. Our experiments on CIFAR-10, CIFAR-100, and ImageNet datasets using deep neural networks demonstrate that aPRCP achieves better trade-offs than state-of-the-art CP and adversarially robust CP algorithms.

%Conformal prediction(CP) is a framework to quantify the uncertainty of black-box models. Given a testing example and a trained model, CP produces a prediction set of candidate labels with user-specified coverage(i.e., true class label is contained with high probability) for the classification. Almost all the existing works in CP assume no distribution shift between training and test data and there is not much known about the robustness of CP algorithms against adversarial test examples. This paper studies two problems: one is probabilistically robust conformal prediction (PRCP) which ensures most of the $l_2$-norm bounded perturbations around a clean input example and the other is adversarially robust conformal prediction (ARCP) which ensures finite-sample coverage against $l_2$-norm bounded adversarial examples generated by any adversarial attack algorithm. We propose a novel PRCP algorithm to achieve both probabilistic robust coverage and adversarially robust coverage. We provide theoretical analysis to show that our claims hold. Our experiments on CIFAR-10, CIFAR-100, and ImageNet datasets using different deep neural networks demonstrate that our proposed method achieves better results in both the setting than the baseline method.

%Conformal prediction (CP) is a framework to quantify uncertainty of machine learning classifiers including deep neural networks. Given a testing example and a trained classifier, CP produces a prediction set of candidate labels with a user-specified  coverage (i.e., true class label is contained with high probability). Almost all the existing work on CP assumes clean testing data and there is not much known about the robustness of CP algorithms w.r.t natural/adversarial perturbations to testing examples. This paper studies the problem of probabilistically robust conformal prediction (PRCP) which ensures robustness to most perturbations around clean input examples. PRCP generalizes the standard CP (cannot handle perturbations) and adversarially robust CP (ensures robustness w.r.t worst-case perturbations) to achieve better trade-offs between nominal performance and robustness.  We propose a novel PRCP algorithm to 
%determine an appropriate threshold during the calibration step to 
%achieve probabilistically robust coverage. The key idea 
%behind our approach
%is to determine two parallel thresholds, one for data samples and another one for the perturbations on data. We provide theoretical analysis to show that our PRCP algorithm achieves robust coverage. Our experiments on CIFAR-10, CIFAR-100, and ImageNet datasets using deep neural networks demonstrate that PRCP achieves better trade-offs than state-of-the-art CP and adversarially robust CP algorithms.

% has been recently studied and demonstrated to construct statistically valid coverage for machine learning prediction tasks. Although they nicely help quantify the prediction uncertainty, it is still unclear how to ensure the various types of robustness of the coverage for CP algorithms. Previously the notion of adversarially robust coverage has been studied, which can be guaranteed by inflating the threshold under some certain condition for the score function. In this paper, we propose a more general notation of probabilistically robust coverage for CP algorithms. Due to the difficulty of calibrating the inflation value in the case of probabilistic robustness, alternatively, we propose an adaptive algorithm that can find an appropriate threshold for achieving probabilistically robust coverage, and fit the underlying data distribution at the meantime. The general idea of this adaptive method is to determine two parallel thresholds, one in the sense of data samples and the other one in the sense of perturbation on data, which, shown by our analysis, together build the robust coverage. Experimental results verify the efficacy to guarantee probabilistic robustness of CP algorithms and the adaptivity for the underlying data distribution. 
\end{abstract}

\section{Introduction}

% - Importance of robust and safe ML deployment
% - Conformal prediction is a promising uncertainty quantification tool to enable this goal.
% - Inspite of the succcesses of CP, there is little to no work on developing robust CP methods to adversarial examples. There is only one work on adversaially robust CP, but it makes strong assumptions and also considers the worst-case.
% - Motivate probabilistic robustness from practical examples
% - We study the novel problem of probabilistically robust CP.
% - Explain the overall algorithmic methodology
% - Explain the significance of theory
% - Summarize experimental results.

\begin{figure*}[t]
\centering
% \includegraphics[width=.55\linewidth]{Figures/Sktech_AISTATS.png}
\includegraphics[width=.45\linewidth]{Figures/figure_illustration_crop.pdf}
\caption{Conceptual illustration of the adaptive PRCP setting. 
The goal is to improve the robustness of the CP framework to handle perturbations $\epsilon$ bounded by $r$ for every input $X\in\mathcal{X}$. 
The robust quantile corresponding to 1-$\tilde{\alpha}$ region (blue circle around $X$) is computed by accounting for most of the perturbed data $X+\epsilon$ (see (\ref{eq:robust_quantile_x})). 
$s$ is a conservativeness parameter for the robust quantile that can be varied to achieve the target marginal coverage $1-\alpha + s$ (see (\ref{eq:aPRCP_threshold})). 
Adaptive PRCP can find a trade-off between the marginal coverage on feature space $(X, Y)$ and the robustness for perturbation $\epsilon$ by changing the value of $\tilde \alpha$ and $s$ to achieve probabilistically robust coverage (See Definition \ref{definition:prob_robust_coverage}).
}
\label{fig:illustration}
\end{figure*}


Deep learning has shown significant success in diverse real-world applications.
%over image, natural language, and speech data. 
However, to deploy these deep models in safety-critical applications (e.g, autonomous driving and medical diagnosis), we need uncertainty quantification (UQ) tools to capture the deviation of the prediction from the ground-truth output. For example, producing a subset of candidate labels referred to as {\em prediction set} for classification tasks. Conformal prediction (CP) \citep{vovk1999machine,vovk2005algorithmic,shafer2008tutorial} is a framework for UQ that provides formal guarantees for a user-specified {\em coverage}: ground-truth output is contained in the prediction set with a high probability $1 - \alpha$ (e.g., 90\%). %Additionally, UQ from CP is adaptive and will reflect the difficulty of testing inputs: size of the prediction set will be large for difficult inputs and small for easy inputs. 
There are two key steps in CP. First, in the prediction step, we use a black-box classifier (e.g., deep neural network) to compute %{\em conformity} (or 
{\it (non-)conformity} scores which measure similarity between calibration examples and a testing input. Second, in the calibration step, we use the conformity scores on a set of calibration examples to find a threshold to construct prediction set which meets the coverage constraint (e.g., $1 - \alpha$=90\%). The {\em efficiency} of CP \citep{sadinle2019least} is measured in terms of size of the prediction set (the smaller the better) which is important for human-ML collaborative systems \citep{rastogi2022unifying}. 

In spite of the recent successes of CP \citep{vovk2005algorithmic}, there is little known about the robustness of CP to adversarial perturbations of clean inputs. Most CP methods \citep{cauchois2020robust,gibbs2021adaptive,tibshirani2019conformal,podkopaev2021distribution,guan2022prediction} are brittle as they assume clean input examples and cannot handle {\em any} perturbations. 
The recent work on adversarially robust CP \citep{gendler2022adversarially} ensures robustness to {\em all} perturbations bounded by a norm ball with radius $r$. 
However, this conservative approach of dealing with {\em worst-case} perturbations can degrade the nominal performance (evaluation on only clean inputs) of the CP method. 
For example, the prediction set size can be large even for clean and easy-to-classify inputs, which increases the burden of human expert in human-ML collaborative systems \citep{cai2019human, rastogi2022unifying}. The main research question of this paper is: {\em how can we develop probably correct CP
algorithms for ensuring robustness to most perturbations
for (pre-trained) deep classifiers?} \footnote{= Equal contribution by first two authors}

To answer this question, we present a general notion for probabilistically robust coverage that balances the standard conformal coverage and the adversarial (worst-case) coverage as the fundamental setting.
To address this challenge, we develop the adaptive PRCP algorithm (aPRCP) which is based on the principle of "{\em quantile-of-quantile}" design: consists of two parallel quantiles as illustrated in Figure \ref{fig:illustration}: one defined in the perturbed noise space (see (\ref{eq:robust_quantile_x})), the other one in the data space (\ref{eq:aPRCP_threshold}). 
%We also proposed an approach inspired by the RSCP method for worst-case robustness \citep{gendler2022adversarially} to the probabilistic setting called inflation based PRCP (iPRCP). iPRCP is not a practical solution due the need to find inflation constant which depends on the conformity score. Hence, we do all our analysis using the proposed aPRCP method.
%(Section \ref{subsection:aPRCP}), which consists of two parallel quantiles -- one defined in the perturbed noise space (see (\ref{eq:robust_quantile_x})), the other one in the data space (see (\ref{eq:aPRCP_threshold})).
% PRCP generalizes both standard CP and adversarially robust CP using a probability parameter $p \in [0, 1]$: $p$=0 and $p$=1 corresponds to standard CP (no perturbations) and adversarially robust CP (all perturbations) respectively. 
Our analysis fixes one quantile probability as a given hyper-parameter, and finds the other one to achieve the target probabilistically robust coverage.
%, but for the conformal prediction setting. 
% nominal performance and robustness by appropriately setting the value of parameter $p$. 
% The key idea behind our PRCP algorithm is to adaptively determine two thresholds: one for data samples and another one for the perturbations on data. 
We provide theoretical analysis for probabilistic correctness of aPRCP at the population level and the approximation error of empirical quantiles as a function of the number of samples. As a result, aPRCP achieves improved trade-offs between nominal performance (evaluation on clean inputs) and robust performance (evaluation on perturbation inputs) for both probabilistic and worst-case settings as illustrated in Figure \ref{All_ARCP_PRCP_mainPaper1}, which is analogous to the recent work on probabilistically robust learning \cite{robey2022probabilistically}.

%Our comprehensive experimental evaluation demonstrates that aPRCP achieves improved trade-offs between nominal performance (evaluation on clean inputs) and robust performance (evaluation on perturbation inputs) for both probabilistic and worst-case settings. 

%We also provide experimental analysis showing both nominal and robust performance (probabilistic and worst-case) of aPRCP and its efficiency.  %and the approximation error of empirical quantiles as a function of the number of samples.


\noindent {\bf Contributions.} The key contribution of this paper is the development, theoretical analysis, and empirical evaluation of the aPRCP algorithm. Our specific contributions include:
% \vspace{-2.0ex}
\begin{itemize}

\setlength\itemsep{0em}
\item A general notion of probabilistically robust coverage for conformal prediction against perturbations of clean input examples.
\item Development of the adaptive PRCP algorithm based on the principle of "{\em quantile-of-quantile}" design.
%\item A general notion of probabilistically robust coverage against noisy samples sampled from $l_2$-norm bounded ball and marginal coverage against adversarially generated data from $l_2$-norm bounded ball.
%\item Two PRCP algorithms: one practical algorithm adaptive PRCP (aPRCP) and  other based on  probabilistically inflated score function (iPRCP). We prove that both algorithms achieve our claim.
% Principled two-step calibration algorithm to adaptively determine the thresholds for PRCP setting.
\item Theory to show that aPRCP algorithm achieves probabilistically robust coverage for adversarial examples.
\item Experimental evaluation of aPRCP method on classification benchmarks using deep models to demonstrate its efficacy over prior CP methods on CIFAR-10, CIFAR-100, and ImageNet. %Our anonymized code is in the Appendix for review purposes. 

\end{itemize}

\begin{figure*}[!h]
    \centering
    \begin{minipage}{.98\linewidth}
        \begin{minipage}{\linewidth}
            \centering
            \includegraphics[width=0.6\linewidth]{MainPaper/legend11.png}
        \end{minipage}     
        \begin{minipage}{.24\linewidth}
            \centering
            (a)
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.24\linewidth}
            \centering
            (b)
        \end{minipage} 
        \begin{minipage}{.24\linewidth}
            \centering
            (c)
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.24\linewidth}
            \centering
            (d)
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.24\linewidth}
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR100ARCP_PRCP_fixed_ns_Coverage_APS_HPS.png}
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.24\linewidth}
            \centering
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR100ARCP_PRCP_fixed_ns_Coverage_APS_HPS1.png}
        \end{minipage}    
        \hfill
        \begin{minipage}{.24\linewidth}
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR100ARCP_PRCP_fixed_ns_Size_APS_HPS.png}
        \end{minipage}%%
        \hfill
        \begin{minipage}{.24\linewidth}
            \centering
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR100ARCP_PRCP_fixed_ns_Size_APS_HPS1.png}
        \end{minipage}    
    \end{minipage}
    \caption{Results on CIFAR100 dataset using a ResNet model to illustrate the trade-offs between nominal performance (evaluation on clean data) and robust performance (evaluation on adversarial examples) for Vanilla CP, RSCP, and variants of the aPRCP algorithm. (a) and (c) show the evaluation against clean examples and their corresponding noisy samples (i.e., $\tilde{X} = X + \epsilon; ||\epsilon||_2 \leq r$) w.r.t probabilistic robustness. (b) and (d) show the evaluation against clean examples and their corresponding bounded adversarial examples. aPRCP(worst-adv) is the variant of aPRCP that works for worst adversarial data. Vanilla CP fails to achieve coverage for worst-case adversarial data. RSCP achieves a robust coverage much higher than the target (nominal) coverage, resulting in large prediction sets. aPRCP achieves better results (tighter coverage and smaller prediction set size) than vanilla CP and RSCP in terms of the joint performance on clean, noisy, and worst-adversarial data.}
    \label{All_ARCP_PRCP_mainPaper1}
\end{figure*}


% \begin{figure*}[!h]
% \centering
% %\vspace{.3in}
% %\centerline{\fbox{This figure intentionally left non-blank}}
% \includegraphics[height=5.5cm, width=.65\linewidth]{Figures/first.png}
% %\vspace{.3in}
% \caption{Results on ImageNet dataset using ResNet-50 to illustrate the trade-offs between nominal performance (evaluation on clean data) and robust performance (evaluation on adversarial examples) for different CP algorithms, namely, vanilla CP, RSCP, and variants of the PRCP algorithm. (a) and (c) show the evaluation of CP algorithms against clean testing data and the corresponding worst-case adversarial data. PRCP(worst-adv) is the variant of PRCP with $(1-\tilde{\alpha})$=1.  (b) and (d) show the evaluation of CP algorithms on a sample of adversarial perturbations for each clean testing input w.r.t probabilistic robustness, i.e., most (95\%) perturbation samples should achieve coverage. Vanilla CP fails to achieve coverage for worst-case adversarial data. RSCP achieves a marginal coverage much higher than the target (nominal) coverage, resulting in large prediction sets for both clean and worst-case adversarial data. PRCP achieves better results (tighter coverage and smaller prediction set size) than vanilla CP and RSCP in terms of the joint performance on clean and adversarial data.}
% (a) and (b) show marginal coverage, and (c) and (d) show the corresponding average prediction set size results. 
% Marginal coverage (reporting the percentage of true test labels within the prediction set) and the average set size achieved by different Conformal Prediction (CP) methods on ImageNet dataset:   (b) and (d) show the evaluation of CP algorithm on most (95\%) perturbations around the input examples. Vanilla CP fails to achieve coverage for adversarial data. RSCP achieves a marginal coverage farther then the target (nominal) coverage, yielding a large prediction set. Our goal (ARCP/PRCP) is to reduce the uncertainty (i.e., reduce the prediction set) under any robustness setting (described by adversarial or random perturbations) by reducing the gap between target and empirical coverage.}
% \label{coverage:vgg}
% \end{figure*}

\section{Background and Problem Setup}

We consider the problem of uncertainty quantification (UQ) of pre-trained deep models for classification tasks in the presence of adversarial perturbations. Suppose $(X, Y)$ is a data sample where $X$ is an input from the space $\mathcal{X}$ and $Y \in \mathcal{Y}$ is the corresponding ground-truth output. For classification tasks, $\mathcal{Y}$ is a set of $C$ discrete class-labels $\{1, 2, \cdots, C\}$. 
Let $\epsilon$ denote the $l_2$-norm bounded noise, i,e,. $\calE_r = \{ \epsilon \in \calX : \| \epsilon \|_2 \leq r \}$ that is independent from data sample $(X, Y)$.
Let $\calP_{X, Y}$ and $\calP_\epsilon$  denote the underlying distribution of $(X, Y)$ and $\epsilon$, respectively. We also define $Z = (X, Y, \epsilon)$ as the joint random variable and the perturbed input example $\widetilde X = X + \epsilon$ for notational simplicity.


%Let $\calZ$ = $\calX \times \calY$ be the joint space of input-output pairs and the underlying distribution on $\calZ$ be $\calD_{\calZ}$.
 %As per the standard notation in conformal prediction, $X$ is a random variable and $x$ is a data sample.

%\yanred{$x$ denotes a certain sample; $X$ represents random variable in CP fashion}

%\vspace{1.0ex}

\noindent {\bf Uncertainty Quantification.} Let $\calD_\tr$ and $\calD_\cal$ correspond to sets of training and calibration examples drawn from a target distribution $\calP_{X, Y}$.
%over input-output pairs $\mathcal{Z}$. 
We assume the availability of a pre-trained deep model $F_{\theta}: \mathcal{X} \mapsto \mathcal{Y}$, where $\theta$ stands for the parameters of the deep model. For a given testing input $\widetilde X$, we want to compute UQ of the deep model $F_{\theta}$ in the form of a prediction set $\mathcal{C}(\widetilde X)$, a subset of candidate class-labels $\{1, 2, \cdots, C\}$. The performance of UQ for clean data samples (i.e.,  $\epsilon$=0) is measured using two metrics. First, the (marginal) {\em coverage} is defined as the probability that the ground-truth output $Y$ is contained in $\mathcal{C}(X)$ for a testing example $(X, Y)$ from the same data distribution $\calP_{X, Y}$, i.e., $\mathbb{P}(Y \in \mathcal{C}(X))$. The empirical coverage \texttt{Cov} is measured over a given set of testing examples $\calD_\test$. Second, {\em efficiency}, denoted by \texttt{Eff}, measures the cardinality of the prediction set $\mathcal{C}(X)$. Smaller prediction set means higher efficiency. It is easy to achieve the desired coverage (say 90\%) by always outputting $\mathcal{C}(X)$=$\mathcal{Y}$ at the expense of poor efficiency. 

% \vspace{1.0ex}

\noindent {\bf Conformal Prediction (CP).} 
CP is a framework that allows us to compute UQ for any given predictor through a conformalization step. 
The key element of CP is a score function $S$ that computes the {\em conformity} (or {\em non-conformity}) score, measures similarity between labeled examples, which is used to compare a given testing input to the calibration set $\calD_\cal$. %For example, in regression tasks, absolute residual $V((x,y^*) \in \mathcal{Z})$=$|y^* - F_{\theta}(x)|$ \citep{vovk2005algorithmic} is often used as the non-conformity score.  
Since any non-conformity score can be intuitively converted to a conformity measure \citep{vovk2005algorithmic}, we use non-conformity measure for ease of technical exposition. Let $S(X, Y)$ denote the non-conformity score function of data sample $(X, Y)$. For a sample $(X_i, Y_i)$ from the calibration set $\calD_\cal$, we use $S_i = S(X_i, Y_i)$ as a shorthand notation of its non-conformity score.



A typical method based on split conformal prediction has a threshold $\tau$ to compute UQ in the form of prediction set for a given testing input $X$ and deep model $F_{\theta}$. A small set of calibration examples $\calD_\cal$ are used to select the threshold $t$ for achieving the given coverage $1-\alpha$ (say 90\%) empirically on  $\calD_\cal$. 
Let $Q(\alpha) := \min \{ t : \P_{X, Y} \{ S(X, Y) \leq t \} \geq 1 - \alpha \}$ be the true quantile of the conformity score for $(X, Y)$.
Let $\calD_\cal = \{(X_i, Y_i)\}_{i=1}^n$ denote a calibration set with $n$ exchangeably drawn random samples from the underlying distribution $\calP_{X, Y}$.
We denote the ($1-\alpha$)-quantile derived from $\{S_i\}_{i=1}^n$ by $Q(\alpha; \{S_i\}_{i=1}^n)$ = $S_{(\lceil (1-\alpha) (n+1) \rceil)}$. The prediction set for a new testing input $X$ is given by $\mathcal{C}(X)$=$\{y: S(X, y) \le \tau \}$ using a threshold $\tau$. 
CP provides valid guarantees that $\mathcal{C}(X)$ has coverage $1-\alpha$ on future examples drawn from the same distribution $\calP_{X, Y}$.





% \begin{algorithm}[!h]

%     \caption{Split Conformal Prediction (CP)} %\yanred{(aligned to $1-\alpha$)} }
%     \label{alg:gen_CP}
%     \begin{algorithmic}[1]
    
%     \STATE \textbf{Input}: Significance level $\alpha \in (0, 1)$;
%     Randomly split data into training set $\calD_\tr$ and calibration set $\calD_\cal = \{ Z_1,\cdots, Z_n \}$.
    
%     % A test point $X_{n+1}$ for which we need to predict a set, error rate $\alpha \in (0, 1)$, localized hyperparameter $\lambda_L \in \mathbb{R}^+$, regularized hyperparameter $\lambda_R \in [0, 1].$
%     % \STATE Randomly split the $\mathcal{D}$ into $\mathcal{D}_{tr}$ and $\mathcal{D}_{cal}$.
    
%     \STATE If predictor $F_\theta$ is not given, train a prediction model $F_\theta$ on the training set $\calD_\tr$
%     %\STATE Train a Temperature scaling parameter $ \tau $ using $ \mathcal{D}_{temp}$.
%     %\STATE Find a hyperparameter $K_{reg}$ using $\mathcal{D}_{hyper}$ from Equation []. 
    
%     \STATE Compute non-conformity score $V_i$ for each example $Z_i \in \calD_\cal$

%     % such that $V_i = |\left[V(X_i, Y_i; \hat{\mu}) + \lambda_R * R\right]| * H(\hat{\mu}(X_{n+1}), \hat{\mu}(X_{i});\lambda_L)$

%     \STATE Compute $\hat{Q}^\CP(\alpha, V_{1:n})$ as the $\lceil (1-\alpha)(1 + |\mathcal{D}_{cal}|) \rceil$th smallest value in $\{V_i\}_{i \in \calD_\cal }$ as in (\ref{eq:CP}).
    

%     %   \STATE $\hat {\mathcal{C}}(X_{n + 1}) = \color{red}{what ??}$
%     \STATE $\hat {\mathcal{C}}(x_{n+1}) = \{ y : V(x_{n+1}, y) \leq \hat Q^\CP(\alpha, V_{1:n}) \} $ is the prediction set for a testing input $x_{n + 1}$
%     \end{algorithmic}
   
% \end{algorithm}

\begin{comment}
For classification, recent work has proposed conformity scores based on ordered probabilities \citep{NEURIPS2020_244edd7e,angelopoulos2021uncertainty}. The conformity score of adaptive prediction sets (APS) \citep{NEURIPS2020_244edd7e} is defined as follows. For a given input $X$, we get the sorted probabilities for all classes using the deep model $F_\theta$, $\pi(X,y^1) \geq \cdots \pi(X,y^C)$, and compute the score: 
\begin{equation}
\small
    S^{\text{APS}}(X, k) =  \pi(X,y^1)  + \cdots + \pi(X,y^{k-1}) + U \cdot \pi(X,y^{k})
    \label{APS}
\end{equation}
where $U \in [0,1]$ is a random variable to break ties. Suppose $L$ is the index of the ground-truth class label $Y$ in the ordered list of probabilities $\pi(x,y)$. We will employ APS as the vanilla CP baseline in our experiments.
\end{comment}

For classification, several non-conformity scores can be employed. The homogeneous prediction sets (HPS) score is defined \citep{vovk2005algorithmic, lei2013distribution} as follows:
\begin{equation}
\label{HPS_eq}
    S^{\text{HPS}}(X, y ) = 1 - F_{\theta}(X)_y,
\end{equation}
where $F_{\theta}(X)_y \in [0, 1]$ is the probability corresponding to the true class $y$ using the deep model $F_{\theta}$. Recent work has proposed the adaptive prediction sets (APS) \citep{romano2020classification} score that is based on ordered probabilities. The score function of APS is defined as follows:
\begin{align}
\label{APS_eq}
    S^{\text{APS}}(X, y ) = &\sum_{y^{'} \in \mathcal{Y}} F_{\theta}(X)_{y^{'}}\mathds{1}\left\{ F_{\theta}(X)_{y^{'}} >  F_{\theta}(X)_y\right\}
\nonumber\\
&+ u. F_{\theta}(X)_y,
\end{align}
where $u$ is a random variable uniformly distributed over $[0, 1]$ and $\mathds{1}$ is the indicator function.

% \vspace{1.0ex}

\noindent {\bf Problem Definition.} The high-level goal of this paper is to study methods to improve the robustness of the standard CP framework to adversarial/noisy examples of the form $\widetilde X = X + \epsilon$, where $\epsilon$ is the additive perturbation from $\calE_r = \{ \epsilon \in \R^d : \| \epsilon \|_p \leq r\}$. 
Specifically, we propose a novel adaptive probabilistically robust conformal prediction (aPRCP) algorithm which accounts for $(1-\tilde \alpha)$ (see $\tilde \alpha$ for robust quantile in (\ref{eq:robust_quantile_x})) fraction of perturbations in $\calE_r$ for each data $(X, Y)$.
Setting $\tilde \alpha = 0$ as an extreme case makes aPRCP handle all perturbations (i.e., worst-case), similar to RSCP \citep{gendler2022adversarially}.
% as opposed to two other extreme cases: standard CP that cannot handle any perturbations, $(1-\rho)$=0 and adversarially robust CP (ARCP) that can handle all perturbations, $(1-\rho)$=1. 
We theoretically and empirically analyze aPRCP to demonstrate improved trade-offs between nominal performance (evaluation on clean inputs) and robust performance (evaluation on perturbation inputs). 
Figure \ref{fig:illustration} conceptually illustrates the PRCP problem setting.% and Figure \ref{All_ARCP_PRCP_mainPaper1} demonstrates how PRCP with $\tilde{\alpha}$=0.05 produces similar prediction set sizes on both clean and adversarial data, and performs better than both vanilla CP and adversarially robust CP algorithm, RSCP \citep{gendler2022adversarially}. 


% \section{Notations}
% \label{section:problem_setup}

% Before we proceed to the technical part of our proposed algorithm, we first introduce some useful notations.
% Let $(X, Y)$ be a data sample where $X \in \calX$ is the sample feature and $Y \in \calY$ from a feature space $\calX$ and label space $\calY$.
% Let $\epsilon$ denote the additive random noise from $\calE_\delta = \{ \epsilon \in \R^d : \| \epsilon \| \leq \delta \}$ that is independently from data $(X, Y)$.
% Denote $\calP_{X, Y}$ and $\calP_\epsilon$ the underlying distribution of $(X, Y)$ and $\epsilon$, respectively.
% We also define $Z = (X, Y, \epsilon)$ as the joint random variable and noisy feature $\widetilde X = X + \epsilon$ for the notion simplicity.
% Let $S(X, Y)$ denote the non-conformity score function on $(X, Y)$.
% For a sample $(X_i, Y_i)$ from a calibration set, we use $S_i = S(X_i, Y_i)$ as a shorthand notation of its non-conformity score.
% Let
% $Q(\alpha) := \min \{ t : \P_{X, Y} \{ S(X, Y) \leq t \} \geq 1 - \alpha \}$
% be the true quantile of the score function value for $(X, Y)$.
% In the context of conformal prediction, on a calibration set $\calD_\cal = \{(X_i, Y_i)\}_{i=1}^n$ with $n$ exchangeably drawn random samples from the underlying distribution $\calP_{X, Y}$, we denote $Q(\alpha; \calD_\cal) = S_{(\lceil (1-\alpha) n \rceil)}$ by the ($1-\alpha$)-quantile derived from $\calD_\cal$ where we treat $Q(\alpha) = Q(\alpha; \calD_\cal)$, since they are both defined on random variables.
% Denote
% $\hat Q_n(\alpha) := \min \{ t : \sum_{i=1}^n \indicator [ S(X_i, Y_i) \leq t ] \geq n ( 1 - \alpha ) \} = S_{ ( \lceil (1-\alpha) n \rceil ) }$ as the empirical


\section{Robust Conformal Prediction}
\label{section:RCP}

This section describes our proposed adaptive probabilistically robust conformal prediction (aPRCP) algorithm. First, we introduce the notion of adversarially robust coverage and extend it to  probabilistically robust coverage. Next, we motivate the significance of aPRCP algorithm and study the theoretical connection between aPRCP and adversarially robust CP setting \citep{gendler2022adversarially} in terms of probabilistically robust coverage and prediction set size. Finally, we analyze the gap between empirical and population level quantiles in terms of the number of data samples.




\subsection{Probabilistically Robust Coverage}


This section introduces the expanded notation of inflation condition on the conformity scoring function from the worst-case adversarial robustness setting to the more general probabilistic robustness setting. We start with the following definitions that are originally introduced for the ARCP setting \citep{gendler2022adversarially} and capture the inflation property of the score function for deriving adversarial robustness.


\begin{definition}
\label{definition:adv_robust_coverage}
(Adversarially robust coverage)
A prediction set $\calC(\widetilde X)$ provides ($1-\alpha$)-adversarially robust coverage if for a desired coverage probability $1-\alpha \in (0,1)$:
\begin{align}
\label{eq:adv_robust_coverage}
% \P_{X, Y} \{ S(X+\epsilon, Y) \leq \tau, ~~\forall \epsilon \in \calE_\rho \} \geq 1 - \alpha .
\P_{X, Y} \{ Y \in \calC(\widetilde X = X + \epsilon), \forall \epsilon \in \calE_r \} \geq 1 - \alpha .
\end{align}
\end{definition}




\begin{definition}
\label{definition:adv_inflated_score}
($M_r$-adversarially inflated score function)
$S: \calX \times \calY \rightarrow \R$ is an $M_r$-adversarially inflated score function if the following inequality holds:
\begin{align}
\label{eq:adv_inflated_score}
&
S(X + \epsilon, Y) 
\leq 
S(X, Y) + M_r,
\nonumber\\
&
\qquad \qquad \qquad \qquad  
\forall X \in \calX, Y \in \calY \text{ and } \epsilon \in \calE_r .
\end{align}
\end{definition}



The strategy of RSCP algorithm \citep{gendler2022adversarially} for the ARCP setting is to directly add an inflated quantity $M_r$ to the quantile determined from the clean data $(X, Y)$, 
\begin{align}
\label{eq:ar_threshold}
\tau^\AR(\alpha) := Q(\alpha) + M_r,
\end{align}
and construct a prediction set with $\calC^\AR(X) = \{ y \in \calY : S(X + \epsilon, y) \leq \tau^\AR(\alpha) \}$.
To this end, since $Q(\alpha)$ provides $(1-\alpha)$ marginal coverage on clean data $(X, Y)$, $\tau^\AR(\alpha)$ thus guarantees $(1-\alpha)$-adversarially robust coverage on adversarial data $(X + \epsilon, Y)$.
% In addition, since ARCP achieves adversarially robust coverage, it is invariant to the underlying distribution $\calP_\epsilon$.
This result is summarized in the following proposition.
\begin{proposition}
\label{proposition:AR_coverage_ARCP}
(Adversarially robust coverage of RSCP, Theorem 1 in \citep{gendler2022adversarially})
Assume the score function $S$ is $M_r$-adversarially inflated.
Let $\calC^\AR(\widetilde X) = \{ y \in \calY : S(\widetilde X, y) \leq \tau^\AR(\alpha) \}$ be the prediction set for a testing sample $\widetilde X$.
Then RSCP achieves ($1-\alpha$)-adversarially robust coverage.
\end{proposition}






Now we extend the notion of adversarially robust coverage to the more general and relaxed condition, i.e., probabilistically robust coverage, by introducing the definition below.
\begin{definition}
\label{definition:prob_robust_coverage}
(Probabilistically robust coverage)
A prediction set $\calC(\widetilde X)$ provides ($1-\alpha$)-probabilistically robust coverage if for a desired coverage probability $1-\alpha \in (0, 1)$:
\begin{align}
\label{eq:prob_robust_coverage}
% \P_{X, Y, \epsilon} \{ S(X+\epsilon, Y) \leq \tau \} \geq 1 - \alpha
\P_{X, Y, \epsilon} \{ Y \in \calC(\widetilde X = X + \epsilon) \} \geq 1 - \alpha .
\end{align}
% Nested probabilistically robust coverage:
% \begin{align*}
% \P_{X, Y} \{ \P_{\epsilon | X, Y} \{S(X+\epsilon, y) \leq \tau \} \geq 1 - \beta \} \geq 1 - \alpha
% \end{align*}
\end{definition}


We highlight that the key difference between adversarially robust coverage (Definition \ref{definition:adv_robust_coverage}) and probabilistically robust coverage (Definition \ref{definition:prob_robust_coverage}) is whether the distribution of the perturbation $\epsilon$ is involved in the comparison with the target probability $1-\alpha$:
probabilistically robust coverage goes though the joint distribution involving $\epsilon$, i.e., $\P_{X, Y, \epsilon}\{ \cdot \}$ in (\ref{eq:prob_robust_coverage}) instead of $\P_{X, Y}\{ \cdot, \forall \epsilon \in \calE_r \}$ in (\ref{eq:adv_robust_coverage}).
Based on this understanding, we can see that a conformal prediction method can achieve ($1-\alpha$)-probabilistically robust coverage if it can satisfy ($1-\alpha$)-adversarially robust coverage.
For the same target probability $(1-\alpha)$, adversarially robust coverage is more difficult to achieve than probabilistically robust coverage. Hence, the notion of probabilistic robustness for CP is more general and relaxed.


Naturally, we now extend the definition of the uniform inflated score function (Definition \ref{definition:adv_inflated_score}) to the following one.
\begin{definition}
\label{definition:prob_inflated_score}
($M_{r, \eta}$-probabilistically inflated score function)
$S : \calX \times \calY \rightarrow \R$ is an $M_{r, \eta}$-probabilistically inflated score function if the following inequality holds for $\eta \in [0, \alpha]$:
\begin{align}
\label{eq:prob_inflated_score}
\P_Z \big\{ 
S(X + \epsilon, Y) 
\leq 
S(X, Y) + M_{r, \eta}
\big\} 
\geq 
1 - \eta .
\end{align}
\end{definition}


The above definition regarding the inflation of the score function is general and includes (\ref{eq:adv_inflated_score}) given in Definition \ref{definition:adv_inflated_score} as a special case:
By simply setting $\eta = 0$, we get $\P_Z\{ S(X + \epsilon, Y) \leq S(X, Y) + M_{r, 0} \} \geq 1$, i.e., $M_{r, 0} = M_r$.
% This equivalence is useful to understand the connection between ARCP and inflated PRCP.
Again, we highlight that the above condition involves the joint distribution on $Z$, as in Definition \ref{definition:prob_robust_coverage}.




Based on the extension from adversarial to probabilistic robustness setting, it is easy to develop a similar principle on the {\it inflated} score function to derive probabilistically robust coverage, which we refer to as inflated probabilistically robust conformal prediction (\texttt{iPRCP}).
To this end, let
$$\tau^\iPR(\alpha; \eta) := Q(\alpha^*_\iPR) + M_{r, \eta},$$ 
where $\alpha^*_\iPR = 1 - ( 1 - \alpha ) / ( 1 - \eta )$.
$\tau^\iPR(\alpha; \eta)$ is the threshold determined by iPRCP that treats $\eta$ from probabilistically inflated score function as a hyper-parameter.
We use $\alpha^*_\iPR$ as the probability for deriving the quantile on clean data, as (\ref{eq:ar_threshold}) in ARCP.


\begin{proposition}
\label{proposition:PR_coverage_iPRCP}
(Probabilistically robust coverage of iPRCP)
Assume the score function $S$ is an $M_{r, \eta}$-probabilistically inflated.
Let $\calC^\iPR(\widetilde X) = \{ y \in \calY : S(\widetilde X, y) \leq \tau^\iPR(\alpha; \eta) \}$ be the prediction set for a testing sample $\widetilde X=X+\epsilon$. 
Then iPRCP achieves ($1-\alpha$)-probabilistically robust coverage.
\end{proposition}



This result shows that we can guarantee the ($1-\alpha$)-probabilistically robust coverage if we use $\tau^\iPR(\alpha; \eta)$ to construct the prediction set $\calC^\iPR$.
While the idea is simple and follows the inflation quantile used in the ARCP setting, it implies that we {\it have to know $M_{r, \eta}$}, the inflated quantity on the clean quantile. This requires us to know the score function very well. Otherwise, we have to design a score function that satisfies the desired condition,
similar to how the randomly smoothed score function was designed by RSCP algorithm to work for the ARCP setting \citep{gendler2022adversarially}.
It was carefully designed to offer a uniform Lipschitz continuity with the requirement of an additional set of Gaussian random samples. This design may introduce additional restrictions, since extra samples are required every time the score function is applied, including each calibration and testing sample. Therefore, we would like to address the following question: {\em Can we design an adaptive algorithm to fit the underlying distribution without any prior knowledge or special design of the score function?}





\begin{algorithm}[t]

    \caption{adaptive PRCP (\texttt{aPRCP}) } %\yanred{(aligned to $1-\alpha$)} }
    \label{alg:aPRCP}
    \begin{algorithmic}[1]
    
    \STATE \textbf{Input}: target probability $\alpha \in (0, 1)$; the hyper-parameter $s$; set $\tilde \alpha = 1- \frac{1-\alpha}{1-\alpha+s}$;
    split data into disjoint training set $\calD_\tr$ and calibration set $\calD_\cal$ with $|\calD_\cal| = n$.
    
    % A test point $X_{n+1}$ for which we need to predict a set, error rate $\alpha \in (0, 1)$, localized hyperparameter $\lambda_L \in \mathbb{R}^+$, regularized hyperparameter $\lambda_R \in [0, 1].$
    % \STATE Randomly split the $\mathcal{D}$ into $\mathcal{D}_{tr}$ and $\mathcal{D}_{cal}$.
    
    % \STATE If predictor $F_\theta$ is not given, train a prediction model $F_\theta$ on the training set $\calD_\tr$.
    
    \STATE Train a classifier $F_\theta$ on $\calD_\tr$.
    
    %\STATE Train a Temperature scaling parameter $ \tau $ using $ \mathcal{D}_{temp}$.
    %\STATE Find a hyperparameter $K_{reg}$ using $\mathcal{D}_{hyper}$ from Equation []. 
    
    \STATE Draw $\epsilon_{ij} \sim \calP_\epsilon$ where $i \in \{1,\cdots, n\}$ and $j \in \{1,\cdots, m\}$ denote the indices of data $(X_i, Y_i)$ and its $m$ perturbations.
    
    \STATE Compute scores: $S_{ij} = S(X_i + \epsilon_{ij}, Y_i)$, $\forall i, j$.
    
    \STATE Compute empirical robust quantiles: \\
    $\widehat Q^\rob_i = \widehat Q^\rob(X_i, Y_i; \tilde \alpha) = Q(\tilde \alpha, \{S_{ij}\}_{j=1}^m)$ via (\ref{eq:robust_quantile_x}), $\forall i$.
    \label{algorithm:line:empirical_robust_quantile}

    \STATE Determine threshold $\tau^\aPR(\alpha; s) = \widehat Q^\rob_{ ( \lceil (n+1) ( 1 - \alpha + s ) \rceil ) }$ from empirical robust quantiles according to (\ref{eq:aPRCP_threshold}).
    \label{algorithm:line:threshold_aPRCP}
    
    \STATE Receive $\widetilde{X}_{n+1}$ and construct prediction set:\\
    ~~~ $\calC(\widetilde{X}_{n+1}) = \{ y \in \mathcal{Y}: S(\widetilde{X}_{n+1}, y) \leq \tau^\aPR(\alpha; s) \}$.
    \end{algorithmic}
\end{algorithm}


\subsection{Adaptive PRCP Algorithm}
\label{subsection:aPRCP}





This section presents our adaptive algorithm for achieving probabilistically robust coverage (aPRCP).
We summarize it in Algorithm \ref{alg:aPRCP} and elaborate it below.
First, we define the $(1 - \tilde \alpha)$-{\it robust quantile} for a given $X$ as follows
\begin{align}
\label{eq:robust_quantile_x}
&
Q^\rob(X, Y; \tilde \alpha)
\nonumber\\
& \qquad \quad
:=
\min\{ t : \P_\epsilon \{ S(\widetilde X, Y) \leq t \} \geq 1 - \tilde \alpha \}.
\end{align}
Given $(X, Y$) and $\tilde \alpha$, $Q^\rob(X, Y; \tilde \alpha)$ returns the quantile from all randomly perturbed $\widetilde X=X + \epsilon$ over $\epsilon \in \calE_r$.
It acquires the inflated quantity from a local region of $X$ as $\tilde \alpha$ indicates how conservative this inflation can be.
We denote the empirical robust quantile (in Line \ref{algorithm:line:empirical_robust_quantile} of Algorithm \ref{alg:aPRCP}) by $\widehat Q^\rob$.


Next, we define the threshold of the proposed adaptive PRCP (\texttt{aPRCP}) for a hyper-parameter $s \in [0, \alpha]$ as follows.
\begin{align}
\label{eq:aPRCP_threshold}
% \alpha^\aPR(\alpha; s)
&
\tau^\aPR(\alpha; s)
= 
\min\{ t : 
\nonumber\\
& \qquad 
\P_{X, Y} \{ Q^\rob(X, Y; \alpha^*_\aPR) \leq t \} \geq 1 - \alpha + s \} ,
\end{align}
where
$\alpha^*_\aPR
=
1 - ( 1 - \alpha ) / ( 1 - \alpha + s )$ is a conservativeness parameter for the robust quantile in (\ref{eq:robust_quantile_x}) that depends on the target probability $\alpha$ and the hyper-parameter $s$.
In practice, the empirical threshold $\widehat \tau^\aPR = \widehat Q^\rob_{( \lceil (n+1) ( 1 - \alpha + s ) \rceil ) }$ is selected from empirical robust quantiles $\{\widehat Q^\rob_i\}_{i=1}^n$ (in Line \ref{algorithm:line:threshold_aPRCP} of Algorithm \ref{alg:aPRCP}).
Our aPRCP algorithm is adaptive since it finds $\alpha^*_\aPR$ that is adaptive to the underlying distribution of $(X, Y)$ as long as $\alpha$ and $s$ are fixed apriori.
The following formal result guarantees the probabilistically robust coverage for the aPRCP algorithm.
\begin{theorem}
\label{theorem:prob_robust_coverage_aPRCP}
(Probabilistically robust coverage of aPRCP)
Let $\calC^\aPR(\widetilde X = X + \epsilon) = \{ y \in \calY : S(\widetilde X, y) \leq \tau^\aPR(\alpha; s) \}$ be the prediction set for a testing sample $\widetilde X$.
Then aPRCP achieves ($1-\alpha$)-probabilistically robust coverage.
% \begin{align*}
% \P_Z \{ S(X+\epsilon) \leq \tau^\aPR(\alpha; s, \alpha^*_\aPR) \} 
% \geq 
% 1 - \alpha .
% \end{align*}
\end{theorem}

\begin{remark}
In fact, $\tau^\aPR(\alpha; s)$ is the ($1-\alpha+s$)-th quantile (going through $(X, Y)$) of the ($1-\alpha^*_\aPR$)-robust quantiles (going through $\epsilon$).
One benefit of aPRCP is the transfer of the inflation from the score function to the specified probability (i.e., an $s$ increase in probability). Therefore, it is not required to have a prior knowledge of either $M_r$ as in ARCP or  $M_{r, \eta}$ as in iPRCP.
Instead, aPRCP requires finding a feasible and a good value for $\alpha^*_\aPR$ by treating $s$ as a hyper-parameter, though it inflates the specified probability, i.e., $1-\alpha+s \geq 1 - \alpha$, and $1-\alpha^*_\aPR \geq 1 - \alpha$.
\end{remark}


\begin{theorem}
\label{theorem:prob_robust_coverage_aPRCP_cross_domain_noise}
(Probabilistically robust coverage of aPRCP for cross-domain noise)
Let $\calP_\epsilon^{test}$ and $\calP_\epsilon^{cal}$ denote different distributions of $\epsilon$ during the testing and calibration phases, respectively.
Assume $\P_{\epsilon \sim \calP_\epsilon^{cal}}\{\epsilon\} - \P_{\epsilon \sim \calP_\epsilon^{test}}\{\epsilon\} \leq d$ for all $\| \epsilon \| \leq r$.
Set $\alpha^*_\aPR = 1 - d - ( 1 - \alpha) / (1 - \alpha + s )$ in (\ref{eq:aPRCP_threshold}).
Let $\calC^\aPR(\widetilde X = X + \epsilon) = \{ y \in \calY : S(\widetilde X, y) \leq \tau^\aPR(\alpha; s) \}$ be the prediction set for a testing sample $\widetilde X$.
Then aPRCP achieves ($1-\alpha$)-probabilistically robust coverage.
\end{theorem}

\begin{remark}
The key assumption we make is $\P_{\epsilon \sim \calP_\epsilon^{cal}}\{\epsilon\} - \P_{\epsilon \sim \calP_\epsilon^{test}}\{\epsilon\} \leq d$, which is analogous to $L^1$-distance used in the domain adaptation literature \citep{redko2020survey,ben2006analysis}.
One can interpret it as the maximal gap of the density probability between the calibration and testing distributions when fixing $\epsilon$.
As per our analysis, when this gap can be bounded by a sufficiently small constant $d$, with an inflated nominated coverage in the robust quantile (i.e., setting $\alpha^*_\aPR = 1 - d - (1-\alpha)/(1-\alpha+s)$ in (\ref{eq:aPRCP_threshold})), we can guarantee probabilistically robust coverage for aPRCP.
\end{remark}


\subsection{ Connection Between ARCP and PRCP }
\label{subsection:connection}

Although ARCP algorithm can achieve adversarially robust coverage, we can still connect ARCP and PRCP in the sense of {\it probabilistically robust coverage} and understand their performance in terms of {\it efficiency}. Recall that efficiency of conformal prediction algorithms refers to the measured size of prediction sets for testing samples when some desired coverage is achieved.
For example, for the same target probability $1-\alpha$, a smaller threshold indicates better efficiency. The following result shows the possibly improved efficiency of iPRCP and aPRCP when compared to ARCP after that their hyper-parameters were tuned properly (i.e., $\eta$ for iPRCP and $s$ for aPRCP).

\begin{corollary}
\label{corollary:compare_aPRCP_ARCP}
To achieve the same ($1-\alpha$)-probabilistically robust coverage on $Z$, the following inequalities hold: \begin{align*}
\min_{ \eta \in [0, \alpha] } \tau^\iPR(\alpha; \eta) \leq \tau^\AR(\alpha), ~~
\min_{ s \in [0, \alpha] }  \tau^\aPR(\alpha; s) \leq \tau^\AR(\alpha) .
\end{align*}
\end{corollary}
When all three algorithms achieve ($1-\alpha$)-probabilistically robust coverage,
smaller thresholds yield better efficiency, i.e., iPRCP and aPRCP.
The idea of the above result is to particularly set $\eta=0$ and $s=0$, which makes iPRCP and aPRCP degenerate to ARCP, resulting in the same threshold.
For aPRCP with $s=0$, we have $\alpha^*_\aPR=0$, i.e., $1$-robust quantile %(adversarial quantile) 
for each $(X, Y)$ used, which recovers ARCP.
% Then it is easy to see that iPRCP and aPRCP with tuned parameter values can be potentially more efficient than ARCP in terms of achieving probabilistically robust coverage.





\subsection{Approximation Error of Empirical Quantiles}

In the above sections, we presented algorithms and their analysis directly in the population sense, including the true quantile $Q(\alpha)$ and $Q^\rob(X; \alpha)$. However, when executing a given conformal prediction method on exchangeable samples $\calD_\cal$, we employ empirical quantiles in practice. To close this gap between theory and practice, we additionally discuss the concentration inequalities for empirical approximation to these quantities (i.e., the gap between empirical and true quantiles) as a function of the number of samples.

\begin{proposition}
\label{proposition:empirical_quantile_concentration}
(Concentration inequality for quantiles)
Let $Q(\alpha) = \max\{ t : \P_V\{ V \leq t \} \geq 1 - \alpha \}$ be the true quantile of a random variable $V$ given $\alpha$,
and $\widehat Q_n(\alpha) = V_{ ( \lceil (n+1) ( 1 - \alpha ) \rceil ) }$ be the empirical quantile estimated by $n$ randomly sampled set $\{V_1, ..., V_n\}_{i=1}^n$.
Then with probability at least $1-\delta$, we have
$
% | \sum_{i=1}^n Z_i - p n | \geq \frac{ \log(2 / \delta) }{ \sqrt{n} }  \
% | \widehat Q_n(\alpha) - Q(\alpha) |
% \leq 
% \tilde O( 1 / \sqrt{n} ), 
\widehat Q_n(\alpha + \tilde O(1/\sqrt{n}))
\leq
Q(\alpha)
\leq
\widehat Q_n(\alpha - \tilde O(1/\sqrt{n}))
$   
where $\tilde O$ hides the logarithmic factor.
\end{proposition}

The above result shows that more data samples from the underlying distribution for $(X, Y)$ or $\epsilon$ will help in improving the approximation of empirical quantiles on score function $S$ at a rate of $\tilde O(1/\sqrt{n})$, where $n$ is number of samples.
Note that we only use this proposition to fill the gap between empirical and true quantiles.
Some prior work also studied similar concentration results \citep{vovk2012conditional}.









%\section{Experiments and Results}
\section{Experiments and Results}

In this section, we present the empirical evaluation of our proposed aPRCP algorithm along different dimensions. %, and compare with relevant state-of-the-art CP methods.


% We present the experimental evaluation of PRCP and the effect of its probabilistic parameter $\rho$ over the existing work, namely Vanilla CP  \citep{vovk2005algorithmic} and RSCP  \citep{gendler2022adversarially} to demonstrate the effectiveness of PRCP in achieving better robust coverage and improved trade-offs between nominal and robustness performance. We denote PRCP when $p=0$ as ARCP, which the special-case algorithm that is evaluated against the worst-case scenario.


\begin{figure*}[!h]
    \centering
    \begin{minipage}{.92\linewidth}
        \begin{minipage}{\linewidth}
            \centering
            \includegraphics[width=.35\linewidth]{MainPaper/legend.png}
        \end{minipage}     
        \begin{minipage}{.33\linewidth}
            \centering
            (a) CIFAR10
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.33\linewidth}
            \centering
            (b) CIFAR100
        \end{minipage} 
        \begin{minipage}{.33\linewidth}
            \centering
            (a) ImageNet
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.33\linewidth}
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR10PRCP_fixed_ns_Coverage_APS_HPS.png}
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.33\linewidth}
            \centering
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR100PRCP_fixed_ns_Coverage_APS_HPS.png}
        \end{minipage}    
        \hfill
        \begin{minipage}{.33\linewidth}
            \includegraphics[width=\linewidth]{MainPaper/dataset_ImageNetPRCP_fixed_ns_Coverage_APS_HPS.png}
        \end{minipage}%%
        \hfill
        \begin{minipage}{.33\linewidth}
            \centering
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR10PRCP_fixed_ns_Size_APS_HPS.png}
        \end{minipage}    
        \hfill
        \begin{minipage}{.33\linewidth}
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR100PRCP_fixed_ns_Size_APS_HPS.png}
        \end{minipage}%%
        \hfill
        \begin{minipage}{.33\linewidth}
            \includegraphics[width=\linewidth]{MainPaper/dataset_ImageNetPRCP_fixed_ns_Size_APS_HPS.png}
        \end{minipage}
    \end{minipage}
    \caption{Probabilistic robust coverage (top) and prediction set size (bottom) constructed by \texttt{Vanilla CP}, \texttt{RSCP}, and \texttt{aPRCP}($\tilde{\alpha} = 0.1$) using HPS and APS scoring functions (target coverage is $90\%$). Results are reported over 50 runs.}
    \label{All_PRCP_mainPaper1}
\end{figure*}

\subsection{Experimental Setup}

\noindent {\bf Classification Datasets.} We consider three benchmark datasets for evaluation: CIFAR10 \citep{krizhevsky2009learning}, CIFAR100  \citep{krizhevsky2009learning}, and ImageNet  \citep{deng2009imagenet} using the standard training and test split. 
%We employ the same methodology as  \citep{angelopoulos2021uncertainty} to create calibration data and validation data for tuning hyper-parameters.

\noindent {\bf Deep Neural Network Models.} We consider ResNet-110  \citep{he2016deep} as the main model architecture for CIFAR10 and CIFAR100 and ResNet-50 for ImageNet in our experiments. We provide results on additional deep neural networks in the Appendix due to space constraints noting that we find similar patterns.
We train each model using two different approaches : {\em 1) Standard training:} The training is only performed using clean training examples; and {\em 2) Gaussian augmented training:} The training procedure employs Gaussian augmented examples  \citep{gendler2022adversarially} parameterized by a given standard deviation $\sigma=0.125$. 
%for CIFAR10 and CIFAR100 datasets and $\sigma$=0.25 for the ImageNet dataset. %We provide details of the hyper-parameters used in the {\bf Appendix}.

\noindent {\bf Methods and Baselines.} We consider two relevant state-of-the-art CP algorithms as our baselines. First, we employ \texttt{Vanilla CP} \citep{NEURIPS2020_244edd7e} designed for clean input examples. Second, we use randomly smooth conformal prediction (\texttt{RSCP})  \citep{gendler2022adversarially} which is designed to handle worst-case adversarial examples. We employ the publicly available implementations of \texttt{Vanilla CP}\footnote{\url{https://github.com/msesia/arc}} and \texttt{RSCP}\footnote{\url{https://github.com/Asafgendler/RSCP}} using the best settings suggested by their authors. 

We consider different configurations of our proposed adaptive probabilistically robust CP (\texttt{aPRCP}) algorithm. \texttt{aPRCP}(worst-adv) refers to the configuration where the evaluation of \texttt{aPRCP} is performed over adversarial examples generated using an adversarial attack algorithm. \texttt{aPRCP}($\tilde{\alpha}$) refers to the configuration where the evaluation is performed over noisy examples with a bounded perturbation on the test data. We provide additional results using different values for $\tilde{\alpha}$ in the Appendix.
%For calibration purpose, we uniformly sample $m_s=128$ perturbed examples within a distance $r_1 = 8$ for ImageNet and $r_1 = 6$ for other datasets from each calibration input noting that we also present ablation analysis by varying $m_s$ in the {\bf Appendix}. We tune the hyper-parameter $s$ for PRCP algortihm using the validation data. 

% We present the experimental evaluation of PRCP and the effect of its probabilistic parameter $\rho$ over the existing work, namely Vanilla CP  \citep{vovk2005algorithmic} and RSCP  \citep{gendler2022adversarially} to demonstrate the effectiveness of PRCP in achieving better robust coverage and improved trade-offs between nominal and robustness performance. We denote PRCP when $p=0$ as ARCP, which the special-case algorithm that is evaluated against the worst-case scenario.
\noindent{\bf Adversarial Attack Algorithms.}
To generate adversarial examples, we employ the white-box \texttt{PGD} attack algorithm  \citep{gendler2022adversarially} to evaluate \texttt{Vanilla CP} algorithm. For \texttt{RSCP} and \texttt{aPRCP(worst-adv)}, we employ an adapted \texttt{PGD} algorithm for smoothed classifiers as proposed in \citet{salman2019provably}. We provide additional results using different adversarial algorithms in the Appendix.

% We also generate test data using black-box adversarial attack algorithm \citep{blackBox}. Instead of searching an optimal adversarial example for an input, this method finds a probability density distribution centered around a small region of the input. Any data falling to that region is more likely an adversarial example.
% We use adversarial perturbation of energy of $r = 0.125$ for CIFAR10 and CIFAR100 datasets, and for ImageNet dataset the energy is $r = 0.25$.

\noindent {\bf Evaluation Methodology.}  We present all our experimental results for desired coverage as $(1-\alpha)$=90\%. %We evaluate all methods in terms of two metrics computed on any given testing set: {\em Coverage} (fraction of testing examples for which prediction set contains the ground-truth output) and {\em Efficiency} (average length of cardinality of prediction set, small values mean high efficiency). 
We report the average metrics (coverage and prediction set size) over 50 different runs for all datasets. We consider two different evaluation settings at the inference time as described below.

% (a) \textbf{Clean data evaluation:} We only employ the clean testing examples to measure coverage and prediction set size. This evaluation is referred as the nominal performance. %of a given CP method.


(a) \textbf{Probabilistic robustness evaluation}: We randomly sample $n_s = 128$ examples for each clean testing input: $X^{j}=X+\epsilon_j$ ($j$=1 to $n_s$), where $||\epsilon_j||_2 \leq r = 0.125$ for the CIFAR data and $||\epsilon_j||_2 \leq r = 0.25$ for the ImageNet data.  For a better span during the sampling procedure for each clean testing input, we sample two perturbations $\epsilon_j$ for each $r^{(k)}$ in $0<r^{(1)}<\cdots<r^{(k)}\le r$ such that $\|\epsilon_j\|_2=r^{(k)}$. 

We define both coverage and prediction set size metrics to adapt to the probabilistic robustness setting as follows:
{\em Coverage}: fraction of examples for which prediction set contains the ground-truth output.
\begin{equation}
\small
    \text{\normalsize Coverage} = \frac{1}{n_s} \sum_{j=1}^{n_s} \mathbbm{1}[Y_{n+1} \in \tilde{C}(X_{n+1} + \epsilon_j)].
    \label{eq:cvg_prob}
\end{equation}

{\em Efficiency}: average prediction set size, small values mean high efficiency.
\begin{equation}
    \text{Prediction Set Size} = \frac{1}{n_s}\sum_{j=1}^{n_s}\lvert\tilde{C}(X_{n+1}+\epsilon_j)\rvert,
    \label{eq:set_prob}
\end{equation}
where $||\epsilon_j||_2 \leq r = 0.125$ for CIFAR dataset, and $||\epsilon_j||_2 \leq r = 0.25$ for the ImageNet dataset. These re-defined metrics allow us to evaluate \texttt{aPRCP}($\tilde \alpha$) with different values of probability parameters $\tilde{\alpha}$ for probabilistic robustness. We provide additional results explaining the impact of the choice of the sampling distributions in the Appendix.


(b) \textbf{Worst-case evaluation:} We employ adversarial attack algorithms as mentioned above to create one worst-case adversarial example ($\tilde{X}$) for each clean testing input ($X$). We define both metrics for this setting as follows:
\begin{equation}
    \text{Coverage} = \mathbbm{1}[Y_{n+1} \in \tilde{C}(\tilde{X}_{n+1})].
    \label{eq:cvg}
\end{equation}
\begin{equation}
    \text{Prediction Set Size} = \lvert\tilde{C}(\tilde{X}_{n+1})\rvert.
    \label{eq:set}
\end{equation}



%For this evaluation, we sample uniformly $m_s =64$ examples based on each calibration point and $n_s = 128$ examples based on each test input. All $X^{j}=X+\epsilon_j$ test samples are within a distance $r_2=6$ of the original example $X$. For a better span during the sampling procedure for test examples, we sample $2$ perturbations $\epsilon_j$ for each $r^{(k)}$ in $0<r^{(1)}<\cdots<r^{(k)}\le r_2$ such that $\|\epsilon_j\|=r^{(k)}$. 

% We re-define both coverage and prediction set size metrics to adapt to the probabilistic setting as follows:
% \begin{equation}
% \begin{split}
%     \text{Coverage} = \mathbbm{1}\bigg[ \bigg[\frac{1}{n_s} & \sum_{j=1}^{n_s} \mathbbm{1}[Y_{n+1} \in \tilde{C}(X_{n+1} + \epsilon_j)] \bigg] \\ & \geq 1- \tilde{\alpha}
%     \bigg]
% \end{split}
%     \label{eq:cvg_prob}
% \end{equation}


    

\begin{figure*}[!h]
    \centering
    \begin{minipage}{.92\linewidth}
        \begin{minipage}{\linewidth}
            \centering
            \includegraphics[width=.35\linewidth]{MainPaper/legend.png}
        \end{minipage}     
        \begin{minipage}{.33\linewidth}
            \centering
            (a) CIFAR10
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.33\linewidth}
            \centering
            (b) CIFAR100
        \end{minipage} 
        \begin{minipage}{.33\linewidth}
            \centering
            (a) ImageNet
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.33\linewidth}
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR10ARCP_fixed_ns_Coverage_APS_HPS.png}
        \end{minipage}%% 
        \hfill
        \begin{minipage}{.33\linewidth}
            \centering
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR100ARCP_fixed_ns_Coverage_APS_HPS.png}
        \end{minipage}    
        \hfill
        \begin{minipage}{.33\linewidth}
            \includegraphics[width=\linewidth]{MainPaper/dataset_ImageNetARCP_fixed_ns_Coverage_APS_HPS.png}
        \end{minipage}%%
        \hfill
        \begin{minipage}{.33\linewidth}
            \centering
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR10ARCP_fixed_ns_Size_APS_HPS.png}
        \end{minipage}    
        \hfill
        \begin{minipage}{.33\linewidth}
            \includegraphics[width=\linewidth]{MainPaper/dataset_CIFAR100ARCP_fixed_ns_Size_APS_HPS.png}
        \end{minipage}%%
        \hfill
        \begin{minipage}{.33\linewidth}
            \includegraphics[width=\linewidth]{MainPaper/dataset_ImageNetARCP_fixed_ns_Size_APS_HPS.png}
        \end{minipage}
    \end{minipage}
    \caption{Adversarially robust coverage (top) and prediction set size (bottom) constructed by Vanilla CP, RSCP, and aPRCP(worst-adv) using HPS and APS scoring functions (target coverage is $90\%$). Results are reported over 50 runs.}
    \label{All_ARCP_mainPaper1}
\end{figure*}

\subsection{Results and Discussion}

{\bf Probabilistic Robust Coverage Performance.} 
Figure \ref{All_PRCP_mainPaper1} shows the probabilistic robustness performance (in terms of coverage and prediction set size) obtained by \texttt{Vanilla CP}, \texttt{RSCP}, and \texttt{aPRCP}($\tilde{\alpha} = 0.1$) for all three datasets using standard training. We make the following observations. 1) \texttt{Vanilla CP} algorithm fails in achieving the target probabilistic robust coverage. 2) \texttt{RSCP} algorithm achieves the desired probabilistic coverage, but has an empirical coverage significantly larger then 90\%. This yields very large prediction sets. Using APS, \texttt{RSCP} yields on average a prediction set of 30 labels for CIFAR100 and 60 for ImageNet. 3) \texttt{aPRCP}($\tilde{\alpha} = 0.1$) produces smaller prediction sets by keeping the actual coverage close to the target coverage. \texttt{aPRCP}($\tilde{\alpha} = 0.1$) reduces the prediction set by an average of 20 labels for CIFAR100 and ImageNet compared to RSCP method using any of the two non-conformity scores.  



{\bf Adversarially Robust Coverage Performance.}
Figure \ref{All_ARCP_mainPaper1} shows the robust coverage and prediction set size obtained by \texttt{Vanilla CP}, \texttt{RSCP}, and \texttt{aPRCP}(worst-adv) achieved on the worst-case examples for three different datasets using Gaussian augmented training. We observe similar patterns as the probabilistic robust coverage results. 1) \texttt{Vanilla CP} fails to achieve the target coverage empirically. For all datasets, it achieves empirical coverage lower then 80\%. 2) Similar to the probabilistic robustness  results, \texttt{RSCP} method achieves an empirical coverage larger then 95\% for all datasets, yielding significantly large prediction sets for all datasets. 3) \texttt{aPRCP}(worst-adv) produces smaller prediction sets by keeping the actual coverage close to the target coverage (by a margin of 2\%) on worst-case adversarial examples. \texttt{aPRCP(worst-adv)} reduces the prediction set by more then 10 labels for CIFAR100 and ImageNet compared to RSCP method using any of the two non-conformity scores (HPS and APS).   






\section{Related Work}
% \vspace{-0.1in}
\textbf{Conformal Prediction.} CP is a general framework for uncertainty quantification that provides marginal coverage guarantees without any assumptions on the underlying data distribution \citep{shafer2008tutorial}. CP can be used for regression \citep{vovk2018cross,lei2018distribution,romano2019conformalized,izbicki2019flexible,guan2019conformal,gupta2022nested,kivaranovic2020adaptive,barber2021predictive,foygel2021limits} to produce prediction intervals and for classification \citep{lei2013distribution,sadinle2019least,romano2020classification,angelopoulos2021uncertainty,NCP} to produce prediction sets. Prior work has also considered instantiations of the CP framework to handle the differences between training and test distributions that is caused by long-term distribution shift \citep{gibbs2021adaptive}, covariate shift\citep{tibshirani2019conformal}, and label-distribution shift \citep{podkopaev2021distribution}. However, none of these existing works focus on the robustness setting where the distributional shift is caused by a bounded  adversarial perturbation. While using adversarial training seems intuitive to mitigate this problem, it was shown that vanilla CP cannot achieve the target coverage on adversarial data \citep{gendler2022adversarially}.

\textbf{Robust Conformal Prediction.} CP methods for robust coverage due to natural or adversarial perturbations is a new line of research that requires theoretical and empirical analysis. % to improve our knowledge of CP. 
Very few works have proposed variants of CP to handle adversarial robust settings.  The work on cautious deep learning \citep{hechtlinger2018cautious} proposed a CP-based prediction set construction that accounts for adversarial examples. However, this method does not provide any theoretical guarantees. Recently, randomly smoothed conformal prediction (RSCP) \citep{gendler2022adversarially} was proposed as a generalization for adversarial examples using randomized smoothing. This generalization is achieved by introducing a constant inflation condition that adjusts the CP quantile to adversarial perturbations. This adjustment is proportional to the potential adversarial perturbations that can affect the test data. Hence, RSCP is prone to produce large prediction sets along with high marginal coverage to achieve  robustness. 

We study the general setting of probabilistically robust CP and develop probably correct algorithms to achieve improved trade-offs for nominal and robust performance over vanilla CP and RSCP. The key differences between our work (aPRCP) and RSCP are: 1) aPRCP uses a {\em quantile-of-quantile} design and does not require finding a score inflation constant like RSCP. 2) RSCP requires the design of a specialized scoring function while aPRCP can employ any existing score function. 3) aPRCP does not have test-time overhead unlike RSCP due to the generation of samples.

\section{Summary and Future Work}

% \vspace{-0.1in}

% This paper studies the novel problem of adaptive probabilistic robustness for conformal prediction (aPRCP) based uncertainty quantification of deep classifiers. We develop the aPRCP algorithm and show its effectiveness to achieve
% probabilistic robust coverage against noisy samples drawn from a $l_2$-norm bounded ball and marginal coverage against adversarial examples generated from $l_2$ norm bounded ball. Our experiments show that aPRCP is most effective compared to other baseline methods. One future direction could be to use our method to improve probabilistic robustness and adversarial robustness in regression problems. The second direction could be to extend the assumption that the adversarial noise and sampling noise for probabilistic cases is $l_2$-norm bounded and consider some techniques that can handle $l_0$, $l_1$, and $l_{\infty}$ cases \citep{lee2019tight, teng2020ell_1}.

This paper studied the novel problem of probabilistic robustness for conformal prediction (PRCP) based uncertainty quantification of deep classifiers. We developed the adaptive PRCP (aPRCP) algorithm based on the principle of quantile-of-quantile design and theoretically analyzed its effectiveness to achieve improved trade-offs between performance on clean data and robustness to adversarial examples. Our experiments on multiple image datasets using deep classifiers demonstrated the effectiveness of aPRCP over vanilla CP methods and adversarially robust CP methods. Future work should study and analyze end-to-end PRCP algorithms.

% This paper studied a novel direction of Conformal Prediction algorithms for robust coverage.  Inputs are prone to be distorted by a bounded perturbation at test time. Such perturbation can originate from adversarial attacks or natural perturbations. We propose and study in this paper probabilistically robust conformal prediction (PRCP) which ensures robustness to most perturbations around each input examples. The resulting coverage exhibits a controlled trade-off between nominal and adversarial coverage. To achieve this goal, this paper proposes a theoretically-sound two-step calibration algorithm. This calibration adaptively computes a threshold on the data population and another on the bounded possible perturbations. The experimental results proved that the proposed algorithm is able to achieve robust coverage on different settings with various models. Future work includes exploring this trade-off for an end-to-end CP framework that can solve Empirical Risk Minimization problems while finding the most efficient prediction set.
% \input{files/Introduction}
% \input{files/conformal}
% \input{files/algorithms}
% \input{files/experiments}
% \input{files/Related_work}

\section*{Acknowledgements}

This research is supported in part by Proofpoint Inc. and the
AgAID AI Institute for Agriculture Decision Support, supported by the National Science Foundation and United States
Department of Agriculture - National Institute of Food and
Agriculture award \#2021-67021-35344. The authors would
like to thank the feedback from anonymous reviewers who
provided suggestions to improve the paper.

\newpage
% \bibliographystyle{unsrt}
% \bibliographystyle{plain}
%\begin{thebibliography}{}
\bibliography{reference}
%\end{thebibliography}




\end{document}

