\section{Appendix}

\subsection{Proofs}
\label{app:proofs}

\begin{lemma} \label{proof:lemma}
The local robustness of a multi-class linear model $f(\X) = \mathbf{w}^\top \X + b$ (with $\mathbf{w} \in \R^{d \times C}$ and $b \in \R^C$) at point $\X$ with respect to a target class $t$ is given by the following. Define weights $\U_i = \W_t - \W_i \in \R^d, \forall i \neq t$, where $\W_t, \W_i$ are rows of $\mathbf{w}$ and biases $c_i = {\U_i}^\top\X + (b_t - b_i) \in \R$. Then, 
\begin{align*}
    p^\text{robust}_\sigma(\X) = \cdf \left( \frac{c_i}{\sigma \| \U_i \|_2} \tensor \right)\\
    \mathrm{where}~~\matU = \frac{\U_i}{\| \U_i \|_2} \tensor \in \R^{(C-1) \times d}
\end{align*}
and $\cdf$ is the ($C-1$)-dimensional Normal CDF with zero mean and covariance $\matU \matU^\top$.
\end{lemma}

\begin{proof}
First, we rewrite \probust{} in the following manner, by defining $g_i(\X) = f_t(\X) - f_i(\X) > 0$, which is the ``decision boundary function".

\begin{align*}
    p_\sigma^\text{robust} = P_{\epsilon \sim \mathcal{N}(0,\sigma^2)} \left[ \max_{i} f_i(\X + \epsilon) < f_t(\X + \epsilon) \right] \\= P_{\epsilon \sim \mathcal{N}(0,\sigma^2)} \left[ \bigcup_{i=1; i \neq t}^C g_i(\X + \epsilon) > 0 \right]
\end{align*}

Now, assuming that $f,g$ are linear such that $g_i(\X) = {\U'_i}^\top \X + g(0)$, we have $g_i(\X + \epsilon) = g_i(\X) + {\U_i}^\top \epsilon$, and obtain

\begin{align}
p_\sigma^\text{robust} &= P_{\epsilon \sim \mathcal{N}(0,\sigma^2)}\left[ \bigcup_{i=1; i \neq t}^C {\U_i}^{\top}\epsilon > -g_i(\X) \right] \\
&= P_{z \sim \mathcal{N}(0,I_d)} \left[ \bigcup_{i=1; i \neq t}^C \frac{\U_i}{\| \U_i \|_2}^{\top}z > - \frac{g_i(\X)}{\sigma \| \U_i \|_2} \right] \label{appeqn:key_step}
\end{align}

This step simply involves rescaling and standardizing the Gaussian to be unit normal. We now make the following observations:
\begin{itemize}
    \item For any matrix $\matU \in \R^{C-1 \times d}$ and a d-dimensional Gaussian random variable $z \sim \mathcal{N}(0, I_d) \in \R^d$, we have $\matU^\top z \sim \mathcal{N}(0, \matU \matU^\top)$, i.e., an (C-1) -dimensional Gaussian random variable. 
    \item CDF of a multivariate Gaussian RV is defined as $P_z [\bigcup_i z_i < t_i]$ for some input values $t_i$
\end{itemize}

Using these observations, if we construct $\matU = \frac{\U_i}{\| \U_i \|_2} \tensor \in \R^{(C-1) \times d}$, and obtain

\begin{align*}
p^\text{robust}_{\sigma} &= P_{r \sim \mathcal{N}(0, \matU\matU^\top)} \left[ \bigcup_{i=1; i \neq t}^C r_i < \frac{g_i(\X)}{\sigma \| \U_i \|_2} \right] \\
&= \text{CDF}_{\mathcal{N}(0, UU^{\top})} \left( \frac{g_i(\X)}{\sigma \| \U_i \|_2} \tensor \right)
\end{align*}

where $g_i(\X) = {\U_i}^\top \X + g_i(0) = {(\W_t - \W_i)}^\top\X + (b_t - b_i)$

\end{proof}



\vspace{1cm}



\begin{lemma} (\textbf{Extension to non-Gaussian noise})
    For high-dimensional data ($d \rightarrow \infty$), Lemma \ref{estimator-linear-models} generalizes to any coordinate-wise independent noise distribution that satisfies Lyapunov's condition. 
\end{lemma} 

\begin{proof}
    Applying Lyupanov's central limit theorem, given $\epsilon \sim \mathcal{R}$ is sampled from some distribution $\mathcal{R}$ to equation \ref{appeqn:key_step} in the previous proof, we have we have $\frac{\U}{\sigma \| \U \|_2}^\top \epsilon = \sum_{j=1}^{d} \frac{\U_j}{\sigma\| \U \|_2} \epsilon_j ~~\substack{d\\\longrightarrow} ~~\mathcal{N}(0, 1)$, which holds as long as the sequence $\{\frac{\U_j}{\| \U \|_2} \epsilon_j\}$ are independent random variables and satisfy the Lyapunov condition. In particular, this implies that $\matU^\top z \sim \mathcal{N}(0, \matU \matU^\top)$, and the proof proceeds as similar to the Gaussian case after this step.
\end{proof}


\vspace{1cm}

\begin{lemma} (\textbf{Extension to non-isotropic Gaussian}) Lemma \ref{estimator-linear-models} can be extended to the case of $\epsilon \sim \mathcal{N}(0, \mathcal{C})$ for an arbitrary positive definite covariance matrix $\mathcal{C}$:

\begin{align*}
    p^\text{robust}_\sigma(\X) = \Phi_{\matU \mathcal{C} \matU^\top} \left( \frac{c_i}{\| \U_i \|_2} \tensor \right)
\end{align*}
    
\end{lemma}

\begin{proof}
    We observe that the Gaussian random variable $\frac{\U_i}{\| \U_i \|}^\top \epsilon \vert_{\substack{i=1\\t \neq t}}^C = \matU^\top \epsilon$ has mean zero as $\epsilon$ is mean zero. Computing its covariance matrix, we have $\E_\epsilon \matU^\top \epsilon \epsilon^\top \matU  = \matU^\top \E_\epsilon (\epsilon \epsilon^\top) \matU = \matU^\top \mathcal{C} \matU$. We use this result after equation \ref{appeqn:key_step} in the proof of Lemma \ref{estimator-linear-models}.
\end{proof}


\vspace{1cm}


\begin{thm}
    The \textbf{Taylor estimator} for the local robustness of a classifier $f$ at point $\X$ with respect to target class $t$ is given by linearizing $f$ around $\X$ using a first-order Taylor expansion, with decision boundaries $g_i(\X) = f_t(\X) - f_i(\X)$, $\forall i \neq t$, leading to
    \begin{align*}
        p^\text{taylor}_{\sigma}(\X) = \cdf \left( \frac{g_i(\X)}{\sigma \|\grad g_i(\X)\|_2} \tensor \right) 
    \end{align*}
    with $\matU$ and $\Phi$ defined as in the linear case.
\end{thm}

\begin{proof}
    Using the notations from the previous Lemma \ref{proof:lemma}, we can linearize $g(\X + \epsilon) \approx g(\X) + \grad g(\X)^\top \epsilon$ using a first order Taylor series expansion.
    Thus we use $\U_i = \grad g_i(\X)$ and $c_i = g_i(\X)$, and plug it into the result of Lemma \ref{proof:lemma}.
\end{proof}




\begin{thm} The \textbf{estimation error} of the Taylor estimator for a classifier with a quadratic decision boundary $g_i(\X) = \X^\top A_i \X + \U_i^\top \X + c_i$ for positive-definite $A_i$, is upper bounded by
    \begin{align*}
        | p^{robust}_{\sigma}(\X) - p^{taylor}_{\sigma}(\X) | \leq k \sigma^{C-1} \prod_{\substack{i=1\\i\neq t}}^{C} \frac{\lambda_{\max}^{A_i}}{\| \U_i \|_2}  
    \end{align*}
    for noise $\epsilon \sim \mathcal{N}(0, \sigma^2 / d)$, in the limit of $d \rightarrow \infty$. 
\end{thm} 

\begin{proof}
Without loss of generality, assume that $\X = 0$. For any other $\X_1 \neq 0$, we can simply perform a change of variables of the underlying function to center it at $\X_1$ to yield a different quadratic. We first write an expression for $p^{robust}_\sigma$ for the given quadratic classifier $g_i(\X)$ at $\X = 0$. 

\begin{align*}
    p^{robust}_\sigma(0) &= P_{\epsilon} \left( \bigcup_i g_i(\epsilon) > 0 \right) \\
                          &= P_{\epsilon} \left( \bigcup_i \U_i^\top \epsilon + c > - \epsilon^\top A_i \epsilon \right) 
\end{align*}

Similarly, computing, $p^{taylor}_{\sigma}$ we have $\grad g_i(0) = \U^\top$ and $g_i(0) = c_i$, resulting in

\begin{align*}
    p^{taylor}_{\sigma}(0) &= P_{\epsilon}\left(\bigcup_i g^{taylor}_i(\epsilon) > 0\right) \\
    &= P_{\epsilon}\left(\bigcup_i \U_i^\top \epsilon + c > 0 \right)
\end{align*}

Subtracting the two, we have 

\begin{align*}
    &|p^{robust}_\sigma(0) - p^{taylor}_\sigma(0)| \\
    &= \left| P \left(\bigcup_i 0 >  \U_i^\top \epsilon + c > - \epsilon^\top A_i \epsilon \right) \right| \\
    &= \left| P \left(\bigcup_i 0 >  \frac{\U_i^\top \epsilon + c}{\sigma \| \U_i \|_2} > - \frac{\epsilon^\top A_i \epsilon}{\sigma \| \U_i \|_2} \right) \right| 
\end{align*}

For high-dimensional Gaussian noise $\epsilon \sim \mathcal{N}(0, \sigma^2 / d)$, with $d \rightarrow \infty$, we have that $\| \epsilon \|^2 = \sum_i \epsilon_i^2 \rightarrow \sigma^2$ from the law of large numbers. See \cite{vershynin2018high} for an extended discussion. Thus we have $\epsilon^\top A \epsilon \leq \lambda_{\max}^A \| \epsilon \|^2 = \lambda_{\max}^A \sigma^2$.

Also let $z_i = \frac{\U_i^\top \epsilon + c}{\sigma \| \U_i \|_2}$ be a random variable. We observe that $z_i \vert_i$ is a tensor extension of $z_i$, has a covariance matrix of $\matU \matU^\top$ as before. Let us also define $\mathcal{C}_i = \frac{\lambda_{\max}^{A_i}}{\| \U_i \|_2}$.

\begin{align*}
    |&p^{robust}_\sigma(0) - p^{taylor}_\sigma(0)| \\&= \left|P \left( \bigcup_i 0 > z_i(\epsilon) > - \frac{\epsilon^\top A_i \epsilon}{\sigma \| \U_i \|_2} \right) \right| \\
    &\leq \left| P\left(\bigcup_i 0 > z_i > - \frac{\lambda_{\max}^{A_i}}{\| \U_i \|_2} \sigma\right) \right|~~~(\epsilon^\top A \epsilon < \lambda_{\max}^A \sigma^2) \\
    &= \left| \int ... \int^{0}_{-\mathcal{C}_i \sigma} \text{pdf}(z_i \vert_i)~ \mathrm{d}z_i \vert_i \right| ~~~(\text{Defn of mvn cdf}) \\
    &\leq \max_{z_i \vert_i} \text{pdf}(z_i \vert_i) ~ \prod_i |C_i \sigma |~~~(\text{Upper bound pdf with its max})\\
    &\leq (2\pi)^{-(C-1)/2} \det(\matU \matU^\top)^{-1/2} \prod_{\substack{i=1\\i\neq t}}^C C_i \sigma \\ &=  k \left(\sigma^{C-1} \prod_{\substack{i=1\\i\neq t}}^C \frac{\lambda_{\max}^{A_i}}{\| \U_i \|_2} \right)
\end{align*}

where $k = \max_z pdf(z) = (2 \pi)^{-(C-1)/ 2} \det(\matU \matU^\top)^{-1/2}$, which is the max value of the Gaussian pdf. Note that as the rows of $\matU$ are normalized, $\det(\matU) \leq 1$ and $\det(\matU \matU^\top) = \det(\matU)^2 \leq 1$.

\end{proof}

We note that these bounds are rather pessimistic, as in high-dimensions $\epsilon^\top A_i \epsilon \sim \lambda_{\mn}^{A_i} \leq \lambda_{\max}^{A_i}$, and thus in reality the errors are expected to be much smaller. 

\vspace{1cm}


\begin{thm}
    The \textbf{MMSE estimator} for the local robustness of a classifier $f$ at point $\X$ with respect to target class $t$ is given by an MMSE linearization $f$ around $\X$, for decision boundaries $g_i(\X) = f_t(\X) - f_i(\X)$, $\forall i \neq t$, leading to
    \begin{align*}
        &p^\text{mmse}_{\sigma}(\X) = \cdf \left( \frac{ \Tilde{g}_i(\X)}{\sigma \| \grad \Tilde{g}_i(\X)\|_2} \tensor \right) \\
        &\mathrm{where}~~\Tilde{g}_i(\X) = \frac{1}{N}\sum_{j=1}^{N} g_i(\X + \epsilon) ~,~ \epsilon \sim \mathcal{N}(0, \sigma^2)
    \end{align*}
    with $\matU$ and $\Phi$ defined as in the linear case, and $N$ is the number of perturbations. 

\end{thm}

\begin{proof}
We would like to improve upon the Taylor approximation to $g(\X + \epsilon)$ by using an MMSE local function approximation. Essentially, we'd like the find $\U \in \R^d$ and $c \in \R$ such that 

\begin{align*}
    (\U^*(\X), c^*(\X)) = \arg\min_{\U,c} \E_{\epsilon \sim \mathcal{N}(0, \sigma^2)} (g(x+\epsilon) - \U^{\top} \epsilon - c)^2
\end{align*}

A straightforward solution by finding critical points and equating it to zero gives us the following:

\begin{align*}
    \U^*(\X) &= \E_\epsilon \left[ g(x + \epsilon) \epsilon^{\top} \right] / \sigma^2 \\&= \E_\epsilon \left[ \grad g(\X + \epsilon) \right] ~~~~~ (\text{Stein's Lemma}) \\
    c^*(\X) &= \E_\epsilon g(x + \epsilon)
\end{align*}

Plugging in these values of $U^*, c^*$ into Lemma \ref{proof:lemma}, we have the result.

\end{proof}



\vspace{1cm}


\begin{thm} The \textbf{estimation error} of the MMSE estimator for a classifier with a quadratic decision boundary $g(\X) = \X^\top A \X + \U^\top \X + c$, and positive definite $A$ is upper bounded by
    \begin{align*}
        | p^{robust}_{\sigma}(\X) - p^{mmse}_{\sigma}(\X) | \leq k \sigma^{C-1} \prod_{\substack{i=1\\i\neq t}}^C \frac{|\lambda_{\max}^{A_i} - \lambda_{\mathrm{mean}}^{A_i}|}{\| \U_i \|_2}
    \end{align*}
    for noise $\epsilon \sim \mathcal{N}(0, \sigma^2 / d)$, in the limit of $d \rightarrow \infty$ and $N \rightarrow \infty$.  
\end{thm}

\begin{proof}
We proceed similarly to the proof made for the Taylor estimator, and without loss of generality, assume that $\X = 0$. Computing, $p^{mmse}_{\sigma}$ we have $\E_{\epsilon} \grad g_i(\epsilon) = \U_i^\top$ and $\E_{\epsilon} g_i(\epsilon) = c + \E (\epsilon^\top A_i \epsilon) = c + \E(trace(\epsilon^\top A_i \epsilon)) = c + \E(trace(A_i \epsilon \epsilon^T)) = c + trace(A_i) \sigma^2 / d = c + \sigma^2 \lambda_{\mn}^{A_i}$, resulting in

\begin{align*}
    p^{mmse}_{\sigma}(0) &= P_{\epsilon}\left(\bigcup_i \hat{g}_i(\epsilon) > 0\right) \\
    &= P_{\epsilon}\left(\bigcup_i \U_i^\top \epsilon + c > - \sigma^2 \lambda_{\mn}^{A_i} \right)
\end{align*}

Subtracting the two, we have 

\begin{align*}
    &|p^{robust}_\sigma(0) - p^{mmse}_\sigma(0)| \\
    & \leq \left|P \left(\bigcup_i - \sigma^2 \lambda_\mn^{A_i} >  \U_i^\top \epsilon + c > - \sigma^2 \lambda_{\max}^{A_i} \right) \right| \\
    &= \left|P \left(\bigcup_i - \sigma \frac{\lambda_{\mn}^{A_i}}{\| \U_i \|_2} >  \frac{\U_i^\top \epsilon + c}{\sigma \| \U_i \|_2} > - \sigma \frac{\lambda_{\max}^{A_i}}{\| \U_i \|_2} \right) \right| 
\end{align*}

Similar to the previous proof, let $z_i = \U_i^\top \epsilon + c$ be a random variable, and that $z_i \vert_i$ is a tensor extension of $z_i$ from our previous notation.

\begin{align*}
    |&p^{robust}_\sigma(\X) - p^{mmse}_\sigma(\X)| \\
    &\leq \left| P\left(\bigcup_i  - \lambda_{\mn}^{A_i} \sigma^2 > z_i > - \lambda_{\max}^{A_i} \sigma^2\right) \right| \\
    &= \left| \int ... \int^{-\lambda_{\mn}^{A_i} \sigma^2}_{-\lambda_{\max}^{A_i} \sigma^2} \text{pdf}(z_i \vert_i)~ \mathrm{d}z_i \vert_i \right| \\
    &\leq \max_{z_i \vert_i} \text{pdf}(z_i \vert_i) ~ \sigma^{C-1} \prod_i \frac{|(\lambda_{\max}^{A_i} - \lambda_{\mn}^{A_i}) |}{\| \U_i \|_2}\\
    &= k \sigma^{C-1} \prod_i \frac{|\lambda_{\max}^{A_i} - \lambda_{\mn}^{A_i}|}{\| \U_i \|2}
\end{align*}

where $k = \max_z \text{pdf}(z_i \vert_i) = (2 \pi)^{-(C-1)/ 2} \det(\matU \matU^\top)^{-1/2}$ like in the Taylor case. Note that as the rows of $\matU$ are normalized, $\det(\matU) \leq 1$ and $\det(\matU \matU^\top) = \det(\matU)^2 \leq 1$.

\end{proof}

We note that these bounds are rather pessimistic, as in high-dimensions $\epsilon^\top A_i \epsilon \sim \lambda_{\mn}^{A_i} \leq \lambda_{\max}^{A_i}$, and thus in reality the errors are expected to be much smaller. 

\subsubsection{Approximating the Multivariate Gaussian CDF with mv-sigmoid}\label{app:mv-sigmoid-explain} 

One drawback of the Taylor and MMSE estimators is their use of the \emph{mvn-cdf}, which does not have a closed form solution and can cause the estimators to be slow for settings with a large number of classes $C$. In addition, the \emph{mvn-cdf} makes these estimators non-differentiable, which is inconvenient for applications which require differentiating \probust{}. To alleviate these issues, we approximate the \emph{mvn-cdf} with an analytical closed-form expression. As CDFs are monotonically increasing functions, the approximation should also be monotonically increasing.

To this end, it has been previously shown that the \emph{univariate} Normal CDF $\phi$ is well-approximated by the sigmoid function \cite{hendrycks2016gaussian}. It is also known that when $\matU \matU^\top = I$, \emph{mvn-cdf} is given by $\Phi(\X) = \prod_i\phi(\X_i)$, i.e., it is given by the product of the univariate normal CDFs. Thus, we may choose to approximate $\Phi(\X) = \prod_i \text{sigmoid}(\X)$. However, when the inputs are small, this can be simplified as follows:

\begin{align*}
    &\Phi_{I}(\X) = \prod_i \phi(\X_i) \approx \prod_i \frac{1}{1 + \exp(-\X_i)}\\
    &= \frac{1}{1 + \sum_i \exp(-\X_i) + \sum_{j,k} \exp(-\X_j - \X_k) + ...} \\
    &\approx \frac{1}{1 + \sum_i \exp(-\X_i)} ~~~(\text{for} ~~\X_i \rightarrow \infty~~ \forall i)
\end{align*}

%\begin{defn}
%    The multivariate sigmoid is defined as $\text{mv-sigmoid}(\X) = \frac{1}{1 + \sum_{i} \exp(-\X_i)}$ 
%\end{defn}

We call the final expression the ``multivariate sigmoid'' (\emph{mv-sigmoid}) which serves as our approximation of \emph{mvn-cdf}, especially at the tails  of the distribution. While we expect estimators using \emph{mv-sigmoid} to approximate ones using \emph{mvn-cdf} only when $\matU \matU^\top = \mathbf{I}$, we find experimentally that the approximation works well even for practical values of the covariance matrix $\matU\matU^\top$. Using this approximation to substitute \emph{mv-sigmoid} for \emph{mvn-cdf} in the \ptaylor{} and \pmmse{} estimators yields the \ptaylormvs{} and \pmmsemvs{} estimators, respectively.

%\paragraph{Relationship with Softmax.}
%The commonly-used softmax function applied to the logits (henceforth $p^\text{softmax}_{T}$ for softmax with temperature $T$) is identical to \emph{mv-sigmoid}. In this form, $p^\text{softmax}_{T}$ and the Taylor estimator are relatively similar (the only difference is that the latter divides the logits by the gradient norm, and uses the \emph{mvn-cdf} function instead of \emph{mv-sigmoid}) and it is reasonable to ask whether $p^\text{softmax}_{T}$ itself is be a ``good enough'' estimator of $p^\text{robust}_{\sigma}$ in practice. We find that, outside of a specific linear setting, $p^\text{softmax}_T$ does not well-approximate $p^\text{robust}_{\sigma}$ because $p^\text{softmax}_T$ does not account for the alignment of decision boundary pairs ($\matU \matU^\top$), nor does it use the gradient information used in all of our estimators. A more detailed discussion can be found in Appendix XXX.

\subsubsection{Relationship between mv-sigmoid, softmax, and the Taylor estimator}
\label{app:softmax-explain}

A common method to estimate the confidence of model predictions is to use the softmax function applied to the logits $f_i(\X)$ of a model. We note that softmax is identical to \emph{mv-sigmoid} when directly applied to the logits of neural networks: 

\begin{align*}
    &\text{softmax}_t\left( f_i(\X) ~\Big\vert_{\substack{i = 1}}^{C} \right) = \frac{\exp(f_t(\X))}{\sum_{i=1}^C \exp(f_i(\X))} = \\& \frac{1}{1 + \sum\limits_{\substack{i=1\\i \neq t}}^{C} \exp(f_i(\X) - f_t(\X))} = \text{mv-sigmoid}\left( g_i(\X) ~\Big\vert_{\substack{i = 1\\i\neq t}}^{C} \right)
\end{align*}

Recall that $g_i(\X) = f_t(\X) - f_i(\X)$ is the decision boundary function. Note that this equivalence only holds for the specific case of logits. Comparing the expressions of softmax applied to logits above and the Taylor estimator, we notice that they are only different in that the Taylor estimator divides by the gradient norm, and uses the \emph{mvn-cdf} function instead of \emph{mv-sigmoid}. Given this similarity to the Taylor estimator, it is reasonable to ask whether softmax applied to logits (henceforth $p^\text{softmax}_{T}$ for softmax with temperature $T$) itself can be a ``good enough'' estimator of $p^\text{robust}_{\sigma}$ in practice. In other words, does $p^\text{softmax}_T$ well-approximate $p^\text{robust}_{\sigma}$ in certain settings?

In general, this cannot hold because softmax does not take in information about $\matU \matU^\top$, nor does it use the gradient information used in all of our estimators, although the temperature parameter $T$ can serve as a substitute for $\sigma$ in our expressions. In Appendix \ref{app:proofs}, we provide a theoretical result for a restricted linear setting where softmax can indeed match the behavior of \ptaylormvs{}, which happens precisely when $\matU \matU^\top = \mathbf{I}$ and all the class-wise gradients are equal. In the next section, we demonstrate empirically that the softmax estimator $p^{\text{softmax}}_T$ is a poor estimator of average robustness in practice.

\paragraph{The softmax estimator} We observe that for linear models with a specific noise perturbation $\sigma$, the common softmax function taken with respect to the output logits can be viewed as an estimator of \probust{}, albeit in a very restricted setting. Specifically,

\begin{lemma}
    For multi-class linear models $f(\X) = \mathbf{w}^\top \X + b$, such that the decision boundary weight norms $\| \U_i \|_2 = k, \forall i \in [1, C], i \neq t$,
    \begin{align*}
        p^\text{softmax}_{T} = p^\text{taylor\_mvs}_{\sigma}~~~~\text{where}~~~~T = \sigma k
    \end{align*}
\label{lemma:softmax}
\end{lemma}

\begin{proof} Consider softmax with respect to the $t^{th}$ output class and define $g_i(\X) = f_t(\X) - f_i(\X)$, with $f$ being the linear model logits. Using this, we first show that softmax is identical to \emph{mv-sigmoid}:

\begin{align*}
        p^\text{softmax}_T(\X) &= \text{softmax}_t(f_1(\X)/T, ..., f_C(\X)/T) \\
        &= \frac{\exp(f_t(\X)/T)}{\sum_i \exp(f_i(\X)/T)} \\ 
        &= \frac{1}{1 + \sum_{i; i\neq t} \exp((f_i(\X) - f_t(\X))/T)} \\ 
        &= ~\text{mv-sigmoid} \left[ g_i(\X)/T \tensor \right]
\end{align*}

Next, by denoting $\U_i = \W_t - \W_i$, each row has equal norm $\| \U_i \|_2 = \| \U_j \|_2, \forall i,j,t \in [1,...C]$ which implies: 

\begin{align*}
        p^\text{taylor\_mvs}_\sigma(\X) &= \text{mv-sigmoid} \left[ \frac{g_i(\X)}{\sigma \| \U_i \|_2} \tensor \right]\\ 
        &= \text{mv-sigmoid} \left[ g_i(\X)/T \tensor \right]~~ (\because \text{$T = \sigma k $})\\ 
        & = p^\text{softmax}_T(\X)
\end{align*}
\end{proof}

Lemma~\ref{lemma:softmax} indicates that the temperature parameter $T$ of softmax roughly corresponds to the $\sigma$ of the added Normal noise with respect to which local robustness is measured. Overall, this shows that under the restricted setting where the local linear model consists of decision boundaries with equal weight norms, the softmax outputs can be viewed as an estimator of the \ptaylormvs{} estimator, which itself is an estimator of \probust{}. However, due to the multiple levels of approximation, we can expect the quality of \psoftmax{}'s approximation of \probust{} to be poor in general settings (outside of the very restricted setting), so much so that in general settings, \probust{} and \psoftmax{} would be unrelated.

\subsection{Datasets}
\label{app:datasets}

The MNIST dataset consists of images of gray-scale handwritten digits spanning 10 classes: digits 0 through 9. The FashionMNIST (FMNIST) dataset consists of gray-scale images of articles of clothing spanning 10 classes: t-shirt, trousers, pullover, dress, coat, sandal, shirt, sneaker, bag, and ankle boot. For MNIST and FMNIST, each image is 28 pixels x 28 pixels. For MNIST and FMNIST, the training set consists of 60,000 images and the test set consists of 10,000 images.

The CIFAR10 dataset consists of color images of common objects and animals spanning 10 classes: airplane, car, bird, cat, deer, dog, frog, horse, ship, and truck. The CIFAR100 dataset consists of color images of common objects and animals spanning 100 classes: apple, bowl, chair, dolphin, lamp, mouse, plain, rose, squirrel, train, etc. For CIFAR10 and CIFAR100, each image is 3 pixels x 32 pixels x 32 pixels. For CIFAR10 and CIFAR100, the training set consists of 50,000 images and the test set consists of 10,000 images.


\subsection{Models}
\label{app:models}

For the MNIST and FMNIST, we train a linear model and a convolutional neural network (CNN) to perform 10-class classification. The linear model consists of one hidden layer with 10 neurons. The CNN consists of four hidden layers: one convolutional layer with 5x5 filters and 10 output channels, one convolutional layer 5x5 filters and 20 output channels, and one linear layer with 50 neurons, and one linear layer 10 neurons. 

For CIFAR10 and CIFAR100, we train a Vision Transformer model to perform 10-class and 100-class classification, respectively, by fine-tuning a Vision Transformer that was pre-trained on ImageNet (\url{https://huggingface.co/google/vit-base-patch16-224-in21k}) on each dataset. For these models, the test set consists of 100 images. We chose this number of datapoints so that \pmc{} would run within a reasonable amount of time.
We also train a ResNet18 model to perform 10-class and 100-class classification, respectively. The model architecture is described in \citep{he2016deep}. For CIFAR10 and CIFAR100, we also train the ResNet18 models using varying levels of gradient norm regularization to obtain models with varying levels of robustness. The larger the weight of gradient norm regularization ($\lambda$), the more robust the model.

All models were trained using stochastic gradient descent. Hyperparameters were selected to achieve decent model performance. The emphasis is on analyzing the estimators’ estimates of local robustness of each model, not on high model performance. Thus, we do not focus on tuning model hyperparameters. All models were trained for 200 epochs. The test set accuracy for each model is shown in Table~\ref{table:app-model-acc}.

\begin{table*}[ht!]
    \centering
    \begin{tabular}{c|c|c|c}
    Dataset      & Model  & $\lambda$  & Test set accuracy \\
    \midrule
    MNIST        & Linear  & 0 & 92\%                         \\
    MNIST        & CNN     & 0 & 99\%                         \\
    \midrule
    FashionMNIST & Linear  & 0 & 84\%                         \\
    FashionMNIST & CNN     & 0 & 91\%                         \\
    \midrule
    CIFAR10      & Vision Transformer & 0 & 99\%                         \\
    CIFAR10      & ResNet18 & 0 & 94\%                         \\
    CIFAR10      & ResNet18 & 0.0001 & 93\%                         \\
    CIFAR10      & ResNet18 & 0.001 & 90\%                         \\
    CIFAR10      & ResNet18 & 0.01 & 85\%                         \\
    \midrule
    CIFAR100     & Vision Transformer & 0 & 91\%                        \\
    CIFAR100     & ResNet18 & 0 & 76\%                        \\
    CIFAR100     & ResNet18 & 0.0001 & 74\%                         \\
    CIFAR100     & ResNet18 & 0.001 & 69\%                         \\
    CIFAR100     & ResNet18 & 0.01 & 60\%                         
    \end{tabular}
    \vspace*{3mm}
    \caption{Test set accuracy of models.}
    \label{table:app-model-acc}
\end{table*}


\subsection*{Experiments} 
Due to file size constraints, Section A.4 can be found in the Supplementary material.

\clearpage



\subsection{Experiments}
\label{app:experiments}
In this section, we provide the following additional experimental results:

\begin{enumerate}
    \item Figure \ref{app:convergence} shows results on the convergence of \pmc{}. \pmc{} takes a large number of samples to converge and is computationally inefficient.
    \item Figure \ref{app:convergence_mmse} shows results on the convergence of \pmmse{}. \pmmse{} takes only a few samples to converge and is more computationally inefficient than \pmc{}.
    \item Figure \ref{app:distribution_probust} shows the distribution of \probust{} as a function of $\sigma$. Consistent with theory in Section \ref{sec:methods}, (1) as noise increases, \probust{} decreases, and (2) \pmmse{} accurately estimates \pmc{}.
    \item Table \ref{app:runtimes} presents estimator runtimes. Our analytical estimators are more efficient than the naïve estimator (\pmc{}).
    \item Figure \ref{app:accuracy_probust} shows the accuracy of the analytical robustness estimators as a function of $\sigma$. \pmmse{} and \pmmsemvs{} are the best estimators of \probust{}, followed closely by \ptaylormvs{} and \ptaylor{}, trailed by \psoftmax{}.
    \item Figure \ref{app:accuracy_robust} shows the accuracy of the analytical estimators for robust models. For more robust models, the estimators compute \probust{} more accurately over a larger $\sigma$.
    \item Figures \ref{fig2:mvsig-mvncdf} and \ref{app:mvsigmoid} shows that \emph{mv-sigmoid} well-approximates \emph{mvn-cdf} over $\sigma$.
    \item Figure \ref{fig4:probust-and-psoftmax} shows that \psoftmax{} is not a good approximator of \probust{}.
    \item Figure \ref{app:robustness_bias} shows the distribution of \probust{} among classes (measured by \pmmse{}), revealing that models display robustness bias among classes. 
    \item Figures \ref{fig-supp:topk-vs-bottomk} and \ref{fig6:topk-vs-bottomk} show the application of \pmmse{} and \psoftmax{} to identification of robust and non-robust points. \probust{} better identifies robust and non-robust points than \psoftmax{}. 
    \item Figures \ref{app:noisy_mnist}, \ref{app:noisy_fmnist}, \ref{app:noisy_cifar10}, and \ref{app:noisy_cifar100} show examples of noisy images with the level of noise analyzed in our paper. Overall, the noise levels seem visually significant.
 \end{enumerate}

%appendix/a_convergence_pmc = 02a_p_emp_convergence_n50000_baseline
\begin{figure*}[htbp!]
    \centering
    \begin{flushleft}
        %row labels
        \hspace{-0.1cm}\rotatebox{90}{\hspace{-5.9cm}Relative error \hspace{1.2cm}Absolute error}
        %column labels
        \hspace{1.4cm}MNIST CNN
        \hspace{1.8cm} FMNIST CNN
        \hspace{1.4cm} CIFAR10 ResNet18
        \hspace{1cm} CIFAR100 ResNet18
    \end{flushleft}
         
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/a_convergence_pmc/abs/mnist_cnn_sigma0.5.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/a_convergence_pmc/abs/fmnist_cnn_sigma0.5.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/a_convergence_pmc/abs/cifar10_resnet18_sigma0.05.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/a_convergence_pmc/abs/cifar100_resnet18_sigma0.05.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/a_convergence_pmc/rel/mnist_cnn_sigma0.5.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/a_convergence_pmc/rel/fmnist_cnn_sigma0.5.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/a_convergence_pmc/rel/cifar10_resnet18_sigma0.05.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/a_convergence_pmc/rel/cifar100_resnet18_sigma0.05.pdf}
    \end{subfigure}
    \caption{Convergence of \pmc{}. In practice, \pmc{} takes around $n=10,000$ samples to converge and is computationally inefficient.}\label{app:convergence}
\end{figure*}


%appendix/b_convergence_pmmse = 02b_p_mmse_convergence_n500_baseline
\begin{figure*}[htbp!]
    \centering
    \begin{flushleft}
        %row labels
        \hspace{-0.1cm}\rotatebox{90}{\hspace{-5.8cm}Relative error \hspace{1.1cm}Absolute error}
        %column labels
        \hspace{1.4cm} MNIST CNN
        \hspace{1.9cm} FMNIST CNN
        \hspace{1.2cm} CIFAR10 ResNet18
        \hspace{0.9cm} CIFAR100 ResNet18
    \end{flushleft}
         
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/b_convergence_pmmse/abs/mnist_cnn_sigma0.05.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/b_convergence_pmmse/abs/fmnist_cnn_sigma0.05.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/b_convergence_pmmse/abs/cifar10_resnet18_sigma0.05.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/b_convergence_pmmse/abs/cifar100_resnet18_sigma0.05.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/b_convergence_pmmse/rel/mnist_cnn_sigma0.05.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/b_convergence_pmmse/rel/fmnist_cnn_sigma0.05.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/b_convergence_pmmse/rel/cifar10_resnet18_sigma0.05.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/b_convergence_pmmse/rel/cifar100_resnet18_sigma0.05.pdf}
    \end{subfigure}
    \caption{Convergence of \pmmse{}. In practice, \pmmse{} takes around $n=5-10$ samples to converge and is more computationally efficient than \pmc{}.} \label{app:convergence_mmse}
\end{figure*}

%appendix/c_probust_over_noise = 02b_p_over_noise/p_all_over_noise

%appendix/c_probust_over_noise = 02c_p_vs_sigma/p_all_vs_sigma

\begin{figure*}[htbp!]
    \centering
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/c_probust_over_noise/mnist_linear.pdf}
        \caption{MNIST, Linear}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/c_probust_over_noise/mnist_cnn.pdf}
        \caption{MNIST, CNN}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/c_probust_over_noise/fmnist_linear.pdf}
        \caption{FMNIST, Linear}
    \end{subfigure}
    
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/c_probust_over_noise/fmnist_cnn.pdf}
        \caption{FMNIST, CNN}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/c_probust_over_noise/cifar10_resnet18.pdf}
        \caption{CIFAR10, ResNet18}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/c_probust_over_noise/cifar100_resnet18.pdf}
        \caption{CIFAR100, ResNet18}
    \end{subfigure}

    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/j_vit_cifar/06c_p_vs_sigma/p_all_vs_sigma/vit_cifar10.pdf}
        \caption{CIFAR10, Vision Transformer}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/j_vit_cifar/06c_p_vs_sigma/p_all_vs_sigma/vit_cifar100.pdf}
        \caption{CIFAR100, Vision Transformer}
    \end{subfigure}
    \caption{Distribution of \probust{} over $\sigma$. As noise increases, \probust{} decreases. In addition, \pmmse{} accurately estimates \pmc{}.}\label{app:distribution_probust}
\end{figure*}

%appendix/d_accuracy_of_estimators = 02d_pemp_vs_pothers_over_sigma/rel
\begin{figure*}[htbp!]
    \centering
    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/d_accuracy_of_estimators/mnist_linear.pdf}
        \caption{MNIST, Linear}
    \end{subfigure}
    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/d_accuracy_of_estimators/mnist_cnn.pdf}
        \caption{MNIST, CNN}
    \end{subfigure}

    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/d_accuracy_of_estimators/fmnist_linear.pdf}
        \caption{FMNIST, Linear}
    \end{subfigure}
    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/d_accuracy_of_estimators/fmnist_cnn.pdf}
        \caption{FMNIST, CNN}
    \end{subfigure}
    
    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/d_accuracy_of_estimators/cifar10_resnet18.pdf}
        \caption{CIFAR10, ResNet18}
    \end{subfigure}
    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/d_accuracy_of_estimators/cifar100_resnet18.pdf}
        \caption{CIFAR100, ResNet18}
    \end{subfigure}

    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/j_vit_cifar/06d_pemp_vs_pothers_over_sigma/rel/vit_cifar10.pdf}
        \caption{CIFAR10, Vision Transformer}
    \end{subfigure}
    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/j_vit_cifar/06d_pemp_vs_pothers_over_sigma/rel/vit_cifar100.pdf}
        \caption{CIFAR100, Vision Transformer}
    \end{subfigure}
    \caption{Accuracy of \probust{} estimators over $\sigma$. The smaller the noise neighborhood $\sigma$, the more accurately the estimators compute \probust{}. \pmmse{} and \pmmsemvs{} are the best estimators of \probust{}, followed closely by \ptaylormvs{} and \ptaylor{}, trailed by \psoftmax{}.} \label{app:accuracy_probust}
\end{figure*}


%appendix/e_accuracy_of_estimators_robust_models = 02e_pemp_vs_pmmse_over_sigma_robust_models/rel
\begin{figure*}[h]
    \centering
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/e_accuracy_of_estimators_robust_models/cifar10_resnet18.pdf}
        \caption{CIFAR10, ResNet18}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/e_accuracy_of_estimators_robust_models/cifar100_resnet18.pdf}
        \caption{CIFAR100, ResNet18}
    \end{subfigure}
    \caption{Accuracy of \probust{} estimators over $\sigma$ for robust models. For more robust models, the estimators compute \probust{} more accurately over a larger $\sigma$.} \label{app:accuracy_robust}
\end{figure*}

\begin{figure}
  \centering
  \includegraphics[width=0.4\linewidth]{figures/fig2_cifar10_resnet18_gnorm0.0_sigma0.05.pdf}
  \caption{Correlation of \emph{mvn-cdf(z)} and \emph{mv-sigmoid(z)} for the CIFAR10 ResNet18 model. The formulation of $z$ is described in Section~\ref{sec:exp_correctness}. In practice, \emph{mv-sigmoid} approximates \emph{mvn-cdf} well.}
  \label{fig2:mvsig-mvncdf}
\end{figure}

%appendix/f_mvsigmoid_vs_mvncdf = 03_mvncdf_vs_mvsigmoid/cifar10_resnet18
\begin{figure*}[h]
    \centering
    \includegraphics[width=0.4\linewidth]{figures/appendix/f_mvsigmoid_vs_mvncdf/cifar10_resnet18_gnorm0.0_correlation_over_sigmas.pdf}
    \caption{mv-sigmoid's approximation of mvn-cdf over $\sigma$. mv-sigmoid well-approximates mvn-cdf over $\sigma$.} \label{app:mvsigmoid}
\end{figure*}

%fig4 -- p_robust and p_softmax
%fig4a: scatterplot, non-robust model
%02i_pemp_vs_pmmse_corr_robust_models_scatterplots/cifar10_resnet18_sigma0.1_gnormreg0.png
%fig4b: the more robust the model, the more the two are related
%02h_pemp_vs_pmmse_corr_robust_models_lineplots/cifar10_resnet18_cifar100_resnet18_sigma0.1.png
%fig4c: scatterplot, robust model
%02i_pemp_vs_pmmse_corr_robust_models_scatterplots/cifar10_resnet18_sigma0.1_gnormreg0.01.png
\begin{figure*}[ht!]
    % \centering
    \centerline{
    \begin{subfigure}[h]{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/fig4a_cifar10_resnet18_sigma0.1_gnormreg0.pdf}
        \captionsetup{justification=centering}
        \caption{CIFAR10 \\ Non-robust model ($\lambda=0$)}
        \label{fig4a:ps-nonrob-model}
    \end{subfigure}
    \begin{subfigure}[ht]{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/fig4b_cifar10_resnet18_cifar100_resnet18_sigma0.1.pdf}
        \captionsetup{justification=centering}
        \caption{CIFAR10 and CIFAR100 \\ Varying model robustness}
        \label{fig4b:ps-rob-models-lineplot}
    \end{subfigure}
    \begin{subfigure}[h]{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/fig4c_cifar10_resnet18_sigma0.1_gnormreg0.01.pdf}
        \captionsetup{justification=centering}
        \caption{CIFAR10 \\ Robust model ($\lambda=0.01$)}
        \label{fig4c:ps-rob-model}
    \end{subfigure}
    }
    \caption{Relationship between \probust{} and \psoftmax{} for CIFAR10 and CIFAR100 ResNet18 models. (a) For a non-robust model, \probust{} and \psoftmax{} are not strongly correlated. (b) As model robustness increases, the two quantities become more correlated. (c) However, even for robust models, the relationship between \probust{} and \psoftmax{} is mild. Together, these results indicate that, consistent with the theory in Section~\ref{sec:methods}, \psoftmax{} is not a good estimator for \probust{} in general settings.}
    \label{fig4:probust-and-psoftmax}
\end{figure*}









%appendix/g_robustness_bias = 02f_p_distr_vs_classes/p_all_over_classes
\begin{figure*}[h]
    \centering
         
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/g_robustness_bias/mnist_linear_sigma0.5.pdf}
        \caption{MNIST, Linear}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/g_robustness_bias/mnist_cnn_sigma0.9.pdf}
        \caption{MNIST, CNN}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/g_robustness_bias/fmnist_linear_sigma0.5.pdf}
        \caption{FMNIST, Linear}
    \end{subfigure}
    
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/g_robustness_bias/fmnist_cnn_sigma0.9.pdf}
        \caption{FMNIST, CNN}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/g_robustness_bias/cifar10_resnet18_sigma0.1.pdf}
        \caption{CIFAR10, ResNet18}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/g_robustness_bias/cifar100_resnet18_sigma0.1.pdf}
        \caption{CIFAR100, ResNet18}
    \end{subfigure}
    \caption{Local robustness bias among classes. \probust{} reveals that the model is less locally robust for some classes than for others. The analytical estimator \pmmse{} properly captures this model bias.} \label{app:robustness_bias}
\end{figure*}

%table: naive method is inefficient, analytical method is efficient
\begin{table*}[h]
\centering
\begin{tabular}{l|l|l|l|l|l}
    \multicolumn{2}{c}{}   & \multicolumn{2}{|c|}{CPU: Intel x86\_64}   & \multicolumn{2}{|c}{GPU: Tesla V100-PCIE-32GB} \\
    \midrule
    Estimator   & \# samples ($n$)   & Serial   & Batched   & Serial   & Batched \\
    \midrule
    \pmc{}   & \begin{tabular}[c]{@{}l@{}}$n=100$\\ $n=1000$\\ $n=10000$\end{tabular}               
             & \begin{tabular}[c]{@{}l@{}}0:00:59\\ 0:09:50\\ \textit{1:41:11}\end{tabular}                                               
             & \begin{tabular}[c]{@{}l@{}}0:00:42\\ 0:07:22\\ \textit{1:14:38}\end{tabular}                                                
             & \begin{tabular}[c]{@{}l@{}}0:00:12\\ 0:02:00\\ \textit{0:19:56}\end{tabular}                                                
             & \begin{tabular}[c]{@{}l@{}}0:00:01\\ 0:00:04\\ \textit{0:00:35}\end{tabular} \\
    \midrule
    \ptaylor{}   & N/A
                 & 0:00:08                                                                                                                      
                 & 0:00:07                                                                                                                      
                 & 0:00:02                                                                                                                      
                 & $<$ 0:00:01 \\
    \midrule
    \ptaylormvs{}   & N/A
                    & 0:00:08                                                                                                                   
                    & 0:00:07                                                                                                                  
                    & 0:00:01                                                                                                                   
                    & $<$ 0:00:01 \\
    \midrule
    \pmmse{}   & \begin{tabular}[c]{@{}l@{}}$n=1$\\ $n=5$\\ $n=10$\\ $n=25$\\ $n=50$\\ $n=100$\end{tabular} 
               & \begin{tabular}[c]{@{}l@{}}0:00:08\\ \textit{0:00:41}\\ 0:01:21\\ 0:03:21\\ 0:06:47\\ 0:13:57\end{tabular} 
               & \begin{tabular}[c]{@{}l@{}}0:00:10\\ \textit{0:00:31}\\ 0:01:02\\ 0:02:44\\ 0:05:38\\ 0:11:31\end{tabular} 
               & \begin{tabular}[c]{@{}l@{}}0:00:02\\ \textit{0:00:06}\\ 0:00:11\\ 0:00:26\\ 0:00:51\\ 0:01:42\end{tabular} 
               & \begin{tabular}[c]{@{}l@{}}0:00:02\\ \textit{0:00:02}\\ 0:00:02\\ 0:00:03\\ 0:00:04\\ 0:00:06\end{tabular} \\
    \midrule
    \pmmsemvs{}   & \begin{tabular}[c]{@{}l@{}}$n=1$\\ $n=5$\\ $n=10$\\ $n=25$\\ $n=50$\\ $n=100$\end{tabular} 
                  & \begin{tabular}[c]{@{}l@{}}0:00:08\\ \textit{0:00:41}\\ 0:01:21\\ 0:03:24\\ 0:06:47\\ 0:13:28\end{tabular} 
                  & \begin{tabular}[c]{@{}l@{}}0:00:08\\ \textit{0:00:32}\\ 0:01:00\\ 0:02:37\\ 0:05:35\\ 0:11:32\end{tabular} 
                  & \begin{tabular}[c]{@{}l@{}}0:00:01\\ \textit{0:00:05}\\ 0:00:10\\ 0:00:25\\ 0:00:51\\ 0:01:42\end{tabular} 
                  & \begin{tabular}[c]{@{}l@{}}0:00:01\\ \textit{0:00:01}\\ 0:00:02\\ 0:00:02\\ 0:00:03\\ 0:00:06\end{tabular} \\
    \midrule
    \psoftmax{}   & N/A                                                                             
                  & 0:00:01                                                                                                                              
                  & $<$ 0:00:01                                                                                                                              
                  & $<$ 0:00:01                                                                                                                              
                  & $<$ 0:00:01                                                                                                                             
\end{tabular}
\caption{Runtimes of each \probust{} estimator. Each estimator computes \probustwsigma{0.1} for the CIFAR10 ResNet18 model for 50 data points. For estimators that use sampling, the row with the minimum number of samples necessary for convergence is italicized. Runtimes are in the format of hour:minute:second. The analytical estimators (\ptaylor{}, \ptaylormvs{}, \pmmse{}, and \pmmsemvs{}) are more efficient than the naïve estimator (\pmc{}).} \label{app:runtimes}
\end{table*}

\subsubsection{\probust{} identifies images that are robust to and images
that are vulnerable to random noise}

For each dataset, we train a simple CNN to distinguish between images with high and low \pmmse{}. We train the same CNN to also distinguish between images with high and low \psoftmax{}. The CNN consists of two convolutional layers and two fully-connected feedforward layers with a total of 21,878 parameters. For a given dataset, for each class, we take the images with the top-25 and bottom-25 \pmmse{} values. This yields 500 images for CIFAR10 (10 classes x 50 images per class) and 5,000 images for CIFAR100 (100 classes x 50 images per class). We also perform the same steps using \psoftmax{}, yielding another 500 images for CIFAR10 and another 5,000 images for CIFAR100. For each dataset, the train/test split is 90\%/10\% of points. 

Then, we compare the performance of the two models. For CIFAR10, the test set accuracy for the \pmmse{} CNN is 0.92 while that for the \psoftmax{} CNN is 0.58. For CIFAR100, the test set accuracy for the \pmmse{} CNN is 0.74 while that for the \psoftmax{} CNN is 0.55. The higher the test set accuracy of a CNN, the better the CNN distinguishes between images. Thus, the results indicate that \probust{} better identifies images that are robust to and vulnerable to random noise than \psoftmax{}.

We also provide additional visualizations of images with the highest and lowest \probust{} and images with the highest and lowest \psoftmax{}.


\subsubsection{Softmax probability is not a good proxy for average-case robustness}

To examine the relationship between \probust{} and \psoftmax{}, we calculate \pmmse{} and \psoftmax{} for CIFAR10 and CIFAR100 models of varying levels of robustness, and measure the correlation of their values and ranks using Pearson and Spearman correlations. Results are in Appendix~\ref{app:experiments} (Figure~\ref{fig4:probust-and-psoftmax}). For a non-robust model, \probust{} and \psoftmax{} are not strongly correlated (Figure~\ref{fig4a:ps-nonrob-model}). As model robustness increases, the two quantities become more correlated (Figures~\ref{fig4b:ps-rob-models-lineplot} and~\ref{fig4c:ps-rob-model}). However, even for robust models, the relationship between the two quantities is mild (Figure~\ref{fig4c:ps-rob-model}). That \probust{} and \psoftmax{} are not strongly correlated is consistent with the theory in Section~\ref{sec:methods}: in general settings, \psoftmax{} is not a good estimator for \probust{}.

\newpage
While working on the paper, we hypothesized that \probust{} (e.g., \pmmse{}) might be correlated with model accuracy. However, we did not find this in practice. Instead, what we find is that \probust{} succeeds in identifying canonical data points of a class, and does so much better than \psoftmax{}. We first assess this finding through visual inspection, finding that images with higher \probust{} tend to be more canonical and clear images, and that this distinction is less apparent for \psoftmax{} (Figures~\ref{fig4:topk-vs-bottomk-main} and \ref{fig-supp:topk-vs-bottomk}). We then use a model to classify these images as an additional, more objective assessment of this pattern (as discussed in Section~\ref{subsec:case-studies}).

%fig4 -- 2x4 images
%top-k and bottom-k images
% 02g_topk_bottomk_images/cifar10_resnet18/p_mmse/
% - cifar10_resnet18_p_mmse_sigma0.1_class8_bottomk.png
% - cifar10_resnet18_p_mmse_sigma0.1_class8_topk.png
% - ... class1 x 2
% 02g_topk_bottomk_images/cifar10_resnet18/p_sm
% - cifar10_resnet18_p_sm_sigma0.1_class8_bottomk.png
% - cifar10_resnet18_p_sm_sigma0.1_class8_topk.png
% - ... class1 x 2

%     \begin{flushleft}
%         %row labels
%         \hspace{-0.1cm}\rotatebox{90}{\hspace{-8.7cm}Truck \hspace{2.4cm}Boat \hspace{2.1cm}Airplane}
%         %column labels
%         \hspace{0.9cm}Lowest \pmmsewsigma{0.1}
%         \hspace{1cm} Highest \pmmsewsigma{0.1}
%         \hspace{1cm} Lowest \psoftmax{}
%         \hspace{1cm} Highest \psoftmax{}
%     \end{flushleft}
         
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_mmse_sigma0.1_class0_bottomk.png}
%     \end{subfigure}
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_mmse_sigma0.1_class0_topk.png}
%     % \hspace{0.2cm}
%     \end{subfigure}
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_sm_sigma0.1_class0_bottomk.png}
%     \end{subfigure}
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_sm_sigma0.1_class0_topk.png}
%     \end{subfigure}
    
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_mmse_sigma0.1_class8_bottomk.png}
%     \end{subfigure}
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_mmse_sigma0.1_class8_topk.png}
%     \end{subfigure}
%     % \hspace{0.2cm}
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_sm_sigma0.1_class8_bottomk.png}
%     \end{subfigure}
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_sm_sigma0.1_class8_topk.png}
%     \end{subfigure}
    
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_mmse_sigma0.1_class9_bottomk.png}
%     \end{subfigure}
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_mmse_sigma0.1_class9_topk.png}
%     \end{subfigure}
%     % \hspace{0.2cm}
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_sm_sigma0.1_class9_bottomk.png}
%     \end{subfigure}
%     \begin{subfigure}{0.23\textwidth}
%         \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/resnet18_cifar10_p_sm_sigma0.1_class9_topk.png}
%     \end{subfigure}
%     \caption{Images with the lowest and highest \probust{} (\pmmse{}) and \psoftmax{} values among CIFAR10 classes. Images with high \probust{} tend to be brighter and have stronger object-background contrast (making them more robust to random noise) than those with low \probust{}. The difference between images with high and low \psoftmax{} is less clear. Thus, \probust{} better captures the model's local robustness with respect to an input than \psoftmax{}.}
%     \label{fig4:topk-vs-bottomk}
% \end{figure}




\begin{figure*}[htbp!]
    \vspace{1cm}
    \centering
    \begin{flushleft}
        %row labels
        \hspace{-0.1cm}\rotatebox{90}{\hspace{-6.5cm}Car \hspace{3cm}Boat}
        %column labels
        \hspace{1.3cm}Lowest \pmmsewsigma{0.1}
        \hspace{1.6cm} Highest \pmmsewsigma{0.1}
        \hspace{1.5cm} Lowest \psoftmax{}
        \hspace{1.5cm} Highest \psoftmax{}
    \end{flushleft}
         
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar10_resnet18_p_mmse_sigma0.1_class8_bottomk.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar10_resnet18_p_mmse_sigma0.1_class8_topk.pdf}
    % \hspace{0.2cm}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar10_resnet18_p_sm_sigma0.1_class8_bottomk.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar10_resnet18_p_sm_sigma0.1_class8_topk.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar10_resnet18_p_mmse_sigma0.1_class1_bottomk.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar10_resnet18_p_mmse_sigma0.1_class1_topk.pdf}
    \end{subfigure}
    % \hspace{0.2cm}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar10_resnet18_p_sm_sigma0.1_class1_bottomk.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar10_resnet18_p_sm_sigma0.1_class1_topk.pdf}
    \end{subfigure}
    \caption{Additional images with the lowest and highest \probust{} and \psoftmax{} values among CIFAR10 classes. Images with high \probust{} tend to be brighter and have stronger object-background contrast (making them more robust to random noise) than those with low \probust{}. The difference between images with high and low \psoftmax{} is less clear. Thus, \probust{} better captures the model's local robustness with respect to an input than \psoftmax{}.}
    \label{fig-supp:topk-vs-bottomk}
\end{figure*}


%fig4 -- 2x4 images
%top-k and bottom-k images
% 02g_topk_bottomk_images/cifar10_resnet18/p_mmse/
% - cifar100_resnet18_p_mmse_sigma0.1_class8_bottomk.png
% - cifar100_resnet18_p_mmse_sigma0.1_class8_topk.png
% - ... class23 x 2
% 02g_topk_bottomk_images/cifar10_resnet18/p_sm
% - cifar100_resnet18_p_sm_sigma0.1_class8_bottomk.png
% - cifar100_resnet18_p_sm_sigma0.1_class8_topk.png
% - ... class23 x 2
\begin{figure*}[htbp!]
    \vspace{1cm}
    \centering
    \begin{flushleft}
        %row labels
        \hspace{-0.1cm}\rotatebox{90}{\hspace{-6.7cm}Cloud \hspace{2.8cm}Bicycle}
        %column labels
        \hspace{1.2cm}Lowest \pmmsewsigma{0.05}
        \hspace{1.5cm} Highest \pmmsewsigma{0.05}
        \hspace{1.5 cm} Lowest \psoftmax{}
        \hspace{1.5cm} Highest \psoftmax{}
    \end{flushleft}
         
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar100_resnet18_p_mmse_sigma0.05_class8_bottomk.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar100_resnet18_p_mmse_sigma0.05_class8_topk.pdf}
    % \hspace{0.2cm}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar100_resnet18_p_sm_sigma0.05_class8_bottomk.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar100_resnet18_p_sm_sigma0.05_class8_topk.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar100_resnet18_p_mmse_sigma0.05_class23_bottomk.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar100_resnet18_p_mmse_sigma0.05_class23_topk.pdf}
    \end{subfigure}
    % \hspace{0.2cm}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar100_resnet18_p_sm_sigma0.05_class23_bottomk.pdf}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \includegraphics[width=\linewidth, trim={0.2cm, 0.2cm, 0.2cm, 0.2cm}]{figures/appendix/h_topk_bottomk/cifar100_resnet18_p_sm_sigma0.05_class23_topk.pdf}
    \end{subfigure}
    \caption{Images with the lowest and highest \probust{} and \psoftmax{} values among CIFAR100 classes. Images with high \probust{} tend to be brighter and have stronger object-background contrast (making them more robust to random noise) than those with low \probust{}. The difference between images with high and low \psoftmax{} is less clear. Thus, \probust{} better captures the model's local robustness with respect to an input than \psoftmax{}.}
    \label{fig6:topk-vs-bottomk}
\end{figure*}

%3x1 noisy images
%top-k and bottom-k images
% 00_noisy_images/mnist/
% - mnist_imgidx10_class0_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.png
% - mnist_imgidx2_class1_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.png
% - mnist_imgidx1_class2_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.png
\begin{figure*}[htpb!]
    \vspace{1cm}
    \centering
    \begin{flushleft}
        %row labels
        \hspace{-0.1cm}\rotatebox{90}{\hspace{-7.7cm}Digit 2 \hspace{1.6cm}Digit 1 \hspace{1.3cm} Digit 0}
        %column labels
        \hspace{1.3cm}Original 
        \hspace{1.1cm} $\sigma=0.2$ 
        \hspace{1.2cm} $\sigma=0.4$
        \hspace{1.2cm} $\sigma=0.6$
        \hspace{1.2cm} $\sigma=0.8$
        \hspace{1.2cm} $\sigma=1.0$
    \end{flushleft}

    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/mnist_imgidx10_class0_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/mnist_imgidx2_class1_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/mnist_imgidx1_class2_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.pdf}
    \end{subfigure}
    \caption{Examples of noisy images for MNIST.}
    \label{app:noisy_mnist}
\end{figure*}


%3x1 noisy images
%top-k and bottom-k images
% 00_noisy_images/fmnist/
% - fmnist_imgidx7_classshirt_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.png
% - fmnist_imgidx2_classtrousers_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.png
% - fmnist_imgidx0_classankle_boot_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.png
\begin{figure*}[htbp!]
    \vspace{1cm}
    \centering
    \begin{flushleft}
        %row labels
        \hspace{-0.1cm}\rotatebox{90}{\hspace{-8cm}Ankle boot \hspace{1.1cm}Trousers \hspace{1.5cm} Shirt}
        %column labels
        \hspace{1.3cm}Original 
        \hspace{1.1cm} $\sigma=0.2$ 
        \hspace{1.2cm} $\sigma=0.4$
        \hspace{1.2cm} $\sigma=0.6$
        \hspace{1.2cm} $\sigma=0.8$
        \hspace{1.2cm} $\sigma=1.0$
    \end{flushleft}

    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/fmnist_imgidx7_classshirt_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/fmnist_imgidx2_classtrousers_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/fmnist_imgidx0_classankle_boot_noisy_sigmas0.0_0.2_0.4_0.6_0.8_1.0.pdf}
    \end{subfigure}
    \caption{Examples of noisy images for FMNIST.}
    \label{app:noisy_fmnist}
\end{figure*}

%3x1 noisy images
%top-k and bottom-k images
% 00_noisy_images/cifar10/
% - cifar10_imgidx22_classdeer_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.png
% - cifar10_imgidx2_classship_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.png
% - cifar10_imgidx10_classairplane_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.png
\begin{figure*}[htbp!]
    \vspace{1cm}
    \centering
    \begin{flushleft}
        %row labels
        \hspace{-0.1cm}\rotatebox{90}{\hspace{-7.5cm}Ship \hspace{1.7cm}Airplane \hspace{1.5cm} Deer}
        %column labels
        \hspace{1.3cm}Original 
        \hspace{1cm} $\sigma=0.02$ 
        \hspace{1cm} $\sigma=0.04$
        \hspace{1cm} $\sigma=0.06$
        \hspace{1cm} $\sigma=0.08$
        \hspace{1.1cm} $\sigma=0.1$
    \end{flushleft}

    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/cifar10_imgidx22_classdeer_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/cifar10_imgidx10_classairplane_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/cifar10_imgidx2_classship_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.pdf}
    \end{subfigure}
    \caption{Examples of noisy images for CIFAR10.}
    \label{app:noisy_cifar10}
\end{figure*}


%3x1 noisy images
%top-k and bottom-k images
% 00_noisy_images/cifar10/
% - cifar100_imgidx4_classsea_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.png
% - cifar100_imgidx8_classcloud_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.png
% - cifar100_imgidx15_classlion_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.png
\begin{figure*}[htbp!]
    \vspace{1cm}
    \centering
    \begin{flushleft}
        %row labels
        \hspace{-0.1cm}\rotatebox{90}{\hspace{-7.5cm}Lion \hspace{1.8cm}Cloud \hspace{1.7cm} Sea}
        %column labels
        \hspace{1.3cm}Original 
        \hspace{1cm} $\sigma=0.02$ 
        \hspace{1cm} $\sigma=0.04$
        \hspace{1cm} $\sigma=0.06$
        \hspace{1cm} $\sigma=0.08$
        \hspace{1.1cm} $\sigma=0.1$
    \end{flushleft}

    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/cifar100_imgidx4_classsea_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/cifar100_imgidx8_classcloud_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.pdf}
    \end{subfigure}
    
    \begin{subfigure}{0.9\textwidth}
        \includegraphics[width=\linewidth]{figures/appendix/i_noisy_images/cifar100_imgidx15_classlion_noisy_sigmas0.0_0.02_0.04_0.06_0.08_0.1.pdf}
    \end{subfigure}
    \caption{Examples of noisy images for CIFAR100.}
    \label{app:noisy_cifar100}
\end{figure*}



%\begin{figure}
%  \centering
%  \includegraphics[width=0.7\linewidth]{figures/fig3_cifar10_resnet18_sigma0.1.png}
%  \caption{Convergence of the naïve estimator \pmc{} for the CIFAR10 ResNet18 model as the number of noisy %samples increases. In practice, \pmc{} is statistically inefficient.}
%  \label{fig3:pmc-convergence}
%\end{figure}