\documentclass[accepted]{uai2022} 

%\pdfpagewidth=8.5in
%\pdfpageheight=11in
%\usepackage{ijcai22}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands t
    \bibliographystyle{plainnat}


\usepackage{times}
\usepackage{soul}
\usepackage{caption}
\usepackage[utf8]{inputenc}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{listings}
%\usepackage[hidelinks]{hyperref,color}
%\usepackage{ stmaryrd }
%\urlstyle{same}

\usepackage{graphicx}
\usepackage[capitalise,noabbrev]{cleveref}
\usepackage{subfig}
\newtheorem{defn}{Definition}
\newtheorem{prop}{Proposition}
\newtheorem{thm}{Theorem}
\newtheorem{lemma}{Lemma}
\usepackage{ dsfont }
\usepackage{booktabs}
\usepackage[nomargin,inline,draft]{fixme}
\fxusetheme{color}
\fxuseenvlayout{color}
\usepackage{multirow}
\usepackage{textcomp}

% \usepackage[colorlinks=true,linkcolor=blue]{hyperref}        

\usepackage{xcolor}
\usepackage{hyperref}

\hypersetup{
  colorlinks=true,
  linkcolor=blue
  citecolor=violet,
  linkcolor=red,
  urlcolor=blue}
  
%% inline notes
\definecolor{ao}{rgb}{0.0, 0.0, 1.0}
\definecolor{amethyst}{rgb}{0.6, 0.4, 0.8}
\definecolor{Green}{rgb}{0.55, 0.71, 0.0}
\FXRegisterAuthor{lc}{alc}{\color{magenta}[Liron]}
\FXRegisterAuthor{ge}{age}{\color{ao}[Gil]}
\FXRegisterAuthor{gc}{agc}{\color{amethyst}[Gabriella]}
\FXRegisterAuthor{ll}{all}{\color{Green}[Liel]}

\input{Chouraqui_457-macros}


\title{A  Geometric Method for Improved  Uncertainty Estimation in Real-time}
\author[1]{Gabriella~Chouraqui\thanks{Equal Contribution}}
\author[1]{Liron~Cohen}
\author[1]{Gil~Einziger}
\author[1]{Liel~Leman$^*$}


% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    Ben-Gurion University of the Negev, Israel
}
%\affil[$\dagger$]{%
%    Equally Contributed
%}

% \author{Gabriella Chouraqui, Liron Cohen, Gil Einziger, Liel Leman}
\date{October 2021}
\begin{document}
\maketitle

\begin{abstract}
    Machine learning classifiers are probabilistic in nature, and thus inevitably involve uncertainty. Predicting the probability of a specific input to be correct is called uncertainty (or confidence) estimation and is crucial for risk management. 
    %
    Post-hoc model calibrations can improve models' uncertainty estimations without the need for retraining, and without changing the model.
    %However, the performance of such methods is limited to the quality of the model's confidence estimations. 
Our work puts forward a geometric-based approach
for uncertainty estimation.
% 
    Roughly speaking, we use the geometric distance of the current input from the existing training inputs as a signal for estimating uncertainty and then calibrate that signal (instead of the model's estimation) using standard post-hoc calibration techniques. 
    %
    We show that our method yields better uncertainty estimations than recently proposed approaches by extensively evaluating multiple datasets and models. In addition, we also demonstrate the possibility of performing our approach in near real-time applications. 
    %We suggest an algorithm for determining the geometric separation, demonstrating improved predictions of the model's confidence across diverse datasets and models, and allowing for near real-time operation. 
 %
Our code is available at our Github~\citep{Code}. 

  %%   For determining the geometric separation, we suggest two algorithms:  a computationally-intensive process and a lightweight approximation.  
    %
%%  Interestingly, this simple geometric approximation demonstrates improved predictions of the model's confidence across diverse models and datasets, and is feasible in near real-time suggesting its potential in practical mission-critical settings. 
    
    %We demonstrate that our geometric approach  improves confidence estimations across diverse datasets and calibrated models, and that it can be approximated in near real-time. 
    %
    %In addition, we show that our approach allows for multiple estimations per second. 
    
%    Accurately determining the confidence level of a machine learning model for a specific input is crucial in assessing the risks involved in utilizing models within mission-critical contexts.
    %
%    Standard machine learning models provide an ad-hoc, built-in confidence measure for each classification, based on \lcnote{can we say smt here about the how?} \llnote{basically its happens by their structure , tree based models are doing that by the fraction of leafs, neural network based models doing that by the output neurons.}
    %
%    Our work puts forward a geometric-based approach for confidence evaluation. Roughly speaking, we use the geometric separation of the current input from the existing training inputs to predicate the model's confidence.
    %
%    For determining the geometric separation, we suggest two algorithms:  a computationally-intensive process and a lightweight approximation.  
    %
%    Interestingly, this simple geometric approximation demonstrates improved predictions of the model's confidence across diverse models and datasets, and is feasible in near real-time suggesting its potential in practical mission-critical settings. 
    
\end{abstract}



\section{Introduction}


Machine learning models such as neural networks, random forests, and gradient boosted trees are extensively used in domains ranging from computer vision to
transportation and are slowly revolutionizing computer science~\citep{Survey1,transport}. 
Dealing with uncertainty is a fundamental challenge for all machine learning-based applications. In principle, classifications are always probabilistic, implying that miss-classifications are inevitable. 


%~\citepp{990517,DBLP:journals/corr/YangYLL15,Survey1,FREUND1997119,Chapelle:2010:YLR:3045754.3045756,transport}. 
%Some of the most exciting opportunities for machine learning include financial, medical,  autonomous driving, and space exploration systems. 
%As the cost of errors in such critical settings may be catastrophic, there is a crucial need to ensure that they operate correctly. Unfortunately, machine learning models inherently include miss-classification errors. 

%Improving the \emph{accuracy} of the model, i.e.,~the model's average success ratio in classifying test inputs, is the holy grail. Still, it can be a tall order, and classification errors are  inevitable.
 %
 
\emph{Uncertainty calibration} is the process of adapting machine learning models' confidence estimations to be consistent with the actual success probability of the model~\citep{pmlr-v70-guo17a}.  
The model's \emph{confidence} evaluation on its classifications, i.e., the model's prediction of the success ratio on a specific input, is an essential aspect of mission-critical machine learning applications as it provides a realistic estimate of the classification's success probability and facilitates informed decisions about the \emph{current} situation. 
%
Even a very accurate model may run into an unexpected situation, which could then be communicated to the user by the confidence estimation. 
%
For example, consider an autonomous driver that uses a model to identify and classify traffic signs. The model is very accurate, and in most cases, its classifications are correct with high confidence. However, one day, it encountered a traffic sign obscured by, e.g.,  heavy vegetation.
In such a case, the model's classification is more likely to be incorrect. Thus, estimating confidence (i.e., uncertainty) is an essential tool for assessing unavoidable risks, enabling system designers to address the risks better, potentially avoiding unexpected and catastrophic implications. Our autonomous driver, for example, may reduce the speed and activate additional sensors until it reaches higher confidence. 
%
Thus, indeed, all popular machine learning models have mechanisms for determining confidence that can be calibrated to maximize the quality of confidence estimations~\citep{Niculescu2005Predicting,CNNCalibration,Ana2019Verified} and there is a concentrated effort to calibrate models better and facilitate more dependable applications~\citep{Survey2,Sun2007}. 

Existing calibration methods can be divided into two types:  \emph{post-hoc}  methods that preform a transformation that maps from classifiers’ raw outputs to their expected probabilities~\citep{NEURIPS2019_8ca01ea9,pmlr-v70-guo17a,gupta2021calibration}, and \emph{ad-hoc} methods that adapt the training process to generate better calibrated design~\citep{ThulasidasanCBB19,pmlr-v97-hendrycks19a}. Post-hoc calibration methods are easier to apply as they do not change the model and do not require us to retrain a model. That said, ad-hoc methods may lead us to better model training in the first place and thus better models. 
With the success of the two approaches, recent approaches suggest using ensemble methods whose estimation is a (weighted) average of multiple calibration methods~\citep{Ashukha2020Pitfalls,pmlr-v161-ma21a}. 


%Quite a
%few techniques in this category can be used to learn the mapping, such as Temperature Scaling (TS) [Guo et al., 2017]
%[Kull et al., 2019], Ensemble Temperature Scaling (ETS)
%[Zhang et al., 2020], and cubic spline [Gupta et al., 2021],
%etc. However, the expressivity of the learnable mapping
%could still be limited in all of them. This is evidenced by the
%fact that in TS a single temperature parameter T is tuned,
%while ETS brings in three additional ensemble parameters.
%Thus, it is desirable to explore a more sophisticated form of
%the mapping function. (2) Another line of methods adapt the


%Indeed, all popular machine learning models have mechanisms for determining confidence that can be calibrated to maximize the quality of confidence estimations~\citep{Niculescu2005Predicting,CNNCalibration,Ana2019Verified}. This paper puts forward a geometric-based approach for confidence evaluation. We examine the geometric distance of the current input from the existing training inputs and use it to estimate the model's confidence. 
%

This paper presents a post-hoc uncertainty estimation method, but of a different kind. While current post-hoc methods use the model itself as their signal for calibration, we use the training dataset (without modifying the model). 
More precisely, our method is based on geometric notions calculated on the training dataset.
In fact, our geometric choice of the signal is orthogonal to the preformed calibration method   in the sense that we can employ it using various post-hoc calibration methods. 
%In the current paper, we propose a geometric-based post-hoc uncertainty estimation method. 
%Our method is post-hoc as  it does not modify the model, however, it uses an additional signal which is external to the model to perform confidence estimations --- the training dataset. More precisely, our method is based on geometric notions calculated on the training dataset. 
%
%Intuitively, we determine a radius around the current input where we are geometrically closer to the model's predicted class than to any other class.  We predict a confidence when the current input is closer to training inputs with the same classification than it is to training inputs with different classifications. 


Roughly speaking, we examine the geometric distance of the current input from the existing training inputs and use it to estimate the model's confidence. 
 Intuitively, the confidence is high when the current input is close to training set inputs in the same classification and is far from training set inputs with other classifications. Dually, the confidence would be low when there are very close training set inputs with different classifications.
 %
 To maximize this geometric signal the inputs should be normalized. That is, the size, format, etc. of the images should be consistent. Thus, in this paper, we employ such well behaved datasets. 



%\textbf{Contribution: }
Our work demonstrates that geometry can facilitate better uncertainty estimations for diverse models and datasets. 
We first provide an algorithm for calculating the maximal geometric \emph{separation} of an input.
%However, this calculation is computationally heavy rendering it infeasible for near real-time estimations.
However, calculating the geometric separation for an input requires evaluating the whole space of training inputs, making it a computationally expensive method that is not always feasible.  For example, an autonomous driver needs to reach decisions within a short time frame to be effective.  
Therefore, we also suggest a lightweight approximation called \emph{fast-separation}, and show that it provides an approximation of geometric separation.  

Our next challenge is to move from a separation value to a confidence estimation. For this, we apply numerical analysis tools. Thus, to obtain a confidence estimation in real-time we only need to apply a regression function to the calculated separation value. 
Interestingly, our extensive simulation across different models and datasets shows that our geometric-based method yields better confidence estimations when compared to popular libraries used in the industry~\citep{scikit-learn}
%\gcnote{we are using only sklearn}\lcnote{TO DO: add cite}
, as well as recently proposed calibration methods~\citep{Ana2019Verified,gupta2021distribution,pmlr-v70-guo17a,pmlr-v119-zhang20k}. 
%
 Furthermore, our evaluation shows that using our method with the fast-separation approximation allows for multiple confidence estimations per second, making it real-time applicable.
%Interestingly, our approach is made feasible by accelerations in finding nearest neighbors. %enable our calculations to be performed in near real-time.
%our extensive evaluation shows that our geometric fast-separation approximation demonstrates improved predictions of the model's confidence across diverse datasets and popular calibrated models.\lcnote{need to say "almost"}
%Furthermore, our process allows for multiple confidence estimations per second, making it real-time applicable.
%\lcnote{can we say that: our technique leverages advancements in finding nearest neighbors that enable our calculations to be performed in near real-time. Perhaps in discussion I think it is irrelevant in intro. (?)}
%\section{OLD Introduction}
%1. ML is good.
%2. In critical systems, we need to ensure that the ML methods work or at least realistically estimate the success probability. 
%3. In general, all models have an accuracy measure that is calculated on the test set. Such a measure gives an estimation of the model's overall accuracy for a "typical" input. However, many works demonstrate that some inputs are harder than others to correctly classify~\citep{works-on-adverserial-machine-learning}. Therefore, we require a method that estimates the "hardness" of specific inputs. 
%
%4. Indeed, by using formal methods, we can now prove that some inputs are robust in the sense that no adversarial perturbation can change the classification. However, these approaches are lacking as formal verification by its very nature relies on computationally intensive offline techniques.  In our context, using offline techniques implies that we can create better models by maximizing the percentage of robust training set inputs. However, once we encounter new data we still need a method of estimating the classification success for that particular data.
%
%5. Luckily, many models suggest some confidence measures that correlate to the classification's accuracy. However, our empirical valuation of such techniques shows that they can be used solely as rough estimates and do not provide the success probability, which is why many practitioners focus on other methods to apply ML in critical systems.  
%\subsection{contribution}
%6. Our work suggests a geometry-based measure that examines how well different inputs in each class are separated from other classes. We call such a measure \emph{separation} and show that it can be used to determine the model's accuracy for such an input and that it can be computed online within acceptable computation time.  We then apply numerical analysis tools to translate between separation measure and the specific model used. Our process does not use formal methods, but rather interpolation and extrapolation methods which are computationally light and would not bottleneck the system. 
%
%We perform an extensive evaluation containing three datasets, and four popular ML models and demonstrate superior ability to predict the success probability. Specifically...  Mention specifics about our results (preferably with numbers). We are X percent more accurate or our error is smaller. 
%Next, we evaluate the computational intensity of our work and demonstrates that its run-time is reasonable, and may be considered real-time in many scenarios. Further, we show methods to approximate the separation by using sampling and expose a run-time/accuracy trade-off to the user. Thus, one can also benefit our work when the allowed computation time is very limited. 
%
%TODO: add a paragraph about the significance of our work within the broader ML context, and perhaps even move to discussion rather than here. 

\section{Related Works}
%\lcnote{add all relevant papers on post-hoc calibration from the UAI paper from last year}

The dependability of machine learning models is a key challenge in the research community~\citep{space_bugs}. Various works demonstrate vulnerabilities in popular machine learning models~\citep{Biggio,Biggio2014}%biggio1
, or show explicit methods to generate adversarial inputs to such models~\citep{Zhou}. Unfortunately, such vulnerabilities are fundamental to the field and cannot be avoided. 

As mentioned above, uncertainty calibration is about estimating the model's success probability of classifying a given example. Post-hoc calibration methods apply some transformation to the model's confidence (without changing the model) such transformations include Temperature Scaling (TS)~\citep{pmlr-v70-guo17a,NEURIPS2019_8ca01ea9}, Ensemble Temperature Scaling (ETS)~\citep{pmlr-v119-zhang20k}, and cubic spline~\citep{gupta2021distribution}.
In brief, these methods are limited by the best learnable mapping between the model's confidence estimations, and the actual confidence. That is, post-hoc calibration methods are limited in mapping each confidence value to another calibrated value. In comparison, our method uses geometric distance as a signal for calibration and its improvement over post-hoc calibration is because geometric distances better differentiate than the model's predicted probabilities in the models and datasets included in our evaluation. 
Another work that uses a geometric distance in this context is \citep{Dalitz09Reject}. There, the confidence score is computed directly from the geometric distance, while we first fit a function on a subset of the data in order to learn the specific behavior of the dataset and model. Moreover, the work in~\citep{Dalitz09Reject} only applies to the k-nearest neighbor model, while our method is applicable to all models.


The recently proposed work of~\citep{Ana2019Verified} uses a fitting function on the confidence values and then divides the inputs into bins of equal size and outputs the function's average in each bin.  The work of \citep{gupta2021distribution} uses a similar idea but divide the inputs into uniform-mass (rather than equal size) bins.
It is interesting to note that while most post-hoc calibration methods are model agnostic, recent methods have begun to look on a neural network non-probabilistic output called logits(before applying softmax)~\citep{CNNCalibration,Ding2020,wenger2019}. Thus, some of the new post-hoc calibration methods are applicable only to neural networks. 


Ensemble methods are similar to post-hoc calibration methods as they do not change the model, but they consider multiple signals to determine the model's confidence~\citep{Ashukha2020Pitfalls,pmlr-v161-ma21a}. In principle, ensemble methods complement our approach. For example, one can include our estimator in an ensemble, e.g., by averaging its prediction with other methods. 
Ad-hoc calibration is about training models in new manners aimed to yield better uncertainty estimations. Important techniques in this category include mixup training~\citep{ThulasidasanCBB19}, pre-training~\citep{pmlr-v97-hendrycks19a},  
label-smoothing~\citep{NEURIPS2019_f1748d6b}, data augmentation~\citep{Ashukha2020Pitfalls},  self-supervised learning~\citep{NEURIPS2019_a2b15837}, Bayesian approximation (MC-dropout)~\citep{pmlr-v48-gal16,NIPS2017_84ddfb34},  Deep Ensemble (DE)~\citep{DeepEnsembles}, Snapshot
Ensemble~\citep{SnapshotEnsemble}, Fast Geometric Ensembling (FGE)~\citep{FastEnsembling}, and SWA-Gaussian (SWAG)~\citep{SWAG}. 
%Based on a Bayesian approach~ \citep{pmlr-v48-gal16} suggest the MC-dropout method that approximate the posterior distribution using dropout on the training process.
%showed that the posterior distribution for a specific input can be estimated by performing several forward passes while dropout is setting to on . The randomness resulted by dropout can help us to approximate the posterior distribution with minimum statistical difficulties. 
%Another interesting method is Deep-ensemble~\cite{DeepEnsembles} that uses ensembles of neural networks (that captures “model uncertainty” by averaging predictions over multiple models) to estimate the uncertainty. 


Ad-hoc calibration is perhaps the best approach in public as it tackles the core of model's calibration directly. However, because it offers specific training methods it is of less use to large and already trained models, and the impact of each work is limited to a specific model type (e.g., DNNs in~\citep{FastEnsembling}). In compression, ad-hoc and ensemble methods (and our own method) often work for numerous models. 



%CALIBRATION
%The topic of improving confidence estimations has received much attention in recent years.
%The confidence in the classification is a  statistical measure estimating the likelihood of a specific classification to be correct.  %as a single classification is always either correct or incorrect Confidence estimates likelihood of each such outcome. %Thus, the confidence in the classification is a  statistical measure. 
%Model calibration is about detecting biases in models' built-in confidence estimations and correcting them according to an experiment on a validation set~\citep{Niculescu2005Predicting,CNNCalibration}. 
%For example,~\citep{ijcai2021-knowledge} suggests a method for improving confidence in edge prediction on knowledge graphs based on graph theory properties. 
%Our work follows a similar direction and seeks to improve confidence estimations using geometric separation. 
%We specifically target datasets where such separation exists by design. For example, traffic signs use multiple visual signals to be easily differentiated. 




%Other calibration methods are based on binning, the inputs are divided into bins according to some algorithms and then ..recent works used 

%The intuitive idea in our work is to leverage the geometric properties of the currently encountered image to the images used in the training set. Our work shows that such properties provide better confidence estimation than the current methods used by models to estimate their confidence. In addition, our technique leverages advancements in finding nearest neighbors that enable our calculations to be performed in near real-time. Geometric properties provide good results in identifying traffic signs and handwriting, but they are not universally helpful. For example, a shirt may be similar in shape to a dress, but traffic signs are made to be very different from one another to allow drivers to identify them effortlessly. 

%Our algorithms work for diverse machine learning models and allow near-real-time operation. However, since we use geometric properties to predict the confidence, our method is effective mainly in datasets that are \emph{geometric} that is, datasets when the classes are geometrically different from each other. Examples of such datasets include traffic signs recognition. The designers of such signs use multiple graphical hints to allow people to easily distinguish between different signs (e.g., the stop sign has a distinct shape).  However, not all datasets are geometric. For example, a shirt may look similar to a dress.  Yet, while our method is not a complete solution, many important datasets are geometric. 


%ROBUSTNESS
Our geometric method is largely inspired by the approach of robustness proving in machine learning models. In this field, formal methods are used to prove that specific inputs are robust to small adversarial perturbations. That is, we formally prove that all images in a certain geometric radius around a specific train-set image receive the same classification~\citep{mooly,KatzBDJK17,marta,Gehr2018AISA,Ehlers17,DBLP:conf/aaai/EinzigerGSS19}. These works are not applicable to uncertainty calibration as they can only produce proves in an offline manner, and thus only to training set inputs rather than to the current input. However, the underlying intuition is that inputs that are geometrically similar should be classified the same also appears in our approach. 
%
Indeed, our work shows that geometric properties of the inputs can help us quantify the uncertainty in certain inputs, and that in general inputs that are less geometrically separated and are 'on the edge' between multiple classifications are more error prune than points that are highly separated from other classes. Thus our work reinforces the intuition behind applying formal methods to prove robustness and support the intuition that more robust training models would be more dependable. 

%Our work bears some resemblance to works on the robustness of machine learning models, as both approaches seek geometric properties. 
%However, we only quantify the probability of correct classification and do not guarantee it. 
%On the other hand, not all images are robust, and robustness gives no information when the image is not robust. Thus, it is not helpful for confidence estimation. 
%Furthermore, proving robustness is a process that requires computationally-intensive formal methods, unlike our confidence evaluation method, which can be performed in near real-time. 
%Thus, robustness does not help in general to reason about a specific input in near real-time (e.g., to check if the autonomous driver is confident about a traffic sign as it drives). Our confidence evaluation method, in contrast, can be performed in near real-time. 



\section{Geometric Confidence Evaluation}
This section lays the foundations for a geometric estimation of the model's confidence level on a given instance. Our work assumes that the inputs are normalized. That is, they are fixed-sized images and within the same format. Under such conditions, we can measure the geometric distance between various inputs.  
%\lcnote{this should be mentioned in the intro maybe}


Formally, a model receives a data input, $x$, and outputs the pair $\langle\cls(x),\con(x)\rangle$, where $\cls(x)$ is the model's classification of $x$ and $\con(x)$ reflects the probability that the classification is correct.  
Our current work evaluates $\con(x)$ from a geometric point of view. We estimate the environment around $x$ where points are closer to inputs of certain classifications over the others. 
In~\cref{sec:sep} we define a geometric separation measure, and provide an algorithm to calculate it. We explain that such a computation is too cumbersome for real-time systems, and so we suggest a lightweight approximation in~\Cref{sec:stab}. Finally, ~\cref{sec:conf} explains how we use the geometric signal to derive $\con(x)$. That is, mapping a real number corresponding to the geometric separation to a number in $[0,1]$ corresponding to the confidence ratio. 

%We provide two geometric measures that allow for such an evaluation and describe how to derive confidence using these measures.\lcnote{we don't really know this last part for sep}


\subsection{Separation Measure}
\label{sec:sep}
We look at the displacement of $x$ compared to nearby data inputs within the training set. Intuitively, when $x$ is close to other inputs in $\cls(x)$ (i.e., inputs with the same classification as $x$) and is far from inputs with other classifications, then the model is correct with a high probability, implying that $\con(x)$ should be high. On the other hand, when there are training inputs with a different classification close to $x$,  we estimate that  $\cls(x)$ is more likely to be incorrect. 

Below we provide definitions that allow us to formalize this intuitive account. In what follows, we consider a model $\model$ to consist of a machine learning model (e.g., a gradient boosted tree or a neural network), along with a labeled train set, $\tr$, used to generate the model. 
%
We use an implicit notion of distance, and denote by $\dis(x,y)$ the distance between inputs $x$ and $y$, and by $\Dis(x,A)$ the distance between the input $x$ and the set $A$ (i.e., the minimal distance between $x$ and the inputs in $A$).

\begin{defn}[Safe and Dangerous inputs]
\label{def:Tr_C(x)}
Let $\model$ be a model.  
For an input $x$ in the sample space we define:
%, $\mathcal{X}$ be a sample space and $\tr \subset \mathcal{X}$ a train set.
%, and $\mathcal{Y}$ a set of possible classifications.
%For an input $x \in \mathcal{X}$ we define:
\[\friend(x) :=\{x'\in \tr : \cls(x')=\cls(x)\}.\]
We denote by $\nfriend(x)$ the set $\tr \setminus \friend(x)$.\\
An input $x\in \mathcal{X}$ is labeled as \emph{safe} if it is closer to $\friend(x)$ than to $\nfriend(x)$, and it is labeled as \emph{dangerous} otherwise.
\end{defn}


\begin{defn}[Zones]
\label{def:zone}
Let $x$ be a safe (dangerous) point. 
A \emph{zone} for $x$, denoted $z_x$, is such that for any input $y$, if $d(x,y)<z_x$, then $\Dis(y,\friend(x))<\Dis(y,\nfriend(x))$ ($\Dis(y,\friend(x))\geq\Dis(y,\nfriend(x))$). 
For each $x$ we denote the maximal such zone by $\zone(x)$.
%The zone of a safe (dangerous) input $x$, $\zone(x)$, is the maximal distance 
%from $x$ such that for any input $y$, if $d(x,y)<\zone(x)$, then $\Dis(y,\friend(x))<\Dis(y,\nfriend(x))$ ($\Dis(y,\friend(x))\geq\Dis(y,\nfriend(x))$). 
\end{defn}

In other words, a zone of a safe (dangerous) input $x$, $\zone(x)$, is a  radius around $x$ such that all inputs in this ball are  closer to an input in $\friend(x)$ ($\nfriend(x)$) than to any input in $\nfriend(x)$ ($\friend(x)$), respectively. 

\begin{defn}[Separation]
\label{def:sep}
The separation of a data input $x$ with respect to the model $\model$ is  $\zone(x)$ when  $x$ is a safe input, and $-1\cdot \zone(x)$
when $x$ is a dangerous input. 
\end{defn}

\begin{figure}[!t]
    \centering
    \includegraphics[width=0.45\textwidth]{figures/danger&good_zone.pdf}
    \caption{Geometric representation of safe and dangerous inputs, maximal zones, and separation values.
    The various classifications are illustrated via different shapes.}
    \label{fig:Sep-circles}
\end{figure}

That is, the separation of $x$ encapsulates the maximal zone for $x$ (provided by the absolute value) together with an indication of whether the point is safe or dangerous (provided by the sign).
%
The separation of $x$ depends only on the classification of $x$ by the model and the train set. This is because our definition partitions the inputs in $\tr$ into two sets: one with $\cls(x)$, $\friend(x)$, and one with all other classifications, $\nfriend(x)$. These sets vary between models only when they disagree on the classification of $x$.
Note that $x$'s for which the distance from $\friend(x)$ equals the distance from $\nfriend(x)$ are considered dangerous inputs, and their separation measure will be zero.

As mentioned,~\Cref{def:zone} and~\Cref{def:sep} use an implicit notion of distance. Such notion can accept any distance metric (e.g., $L_1, L_2$ or $L_{\infty}$). However, in this work, we fix the metric to $L_2$ as it is a standard measure for safety features in adversarial machine learning~\citep{Robustness-Moosavi}, in addition to it being easy to illustrate and intuitive to understand.
Furthermore, our methodology relies on calculating the nearest neighbors of a given input, and for $L_2$, this can be done using standard and well-optimized libraries. Accordingly, all our definitions and calculations assume the $L_2$ metrics (Euclidean distances). 

%Definition~\ref{def:sep} formally defines the separation of an input input which is our main signal for estimating confidence. 
  \Cref{fig:Sep-circles} provides a geometric illustration of safe and danger zones, and separation values. For illustration purposes, the figure uses the $L_2$ norm with two dimensions, whereas our data usually includes many more dimensions. For example, a $30\times30$ traffic sign image will have 900 dimensions.
% 
In the figure, $x$ is a safe input, and the green highlighted ball represents its maximal zone which reflects how far we can get from $x$ and still be closer to training set inputs classified the same as $x$ than any other inputs. 
%
The input $y$ is a dangerous input, and the red highlighted ball represents its maximal zone which dually represents how far we need to distance ourselves from $x$ so that inputs classified as $x$ become closer than other inputs.  
Thus, $x$ will have a positive separation value, while $y$ will have a negative one.

Next, we provide a formula for calculating the separation of a given input $x$ within the $L_2$ distance metric. 


\begin{defn}
\label{def:practicalsep}
Given a model $\model$ and an input $x$, define:
\[\sep(x)= \min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis^2(x,x'') - \dis^2(x,x') }{2\dis(x',x'')}\]
\end{defn}

\begin{lemma}\label{lem:sep}
Let $x, x', x'' \in \mathbb{R}^n$ be inputs such that $\dis(x,x')<\dis(x,x'')$. 
The maximal distance $M(x,x',x'')$ for which if $y\in \mathbb{R}^n$ such that $\dis(x,y)<M(x,x',x'')$, then 
%$y$ is closer to $x'$ than to $x''$ $(
$\dis(y,x')< \dis(y,x'')$ is \[\frac{\dis^2(x,x'')-\dis^2(x,x')}{2\dis(x',x'')}.\]
\end{lemma}
\begin{proof}
Since any three points in space define a plane we focus on the plane defined by these three points. 
\begin{figure}[h!]
    \centering
    \includegraphics[width=0.35\textwidth]{figures/gabi-proof.pdf}
    \caption{Illustration of the proof of~\Cref{lem:sep}}
    \label{fig:Sep-proof}
\end{figure}

\Cref{fig:Sep-proof} demonstrates a geometric positioning of the points, and the main constructions in the proof.
%
The perpendicular bisector to the line between $x'$ and $x''$ divides the plane into two parts: one in which all the points are closer to $x''$ than to $x'$ (the lower part in the figure) and one in which all the points are closer to $x'$ than to $x''$ (the upper part in the figure).
Our goal is thus to establish the distance between $x$ and the lower part of the plane. Hence, $M(x,x',x'')$ amounts to the distance from $x$ to the perpendicular bisector to the line between $x'$ and $x''$.
Using trigonometric calculations, it is straightforward to verify that indeed 
$$M(x,x',x'')=\frac{\dis^2(x,x'')-\dis^2(x,x')}{2\dis(x',x'')}.$$\qedhere
%\[M+\frac{\dis(x',x'')}{2}=\dis(x,x'')sin(\alpha)=
%=\dis(x,x'')cos(90-\alpha)
%\]\[\frac{\dis^2(x,x'')-\dis^2(x,x')-\dis^2(x',x'')}{2\dis(x',x'')}\]
\end{proof}

\begin{prop} \label{prop:sep}
$\sep(x)$ is the separation of $x$ with respect to the model $\model$ (in~\Cref{def:sep}).  
\end{prop}
\begin{proof}
Let $x$ be a safe input, and $y$ be an input such that \[\dis(x,y)< \min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis^2(x,x'') - \dis^2(x,x') }{2\dis(x',x'')}.\]
We first show that $y$ is closer to $\friend(x)$ than to $\nfriend(x)$.
%
Let $z''\in \nfriend(x)$,
%such that $\Dis(y,\nfriend(x))=\dis(y,z'')$. \gcnote{i'm not sure thats what we want.. we need to assure that it append to every z'', since with diffrent angels we may have diffrent sep, i also need the z' to be minimal}
it suffices to show that there exist $z' \in \friend(x)$ such that $\dis(y,z')<\dis(y,z'')$. % we will choose $z'$ such that  $\dis(x,z')=\Dis(x,\friend(x))$
%such that  $\dis(x,z')<\dis(x,z'')$.
%
Notice that
\[\dis(x,y)<\max_{x' \in \friend(x)} \frac{\dis^2(x,z'') - \dis^2(x,x') }{2\dis(x',z'')}. \]
%since  with any $z''\in \nfriend$ the value can only be bigger than the minimun. 
Therefore, there exist a $z'\in \friend(x)$ for which  
\[\dis(x,y)< \frac{\dis^2(x,z'') - \dis^2(x,z') }{2\dis(z',z'')}\]
Thus, since $x$ is a safe point,  using~\Cref{lem:sep}, we conclude that $\dis(y,z')< \dis(y,z'')$.
%since  with any $z''\in \nfriend$ the value can only be bigger than the minimun, and due to our choice of $z'$ as the closest point. 
%\gcnote{do we need to explain more this transition?}\lcnote{yes. in a sentence.}
%
The proof follows similar arguments for dangerous points,  taking the distances as $-\sep$ and flipping the inequalities.

To show maximality, observe that the intersection point marked by $w$ in~\Cref{fig:Sep-proof}, which is at distance $\sep(x)$ from $x$,  can be easily shown to be of equal distances from $\friend(x)$ and $\nfriend(x)$.
%\lcnote{do we need this explanation? what additional info it provides?}
%Therefore for $y$ such that \[\dis(x,y)= \min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis^2(x,x'') - \dis^2(x',x) }{2\dis(x',x'')}\] and according to the lemma above 
%$\dis(y,\hat{x'})=\dis(y,\hat{x''})$ for $\hat{x'},\hat{x''}$ that holds the minimiun/maximum.
%\lcnote{put $w$ in the figure}
%\lcnote{Is it really easy to show? plz check}\gcnote{yes}
%we prove that there exists an input $y$ such that 
%\[\dis(x,y)= \min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis^2(x,x'') - \dis^2(x',x) }{2\dis(x',x'')}\], but $y$ is not closer to an input in $\friend(x)$ than to any input in $\nfriend(x)$.
%\lcnote{TBD}
%
%We want to prove that any input $y$ such that \[\dis(x,y)< \min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis^2(x,x'') - \dis^2(x',x) }{2\dis(x',x'')}\] is closer to an input in $\friend(x)$ than to any input in $\nfriend(x)$.
%
%We will prove by negative- for an input x assume that there exist an input $\hat{x}\in \mathcal{X} $ in $\sep$- radius of $x$ and an input $x'' \in \nfriend$ such that $ \forall x' \in \friend, d(\hat{x},x'')<d(\hat{x},x'')$
\end{proof}


%\begin{defn}[Separation]
%Given a model $\model$, a training set $\tr$, and a datan input $x$, the separation of $x$ w.r.t. the model $\model$, denoted $\sep(x)$, is defined as follows:
%\begin{itemize}
%    \item if the closest input to $x$ in $\tr$ is of classification $\cls(x)$, $\sep(x)$ is the maximal radius over $x$ such that any input from $\tr$ within the ball is closer to an input in classification $\cls(x)$ than to any other classification.
%    In this case, we refer to $\sep(x)$ as the \emph{safe zone} of x.
%    \item if the closest input to $x$ in $\tr$ is of classification different than $\cls(x)$, then $\sep(x)$ is the maximal radius over $x$ such that any input from $\tr$ within the ball is closer to an input in classification other than $\cls(x)$ than to classification $\cls(x)$.
%    In this case, we refer to $\sep(x)$ as the \emph{danger zone} of x.
%\end{itemize}
%\end{defn}

While separation provides the maximal zone, it is expensive to calculate. As can be seen  in~\Cref{def:practicalsep}, to estimate the separation of one specific input, we go over many triplets of inputs. The exact amount is unbounded and depends on the dataset.  Thus, separation is infeasible to compute in near real-time. Therefore, when time or computation resources are limited, we require a different and computationally simpler notion. 
Accordingly, the following section provides an efficient approximation of the separation measure.
%that provides most of the benefits at a lower complexity. 

\subsection{Fast-Separation Approximation}
\label{sec:stab}
We approximate the separation of a given input using only its distance from $\friend(x)$ and its distance from $\nfriend(x)$. 
This simplification allows us to calculate a zone for any given point, which is not necessarily the maximal one. 
The reliance on these two distances enables a faster calculation since we do not perform an exhaustive search over many triplets of inputs.
In particular, we do not consider the geometric positioning of the inputs that determine the distance from these sets. 
%Only require the distances from $\friend(x)$ and $\nfriend(x)$, 
%


 
\begin{defn}[Fast-Separation] 
\label{def:stab}
%\lcnote{add model}
%Given a sample space $\mathcal{X}$, classes set $\mathcal{Y}$, and a classification function $f: \mathcal{X} \rightarrow \mathcal{Y} $ trained on a training set $\mathcal{T} \subset \mathcal{X}$.  
%Let $d$ be some distance metric on $\mathcal{X} \times \mathcal{X}$.
%The separation of a test instance $x \in \mathcal{X}$, denoted by $\sep(x)$, is half the difference between the distance from x to the 2 nearest neighbours with diffrent classes.

Given a model $\model$, the fast-separation of an input $x$, denoted $\stab(x)$, is defined as:
%the difference between the distance of the nearest training instance x' and the second nearest training instance x'' that has a different classification.
\[\stab(x)=\frac{\Dis(x,\nfriend(x))- \Dis(x,\friend(x))}{2}\]
\end{defn}

Notice that just as is the case for separation, if $x$ is a safe input, its fast-separation value will be strictly positive and non-positive otherwise. 


\Cref{fig:Stab_theory} illustrates the notion of fast-separation. In particular, it exemplifies why it only provides an approximation of the more accurate separation measure. It encapsulates a zone that is less than or equal to that of separation. 
Sub-figure  (a) demonstrates a case in which $\stab(x)=\sep(x)$, while sub-figure (b) presents a case where $\sep(x)$ is considerably larger than $\stab(x)$.
    
%This notion is illustrated in Figure~\ref{fig:Stab_theory}. 
%Geometrically, the separation of x is the radius of the environment around x where all the examples are closer to the classification of x' than the classification of x''. 
%For example if there are 2 examples with same the distance from x, the separation is 0, and as the difference between the two distances increases the separation increases. A positive separation means that the area around x include only training example with the same label. It is important to notice that x might be classified with another class and that the certification of separation is \textbf{only} on training set examples and is not a property of the model.\lcnote{change}



\begin{figure}[!t]
    \centering
    \subfloat[$\stab(x) = \sep(x) = 0.5$ ]{{\includegraphics[width=0.75\linewidth]{figures/same_sep_stab.pdf} }}
    \\
    \centering 
\subfloat[$0.5=\stab(x)   \neq \sep(x) = 3.5$ ]{{\includegraphics[width=0.8\linewidth]{figures/diff_stab_sep.pdf} }}
    
    
    \caption{Geometric representation of the  induced zones of $\stab$ and $\sep$ for different input alignments.
    $\stab$ is represented by blue arrows and $\sep$ by green arrows.}
    \label{fig:Stab_theory}
\end{figure}

\begin{prop}
\label{prop:stabseprel}
$\stab$ is a lower bound of the separation $\sep$ in the sense that 
for every safe (dangerous) input $0 \leq \stab(x) \leq \sep(x)$ ($\sep(x) \leq \stab(x) \leq 0$).
%\gcnote{why not $ \mid\sep(x)\mid> \mid \stab(x) \mid$}
%\lcnote{we could say it like this but I also want to show that on dangerous points we get negative points and on safe ones we get a positive ones}
\end{prop}

\begin{proof}
%We start by showing that for any safe input $x$,  $0 \leq \stab(x) \leq \sep(x)$.
%$0 \leq  \stab(x) $ since, for safe inputs, $\Dis(x,\nfriend(x)) > \Dis(x,\friend(x))$.
%To show that $\stab(x) \leq \sep(x)$, observe the following:\lcnote{the subscripts are unreadable}

%\[\sep(x)= \min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis^2(x,x'')-\dis^2(x,x')}{2\dis(x',x'')} \geq\]
% \[\min_{x'' \in \nfriend(x)}  \max_{x' \in \friend(x) : \dis^2(x,x'')-\dis^2(x,x')>0} \frac{\dis^2(x,x'')-\dis^2(x,x')}{2\dis(x',x'')}\geq\]
% \[\min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x) : \dis^2(x,x'')-\dis^2(x,x')>0} \frac{\dis(x,x'')-\dis(x,x')}{2}\geq\]
% \[\min_{x'' \in \nfriend(x)}\frac{\dis(x,x'')-\Dis(x,\friend(x))}{2} = \]
% \[\frac{\Dis(x,\nfriend(x))- \Dis(x,\friend(x))}{2}=\stab(x)\]
% The first inequality holds since  $\Dis(x,\nfriend(x)) > \Dis(x,\friend(x))$ and therefore $\{x' \in \friend(x) : \dis^2(x,x'')-\dis^2(x,x')>0\}\neq \emptyset$.
% The second inequality holds due to the triangle inequality. The third one\lcnote{shouldn't this be an equality?} holds since  $\Dis(x,\friend(x))$ yields the maximal value to the expression and last holds since  $\Dis(x,\nfriend(x))$ yields the minimal value to the expression.
%%%%%%%%%%%%%%%%%%%%%%%%

%\gcnote{new:}
%%%%%%
Let $x$ be a safe input. 
Since~\Cref{prop:sep} shows that $\sep(x)$ is the \emph{maximal} zone, it suffices to show that 
$\stab(x)$ is a zone of $x$.
Let $y$ be a point such that \[\dis(x,y)< \stab= \frac{\Dis(x,\nfriend(x))- \Dis(x,\friend(x))}{2} .\]
We show that $\Dis(y,\friend(x))<\Dis(y,\nfriend(x))$.
Take $z' \in \friend(x)$ and  $z'',w \in \nfriend(x)$  such that $\dis(x,z')=\Dis(x,\friend(x))$,  $\dis(x,z'')=\Dis(x,\nfriend(x))$, 
and $\dis(y,w)=\Dis(y,\nfriend(x))$.
%
Using the triangle inequality we get:
\begin{gather*}
     \Dis(y,\friend(x))\leq \dis(y,z')\leq \dis(x,z')+ \dis(x,y)\\
    < \dis(x,z')+ \frac{\dis(x,z'')-\dis(x,z')}{2}
    =\frac{\dis(x,z'')+\dis(x,z')}{2}\\
    = \dis(x,z'') - \frac{\dis(x,z'')-\dis(x,z')}{2}
    < \dis(x,z'')-\dis(x,y)\\
    \leq \dis(x,w)-\dis(x,y) \leq \dis(y,w)
    = \Dis(y,\nfriend(x))
\end{gather*}
For dangerous points, the proof follows similar arguments, switching $\friend(x))$ and $\nfriend(x))$.
%\gcnote{we can maybe shorten the transitions that are easier}
%%%%%%%%%%%%%%%%%%%%%%%%%
%\genote{I don't understand what happened in the first transition. What is $x''$ in the second line?}
%\gcnote{x'' didn't change, the is that we are considering in the max only x' that are closer to x than x'' (for each x'' it is diffrent) we need that step in order to use triangle inequality' else the expression is not guarently positive and the inequality is not always true}
%\[\sep(x)= \min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis^2(x,x'')-\dis^2(x,x')}{2\dis(x',x'')} \geq\]
%\[\min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis(x,x'')-\dis(x,x')}{2}=\]
%\[\min_{x'' \in \nfriend(x)}\frac{\dis(x,x'')-\Dis(x,\friend(x))}{2} = \]
%\[\frac{\Dis(x,\nfriend(x))- \Dis(x,\friend(x))}{2}=\stab(x)\]
%The first inequality is due to the triangle inequality,  the second equality holds since  $\Dis(x,\friend(x))$ yields the maximal value to the expression, and the  third equality holds since  $\Dis(x,\nfriend(x))$ yields the minimal value to the expression. 
%$0 \leq  \stab(x) $ since, for safe inputs, $\Dis(x,\nfriend(x)) > \Dis(x,\friend(x))$.
%
% Next we show that for any dangerous input $x$,  $\sep(x) \leq \stab(x) \leq 0$.\lcnote{I am not sure why this is so different from the previous one. Can we make one dual transition and then claim that we proceed as in the safe case?}
% %
% For a dangerous input $x$, there exist an input $x'' \in \nfriend(x)$
% such that for all $x' \in \friend(x)$, $\dis^2(x,x'')-\dis^2(x,x')<0$ , we will choose the $x'' \in \nfriend(x)$ that satisfy $\dis(x,x'')=\Dis(x,\nfriend(x))$
% \[\sep(x)= \min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis^2(x,x'')-\dis^2(x,x')}{2\dis(x',x'')} \leq\]
% \[ \max_{x' \in \friend(x)} \frac{\dis^2(x,x'')-\dis^2(x,x')}{2\dis(x',x'')}\leq\]
% \[
% \max_{x' \in \friend(x)} \frac{\Dis(x,\nfriend(x))-\dis(x,x')}{2} =\]
% \[\frac{\Dis(x,\nfriend(x))- \Dis(x,\friend(x))}{2}=\stab(x)\]
% The first inequality is since the minimum is smaller than any element, the second holds due to the triangle inequality, and the last is similar to the proof above.
\end{proof}

\cref{prop:stabseprel} shows that the absolute value of $\stab(x)$ is always smaller than or equal to that of $\sep(x)$ and that they have the same sign. Thus, fast-separation is an approximation of separation in the sense that it uses smaller zones. 
%
The following proposition further provides an approximation bound for fast-separation.

\begin{prop}
\label{prop:bound}

The following holds for any point $x$:
\[|\sep(x)-\stab(x)|\leq \frac{\Dis(x,\friend(x))+ \Dis(x,\nfriend(x))}{2}.\]
\end{prop}

\begin{proof}
We here prove the bound for safe points $x$, the proof for dangerous points is similar. 
Let $x$ be a safe point. By definition:
%we will prove only that side the other is symmetric.
%\[\min_{x' \in \friend(x)} \dis(x,x') \leq \min_{x'' \in \nfriend(x)} \dis(x,x'').\]
\begin{align*}
&|\sep(x)-\stab(x)|= \sep(x)-\stab(x) =  \\
=&\min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis^2(x,x'') - \dis^2(x,x') }{2\dis(x',x'')}\\
&\quad -\frac{\Dis(x,\nfriend(x))- \Dis(x,\friend(x))}{2} 
\end{align*}
Let $z'' \in \nfriend(x)$  be a point such that $\dis(x,z'')=\Dis(x,\nfriend(x))$, and let $z'\in \friend(x)$ be a point for which the  maximum on the expression above is obtained. Then, we have: 
\begin{small}
\begin{align}
&|\sep(x)-\stab(x)| \notag \\
\leq&\max_{x' \in \friend(x)} \frac{\dis^2(x,z'') - \dis^2(x,x') }{2\dis(x',z'')} - \frac{\dis(x,z'')- \Dis(x,\friend(x))}{2}\label{eq1: 1}\\
=&\frac{\dis^2(x,z'') - \dis^2(x,z') }{2\dis(z',z'')} - \frac{\dis(x,z'')- \Dis(x,\friend(x))}{2}\label{eq1: 2}\\
\leq& \frac{\dis(x,z'') + \dis(x,z') }{2} - \frac{\dis(x,z'')- \Dis(x,\friend(x))}{2}\label{eq1: 3}\\
=&\frac{\dis(x,z') + \Dis(x,\friend(x))}{2} \\
\leq &\frac{\Dis(x,\friend(x))+ \Dis(x,\nfriend(x))}{2}
\label{eq1: 5}
\end{align}
\end{small}
The first inequality (\Cref{eq1: 1}) holds due to the definition of the minimum function.
The second inequality (\Cref{eq1: 3}) is due to the triangle inequality.
% and the fact that $x$ is a safe point.
The last inequality (\Cref{eq1: 5}) holds because, since $x$ is a safe point, the maximal distance between $x$ and $z'$ can't be greater than the distance from $x$ to $\nfriend(x)$.
%(else $\sep(x)$ will be negative in contradiction to the fact that x is a safe point).
\end{proof}

Notice that the above bound is tight, in the sense that there exists an example witnessing the exact bound, as shown in~\Cref{fig:bound} below.
\begin{figure}[!h]
    \centering
    \includegraphics[width=0.35\textwidth]{figures/ex_fig.pdf}
    \caption{Example of a point $x$ with $|\sep(x)-\stab(x)|=\frac{\Dis(x,\nfriend(x)) + \Dis(x,\friend(x))}{2} $}
    \label{fig:bound}
\end{figure}



%\genote{I do not like text try to rephease.}
%Just as is the case for separation, we show that fast-separation values are insensitive to the random choice of the train set. We used a similar experiment to the one described in~\Cref{fig:std-boxplot} to obtain that  the fast-separation values change very little between various partitions to  train/test  sets.\lcnote{TODO: run the experiment! }


%Our experimental results,  presented in~\Cref{sec:results}, demonstrate that while $\stab(x)$ is slightly weaker than $\sep(x)$ in determining confidence level, it still improves on current standard methods for confidence calculations.\lcnote{is it really?}
%What is more, $\stab(x)$ is considerably faster to calculate, making it applicable in real-time.




\subsection{Predicting Confidence from Separation}
\label{sec:conf}
At this point, we showed how to calculate geometric separation and fast-separation for points (the latter can be done efficiently). Next, we take the calculated (fast-)separation value and derive a confidence estimation $\con$. We use a validation set that is disjoint from the train and test sets to calculate for each input $\sep$ or $\stab$  values. 
We perform a fitting to map $\sep$ or $\stab$ values to confidence probabilities. The fitting is done between $\sep$ (or $\stab$) values and the ratios of correct classifications (on the validation set) for each unique value. 
E.g., if for $\stab$ value of $10$ we see that 90\% of the points are classified correctly then we'll add the pair $\langle 10, 0.9 \rangle$ to the fitting function. 
%
We expect very low confidence values for highly negative (fast-)separation values, and we expect to approach 100\% confidence when the values become positive enough. 
%
The regression function we finally get accepts a (fast-)separation value in $\mathbb{R}$ and outputs a scalar in $[0,1]$ indicating the confidence estimation, i.e., the~predicted success probability for inputs with that (fast-)separation score. 

In principle, our method can accept most post-hoc calibration methods to perform the fitting. In this paper, we use isotonic regression as our fitting fuction. Such a function was shown to work best for the tested workloads both for our geometric signal and for the model's original signal, as done in~\citep{scikit-learn}.%\lcnote{where did we see they use isotonic reg?} 


%After experimenting with a few options, we selected isotonic regression as it provided the best results for the tested workloads and models. 

%We further check if the point is correctly classified by the model. 
%Then, for each of the separation measures, we employ a fitting function on those values. 
%Concretely we use isotonic regression as our fitting function.
%
%Next, we fit a Sigmoid function using the least-squares method. We choose a Sigmoid because it is a continuous step function, and it behaves as we anticipate the confidence to behave (as is demonstrated in~\Cref{fig:Sigmoid}). 



\section{Experimental Results}
\label{sec:results}

This section provides experimental results following the method described in the previous section. We first introduce the datasets, models, and evaluation criteria and then continue to experimental results. 



%This section explains our evaluation's methodology, including datasets, models and evaluation certeria. 
\subsection{Methodology}\label{sec:method}
\subsubsection{Datasets}
Our evaluation uses the following standard datasets.
\begin{itemize}
   \item \emph{Modified National Institute of Standards and Technology database (MNIST)}~\citep{MNIST}, which consists of  hand-written images designed for training various image processing systems. It includes 70,000 28×28 grayscale images belonging to one of ten labels.%: the digits 0 to 9.
   %The MNIST database is a large collection of hand-written images designed for training various image processing systems in which each image is of size $28 \times 28$ and in black and white. It has 60,000 training input images belonging to one of ten labels, i.e.~the digits 0 to 9.
   
   \item \emph{Fashion MNIST (Fashion)}~\citep{fashion_MNIST}, which is a dataset comprising of 28×28 grayscale images of 70,000 fashion products from 10 categories. 
   %The Fashion MNIST database is intended as a drop-in replacement for the classic MNIST dataset. It consist of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a $28\times28$ grayscale image, associated with a label from 10 classes.

   \item \emph{German Traffic Signs Recognition Benchmark (GTSRB)}~\citep{GTSRB}, which is a large image set of traffic signs devised for
    the single-image, multi-class classification problem. It
    consists of 50,000 RGB images of traffic signs, belonging to 43 classes.% In GTSRB, each image is of size 32×32 and has three channels (Red, Green and Blue).
    
    
    \item \emph{American Sign Language (SignLang)}~\citep{SignLanguage}, which is a database of hand gestures representing a multi-class problem with 24 classes of letters. It consist of 30,000 28×28 grayscale images.
    
    \item \emph{Canadian Institute for Advanced Research (CIFAR10)}~\citep{CIFAR10}, which is a dataset containing 32x32 RGB images of 60,000 objects from 10 classes. 

\end{itemize}
%{\color{blue}{We chose various datasets in order to show that the method is applicable in many conditions.
%Before we apply the models we first normalize the sizes of the pictures.}}
For each dataset\footnote{As is standard practice, we used normalized datasets (e.g., same image size). See our github for details.}, we randomly partitioned the data into  three subsets: train set $\tr$ (60\%), validation set $\vs$ (20\%) and test set $\ts$ (20\%). %\footnote{In GTSRB, the data is in RGB format. Since our distance metric requires a scalar value, we converted the data black and white.}
%
The train set is used to calculate fast separation and train the model. 
The validation set is used to evaluate the confidence estimation associated with each fast-separation value. These values, in turn, are used to fit a Sigmoid function. Finally, the test set is used to evaluate the confidence on new inputs that were \emph{not} present in the train and validation sets. 


%variations of train and validation sets change our confidence estimation by less than .
%Thus, Our experiment demonstrates that (fast-)separation is stable to the random partition between training and test sets and allows us to use a small number of partitions throughout our evaluation. 
%Most practitioners randomly partition their data to a train set used to train the model and a test set used to evaluate it. The definition of (fast-)separation depends on the train set distances. 
%However,  \Cref{fig:std_boxplot} shows that the (fast-)separation values change very little between various partitions to train and test sets. 
%The figure was calculated on (how many?) runs on (which?) dataset when randomly selecting $ZZ\%$ of the dataset as the train set and YY\% as the test set. 
%\genote{TODO: Add the figure! fig:std boxplot and write a meaningful caption describing the figure.}

\subsubsection{Models}
In our evaluation, we use the following popular machine learning models: Random Forest (RF)~\citep{RFTheory}, Gradient Boosting Decision Trees (GBDT)~\citep{GBTheory}, and Convolutional Neural Network (CNN)~\citep{CNN}.
%, and Nearest Neighbour (1NN)~\citep{KNN}.
% \lcnote{citations}
%\begin{itemize}
%    \item Random Forest (RF)
    %- an ensemble model consisting of many decisions trees. It uses bagging and feature randomness when building each tree to create an uncorrelated forest of trees whose prediction by committee is more accurate than that of any individual tree.
%    \item \emph{Gradient Boosting Decision Trees (GBDT)}- Like other boosting methods, gradient boosting combines weak "learners" into a single strong learner in an iterative fashion.
 %   \item Convolutional Neural Network (CNN)
 %   \item Nearest Neighbour (1NN)
%\end{itemize}
We chose these models because they are different: RF and GBDT are tree-based, while CNN is a neural network. 
%
For RF and GBDT, we configured the meta parameters (e.g., the maximal depth of trees) by cross-validation on the train set. For CNN, we used the configuration suggested by practitioners. %~\citep{CNN-Fashion-Config,CNN-GTSRB-Config,CNN-KMNIST-Config,CNN-MNIST-Config}. 
%\genote{Please add something about the calibration method used.}
%
% \gcnote{cite}
% \lcnote{what about 1NN?}\gcnote{nn dont really have params, we can say that we used k-nn with k=1 but i think nn is enough}
Our specific configurations as well as the accuracy scores of each of the models are detailed in~\cite{Code}. %https://github.com/NoSleepDeveloper/Geometric-Calibrator.
% \lcnote{TO DO: write in the Git from which link you took each config for CNN}

%\subsection{Interpolation}
%For each dataset and model we fitted a logarithm function, $\fit$, to the set of inputs(\sep, 
%according to the observed success probability in the validation set of each separation score. 

%In case there were few inputs that received the same separation score, we grouped inputs with similar separation score (according to the train set) together so that there are at least YY inputs to evaluate the success probability. 
%We selected the logarithm with the minimal root mean square error on the validation set.
%\gcnote{not sure if its "select"... the algorithm choose the logarithm that minimize RMSE}
%Finally, the interpolation value yields a prediction of the confidence $\con$. That is, our polynomial accepts a separation value in $\mathbb{R}$, and outputs a scalar in $[0,1]$ indicating the predicted success probability for inputs with that separation score. 

 
 
\subsubsection{Evaluation Criteria}
To evaluate our method, we compare our (fast-)separation-based confidence estimation to: (a) the built-in confidence in the Sklearn library, (b) the scaling-binning calibrator~\citep{Ana2019Verified} which we call $SBC$, (c) the histogram binning calibrator~\citep{gupta2021distribution} which we call $HB$, (d) the temperature scaling calibrator~\citep{pmlr-v70-guo17a} which we call $TS$, and  
(e) the ensemble temperature scaling calibrator~\citep{pmlr-v119-zhang20k} which we call $ETS$. 
$TS$ and $ETS$ are calibration methods for neural networks thus we only apply those to CNNs. 
%
Each method received the same baseline model as an input yielding a slightly different calibrated model. 
%
%\lcnote{TO DO : change the table to the calibrated?} \llnote{I removed the line that talks about that we run our method on calibrated model (now its on uncalibrated as SBC and sklearn get the models and calibrate them by their own methods.}
Note that our method is evaluated against the uncalibrated model as our method does not affect the model. Moreover, it allows us to compare our method against different calibration methods, as shown in~\Cref{tab:mainresult}.

%\lcnote{We used prebuild function that takes our previously build model and calibrates it. in details : we fit the model on a training dataset and calibrate with 'prefit' parameter using a hold out validation dataset.}
%For any confidence measure $\CON$, our evaluation goes over every input in the test set, and determines if its classification is correct or incorrect. If the classification is correct the error is $\abs| 1-\CON |$,  otherwise, the error is $\CON$.  
%To measure accuracy we use the standard summation of the root mean square error (RMSE) on all inputs in the test set. 
%\gcnote{i don't think we need this explanation.. we just compute mean square error}
To evaluate the confidence predictions, we use the \emph{Expected Calibration Error (ECE)}, which is a standard method to evaluate confidence calibration of a model~\citep{Xing2020distance,Krishnan2020Improving}.
Concretely, the predictions sample of size $n$ are partitioned into $M$ equally spaced bins  $(B_{m})_{m\leq M}$, and ECE measures the difference between the sample accuracy in the $m^{th}$ bin and the  the average
confidence in it~\citep{naeini2015obtaining}. 
Formally, ECE is calculated by the following formula: 
%ECE is a standard method to evaluate confidence calibration of a model~\citep{Xing2020distance,Krishnan2020Improving}. %CNNCalibration,Seo2018Confidence
%Roughly speaking, ECE is a weighted average over the difference between the accuracy and the confidence.
%Concretely, ECE approximates the accuracy of the samples  by partitioning the predictions sample of size $n$ into $M$ equally spaced bins  $(B_{m})_{m\leq M}$, and is defined as follows:
%\lcnote{do we need $\left|B_{m}\right|$?}\gcnote{I think yes since it come from the expectation definition, we can use this definition:$\underset{\hat{P}}{\mathbb{E}}[|\mathbb{P}(\hat{Y}=Y \mid \hat{P}=\con)-\con|]$ }
\[E C E=\sum_{m=1}^{M} \frac{\left|B_{m}\right|}{n} \left| \operatorname{acc}\left(B_{m}\right)-\operatorname{conf} \left(B_{m}\right)\right|\] 
where:
\begin{itemize}
    \item $\operatorname{acc}\left(B_{m}\right)=\frac{1}{\left| B_{m}\right|} \cdot \left| \{x\in B_m : \cls(x)~\text{is correct} \}\right| $, and 
    \item $\operatorname{conf}\left(B_{m}\right)=\frac{1}{\left|B_{m}\right|} \sum_{x \in B_{m}} {\con(x)}$.
\end{itemize}

%
%\genote{TO DO: define it properly please.}
%The formal definition-
%\[\underset{\hat{P}}{\mathbb{E}}[|\mathbb{P}(\hat{Y}=Y \mid \hat{P}=\con)-\con|]\]
%
%We classify each input in the test set, and calculate our confidence notion $\con$  using the interpolation either with $\sep$ or with $\stab$.\lcnote{introduce a macro for both sep and stab} 
%We compare our confidence estimation with the confidence estimation of each separate model. That is, various models use different standard techniques to estimate confidence. 
%On each input of the test set we calculate $\con(\sep(x))$ and compare it to the true accuracy on that input $\mathds{1}[\model (x) =y_{true}(x)]$ using RMSE measurement.

 \begin{figure}[t]
     \centering
     \includegraphics[width=0.45\textwidth]{figures/fitting_func.pdf}
     %\lcnote{TO DO: A drawing showing that Sigmoid is the right fitting function on a couple of examples + an explanation of the figure}
     \caption{An illustration of the inputs to the fitting function (blue and red dots), and the functions fitted by Sigmoid (black line) and isotonic regression (green line). The inputs are for the MNIST dataset, and the Random Forest model.}
%     possible fitting functions on MNIST Random Forest.
%     The points indicate the accuracy of our non- calibrated model as function of their confidence, on these points we fitted 3 kinds of functions: logistic regression(black line), isotonic regression (green line) or using HB calibrator(blue line).  }
     \label{fig:Sigmoid}
 \end{figure}

\renewcommand{\arraystretch}{1.2}
\begin{table*}[htb!!]
\centering
\caption{ECE(\%) measures with $95\%$ confidence intervals when varying the calibration method, model and dataset.
The parentheses value show the percentage of relative improvement of ${\stab}$ over other calibration method.}
\resizebox{\textwidth}{!}{
\large
\begin{tabular}{cccccccccccc}
\hline
\multicolumn{1}{c}{\textbf{Dataset}}   & \multicolumn{1}{c}{\textbf{Model}} & 
\multicolumn{1}{c}{$\mathbf{\stab}$}  & 
\multicolumn{1}{c}{$\mathbf{\sep}$}   & 
\multicolumn{1}{c}{\textbf{Sklearn-Iso}}  & 
\multicolumn{1}{c}{\textbf{Sklearn-Platt}}  & 
\multicolumn{1}{c}{\textbf{SBC}}  & 
\multicolumn{1}{c}{\textbf{HB}}  &
\multicolumn{1}{c}{\textbf{TS}}  & 
\multicolumn{1}{c}{\textbf{ETS}} \\ \hline
\multirow{3}{*}{\textbf{MNIST}}       & CNN    & .19{\smaller\textpm.04}    & .19{\smaller\textpm.04}   & \hfil-                             & \hfil-                  			  & 8.71{\smaller\textpm.83(97.8\%)}     & .49{\smaller\textpm.09(61.2\%)}   & \multicolumn{1}{l}{.29{\smaller\textpm.06(34.4\%)}}   & .27{\smaller\textpm.05(29.6\%)}   \\        
                                      & RF     & .39{\smaller\textpm.06}    & .40{\smaller\textpm.06}   & .90{\smaller\textpm.12(56.6\%)}    & 1.48{\smaller\textpm.07(73.6\%)}   & 3.66{\smaller\textpm.38(89.3\%)}     & .53{\smaller\textpm.04(26.4\%)}   & \hfil-                                       			        & \hfil-                 \\        
                                      & GB     & .36{\smaller\textpm.07}    & .35{\smaller\textpm.09}   & 1.74{\smaller\textpm.15(79.3\%)}   & 1.94{\smaller\textpm.13(81.4\%)}   & 8.23{\smaller\textpm.24(95.6\%)}     & .48{\smaller\textpm.08(24.9\%)}   & \hfil-                                       				    & \hfil-                 \\ \hline        
\multirow{3}{*}{\textbf{GTSRB}}       & CNN    & .40{\smaller\textpm.10}    & .38{\smaller\textpm.07}   & \hfil-                        	 & \hfil-                   		  & 28.44{\smaller\textpm2.08(98.5\%)}   & .88{\smaller\textpm.32(54.5\%)}   & \multicolumn{1}{l}{1.11{\smaller\textpm.40(63.9\%)}}   & .99{\smaller\textpm.41(59.5\%)}  \\        
                                      & RF     & .37{\smaller\textpm.04}    & .36{\smaller\textpm.07}   & 2.57{\smaller\textpm.13(85.6\%)}   & 4.27{\smaller\textpm.14(91.3\%)}   & 13.71{\smaller\textpm.38(97.3\%)}    & .81{\smaller\textpm.16(54.3\%)}   & \hfil-                                       				    & \hfil-                 \\        
                                      & GB     & .65{\smaller\textpm.11}    & .67{\smaller\textpm.13}   & 9.96{\smaller\textpm.30(93.4\%)}   & 20.25{\smaller\textpm2.17(96.7\%)} & 31.08{\smaller\textpm.43(97.9\%)}    & 1.36{\smaller\textpm.24(52.2\%)}  & \hfil-                                      				        & \hfil-                 \\ \hline        
\multirow{3}{*}{\textbf{SignLang}}    & CNN    & .01{\smaller\textpm.01}    & .01{\smaller\textpm.01}   & \hfil-                      		 & \hfil-                			  & 17.83{\smaller\textpm.90(99.9\%)}    & .22{\smaller\textpm.12(95.4\%)}   & \multicolumn{1}{l}{.25{\smaller\textpm.09(96.0\%)}}   & .24{\smaller\textpm.09(95.8\%)}   \\        
                                      & RF     & .08{\smaller\textpm.02}    & .09{\smaller\textpm.03}   & .39{\smaller\textpm.06(79.4\%)}    & 1.74{\smaller\textpm.08(95.4\%)}   & 16.88{\smaller\textpm.66(99.5\%)}    & .19{\smaller\textpm.06(57.8\%)}   & \hfil-                                   					    & \hfil-                 \\        
                                      & GB     & .08{\smaller\textpm.03}    & .08{\smaller\textpm.02}   & 4.05{\smaller\textpm0.18(98.0\%)}  & 5.96{\smaller\textpm.17(98.6\%)}   & 30.97{\smaller\textpm.17(99.7\%)}    & .47{\smaller\textpm.04(82.9\%)}   & \hfil-                                    				        & \hfil-                 \\ \hline        
\multirow{3}{*}{\textbf{Fashion}}     & CNN    & .79{\smaller\textpm.13}    & .76{\smaller\textpm.13}   & \hfil-                      		 & \hfil-                    		  & 7.33{\smaller\textpm.51(89.2\%)}     & 1.93{\smaller\textpm.20(59.0\%)}  & \multicolumn{1}{l}{.84{\smaller\textpm.11(5.9\%)}}    & .88{\smaller\textpm.15(10.2\%)}   \\        
                                      & RF     & .74{\smaller\textpm.16}    & .79{\smaller\textpm.10}   & .91{\smaller\textpm.11(18.6\%)}    & 3.74{\smaller\textpm.12(80.2\%)}   & 3.45{\smaller\textpm.31(78.5\%)}     & 1.08{\smaller\textpm.15(31.4\%)}  & \hfil-                                     					    & \hfil-                 \\        
                                      & GB     & .73{\smaller\textpm.13}    & .73{\smaller\textpm.08}   & 3.80{\smaller\textpm.20(80.7\%)}   & 5.71{\smaller\textpm3.91(87.2\%)}  & 3.90{\smaller\textpm.46(81.2\%)}     & 1.06{\smaller\textpm.14(31.1\%)}  & \hfil-                                    			            & \hfil-                 \\ \hline        
\multirow{3}{*}{\textbf{CIFAR-10}}    & CNN    & 1.27{\smaller\textpm.19}   & 1.20{\smaller\textpm.15}  & \hfil-                     		 & \hfil-                     		  & 3.57{\smaller\textpm.40(64.4\%)}     & 5.99{\smaller\textpm.26(78.7\%)}  & \multicolumn{1}{l}{5.16{\smaller\textpm.22(75.3\%)}}   & 5.16{\smaller\textpm.43(75.3\%)} \\        
                                      & RF     & 1.15{\smaller\textpm.24}   & 1.19{\smaller\textpm.23}  & 3.25{\smaller\textpm.28(64.6\%)}   & 4.59{\smaller\textpm.24(74.9\%)}   & 2.99{\smaller\textpm.26(61.5\%)}     & 2.51{\smaller\textpm.39(54.1\%)}  & \hfil-                                       					& \hfil-                 \\        
                                      & GB     & 1.25{\smaller\textpm.21}   & 1.31{\smaller\textpm.16}  & 7.57{\smaller\textpm.25(83.4\%)}   & 8.39{\smaller\textpm.18(85.1\%)}   & 2.70{\smaller\textpm.34(53.7\%)}     & 2.80{\smaller\textpm.24(55.3\%)}  & \hfil-                                       				    & \hfil-                 \\ \hline        
			   			   
\end{tabular}}
\label{tab:mainresult}
\end{table*}



\subsection{Fitting Function}

%The fitted line is non-decreasing everywhere and lies as close to the observations as possible.
%
Post-hoc calibration methods based on fitting functions typically use either a logistic (Sigmoid) or an isotonic regression~\citep{Zadrozny2002Transforming}. 
%
Isotonic regression fits a non-decreasing free-form line to a sequence of observations. In comparison, Sigmoid is a continuous step function.  
We used both fitting functions on our (fast-)separation values and obtained similar accuracy. 
We opt here to present the isotonic regression as it provides the best results, as motivated by~\Cref{fig:Sigmoid}. 

\Cref{fig:Sigmoid} illustrates an example of the success ratio of the Random Forest model for MNIST inputs with varying values of $\stab$ scores (similar behavior were observed for the various models and datasets).  We clustered inputs with a similar score together (into 50 bins overall) as each classification is correct or not, and we are looking for the average.  The black line represents the Sigmoid function and the green line represents the isotonic regression.
As can be observed, both regressions are nearly identical on all the points with positive $\stab$ values. We eventually chose isotonic regression because it better fitted the few points with negative $\stab$ values. Interestingly, these points were consistently a poor fit for the Sigmoid regression rendering slightly less accurate on average. 
%
Also, observe that the transition is around the value 0, indicating that the distinction of safe and dangerous points is meaningful in confidence evaluation.
%Furthermore, as can be expected, there are relatively few instances with negative $\stab$ values. 

%\subsection{Invariance of Geometric Estimations}
%Our first experiment empirically tests the deviation in $\stab$ values for different choices of validation and train sets. That is, we repeat these estimations for five different partitions of train and validation sets and test how the $\stab$ value changes for each input in the test set. 
%We calculated the 95\% confidence intervals for estimates done under these random partitions and found that their maximal value on all runs is $0.0014$ \llnote{its not the correct number ill update it}. In other words, in the vast majority of cases, the $\stab$ value varies by less than $0.14\%$ by the random partition.  
%
%The fact that the $\stab$ values remain robust against random partitions, yields that our calculated confidence estimations remains robust as well. 
%
%Thus, from now on we use a small number of selection of train and validation sets throughout our evaluation.

\subsection{Confidence Evaluation}


\cref{tab:mainresult} presents the main experimental results of our work. The table summarizes  ECEs for our method (with bin size $30$). % the default methods implemented in the Sklearn library, and the $HB$~\citep{gupta2021distribution} and $SBC$~\citep{Ana2019Verified} calibration methods.  

Each entry in the table describes the ECE, the 95\% confidence interval, and (in parenthesis)  the improvement of our fast-separation-based method over each competitor method. The improvement is calculated using the ratio between the difference between our ECE and the competitor's ECE.
%
In this experiment, we perform ten random splits of the data into train, validation, and test sets for each model and dataset. We then measure the ECE of the confidence estimation for all test set items, average the result and take the 95\% confidence intervals. 

First, observe that $\stab$ and $\sep$ yield very similar ECEs, and that the differences between them are usually statistically insignificant. Thus, we conclude that $\stab$ is a very good approximation of $\sep$ despite being considerably  simpler to compute. 
%
The next interesting comparison is between $\stab$ and SKlearn. We use the same fitting function (Isotonic regression) in both cases, but SKlearn performs the calibration on the model's natural uncertainty estimation, and $\stab$ performs the calibration on geometric distances. Thus, the benefit of our approach stems from the geometric signal and not from the chosen fitting function. 

Observe that our $\stab$ improves the confidence estimations consistently and across the board when compared to SKlearn, SBC, and HB.  Specifically, we derive improvements up to 99\% in all tested models, and for all tested datasets. 
% It consistently improves the confidence estimations of uncalibrated models and improves the estimations of calibrated models in the vast majority of cases. 
Such results demonstrate the potential of geometric signals to improve the effectiveness of uncertainty estimation. 

%In practice, for each dataset from MNIST, KMNIST, Sign Language, Fashion-MNIST, and GTSRB we took 10 partitions of the data into 3 parts: train set (60\%), validation set (20\%), and test set (20\%). 
%For each partition, we calculate the separation of the validation set and the test set, according to the train set.
%Next, we show that our new method for confidence evaluation indeed improves against the standard sklearn method. 
%\lcnote{TODO 2: improvement table for $\sep$ and $\stab$.}

% \begin{table}[]
% \resizebox{\columnwidth}{!}{\begin{tabular}{@{}llcc@{}}
% \toprule
% \textbf{dataset} & \textbf{model} & \textbf{$\stab$ Improvement for uncalibrated model} & \textbf{$\stab$ Improvement for calibrated model} \\ \midrule
% MNIST        & RF    & 18.6208\% & 15.5064\%            \\
% MNIST        & GB    & 21.7991\% & 17.5977\%            \\
% MNIST        & 1NN & 45.9632\% & 45.9632\%            \\
% GTSRB        & RF    & 49.0716\% & 42.7422\%            \\
% GTSRB        & GB    & 2.8359\%  & -9.1332\%            \\
% GTSRB        & 1NN & 31.2733\% & 31.2733\%            \\
% SignLanguage & RF    & 17.7951\% & -1.5268\%            \\
% SignLanguage & GB    & 21.6981\% & 9.8458\%             \\
% SignLanguage & 1NN & 53.1283\% & 53.1283\%            \\
% Fashion      & RF    & 30.515\%  & 28.7221\%            \\
% Fashion      & GB    & 32.9901\% & 20.3763\%            \\
% Fashion      & 1NN & 33.9331\% & 33.9331\%            \\
% KMNIST       & RF    &           &                      \\
% KMNIST       & GB    &           &                      \\
% KMNIST       & 1NN &           & \multicolumn{1}{l}{} \\ \bottomrule
% \end{tabular}}
% \end{table}

%Interestingly, we observe that the geometric fast-separation provides an improved signal for confidence even on so-called `non-geometric' datasets such as Fashion. In datasets like MNIST, GTSRB, SignLang, and KMNIST, the varying input classes are geometrically very different from each other. For example, traffic signs vary in shape, size, and color and are designed to be easily distinguishable from one another. However, this is not the case, say, for the Fashion dataset where a t-shirt may sometimes be similar in shape and colors to a dress.
%The results presented in~\Cref{tab:mainresult}, nonetheless, demonstrate that our method provides improved confidence evaluation for such datasets as well. 
%
% \llnote{Not sure if we want to mention color . We only have one dataset that is color-full (German signs), but in order for our methods to be the same we turn the RGB color scale to Gray scale .Do you think it's necessary to implement it in RGB? }
%\lcnote{good input. Let's discuss it on Mon}

%Now we show that indeed the fast-separation based confidence evaluation allows us for calculating confidence in real-time. 
%For this, we ...\lcnote{describe the experiment + the nearest neighbor implementation.}

\subsection{Real-Time Computation}
\Cref{tab:runtime} provides the computational advantage of our method. We used a Macbook Pro with an Intel Core i5 with four processor cores@2.3 GHz and 8GB RAM in this experiment. We measure the throughput of confidence evaluations in predictions per second and the 95\% confidence intervals using five trials for each measurement.

Observe that the dominant factor in operation speed is the dataset. These differences are due to variations in training set sizes, where larger training sets result in slower operation. 
 Importantly, our method runs in 23--46 predictions per second in all but the CIFAR-10 datasets. 
 Such performance is within the ballpark for camera-based applications. For reference, a TV is broadcast in 60 frames per second, and most animation films use up to 24 frames per second. Thus our performance is within an applicable scale.  
 CIFAR-10 is considerably larger, and thus our performance on that dataset is a bit slow. While we can also use parallelism to obtain a faster runtime and mitigate this issue, we plan to address larger training sets in future work. 
 
 


\begin{table}[t!]
\centering
\caption{Number of confidence estimations per second for the $\stab$-based method with $95\%$ confidence intervals. }

\smaller
\begin{tabular}{@{}ccccc@{}}
\hline
\multicolumn{1}{l}{\textbf{Dataset}} &
  \multicolumn{1}{l}{\textbf{Model}} &
  \textbf{Predictions per second}   \\ \midrule
\multirow{3}{*}{\textbf{MNIST}}       & CNN &  22.73 {\tiny\textpm1.90} \\
                              & RF    & 22.42 {\tiny\textpm 0.66 } \\
                              & GBDT    & 22.91 {\tiny\textpm 0.46} \\
\hline
\multirow{3}{*}{\textbf{GTSRB}}        & CNN & 25.29 {\tiny\textpm 0.69 }\\
                              & RF    & 23.23 {\tiny\textpm 1.19} \\
                              & GBDT    &  21.78 {\tiny\textpm 2.18} \\
\hline
\multirow{3}{*}{\textbf{SignLang}}     & CNN & 46.61 {\tiny\textpm 0.18} \\
                              & RF    & 45.16 {\tiny\textpm 2.71}  \\
                              & GBDT & 46.84 {\tiny\textpm 0.40} \\
                              
\hline
\multirow{3}{*}{\textbf{Fashion}}      & CNN &  22.99 {\tiny\textpm 0.16} \\
                              & RF    &  22.85 {\tiny\textpm 0.03} \\
                              & GBDT &  23.20 {\tiny\textpm 0.31} \\
                              
\hline
% \multirow{3}{*}{\textbf{KMNIST}}      & CNN & 22.53 \textpm 0.02 \\
%                               & RF    &   23.33 \textpm 0.21\\
%                               & GBDT    & 23.31 \textpm 0.24\\
%                               \hline
\multirow{3}{*}{\textbf{CIFAR10}}      & CNN & 7.08 {\tiny\textpm 0.24} \\
                              & RF    & 6.76    {\tiny\textpm 0.27}\\
                              & GBDT    & 6.84 {\tiny\textpm 0.41}\\
                              \hline
\end{tabular}%}
%\gcnote{if it's interesting- average on 10 shuffles* ~7000-14000 examples }
\label{tab:runtime}
%\lcnote{are these the real numbers of predictions per second? you mean that each predication takes ~6 seconds? That's far from real time}
\end{table}

%\lcnote{we need the real time values to be of calculating confidence estimation. I.e. the whole process.}
%\gcnote{I don't think we need to show the models but only the datasets}
%\genote{can we show the faster library as well?}


%\begin{figure}[!h]
%    \centering
%    \subfloat[\centering Running time with faiss library ]{{\includegraphics[width=5cm]{figures/faiss_time.png} }}%
%    \qquad
%    \subfloat[\centering Running time with sklearn library ]{{\includegraphics[width=5cm]{figures/sklearn_time.jpeg} }}%
 %   \caption{Running time of separation computation}%
 %   \label{fig:Running_time}%
%\end{figure}


\section{Conclusion}
Our work uses post-hoc calibration techniques but on a geometry-based signal rather than on the model's confidence estimation. We 
demonstrated the feasibility of our approach in estimating uncertainty for multiple models, and for multiple datasets. Our evaluation shows that our fast-separation method ($\stab$) outperforms post-hoc calibration methods based on the model's confidence consistently and across the board. Our approach reduces the error in confidence estimations by up to 99\% compared to alternative methods (depending on the specific dataset and model).   

%
In addition, we showed that for moderately-sized standard datasets our method achieves near-real time operation. As suggested by our analysis and indicated by the experimental results, the complexity of calculating $\stab$ depends on the training-set size which implies that very large datasets would be slower, and not run in near real-time. 
%{\color{blue}
%It is important to mention the limitation of our method, First, as we already mention, for medium sized dataset our method perform in real time, but for lager dataset the computation time may grows. Second we can't create a "calibrated model" by itself, any use of our method require the model with the dataset and the fitting function.
%}
Another related limitation of the work presented in this paper is that our current approach requires using also the training set. 
%However, there is a tradeoff here that our work aims to emphasize: by comparing the model to the training data, we attain considerably better results across the board for multiple models and datasets. 
%
Since shipping the model with as little communication or storage overhead is important, our future research focuses on utilizing a smaller data structure that approximates the entire training set. Thus, we will develop ways to control the effect of the dataset size on the run-time by calculating confidence estimations only on a subset of the dataset. E.g., by pre-processing the data and removing data inputs that are geometrically close to each other and reducing the overheads. 




%{\color{blue}
%delete?
%In the near future, once anonymity restrictions are lifted, we will make all the code used in this work as an open-source library and pursue the adaptation of our technique into standard machine learning libraries. In the meanwhile, our code is available at~\href{https://anonymous.4open.science/r/A-Geometric-Method-for-Improved-Uncertainty-Estimation-in-Real-time-4CDE/}{anonymous-Git}.
%}
%Our work improves the accuracy of confidence estimations for machine learning models. 
%Current work towards this goal usually  
% optimizes confidence by calibrating the models. Our work, instead, focuses on observing geometric properties for the current input. We demonstrate almost consistent improvements compared to calibrated models.


%Writing:
%\begin{enumerate}

%\item proof sep- in process
%\item models \& dataset description
%\item methodology
%\item related work
%\item bound on sep-stab -?
%\end{enumerate}
%Code:
%\begin{enumerate}
%\item sep \& stab inv in data splitting - in process
%\item real time calculation-in process
%\item sep- without bound
%\lcnote{put here the list in priority order and mark those you did as "DONE" and those you are currently working on as "in progress"}
%\end{enumerate}

%\section{Appendix}
%\subsection{Geometry invariance}
%Showing geometry invariance to our both metrics(separation and stability)
%\begin{figure}[!h]
%    \centering
%    \subfloat[\centering Table of mean confidence interval ]{{\includegraphics[width=7cm]{figures/invariance_table.png} }}%
%    \qquad
%    \subfloat[\centering Box-plot of datasets with different models ]{{\includegraphics[width=7cm]{figures/invariance_plot.png} }}%
%    \caption{Invariance experiment}%
%    \label{fig:Running_time}%
%\end{figure}
% \begin{thm}
% The calculation of $\delta$ can be done in real-time.
% \end{thm}

%\begin{enumerate}
%    \item fitting
%    \item calibration
%    \item isotonic regression
%     \item compare against uncertainty estimation methods: MC dropout, deep ensembles 
%    \item code + readme
  %  \item test other error matrices (MSE, LL?) -- important!
%    \item \st{more datasets (RGB, CIFAR)}
%    \item \st{another calibration method (into the table + explain in related works)}
%    \item \st{Add separation to the evaluation}
%    \item \st{add a paragraph about the methods that use NN layers}
%    \item specify that we are data set driven in the intro
%    \item introduce `uncertainty estimation' terminology
%    \item literature on epistemic and aleatoric uncertainties
    %The predictive uncertainty of DNNs can be divided into two main subtypes: epistemic and aleatoric or data uncertainty . Epis- temic uncertainty can be formalized by means of a probability distribution over the model parame- ters and accounts for our unawareness about them. Epistemic is also dubbed as model uncertainty and can be clarified given enough data. In terms of CNNs (Convolutional Neural Networks), epis- temic or model uncertainty describes uncertainty over the network (architecture, hyperparameters, weights). Epistemic uncertainty can be examined via the comparison of several samples acquired from stochastic neural networks. The other type is called aleatoric uncertainty and explained by the noise inherent in observations that is the model’s input-dependent uncertainty. Aleatoric uncertainty exists, for example, due to motion noise and sensor noise, but the epistemic uncertainty may disappear in the case of zero parameter ignorance. Aleatoric uncertainty cannot be reduced with more data; however, it can be formalized by a distribution over model outputs. Aleatoric uncertainty can be further decomposed into two parts: heteroscedastic and homoscedastic . Heteroscedastic uncertainty is handy when modeling assumptions embrace vari- able noise across the pieces of input space. Ho- moscedasticity uncertainty is supposed to be con- stant for different inputs.
    
%    \item ``How do you deal with recoding/scaling/normalisation of input data? How do you select the best space to represent your data?''
%    \item look at papers from UAI in recent years we can compare to (for example \url{https://proceedings.mlr.press/v161/ma21a/ma21a.pdf})
%\end{enumerate}



%\bibliographystyle{named}
\bibliography{Chouraqui_457}

% \color{blue}{
% \section{Appendix}

% \section{Proofs}

% We want to bound the distance between $\sep$ and $\stab$.

% % \begin{lemma}\label{lem:sep2}
% % For a safe point $x$, the example $z'\in \friend$ that holds the maximum on the expression
% % \[\sep(x)= \max_{x' \in \friend(x)} \frac{\dis^2(x,z'') - \dis^2(x,x') }{2\dis(x',z'')}\]
% % for $z'' \in \nfriend(x)$ such that $\dis(x,z'')=\Dis(x,\nfriend(x))$
% % is in a bounded distance from x:
% % \[\dis(x,z') \leq \Dis(x,\nfriend(x))\].
% % \end{lemma}
% % \begin{proof}
% % Since $x$ is a safe point,  $\sep(x)>0$.
% % \begin{equation}\begin{split}
% %   \label{eq:1}
% %   \sep(x)=\min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis^2(x,x'') - \dis^2(x,x') }{2\dis(x',x'')} \leq \\
% %   \max_{x' \in \friend(x)} \frac{\Dis^2(x,\nfriend(x)) - \dis^2(x,x') }{2\dis(x',z'')}
% %   \end{split}
% % % \end{equation}
% % %%


% % If  $z'\in \friend$ that holds the maximum on the expression is such that $\dis(x,z') > \Dis(x,\nfriend(x))$
% %  we get that $\sep(x)<0$ in contradiction. 


% % \end{proof}



% \begin{lemma}\label{lem:sep1}
% For any point $x$, the distance between $\sep(x)$ and  $\stab(x)$ holds:
% \[|\sep(x)-\stab(x)|\leq \frac{\Dis(x,\friend(x))+ \Dis(x,\nfriend(x))}{2}.\]
% \end{lemma}
% \begin{proof}

% Let $x$ be a safe input. Then by definition:
% %we will prove only that side the other is symmetric.
% %\[\min_{x' \in \friend(x)} \dis(x,x') \leq \min_{x'' \in \nfriend(x)} \dis(x,x'').\]
% \begin{align*}
% &|\sep(x)-\stab(x)|= \sep(x)-\stab(x)  \\
% =&\min_{x'' \in \nfriend(x)} \max_{x' \in \friend(x)} \frac{\dis^2(x,x'') - \dis^2(x,x') }{2\dis(x',x'')}\\
% -&\frac{\Dis(x,\nfriend(x))- \Dis(x,\friend(x))}{2} 
% \end{align*}
% We define $z'' \in \nfriend$ to be the example such that $\Dis(x,\nfriend(x))=\dis(x,z'')$ and gets:
% \begin{align}
% &|\sep(x)-\stab(x)|\\
% \leq& \max_{x' \in \friend(x)} \frac{\dis^2(x,z'') - \dis^2(x,x') }{2\dis(x',z'')} - \frac{\dis(x,z'')- \Dis(x,\friend(x))}{2}\label{eq: 2}\\
% \leq&\max_{x' \in \friend(x)} \frac{\dis(x,z'') + \dis(x,z') }{2} - \frac{\dis(x,z'')- \Dis(x,\friend(x))}{2}\label{eq: 3}\\
% =&\max_{x' \in \friend(x)} \frac{\dis(x,z') + \Dis(x,\friend(x))}{2} \\
% \leq &\frac{\Dis(x,\friend(x))+ \Dis(x,\nfriend(x))}{2}
% \label{eq: 5}
% \end{align}
% The first inequality (\Cref{eq: 2}) holds due to the definition of the minimum function.
% The second (\Cref{eq: 3}) is due to the triangle inequality.
% % and the fact that $x$ is a safe point.
% The last inequality (\Cref{eq: 5} holds since for a safe point, the maximal distance from $x$ of a point from $\friend$ can't be greater that the distance from $x$ to $\nfriend$.



% % Let choose the $z'\in \friend$ that holds the maximum on the expression.
% % \[\frac{\dis^2(x,z'') - \dis^2(x,z') }{2\dis(z',z'')} - \frac{\dis(x,z'')- \Dis(x,\friend(x))}{2} \leq\]

% % due to triangle inequality and the fact that we assumed that $x$ is a safe point.
% % It is 
% % Using \cref{lem:sep2}.

% \end{proof}
% Notice that this bound is tight, that mean that we can find an example with this exact distance as showed in \cref{fig:example}.
% \begin{figure}[!h]
%     \centering
%     \includegraphics[width=0.35\textwidth]{figures/ex_fig.pdf}
%     \caption{Example of point $x$ with $|\sep(x)-\stab(x)|=\frac{\Dis(x,\nfriend(x)) + \Dis(x,\friend(x))}{2} $}
%     \label{fig:example}
% \end{figure}



% \section{stab works for all metrics}

% % \begin{defn}[Safe and Dangerous inputs]
% % \label{def:Tr_C(x)}
% % Let $\model$ be a model.  
% % For an input $x$ in the sample space we define:
% % %, $\mathcal{X}$ be a sample space and $\tr \subset \mathcal{X}$ a train set.
% % %, and $\mathcal{Y}$ a set of possible classifications.
% % %For an input $x \in \mathcal{X}$ we define:
% % \[\friend(x) :=\{x'\in \tr : \cls(x')=\cls(x)\}.\]
% % We denote by $\nfriend(x)$ the set $\tr \setminus \friend(x)$.\\
% % An input $x\in \mathcal{X}$ is labeled as \emph{safe} if it is closer to $\friend(x)$ than to $\nfriend(x)$, and it is labeled as \emph{dangerous} otherwise.
% % \end{defn}


% % \begin{defn}[Zones]
% % \label{def:zone}
% % Let $x$ be a safe (dangerous) point. 
% % A \emph{zone} for $x$, denoted $z_x$, is such that for any input $y$, if $d(x,y)<z_x$, then $\Dis(y,\friend(x))<\Dis(y,\nfriend(x))$ ($\Dis(y,\friend(x))\geq\Dis(y,\nfriend(x))$). 
% % For each $x$ we denote the maximal such zone by $\zone(x)$.
% % %The zone of a safe (dangerous) input $x$, $\zone(x)$, is the maximal distance 
% % %from $x$ such that for any input $y$, if $d(x,y)<\zone(x)$, then $\Dis(y,\friend(x))<\Dis(y,\nfriend(x))$ ($\Dis(y,\friend(x))\geq\Dis(y,\nfriend(x))$). 
% % \end{defn}


% \begin{lemma}\label{lem:stab_all_l}


% For every metric and for every input point $x$, $\stab$ is a zone of $x$
% \[\stab(x)= \frac{\Dis(x,\nfriend(x))- \Dis(x,\friend(x))}{2}\]
% \end{lemma}
% \begin{proof}
% We will prove the case in which $x$ is a safe point, the other side is similar.
% Notice that if $\stab$ is a zone of $x$ that mean that for every input in $\stab$-radius of $x$ the distance to a point in $\friend(x)$ is smaller than the distance to a point in $\nfriend(x)$. So given $x$ if we look at $x' \in \friend(x)$, $x'' \in \nfriend(x)$ the nearest points of $x$,
% and given an input $y$ such that $d(x,y) =r$
% %we want that for every $y$ such that $d(x,y) \leq \stab$
% using the triangle inequality we get:
% \[d(y,x') \leq r+ d(x,x')\]
% \[d(y,x'') \geq  d(x,x'') -r \]
% We want to find $r$ for which
% \[d(y,x')\leq d(y,x'')\]
% and we get
% \[d(x,x')-r \geq d(x,x')+r\]
% \[r \leq \frac{d(x,x'')-d(x,x')}{2}\]
% \end{proof}
% \begin{figure}[!b]
%     \centering
%     \includegraphics[width=0.15\textwidth]{figures/stab1.png}
%     \caption{Illustration of the proof of~\cref{lem:stab_all_l}}
%     \label{fig:Stab-proof}
% \end{figure}
% }
\end{document}
