\documentclass[./neurips2023.tex]{subfiles}
\externaldocument[I-]{./neurips2023}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
\myexternaldocument{./neurips2023}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{document}
\newpage
\appendix
\onecolumn

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Proof for theorem~\ref{I-thm:1}}
\label{sec:A1}

Let $\train = \{(\vx_i, y_i)\}$ denote the train dataset of size $N$. Then the minima over the dataset $\train$, is obtained by solving,
\begin{equation}
   \min_{\psi} \mathbb{E}_{\tau \sim U[0,1]}\left[\frac{1}{N} \sum_{i} \rho(I[\psi(\vx_i, \tau) \geq 0],y_i;\tau)\right]
    \label{eq:a1_1}
\end{equation}
Let $\gQ(\vx, \tau)$ denotes the solution obtained using the algorithm~\ref{I-alg:quantrep}. Let $\gP(\vx, \tau)$ denote the solution obtained by solving \eqref{eq:a1_1}. 

We aim to show that $I[\gQ(\vx_i, \tau) \geq 0.5] = I[\gP(\vx_i, \tau) \geq 0]$ for all the points in $\train = \{(\vx_i, y_i)\}$. 

First, observe that, since the base classifier $f_{\theta}(\vx)$ is obtained using MAE we have that $I[\gQ(\vx_i, 0.5) \geq 0.5] = I[f_{\theta}(\vx_i) > 0.5] = I[\gP(\vx_i, 0.5) \geq 0]$. This is because the loss in \eqref{eq:a1_1} at $\tau=0.5$ is nothing but the MAE loss. 

Next for arbitrary $\tau$, we show that $I[\gQ(\vx_i, \tau) \geq 0.5] = I[\gP(\vx_i, \tau) \geq 0]$ over the dataset $\train = \{(\vx_i,y_i)\}$.

We approximate the indicator function as $I[\vx \geq 0] \approx \lim_{k\to \infty} K_k(\vx)$. For instance one can consider $K_k(\vx) = \sigmoid(\vx k)$. Observe that a solution to minimize \eqref{eq:a1_1} can be obtained by
\begin{equation}
    \gP(\vx,\tau) = \lim_{k \to \infty} \argmin_{\psi} \mathbb{E}_{\tau \sim  U[0,1]} \left[\frac{1}{N} \sum_{i} \rho\left( K_k\left(\psi(\vx_i, \tau)\right), y_i; \tau \right) \right]
\end{equation}
Let
\begin{equation}
    \gP^{(k)}(\vx,\tau) =  \argmin_{\psi} \mathbb{E}_{\tau \sim  U[0,1]} \left[\frac{1}{N} \sum_{i} \rho\left( K_k\left(\psi(\vx_i, \tau)\right), y_i; \tau \right) \right]
\end{equation}
Also, since $f(\vx)$ optimizes MAE, we have for some $k$, $K_k(\gP^{(k)}(\vx, 0.5)) = f(\vx)$. That is, for some $k$, 
\begin{equation}
    \begin{aligned}
        I[f(\vx) \geq 1-\tau] &= I[K_k(\gP^{(k)}(\vx, 0.5)) \geq 1-\tau] \\
            &= I[K_k(\gP^{(k)}(\vx, \tau)) \geq 0.5] \\
            &= I[\gP^{(k)}(\vx, \tau)) \geq 0 ] \\
    \end{aligned}
\end{equation}
where the second equality follows from the duality in \eqref{eq:duality}.  However, for any $k, k'$, we have that $I[K_k(\gP^{(k)}(\vx, \tau)) \geq 0.5]$ is equivalent to $I[K_{k'}(\gP^{(k')}(\vx, \tau)) \geq 0.5]$. Since both $\gP^{(k')}(\vx, \tau)$ and $\gP^{(k)}(\vx, \tau)$ would be able to classify the points perfectly at $\tau$. So, we have that 
\begin{equation}
    I[f(\vx) \geq 1-\tau] = I[\gP(\vx, \tau) \geq 0]
\end{equation}
On the other hand, for all data points in $\train$ (from the definition of on the construction of $\gQ(\vx, \tau)$),
\begin{equation}
    I[f(\vx_i) \geq 1-\tau] = I[\gQ(\vx_i, \tau) \geq 0.5]
\end{equation}
Since, $ I[\gQ(\vx_i, \tau) \geq 0.5] = I[\gP(\vx_i, \tau) \geq 0]$ for all datapoints in $\train$, it follows that $\gQ(\vx_i, \tau)$ optimizes \eqref{eq:a1_1}.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Proof for theorem~\ref{I-thm:2}}
\label{sec:A3}

The proof follows from the fact that 
\begin{equation}
    \gQ(\vx_i, \tau) \geq 0 \Leftrightarrow f(\vx_i) \geq (1-\tau) \Leftrightarrow P(g(\vx_i) + \epsilon(\vx_i)\geq 0) \geq 1- \tau
\end{equation}
Assuming that $\tau^* = P(g(\vx_i) + \epsilon(\vx_i)\geq 0)$, So, we have
\begin{equation}
\begin{aligned}
    \int_{\tau=0}^{1} I[\gQ(\vx_i, \tau) \geq 0] d\tau &= \int_{\tau=0}^{1} I[\tau^* \geq (1-\tau)] d\tau = \int_{\tau=0}^{1} I[\tau^* \geq (1-\tau)] d\tau \\
    &= \int_{\tau=0}^{1} I[\tau \geq (1-\tau^*)] d\tau = \int_{\tau=(1-\tau^*)}^{1} 1 d\tau = \tau^*
\end{aligned}
\end{equation}
Thus the theorem follows.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Results when training only the last layer}

\input{Table_OOD.tex}

The same observations as done in the main article also hold true when training is done only in the last layer by considering the features. 

\paragraph{OOD Detection : }These experiments were performed using Densenet and Resnet34 architectures on CIFAR10 and SVHN datasets. The OOD datasets are the same as in the main article. Table~\ref{table:2} shows the results obtained when quantile representations are used only on the last layer.

\begin{figure*}[ht]
\vskip 0.2in
\begin{center}
\begin{subfigure}[b]{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A5b.png}
 \caption{Quantile Representations (Resnet34)}
 \label{fig:quantvsactual(a)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A5a.png}
 \caption{Original Features (Resnet34)}
 \label{fig:quantvsactual(b)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A5e.png}
 \caption{Scatterplot}
 \label{fig:quantvsactual(e)}
\end{subfigure}
\caption{Do quantile representations capture the relevant information for classification? (a) Cross-correlations obtained using Quantile representations for Resnet34 on CIFAR10 (b) Cross-correlations obtained using train features for Resnet34 on CIFAR10. (c) Scatterplot with best fit line (using Locally Weighted Scatterplot Smoothing\cite{cleveland_robust_1979}) of the cross-correlation of features. Observe that as the correlation becomes important (i.e close to $-1$ or $1$) quantile representations are more consistent with raw features.}
\label{fig:quantvsactual1}
\end{center}
\vskip -0.2in
\end{figure*}

\begin{figure*}[ht]
\vskip 0.2in
\begin{center}
\begin{subfigure}[b]{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A5d.png}
 \caption{Quantile Representations (Densenet)}
 \label{fig:quantvsactual(c)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A5c.png}
 \caption{Original Features (Densenet)}
 \label{fig:quantvsactual(d)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A5f.png}
 \caption{Scatterplot}
 \label{fig:quantvsactual(f)}
\end{subfigure}
\caption{Do quantile representations capture the relevant information for classification?  (a) Cross-correlations obtained using Quantile representations for Densenet on CIFAR10 (b) Cross-correlations obtained using train features for Densenet on CIFAR10. (c) Scatterplot with best fit line (using Locally Weighted Scatterplot Smoothing\cite{cleveland_robust_1979}) of the cross-correlations. Observe that as the correlation becomes important (i.e close to $-1$ or $1$) quantile representations are more consistent with raw features.}
\label{fig:quantvsactual2}
\end{center}
\vskip -0.2in
\end{figure*}

\paragraph{Calibration Experiments} The same observations - Quantile probabilities have calibration error which does not change with distortion and that these could not be corrected using simple Platt Scaling/Isotonic Regression, hold true when training only the last layer as well. This is illustrated in figure~\ref{fig:Acalib}.

\begin{figure*}[ht]
\vskip 0.2in
\begin{center}
\begin{subfigure}[b]{0.245\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/Figure2a_old.png}
 \caption{ECE (Resnet34)}
 \label{fig:Acalib(a)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.245\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/Figure2b_old.png}
 \caption{Accuracy (Resnet34)}
 \label{fig:Acalib(b)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.245\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/Figure2c.png}
 \caption{ECE (Densenet)}
 \label{fig:Acalib(c)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.245\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/Figure2d.png}
 \caption{Accuracy (Densenet)}
 \label{fig:Acalib(d)}
\end{subfigure}
\hfill
\caption{Quantile representations can be effective for calibration because they estimate probabilities using Equation~\eqref{eq:quantprob}, which has been shown to be robust to corruptions. As demonstrated using the CIFAR10C dataset \cite{DBLP:conf/iclr/HendrycksD19}, the Expected Calibration Error (\texttt{ECE}) of the probabilities obtained from quantile representations (\texttt{QUANT}) does not increase with the severity of the corruptions. In contrast, when using the standard Maximum Softmax Probability (\texttt{MSP}) method, the calibration error increases as the severity of the corruptions increases.}
\label{fig:Acalib}
\end{center}
\vskip -0.2in
\end{figure*}

\paragraph{Cross-correlation of features}

To illustrate that the quantile representations capture the aspects of data-distrbution relevant to classification, we perform the following experiment - Construct the cross-correlation between features using (i) Quantile Representations and (ii) Feature values extracted using the traindata. If our hypothesis is accurate, then cross-correlations obtained using quantile-representations and feature values would be similar. 

In Figures~\ref{fig:quantvsactual1} and \ref{fig:quantvsactual2}, we present the results of using features from Resnet34 and Densenet on the CIFAR10 dataset. Figures~\ref{fig:quantvsactual(a)} and \ref{fig:quantvsactual(b)} show the results for Resnet34, and Figures~\ref{fig:quantvsactual(c)} and \ref{fig:quantvsactual(c)} show the results for Densenet. To visualize the cross-correlations, we use a heatmap with row and column indices obtained by averaging the linkage of train features. This index is common for both quantile representations and extracted features. It is evident from the figure that the cross-correlation between features is similar whether it is computed using extracted features or quantile representations. 

\section{A case where quantile representations do not capture the entire distribution}
\label{sec:A2}

\begin{figure*}[ht]
\vskip 0.2in
\begin{center}
\begin{subfigure}[b]{0.45\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/Figure_A2a.png}
 \caption{Original Data}
 \label{fig:intro_app(a)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.45\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/Figure_A2b.png}
 \caption{OOD Detection using quantile representations}
 \label{fig:intro_app(b)}
\end{subfigure}


\begin{subfigure}[b]{0.45\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/Figure_A2c.png}
 \caption{OOD Detection using Probabilities}
 \label{fig:intro_app(c)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.45\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/Figure_A3.png}
 \caption{Using Random Labels}
 \label{fig:intro_app(d)}
\end{subfigure}
\caption{ Illustrating a case where quantile representations do not capture the distribution perfectly. (a) Original Dataset. (b) The region detected as in-distribution by using quantile representations. (c) Region detected as in-distribution by using the outputs from a single classifier. Observe that quantile representations still perform better than  single classifier outputs. (d) Using random labels instead of ground-truth. Observe that the two moons structure is faithfully preserved in this image. The brightness of {\color{red}Red} indicates the chance of being in-distribution.}
\label{fig:intro_appendix}
\end{center}
\vskip -0.2in
\end{figure*}

In figure~\ref{fig:intro_appendix} we illustrate an example where quantile representations do not capture the entire distribution. Here we use the same data as in figure~\ref{fig:intro}, but with different class labels. This is shown in figure~\ref{fig:intro_app(a)}. When we perform the OOD detection we get the region as in figure~\ref{fig:intro_app(b)}. Observe that while it does detect points far away from the data as out-of-distribution, the moon structure is not identified. In particular, the spaces between the moons is not considered OOD. This illustrates a case when quantile representations might fail. 

However, OOD detection using a single classifier also fail, as illustrated in figure~\ref{fig:intro_app(c)}. Observe that the region identified by quantile representations is much better than the one obtained using a single classifier. 

\paragraph{A simple fix for OOD detection:} If OOD detection were the aim, then it is possible to change the approach slightly by considering \emph{random labels} instead of the ground-truth labels. This allows us to identify arbitrary regions where the data is located. This is illustrated in figure~\ref{fig:intro_app(d)}. Observe that this method can be used to identify any region in the space by suitably sampling and assigning pseudo-labels. In this case, we identify the training data perfectly. 

% Actually, I think we can further improve this by carefully considering the labels instead of taking random labels. OOD detection also depends on generalization. Random labels by definition give only 50% accuracy. So, we need to consider random labels within each class so that generalization happens. I expect this to improve OOD detection pretty well.

\section{Sanity Check - Preserving Monotonicity Property}
\label{sec:sanity_monotonicity}

\begin{figure*}[ht]
\vskip 0.2in
\begin{center}
\includegraphics[width=\textwidth]{./img/A9.png}
\caption{Checking that the quantile representations learnt using algorithm~\ref{I-alg:quantrep} satisfies the monotonicity property.}
\label{fig:sanity_monotonicity}
\end{center}
\vskip -0.2in
\end{figure*}

Note that quantile representations obtained by optimizing the simulateneous loss \eqref{eq:simulcheckloss}, should follow the monotonicity property - $\gQ(\vx,\tau_0) \leq \gQ(\vx,\tau_1) \leftrightarrow \tau_0 \leq \tau_1$. Since our approach is an alternate, the quantile representation learnt using algorithm~\ref{I-alg:quantrep} should satisfy this property as well. We verify this as follows - Considering the ResNet34 architecture trained on CIFAR10 dataset, we plot the \emph{logits} obtained at different quantiles.  

% \section{Cannot Correct Calibration Error}
% \label{sec:A4}

% \begin{figure}[ht]
% \vskip 0.2in
% \begin{center}
% \begin{subfigure}[b]{0.45\textwidth}
%  \centering
%  \includegraphics[width=\textwidth]{./img/A4a.png}
%  \caption{Densenet}
%  \label{fig:A4(a)}
% \end{subfigure}
% \hfill
% \begin{subfigure}[b]{0.45\textwidth}
%  \centering
%  \includegraphics[width=\textwidth]{./img/A4b.png}
%  \caption{Resnet}
%  \label{fig:A4(b)}
% \end{subfigure}

% \caption{Correcting calibration error on the validation set may not improve performance on corrupted datasets. The figure illustrates the use of Platt scaling and Isotonic regression to correct the calibration error on the validation set, but this leads to an increase in the calibration error on the corrupted dataset. This suggests that it may not be possible to correct the calibration error simply by adjusting the probability scores.}
% \label{fig:A4}
% \end{center}
% \vskip -0.2in
% \end{figure}

% Figure~\ref{I-fig:calib} shows that calibration error from quantile representations is robust to noise. So, an obvious question which follows is - Can we then correct it using validation data and improve the calibration score? It turns out that this is not possible. \textbf{Remark:} A similar result is also obtained in Proposition 1 of \cite{NEURIPS2021_5b168fdb}. 

% To verify this we perform the same experiment as in section~\ref{I-ssec:calibration}. Further we use Platt Scaling and Isonotic regression on validation data and accordingly transform the probability estimates for the corrupted datasets. These results are shown in figure~\ref{I-fig:A4}.

% Surprisingly, we find that correcting for calibration on the validation set actually leads to an increase in the calibration error on corrupted datasets. It is currently unclear why this is the case. Our hypothesis is that this calibration error may be due to the insufficient modeling of the underlying distribution by $f(\vx)$. For example, if the true underlying model is quadratic but we are using a linear model for classification, the calibration error caused by this mismatch cannot be corrected.

% \textbf{Remark:} Using techniques as described in \cite{DBLP:journals/corr/abs-2206-02757} can be useful to correct this, since the correction is dependent on the underlying output features and not just the probabilities.

\section{Matching Quantile-Representations to correct the distribution}
\label{sec:A6}

In this part we illustrate the matching of quantile-representations to correct for distribution shifts following the ideas from \cite{doi:10.1080/01621459.2014.929522}. Let $X$ denote the original distribution of the data, and let $\Phi(X)$ denote the modified distribution. We assume that the function $\Phi(.)$ is deterministic but unknown. 

If both $X$ and $\Phi(X)$ are known, then it is easy to estimate $\Phi(.)$ using some model such as neural networks. However, in reality we do not have this information. Once the environment changes, the data collected will be very different from the original ones and we do not know how $\Phi(.)$ distorts the original data. So, the aim is to estimate $\Phi(.)$ without the knowledge of $X$ and $\Phi(X)$. This is where the fact that - quantile-representations capture the distribution information becomes relevant.

\paragraph{Is this even possible?} Let $\gQ(\vx, \tau)$ denote the quantile representation obtained using $X$, and $\gQ_{\Phi}(\vx, \tau)$ denote the quantile representation obtained using $\Phi(X)$. Let the data collected in the new environment be $\{\hat{\vx}_i\}$, then we should have that
\begin{equation}
    \int_{\tau=0}^{1} |\gQ(\Phi^{-}(\hat{\vx}_i), \tau) - \gQ_{\Phi}(\hat{\vx_i}, \tau)| = 0
    \label{eq:get_phi}
\end{equation}
Using this it is possible to estimate $\Phi^{-}$ and hence $\Phi$. 

Observe the following - Functions $\gQ(.,.)$ and $\gQ_{\Phi}(.,.)$ are learnt from the data using the labels, and depends on it. So, one needs the labels to specify the directions in which distribution should be the same. 

For instance, consider the following example - Assume we wish to classify the candidates as suitable/not-suitable for a job based on a set of features. Now, what is suitable/not-suitable changes with with time. As well as the ability (represented in features) of the general population. So, we collect data at time $t=t_0$, $\{(\vx_{i,t_0}, y_{i,t_0})\}$ and at time $t=t_1$, $\{(\vx_{i,t_1}, y_{i,t_1})\}$. However we do not know the relation between $\vx_{i, t_0}$ and $\vx_{i, t_1}$. In such cases, matching quantile representations can be useful.

\begin{figure*}[ht]
\vskip 0.2in
\begin{center}
\begin{subfigure}[b]{0.45\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A6a.png}
 \caption{Original Dataset ($X_{t_0}$)}
 \label{fig:A6(a)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.45\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A6b.png}
 \caption{Shifted Dataset ($X_{t_1}$)}
 \label{fig:A6(b)}
\end{subfigure}

\begin{subfigure}[b]{0.45\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A6c.png}
 \caption{Boundaries at different quantiles (Original data)}
 \label{fig:A6(c)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.45\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A6d.png}
 \caption{Boundaries at different quantiles (Shifted data)}
 \label{fig:A6(d)}
\end{subfigure}

\begin{subfigure}[b]{0.45\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A6e.png}
 \caption{Distribution of $X_{t_1}$}
 \label{fig:A6(e)}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.45\textwidth}
 \centering
 \includegraphics[width=\textwidth]{./img/A6f.png}
 \caption{Distribution of data $\Phi(X_{t_0})$}
 \label{fig:A6(f)}
\end{subfigure}

\caption{Matching quantile representations. Observe that the estimated distribution at time $t_1$ is similar to the actual distribution at time $t_1$. This shows that the estimate of $\Phi()$ is accurate.}
\label{fig:A6}
\end{center}
\vskip -0.2in
\end{figure*}

\paragraph{Illustration: Matching of quantile representations} The above procedure is illustrated in figure~\ref{fig:A6}. Consider the data at $t_0$ as in figure~\ref{fig:A6(a)} and data at $t_1$ as in figure~\ref{fig:A6(b)}. This data in figure~\ref{fig:A6(a)} is generated using 2d Gaussian distribution with centers $[[0,0],[1,1]]$ and standard deviation $[[0.1,0.3],[0.3,0.11]]$. We refer to this distribution as $X_{t_0}$. Data in figure~\ref{fig:A6(b)} is obtained by generating a new sample with the same distribution as $X_{t_0}$ and transforming it using a random orthogonal matrix. We refer to this distribution using $X_{t_1}$. Note that there is no correspondence between the data samples at $X_{t_0}$ and $X_{t_1}$. Figures~\ref{fig:A6(c)} and \ref{fig:A6(d)} illustrate the quantile representations obtained using the class labels at both these times. We then estimate $\Phi()$ using \eqref{eq:get_phi}. Figure~\ref{fig:A6(e)} shows the density at $X_{t_1}$ and Figure~\ref{fig:A6(f)} shows the density of $\Phi(X_{t_0})$. Observe that the estimate of the density and the actual density match. This shows that quantile representations can be used to correct distribution shifts.

\paragraph{Caveat:} However, quantile-representations cannot estimate $\Phi()$ which do not change the distribution of the samples. For instance if $X_{t_1} = - X_{t_0}$, and if  $X_{t_0}$ is symmetric around $0$, then the quantile-representations are identical. Under what conditions can $\Phi(.)$ be estimated is considered for future work.

\paragraph{Advantage of using quantile-representations} A question which follows is - Why not simply retrain the classifier at $t_0$? (i) As can be gleaned from the above experiments, it is not possible to estimate $\Phi()$ from the single classifier alone, but can be done using quantile-representations (ii) The labels considered for constructing the quantile-representations need not be the same as the classification labels. They would correspond to important attributes of the data. For instance, one can consider aspects like technical skill of the candidate instead of simply suitable/not-suitable classification. 

\section{Training Details and Compute}
\label{sec:A7}

Training quantile representations was done on a DGX server using 4 GPUs. It takes around 10 hours (40 GPU hours in total) to learn the quantile representations for each model. We use stochastic gradient descent with cyclic learning rate for optimization. The base\_lr is taken to be $0.02$ and max\_lr is taken to be $1.0$, with exponentially decreasing learning rate using $\gamma = 0.99994$. The batch\_size is taken to be $1024$ for resnet34. The number of steps for the cyclic learning is taken to be $2(\nicefrac{size\_dataset}{2batch\_size)} + 1)$. The $size\_dataset$ describes the size of the training data. 

\section{Why Quantile Regression?}
\label{sec:A8}

If the goal of a regression problem is to predict the likely range of estimates (prediction interval) and not just a single estimate as the Ordinary Least Square Regression (OLS) does, the method is required to be more general and robust. This method for producing such estimates, relatively unknown in the Machine Learning community, is known as quantile regression. While OLS regression minimizes the squared-error loss function to predict a single point estimate, quantile regressions minimize the quantile loss in predicting a certain quantile. The 50th percentile, otherwise known as the median, represents the quantile loss as the sum of absolute errors (MAE). Other quantiles could give endpoints of a prediction interval; for example, a middle-80-percent range is defined by the 10th and 90th percentiles. The quantile loss differs depending on the evaluated quantile, such that more negative errors are penalized more for higher quantiles and more positive errors are penalized more for lower quantiles. In other words, quantile loss varies with the error, depending on the quantile, commonly interpreted as quantile for under- and over-estimated predictions. The higher the quantile, the more the quantile loss function penalizes underestimates and the less it penalizes overestimates. Quantiles allow for an understanding of a probability distribution of a data set in which only the specifications of the positions are known. Thus, wherever predictions are subject to high uncertainty, quantile should be the preferred loss function. Quantiles give some information about the shape of a distribution - in particular whether a distribution is skewed or not; are robust to outliers and can model extreme events well. Conditional quantiles obtained via regression are used as a robust alternative to classical conditional means in econometrics and statistics, as they can capture the uncertainty in a prediction, and model tail behaviors, while making very few distributional assumptions

The quantile regression has started relatively recently being applied in the energy-growth nexus literature. In the past, it has been used extensively in pediatric medicine (offering an optimistic perspective for precision medicine), survival and duration time studies \cite{huang2017quantile}, the determination of wages, discrimination effects, and income inequality. Also, it has been used in the finance literature in studies that dealt with bank failure and the time occurrence of this failure \cite{schaeck2008bank}. Regarding the more recent application in the energy-growth nexus field, it is not well documented in the relevant studies why asymmetries would be present in the way income and wealth is generated in different countries given the consumption of energy in those countries and other stylized parameters. One reason, quite understandable, why to use this method, is for testing whether poorer countries will be affected the same way by energy conservation measures as the rich ones. Another reason as stated by \cite{TROSTER2018440} in their study on renewable energy, oil prices, and economic growth for the United States is that their study would allow them to determine whether extremely low or high changes in energy consumption prices would lead economic growth. Therefore we can have very specific and accurate answers to what will happen if there is 1\% energy reduction in poor countries. This information would otherwise have to be included in dummy variables and other forms of robust estimation that assign less weight to observations that are characterized as outliers. Among the various other statistical twists offered by the method, the quantile regression may be favored because it does not assume a parametric distribution and it estimates the entire conditional distribution of the independent variable. Generally, this method is regarded as more versatile and informative \cite{rodriguez2017five}.

A switch from the squared error to the tilted absolute value loss function allows gradient descent-based learning algorithms to learn a specified quantile instead of the mean. It means that we can apply all neural network and deep learning algorithms to quantile regression \cite{huang2017quantile,schaeck2008bank}. The application of quantiles in deep learning, although relatively recent, are critical for model interpretability. In the past, \cite{DBLP:journals/tai/TambwekarMDS22} extended the notion of conditional quantiles to the binary classification setting—allowing uncertainty quantification in the predictions, increased resilience to label noise thus furnishing new insights into the functions learnt by the models. This was accomplished by defining a new loss called binary quantile regression loss, in the classification setting. The estimated quantiles to obtain individualized confidence scores provide an accurate measure of a prediction being misclassified. These scores were then aggregated to compute two additional metrics, namely, confidence score and retention rate, which can be used to withhold decisions and increase model accuracy. Thus, in a non-parametric binary quantile classification framework, authors could demonstrate that quantiles aid in explainability as they can be used to obtain several uni-variate summary statistics that can be directly applied to existing explanation tools.

Therefore, it is not unconvincing to realize the relevance and precedence of quantiles in classification, in particular, to obtain the conditional quantiles of the underlying latent function learnt by a binary classifier using customized loss inspired by quantiles \cite{TROSTER2018440}. 

% Experiment on ImagenetC: We have to do the following experiment on imagenet. (i) Simply retrain the last layer with new data and (ii) Use quantile representations to correct the classifier. The hypothesis is that matching quantile representations would give better classifier. For this however, the way we generate quantile-representations should be much more efficient. This is future work.

% \section{Future Work}
% \label{sec:A5}

% There are several different ways these ideas can be extended:

% \begin{itemize}
%     \item In this article we have only used the features from last layer to construct quantile representations. However, the algorithm in section \ref{ssec:genquantrep} can actually be used for entire neural networks as well. Intuitively, the parameter $\tau$ selects the weights of the deep neural network which in turn generates the predictions. We hypothesize that this should improve the OOD detection capabilities since the original data is used instead of the generated features.
% \end{itemize}
% comments on literature---------------%
% E. Parzen, “Quantile probability and statistical data modeling,” Stat. Sci.,vol. 19, pp. 652–662, Nov. 2004 \\
% R. Koenker and G. B. Bassett, “Regression quantiles,” Econometrica, vol. 46, pp. 33–50, 1978.
% S. Portnoy and R. Koenker, “Adaptive l-estimation for linear models,” Ann. Statist., vol. 17, no. 1, pp. 362–381, 1989.--IMPORTANT
% P. Chaudhuri, K. Doksum, and A. Samarov, “On average derivative quantile regression,” Ann. Statist, vol. 25, no. 2, pp. 715–744, 1997.
% R. Koenker, Quantile Regression, Ser. Econometric Society Monographs. Cambridge, U.K.: Cambridge Univ. Press, 2005.---IMPORTANT
% R. Maronna, D. Martin, and V. Yohai, Robust Statistics: Theory and Methods. Hoboken, NJ, USA: Wiley, Jun. 2006.
%P. Chaudhuri, “Generalized regression quantiles: Forming a useful toolkit for robust linear regression,” in L1 Stat. Anal. Related Methods–Proc. 2nd Int. Conf. L1 Norm Related Methods. Amsterdam, The Netherlands: North Holland, 1992, pp. 169–185.--IMPORTANT
%T. Ichiro, V. L. Quoc, D. S. Timothy, and J. S. Alexander, “Nonparametric quantile estimation,” J. Mach. Learn. Res., vol. 7, pp. 1231–1264, Jul. 2006.
% N. Tagasovska and D. Lopez-Paz, “Single-model uncertainties for deep learning,” in Proc. Adv. Neural Inf. Process. Syst., 2019, pp. 6417–6428.
% H. Zou and M.Yuan, “Composite quantile regression and the Oracle model selection theory,” Ann. Statist., vol. 36, no. 3, pp. 1108–1126, 2008.--IMPORTANT--REVOLVE OUR PAPER AROUND THIS
% G.Kordas, “Smoothed binary regression quantiles,” J. Appl. Econ., vol. 21, pp. 387–407, 2006.
% M. H. Farrell, T. Liang, and S.Misra, “Deep neural networks for estimation and inference: Application to causal effects and other semiparametric estimands,” Econometrica, vol. 89, no. 1, pp. 182–213, 2021.--IMPORTANT
% P. Anand, R. Rastogi, and S. Chandra, “A new asymmetric $$\epsilon$-insensitive pinball loss function based support vector quantile --IMPORTANT
% A. Nguyen, J. Yosinski, and J. Clune, “Deep neural networks are easily fooled: High confidence predictions for unrecognizable images,” Proc. IEEE Conf. Comput. Vis. Pattern Recognit., 2015, pp. 427–436. --IMPORTANT--REVOLVE OUR PAPER AROUND THIS
% B. Lakshminarayanan, A. Pritzel, and C. Blundell, “Simple and scalable predictive uncertainty estimation using deep ensembles,” in Proc. 31st Int. Conf. Neural Inf. Process. Syst., 2017, pp. 6405–6416.
%J. Liu, Z. Lin, S. Padhy, D. Tran, T. Bedrax Weiss, and B. Lakshminarayanan, “Simple and principled uncertainty estimation with deterministic deep learning via distance awareness,” in Adv. Neural Inf. Proc. Syst., Curran Associates, Inc., 2020, pp. 7498–7512.
% H. Jiang, B. Kim, M. Guan, and M. Gupta, “To trust or not to trust a classifier,” Adv. Neural Inf. Process. Syst., vol. 31, pp. 5546–5557, 2018.----IMPORTANT--REVOLVE OUR PAPER AROUND THIS
% A. Ghosh, H. Kumar, and P. S. Sastry, “Robust loss functions under label noise for deep neural networks,” in Proc. 31st AAAI Conf. Artif. Intell., 2017, pp. 1919–1925.--IMPORTANT
%M. Neykov, P. Cizek, P. Filzmoser, and P. Neytchev, “The least trimmed quantile regression,” Comput. Statist. Data Anal., vol. 56, no. 6, pp. 1757–1770, 2012.
% Y.-Y.Yang, C. Rashtchian,H. Zhang, R. Salakhutdinov, andK.Chaudhuri, “A closer look at accuracy vs. robustness,” Adv. Neural Inf. Proc. Syst., Curran Associates, Inc., 2020, pp. 8588–8601.--IMPORTANT--REVOLVE OUR PAPER AROUND THIS
% OOD papers by Balaji Laxminaryanan--IMPORTANT
% C. Rudin, “Stop explaining black box machine learning models for high stakes decisions and use interpretable models instead,” Nature Mach. Intell., vol. 1, no. 5, pp. 206–215, May 2019.--IMPORTANT..--IMPORTANT--REVOLVE OUR PAPER AROUND THIS
% S. M. Lundberg et al., “From local explanations to global understanding with explainable AI for trees,” NatureMach. Intell., vol. 2, no. 1, pp. 56–67, Jan. 2020, doi: 10.1038/s42256-019-0138-9.
% M. T. Ribeiro, S. Singh, and C. Guestrin, “Why should I trust you?: Explaining the predictions of any classifier,” in Proc. 22nd ACMSIGKDD Int. Conf. Knowl. Discov. Data Mining, New York, NY, USA: Association for Computing Machinery, 2016, pp. 1135–1144.---IMPORTANT--LIME----IMPORTANT--REVOLVE OUR PAPER AROUND THIS
% P. Chaudhuri, “On a geometric notion of quantiles for multivariate data,” J. Amer. Stat. Assoc., vol. 91, pp. 862–872, 1996.
%M. Hallin, D. Paindaveine, and M. Šiman, “Multivariate quantiles and multiple-output regression quantiles: From l 1 optimization to halfspace depth,” Ann. Statist., vol. 38, no. 2, pp. 635–669, Apr. 2010.
% A. Tambewkar, A. Maiya, Soma.S. Dhavala and S.Saha; Estimation and Applications of Quantiles in Deep Binary Classification; IEE Transactions on Artificial Intelligence; 3 (2), 275-286 DOI:
%10.1109/TAI.2021.3115078; April 2022--IMPORTANT----IMPORTANT--REVOLVE OUR PAPER AROUND THIS
% Transferable Calibration with Lower Bias and Variance in Domain Adaptation, Part of Advances in Neural Information Processing Systems 33 (NeurIPS 2020)--JORDAN---IMPORTANT--REVOLVE OUR PAPER AROUND THIS
% Estimating Confidence of Predictions of Individual Classifiers and Their Ensembles for the Genre Classification Task, Mikhail Lepekhin, Serge Sharoff--take a look!
% https://www.stat.cmu.edu/~jinglei/conf_class_R2.pdf   take a look
% https://www.nematilab.info/bmijc/assets/120518_paper1.pdf   take a look

\bibliography{main}
\bibliographystyle{plain}

\end{document}