\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument[main-]{uai2023-template}

\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Expectation consistency for calibration of neural networks \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:lucas.clarte@epfl.ch?Subject=Your UAI 2023 paper}{Lucas Clart\'e}{}}
\author[2]{Bruno Loureiro}
\author[3]{Florent Krzakala}
\author[1]{Lenka Zdeborov\'a}
% Add affiliations after the authors
\affil[1]{%
Ecole Polytechnique Fédérale de Lausanne (EPFL) \\
Statistical Physics of Computation lab. \\
Lausanne, Switzerland
}
\affil[2]{%
    Département d’Informatique \\
    École Normale Supérieure - PSL \& CNRS \\
    45 rue d’Ulm, Paris, France
}
\affil[3]{%
    École Polytechnique Fédérale de Lausanne (EPFL)\\
    Information, Learning and Physics lab.\\
    Lausanne, Switzerland
  }

\usepackage{amsmath, amssymb, amsthm, xfrac, mathtools, amsfonts}
\usepackage{hyperref}
\usepackage{pgfplots}
\pgfplotsset{compat=newest}
\pgfplotsset{scaled y ticks=false}
\usepgfplotslibrary{groupplots}
\usepgfplotslibrary{dateplot}
\usepackage{bm}
\include{macros}
  
\begin{document}
  
\onecolumn %% Turn this off if single col$umn is desired for the supplement
\maketitle
\appendix

\section{Details on training procedure}
\label{sec:training_procedure}

\paragraph{SVHN} For the SVHN dataset \cite{netzer_reading_2011}, the Resnet20 model of depth 20 and containing 0.27M parameters was trained for $50$ epochs, using SGD with a learning rate $\eta = 0.1$, weight decay $1e-4$ and momentum $0.9$. $90\%$ of data points were used for training and the rest was used for validation.
\paragraph{CIFAR10} ResNet models (of depth 20, 56 with Resnet56 having 0.85M parameters) were trained for $50$ epochs, using SGD with a learning rate $\eta = 0.1$, weight decay $1e-4$ and momentum $0.9$. The DenseNet 121 (containing 7.9 parameters) was trained with the same parameters as the ResNets, except for the learning rate $\eta = 0.01$. As in \cite{he_resnet_2016}, images in the training set were randomly cropped and flipped horizontally.

\paragraph{CIFAR100} On CIFAR100, we used pre-trained models from the Github repository \url{https://github.com/chenyaofo/pytorch-cifar-models}. These models were trained on the entirety of the training set, so the test set containing $10000$ images was split in half into a validation and test set, containing $5000$ images each.

\subsection{Additional plots}

In Figure~\ref{app:reliability_diagram}, we plot the reliability diagram of Resnet20 and Resnet56 on SVHN and CIFAR10 respectively. We observe that the uncalibrated models are overconfident (as the confidence is higher than the corresponding accuracy), and both TS and EC mitigate this overconfidence.

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.45\textwidth]{appendix_real_data/svhn_calibration.pdf}
    \includegraphics[width=0.45\textwidth]{appendix_real_data/resnet56_calibration.pdf}
    \caption{Reliability diagram of Resnet20 on the SVHN dataset (Left) and Resnet56 on the CIFAR10 dataset (Right). Before calibration, both methods are overconfident. TS and EC improve calibration and mitigate overconfidence. }
    \label{app:reliability_diagram} 
\end{figure}

\section{State evolution equation}
\label{app:se_equations}

In this section, we focus on the data model introduced in Section 5. Recall that we consider a dataset of $n$ samples $\mathcal{D} = (x^{\mu}, y^{\mu})_{\mu = 1}^n$ generated by 
\begin{equation}
    \Vec{x} \sim \mathcal{N}(\Vec{0}, \mathcal{I}_d / d), \Vec{w}_* \sim \mathcal{N}(\Vec{0}, \mathcal{I}_d), \mathbb{P}(y = 1 | \wstar^{\top} \Vec{x}) = \sigma_{\star}(\wstar^{\top} \Vec{x})
\end{equation}
and we fit the following logistic regression model, with $\sigma$ the sigmoid function: 
\begin{equation}
    \ferm(\Vec{x}) = \sigma(\werm^{\top} \Vec{x})
\end{equation}
by minimizing the following empirical risk 
\begin{equation}
    \mathcal{R}(\Vec{w}, \mathcal{D}, \lambda) = \sum_{\mu = 1}^n \log \sigma(y^{\mu} \Vec{w}^{\top} \Vec{x}) + \sfrac{\lambda}{2} \| \Vec{w} \|^2
\end{equation}
we thus have $\werm = \arg \min_{\Vec{w}} \mathcal{R}(\Vec{w}, \mathcal{D}, \lambda)$. For a new sample $\Vec{x}$, we are interested in the joint distribution of $\fstar(\Vec{x})$ and $\ferm(\Vec{x})$. As these two functions only depend on the scalar products $\wstar^{\top}\Vec{x}$, $\werm^{\top}\Vec{x}$ it suffices to compute the joint distribution of these scalar products. By the Gaussianity of $\Vec{x}$, we just need to compute the \textit{overlaps} $m = \wstar^{\top} \werm$ and $q = \| \werm \|^2$. In the high-dimensional limit where $n, d \to \infty$ but where we keep the \textit{sampling ratio} constant $\sfrac{n}{d} = \alpha$, it is possible to compute the value of $m$ and $q$. The idea is to introduce the distribution
\begin{equation}
    \mu_{\beta, \mathcal{D}, \lambda}(\Vec{w}) = \frac{1}{\mathcal{Z}_{\beta}} \exp \left( - \beta \mathcal{R}(\Vec{w}, \mathcal{D}, \lambda) \right)
\end{equation}
where $\mathcal{Z}_{\beta}$ is a normalization constant. In the limit $\beta \to \infty$, $\mu_{\beta, \mathcal{D}, \lambda}$ converges to a Dirac distribution peaked at $\werm = \arg \min \mathcal{R}(\Vec{w}, \mathcal{D}, \lambda)$. To compute $m, q$, one needs to compute the expression of $\log \mathcal{Z}_{\beta}$ and its limit when $\beta \to \infty$. In the high-dimensional regime where both the dimension and number of samples diverge with a fixed ratio, this can be done using the \textit{replica method} from statistical physics \cite{zdeborova2016statistical}. As these computations are not the focus of the present paper, we refer to \cite{loureiro_learning_2021, clarte_theoretical_2022} for the detailed computations. In the end, if we define 
\begin{align}
    \mathcal{Z_*}(y, \omega, v_*) &= \int \dd z \sigma_*(y \times z) \mathcal{N}( z | \omega, v_*) \\
    f(y, \omega, v) &= \arg \min_z \left[  \frac{(z - \omega)^2}{2v} - \log \sigma(z) \right]
\end{align}
then $m, q$ are the solution of the following self-consistent equations: 
\begin{align}
    \begin{cases}
        m &= \frac{\hat{m}}{\lambda + \hat{v}} \\
        q &= \frac{ \hat{q} + \hat{m}^2 }{(\lambda + \hat{v})^2}   \\
        v &= \frac{1}{\lambda + \hat{v}}
    \end{cases}, \quad 
    \begin{cases}
        \hat{m} &= \alpha \mathbb{E}_{ \xi \sim \mathcal{N}(0, q)} \left[ \int \dd y \partial_{\omega} \mathcal{Z}_*(y, \sfrac{m}{q}\xi, v_*) f(y, \xi, v) \right]  \\
        \hat{q} &= \alpha \mathbb{E}_{ \xi \sim \mathcal{N}(0, q) } \left[ \int \dd y \mathcal{Z}_*(y, \sfrac{m}{q}\xi, v_*) f^2(y, \xi, v) \right] \\
        \hat{v} &= - \alpha \mathbb{E}_{ \xi \sim \mathcal{N}(0, q) } \left[ \int \dd y \mathcal{Z}_*(y, \sfrac{m}{q}\xi, v_*) \partial_{\omega} f(y, \xi, v) \right]
    \end{cases}
\end{align}
with $v_* = \rho - \sfrac{m^2}{q}$.

\paragraph{Calibration in the high-dimensional regime} Once we obtained the overlaps $m, q$, we can derive the expression the calibration $\Delta_{\ell}$: 
\begin{align}
    \Delta_{\ell} &= \mathbb E \left[ \fstar(\vec x) | \ferm(\vec x) \right] = \mathbb P \left[ y = 1 | \ferm(\vec x) \right] = \int \dd z \sigma_{\star}(z) \mathcal{N}(z | \frac{m}{q} \ferm^{-1}(\vec{x}), \rho - \sfrac{m^2}{q})
\end{align} 
The second line comes from the fact that the scalar product $\wstar^{\top} \Vec{x}$ conditioned on $\werm^{\top} \Vec{x} = \sigma^{-1}(\ell)$ follows a Gaussian distribution with mean $\sfrac{m}{q} \xi$ and variance $\rho - \sfrac{m^2}{q}$. As a consequence, the expression of ECE is 

\begin{align}
    ECE &= \mathbb{E}_{\Vec{x}} \left[ | \Delta_{\ferm(\Vec{x})}| \right] = \mathbb{E}_{\xi = \werm^{\top}\Vec{x}} \left[ | \Delta_{\sigma(\xi)}|\right] = \int \dd \xi |\Delta_{\sigma(\xi)}| \mathcal{N}(\xi | 0, q)
\end{align}

{\color{black}
\section{Experiments on corrupted dataset}

We describe below an experiment where EC can significantly improve over TS for real data: we train different architectures on several image classification tasks, as in Figure 1. However, here for the validation and test set some classes are replaced with random labels. For SVHN and CIFAR10, the labels $y = 0$ are replaced by random labels. For CIFAR100, the labels $y = 0, ..., 9$ are replaced by random labels. By doing so, around 10\%  of validation/test data is corrupted, with a noise that depends on the class. 
Note that the training data is left unchanged: the goal of this experiment is to model a distribution shift between training and test data, similarly as what is done \cite{hendrycks_benchmarking_2019}.

In the table below, we compare the performance (in ECE and Brier score) of EC and TS with these corrupted datasets. We observe that in this setting, EC outperforms TS by a significant margin on several datasets and architectures.

\begin{table*}
    \centering
    \begin{tabular}{|c|c|ccc|ccc|ccc|}
        \hline
        % TODO : Maybe write the number of parameters / layers in the models 
        Dataset & Model & $\mathcal{E}_g$ & $T_{TS}$ & $T_{EC}$ & $ECE$ & \!\!\!\! $ECE_{TS}$ \!\!\!\!& \!\!\!\!$ECE_{EC}$ \!\!\!\!& \!\!\!\! $BS$ \!\!\!\! & \!\!\!\!$BS_{TS}$ \!\!\!\!& \!\!\!\!$BS_{EC}$ \!\!\!\!\\
        \hline
        SVHN & Resnet20 & 12.5   & 2.69 & 2.23 & 8.3    & 10.7   & 7.5   & 21.9   & 23.4   &  22.1   \\ 
        \hline 
        CIFAR10  & Resnet20 & 20.9   & 2.4 & 2.0 & 12.8   & 4.6   & 4.2   & 34.2   & 32.2   & 32.1   \\
        CIFAR10  & Resnet56 &  21   & 2.58 & 2.15 & 13.8   & 5.4   & 4.9   & 35.2 & 32.9 & 32.8 \\
        CIFAR10  & Densenet121 & 20.4   & 2.76 & 2.54 & 15.8   & 3.6   & 5.0    & 35.9     & 31.8  & 31.9   \\
        \hline
        CIFAR100 & Resnet20 & 38.1   & 2.04 & 1.70 & 16.5   & 9.6   & 5.9   & 57.0   & 54.9    & 53.9  \\
        CIFAR100 & Resnet56 & 34.8   & 2.27 & 2.10 & 21.7   & 7.6   & 7.3   & 56.0   & 50.6   & 50.4   \\ 
        CIFAR100 & VGG19 & 35.5   & 2.6 & 2.1 & 28.34   & 5.2   & 5.1   & 61.8   & 50.1   & 50.1   \\
        CIFAR100 & RepVGG-A2 & 30.5   & 1.44 & 1.40 & 13.7   & 11.6   & 11.7   & 47.2   &  47.1   & 47.0  \\
        \hline
    \end{tabular}
    \caption{Comparison of expected calibration error (ECE) and Brier score (BS) of temperature scaling (TS) and expectation consistency (EC) when part of the validation and test data has been corrupted}
    \label{fig:tab_ece}
\end{table*}
}

\nocite{*}
\bibliography{clarte_449}

\end{document}
