\documentclass[accepted]{uai2024} %
                        

\usepackage[american]{babel}

\usepackage{booktabs}
\usepackage{enumitem}

\RequirePackage{fancyhdr}
\RequirePackage{xcolor} %
\RequirePackage{algorithm}
\RequirePackage{algorithmic}
\RequirePackage{eso-pic} %
\RequirePackage{forloop}
\RequirePackage{url}

\usepackage{natbib} %
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{xfrac}
\usepackage{amsmath, amsfonts, amssymb}
\usepackage{bbm}
\input{macros}

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} %

\usepackage{hyperref}
\hypersetup{%
            colorlinks, breaklinks=true,    urlcolor=orange, linkcolor=blue, citecolor=blue
        }
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\usepackage[capitalize,noabbrev]{cleveref}

\usepackage{pgfplots}
\pgfplotsset{compat=newest}
\pgfplotsset{scaled y ticks=false}
\usepgfplotslibrary{groupplots}
\usepgfplotslibrary{dateplot}
\usepackage{tikz}



\newcommand{\swap}[3][-]{#3#1#2} %

\title{Analysis of Bootstrap and Subsampling \\ in High-dimensional Regularized Regression}

\author[1]{\vspace{-2em}Lucas Clart\'e}
\author[1,2]{Adrien Vandenbroucque}
\author[1,2,3]{Guillaume Dalle}
\author[4]{Bruno Loureiro}
\author[2]{Florent Krzakala}
\author[1]{Lenka Zdeborov\'a}

\affil[1]{
\'Ecole Polytechnique F\'ed\'erale de Lausanne (EPFL)\\
SPOC laboratory\\
CH-1015 Lausanne, Switzerland
}
\affil[2]{
\'Ecole Polytechnique F\'ed\'erale de Lausanne (EPFL)\\
IdePHICS laboratory\\
CH-1015 Lausanne, Switzerland
}
\affil[3]{
\'Ecole Polytechnique F\'ed\'erale de Lausanne (EPFL)\\
INDY laboratory\\
CH-1015 Lausanne, Switzerland
}
\affil[4]{
D\'epartement d'Informatique, \'Ecole Normale Sup\'erieure - PSL \& CNRS, 45 rue d’Ulm, F-75230 Paris cedex 05, France
}


\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\begin{document}

\maketitle

\begin{abstract}
We investigate popular resampling methods for estimating the uncertainty of statistical models, such as subsampling, bootstrap and the jackknife, and their performance in high-dimensional supervised regression tasks. We provide a tight asymptotic description of the biases and variances estimated by these methods in the context of generalized linear models, such as ridge and logistic regression, taking the limit where the number of samples and dimension of the covariates grow at a comparable fixed rate. Our findings are three-fold: i) resampling methods are fraught with problems in high dimensions and exhibit the double-descent-like behavior typical of these situations; ii) only when the sampling ratio is large enough do they provide consistent and reliable error estimations (we give convergence rates); iii) in the over-parametrized regime relevant to modern machine learning practice, their predictions are not consistent, even with optimal regularization. 
\end{abstract}
\section{Introduction}
\label{sec:intro}
Estimating and quantifying errors is a central aspect of statistical practice. Nevertheless, a solid understanding of how uncertainty can be reliably quantified in modern machine learning practice is largely missing, despite being a key endeavor towards a reliable use of these methods across sensitive applications. This paper delves into a comprehensive mathematical analysis of conventional resampling methods to estimate uncertainty, such as subsampling, the bootstrap and the jackknife, specifically in the context of high-dimensional regression and classification tasks. 

Let $Z_{1},\cdots, Z_{n}\sim p_{\theta}$ denote $n$ independent samples from a parametric probability distribution. Given an estimator $\hat{\theta}$ of $\theta$ (e.g. the maximum likelihood estimator), one is interested not only in the absolute performance of $\hat{\theta}$ but also in estimating how reliable it is, e.g. error bars. In particular, even if the estimator is consistent, i.e. $\hat{\theta}\!\to\!\theta$ when $n\!\to\!\infty$, having access only to a finite amount of data $n$ introduces uncertainty in our estimation $\theta$. A central question in statistics is \emph{how to quantify this uncertainty} \citep{wasserman2004all}.

A classical family of non-parametric methods developed to address this question are \emph{resampling methods} \citep{tibshirani1993introduction,james2023resampling}, which consist in estimating the statistics of interest from the empirical distribution $p_{n} = \sfrac{1}{n}\sum_{i=1}^{n}\delta_{Z_{i}}$. Our goal is to investigate the statistical properties of three popular resampling methods in the context of the most widespread machine learning task: \emph{supervised learning}. Here the samples are given by pairs $Z_{i} = (\vec{x}_{i}, y_{i})$ from a joint distribution $p_{\theta}(\vec{x},y)$, with $\vec{x}_{i}\in\mathbb{R}^{d}$ being the covariates and $y_{i}\in\mathcal{Y}\subset\mathbb{R}$ the labels. Given the parameter $\hat \theta$ learned by a fitting model, say ridge or logistic regression, the goal is to estimate the actual bias and variance of $\hat \theta$. 

We focus on the {\it high-dimensional} regime, where both the number of samples $n$ and their dimension $d$ are comparatively large, with a fixed ratio $\alpha=n/d$. We provide a tight asymptotic description of the biases and variances estimated by resampling methods for generalized linear models, such as ridge and logistic regression or any M-estimator. We show that resampling methods are fraught with problems in high-dimensions, either overestimating or underestimating the mean and variances. Reliable error estimation can only be reached in the regime when $\alpha\gg 1$, for which we provide asymptotic rates of convergences. However, in the overparametrized regime $\alpha < 1$, relevant to modern machine learning practice, the predictions of resampling methods are clearly off, even when optimally regularizing.

\section{Setting \& motivation}
\label{sec:setting}
We consider the class of generalized linear estimation problems, where the goal is to estimate a parameter $\vec{\theta}_{\star}\in\mathbb{R}^{d}$ from $n$ independent samples $\mathcal{D}=\{(\vec{x}_{i},y_{i})_{i\in[n]}\}$ drawn from the following distribution:
\begin{align}
    \label{eq:def_model}
    y_{i}\sim p(\cdot|\vec{\theta}_{\star}^{\top}\vec{x}_{i}), && \vec{x}_{i}\sim\mathcal{N}(0,\sfrac{1}{d}\mat{I}_{d})
\end{align}
for a general likelihood $p(y|z)$. Therefore, in this case, the joint distribution reads $p_{\vec{\theta}_{\star}}(\vec{x},y) = p(y|\vec{\theta}_{\star}^{\top}\vec{x})p(\vec{x})$. For concreteness, we assume $\vec{\theta}_{\star}\sim\mathcal{N}(0,\mat{I}_{d})$. In the following, we focus on the (regularized) maximum likelihood estimator:
\begin{align}
    \label{eq:def_erm}
    \hat{\vec{\theta}}_{\lambda}(\mathcal{D}) = \underset{\vec{\theta}\in\mathbb{R}^{d}}{\rm argmin}\sum\limits_{i=1}^{n}-\log p\left(y_{i}|\vec{\theta}^{\top}\vec{x}_{i}\right) + \frac{\lambda}{2}\|\vec{\theta}\|^{2}_{2}
\end{align}
also known as \emph{empirical risk minimizer} in the context of supervised machine learning, where the loss function coincides with minus the empirical log-likelihood: ${\ell(y,z) = -\log p(y|z)}$. When it is clear from the context, we omit the training data dependence $\mathcal{D}$ in the MLE estimator and write $\hat{\vec{\theta}}_{\lambda}$.

We will focus on two particular examples of generalized linear estimation: ridge and logistic regression. Ridge regression is a regression problem $\mathcal{Y}=\mathbb{R}$, which corresponds to the Gaussian likelihood $p(y|z) = \mathcal{N}(y|z, \Delta)$ of mean $z$ and variance $\Delta$ (or equivalently the square loss function $\ell(y,z)=\frac{1}{2\Delta}(y-z)^{2}$) for $\Delta>0$. Instead, logistic regression is a binary classification problem $\mathcal{Y}=\{-1,+1\}$ which corresponds to a logit likelihood $p(y|z) = \sigma(yz)$ for $\sigma(t) = (1+e^{-t})^{-1}$ the logistic function (this corresponds to the logistic or cross-entropy loss function $\ell(y,z) = \log(1+e^{-yz})$).

Note that the estimation problem introduced above is well-specified, and therefore enjoys strong mathematical guarantees in the classical statistical regime where $n\to\infty$ at fixed $d$. For instance, a well-known result is the asymptotic normality of the MLE for $\lambda=0$ \citep{wasserman2004all}: 
\begin{align}
\label{eq:consistency}
    \sqrt{n}\left(\hat{\vec{\theta}}_{0} - \vec{\theta}_{\star}\right) \overset{(d)}{\to} \mathcal{N}(0, \mathcal{I}^{-1}), && n\to\infty
\end{align}
where $\mathcal{I}\in\mathbb{R}^{d\times d}$ is the Fisher information matrix, in particular implying consistency and calibration of the maximum likelihood estimator. However, those guarantees break down when the number of samples is comparable with the dimension of the covariates $n=\Theta(d)$. This is precisely the regime of interest in our work, and applying it to resampling methods will be our goal in the following.

\subsection{What statisticians want}
``Bias'' and ``variance'' depend on the underlying data sampling process, and therefore, different notions co-exist, whether one takes, for instance, a frequentist or Bayesian viewpoint. Below, we define these different quantities, which resampling methods try to approximate.

\paragraph{Frequentist bias and variance --- } In the classical frequentist approach, the statistician seeks to estimate the bias and variance with respect to the data sampling process. This induces the classical \emph{bias-variance decomposition} of the mean squared error for the estimator $\hat{\vec{\theta}}_{\lambda}$:
\begin{equation}
    {\rm MSE}(\werm)\!=\!\frac 1d  \mean{\dataset, \wstar}{\|\werm -\wstar  \|^2} \!\!=\!  \biasOnXY \!+\! \varianceOnXY \nonumber 
\end{equation}
with: 
\begin{align}
    \biasOnXY &=  \frac1d\left\lVert\mean{\dataset, \wstar}{\werm} - \vec{\theta}_{\star}\right\lVert^2 \label{eq:def_true_bias}\\
   \varianceOnXY &= \frac1d\mean{\dataset, \wstar}{\left\lVert\werm-\mean{\dataset, \wstar}{\werm}\right\lVert^{2}}. \label{eq:def_true_variance}
\end{align}
We emphasize that in this case, the expectations are taken with respect to sampling of the full data set ${\mathcal{D} = \{(\vec{x}_{i},y_{i})_{i\in[n]}\}\sim p^{\otimes n}_{\wstar}}$.

\paragraph{Conditional bias and variance --- }
Alternatively, in a supervised learning setting one can define the bias and variance only with respect to the sampling of the labels $y_{i}\sim p(\cdot|\vec{x}_{i}^{\top}\vec{\theta}_{\star})$, i.e. conditionally on the covariates $\vec{x}_{i}$. This is known as a \emph{fixed design} analysis. We will refer to the corresponding notions as {\it conditional} bias and variance:
\begin{align}
    \biasOnY &=  \frac1d\left\lVert\mathbb{E}_\mathcal{D}[\hat{\vec{\theta}}_{\lambda}|\mat{X}] - \vec{\theta}_{\star}\right\lVert^2\label{eq:def_true_cond_bias}\\
   \varianceOnY &= \frac1d\mathbb{E}_{\mathcal{D}}\left\lVert\hat{\vec{\theta}}_{\lambda}-\mathbb{E}[\hat{\vec{\theta}}_{\lambda}|\mat{X}]\right\lVert^{2},\label{eq:def_true_cond_variance}
\end{align}
where for convenience we defined the covariate matrix $\mat{X}\in\mathbb{R}^{n\times d}$ with rows given by the covariates $\vec{x}_{i}\in\mathbb{R}^{d}$.

\paragraph{Bayesian estimator and variance ---}
Finally, it is natural to compare the maximum likelihood estimator above with the best estimator (in mean squared error) conditioned on the full training data $\mathcal{D}$, also known as the \emph{Bayes-optimal} estimator. It requires, however, the knowledge of the {\it a priori} distribution of the ``true'' weights.
\begin{align}
    \hat{\vec{\theta}}_{\rm bo} = \underset{\hat{\vec{\theta}}\in\mathbb{R}^{d}}{\rm argmin}~\mathbb{E}\left[\lVert \hat{\vec{\theta}} - \vec{\theta}_{\star}\lVert^{2}\right] = \mathbb{E}[\vec{\theta}|\mathcal{D}]
\end{align}
where the conditional expectation is taken with respect to the posterior distribution:
\begin{align}
\label{eq:def_bo}
    p(\vec{\theta}|\mathcal{D}) \propto \mathcal{N}(\vec{\theta}|0,\mat{I}_{d})\prod\limits_{i=1}^{n} p(y_{i}|\vec{\theta}^{\top}\vec{x}_{i}) 
\end{align}
Note that, by definition, $\hat{\vec{\theta}}_{\rm bo}$ is an unbiased and calibrated estimator of $\vec{\theta}_{\star}$ \citep{clarte2023theoretical}. Nevertheless, it captures the irreducible variance due to the fact we have a finite sample $\mathcal{D}$ of the population distribution: 

\begin{equation}
    \varianceBO = \frac1d\mathbb{E}\left[\left\lVert \vec{\theta} -\vec{\theta}_{\rm bo} \right\lVert^{2}|\mathcal{D}\right]
\end{equation}
where, again, the expectation is taken over the posterior distribution $p(\vec{\theta}|\mathcal{D})$.

\subsection{Resampling estimates} \label{sec:resampling_estimates}
A central problem in statistics is the estimation of the biases \eqref{eq:def_true_bias} \& \eqref{eq:def_true_cond_bias} and variances \eqref{eq:def_true_variance} \& \eqref{eq:def_true_cond_variance}, which involve population expectations, from a finite number of samples $\mathcal{D}=\{(\vec{x}_{i},y_{i})_{i\in[n]}\}$. Resampling methods are a popular class of statistical procedures that fit a family of $B$ estimators $\hatw_{b} \equiv \werm(\mathcal{D}_{b}^{\star})$ from resampled data $\mathcal{D}^{\star}_{b}$ generated from the original samples $\mathcal{D}=\{(\vec{x}_{i},y_{i})_{i\in[n]}\}$, and from which the bias and variance of $\hat{\vec{\theta}}_{\lambda}$ can be estimated:
\begin{align}
    \widehat{\Bias}^{2} &= \frac1d\left\lVert \frac{1}{B}\sum\limits_{b=1}^{B}\hatw_{b} - \werm \right\lVert^{2}, \label{eq:def:bias}\\ 
    \widehat{\Var} &= \frac{1}{dB}\sum\limits_{b=1}^{B}\left\lVert \hatw_{b}-
    \frac{1}{B}\sum\limits_{b=1}^{B}\hatw_{b}\right\lVert^{2}\label{eq:def:var}
\end{align}
In this work, we will focus on the following methods:
\begin{description}[noitemsep,leftmargin=1em,wide=1pt]
    \item[- Pair bootstrap:] Consists in resampling $\mathcal{D}_{b}^{\star}$ from $\mathcal{D}$ with sample replacements, or in other words, sampling ${\mathcal{D}^{\star}_{b} = \{(\vec{x}^{\star}_{b,i},y^{\star}_{b,i})_{i\in[n]}\}\sim p^{\otimes n}_{n}}$ from the empirical distribution. %
    
    \item[- Residual bootstrap:] Akin to the pair bootstrap method, but for the conditional distribution $p(y|z)$. In practice, one first fits an estimator $\hat{\vec{\theta}}_{\lambda}(\mathcal{D})$ on the original samples (the MLE \eqref{eq:def_erm} in our setting), and given a statistical model for $\hat{p}(y|z)$, one resamples only the labels from $\hat{p}(y|\hat{\vec{\theta}}_{\lambda}(\mathcal{D})^\top\vec{x}_i)$, generating new datasets $\dataset^{\star}_b = \{ \Vec{x}_i, y^{\star}_{b,i} \}_{i = 1}^n$. This allows for the estimation of conditional statistical errors. 
    
    \item[- Subsampling:] Consists of generating new datasets $\mathcal{D}_{b}^{\star}$ of a smaller size $\lfloor r  n \rfloor$  by subsampling $\mathcal{D}$ without replacement, where $r\in(0,1)$. While bootstrap creates datasets of the right size but from the wrong distribution (as elements of $\dataset$ are duplicated), subsampling relies on data of the wrong size but from the right distribution.\footnote{Since the $\mathcal{D}_{b}^{\star}$'s are independent conditionally on $\mathcal{D}$.}
    
    \item[- Jackknife:] Consists of creating $B=\!n\!$ datasets $\mathcal{D}^{\star}_{b}\!=\!\{(\vec{x}_{i}, y_{i})_{i\neq b}\}$, each of which leaves a single sample out. Note that when $n\!\to\!\infty$, as in our high-dimensional regime, this is equivalent to subsampling with $r\!\to\!1$.
\end{description}
For notational convenience, we will refer to these statistics  as $\widehat{\Bias^2_t}, \widehat{\Var_t}$ with $t\!\in\!\{\pb, \rb, \Ss, \jk\}$ for  pair (pb) and residual bootstrap (rb), subsampling (ss) and jackknife (jk). \looseness=-1

\section{Contributions \& related work}
The resampling methods above have been widely studied in the classical statistical literature, with whole books dedicated to proving their mathematical soundness \citep{10.1214/aos/1176344552, 10.1214/ss/1177013815, Davison_Hinkley_1997}. However, as discussed in \cref{sec:setting} most of the classical guarantees hold in the regime where the quantity of data $n$ available to the statistician is large in comparison with data dimension $d$ --- a regime that falls short in the context of modern machine learning practice. Of particular importance was the work of \citet{ElKaroui2018} who have pointed out the lack of consistency of the bootstrap method for {\it unregularized} least squares, in the {\it underparametrized regime} $n>d$.  One of our goals in this manuscript is to fill the gap, providing a complete evaluation of the aforementioned methods (beyond bootstrap), including the effect of regularization and over-parametrization. 

More precisely, our \textbf{main contributions} are:
\begin{itemize}[noitemsep,leftmargin=1em,wide=1pt]
    \item We provide a closed-form expression for the biases and variances in the proportional high-dimensional limit where $n,d \to \infty$ at fixed rate $\alpha=n/d$ for all the cases discussed in \cref{sec:setting}: the pair and residual biases and variances and their bootstrap, subsample, and jackknife estimates. Our result holds for generic log-concave likelihoods (corresponding to convex losses) and convex regularizers. 
    
    \item Our formulas are derived from mapping to a Generalized Approximate Message Passing (GAMP) scheme admitting a rigorous asymptotic characterization in terms of \emph{state evolution} equations \citep{bayati2011dynamics,bayati2011lasso,JMLR:v15:javanmard14a,emami2020generalization,loureiro2021learning}. We believe this derivation has an interest on its own, as we show how simultaneously tracking {\it coupled} GAMP trajectories provides the biases and variances for all the resampling methods. Our construction is quite generic and can be extended to other variants of interest.
    \item Our examination into the effectiveness and limitations of these methods yields three key insights. Firstly, we demonstrate that resampling techniques face significant challenges in high-dimensional contexts, resulting in a double-descent behavior typical of such scenarios. Secondly, we find that these methods yield consistent and reliable error estimates only when the ratio $\alpha$ is sufficiently large, for which we also present convergence rates. Thirdly, in the overparametrized regime where $\alpha\!<\!1$, the predictions remain inconsistent despite optimal regularization.
\end{itemize}

\paragraph{Further related work ---} Resampling methods are a classical topic in statistics. The jackknife method was introduced in \citet{6c956df0-ca97-3419-9961-dcc097853946}, refined by \citet{10.1214/aoms/1177706647} and analysed by \citet{efron1981jackknife}. Bootstrap was introduced by \citet{10.1214/aos/1176344552}, and studied in the context of least squares estimation in \citet{10.1214/aos/1176345638, 10.1214/aos/1176350142}.


The asymptotic theory of high-dimensional statistical generalized linear problems has witnessed a burst of activity over the last decades. Pioneered by the statistical physics community in the late 80s \citep{Gardner_1989, Opper_1990, NIPS1991_8eefcfdf, PhysRevA.45.6056, 6796373}, it is now an established field of research encompassing applications to machine learning, statistics, and signal processing among others \citep{bayati2011lasso, Karoui2013a, Donoho2016, pmlr-v40-Thrampoulidis15, thrampoulidis2018precise, 10.1214/17-AOS1549, sur_modern_2018, 10.1214/18-AOS1789, pmlr-v125-gerbelot20a, 9745052, loureiro2021learning, Loureiro_2022, Bellec2023, Bellec2023b}. 
Bayes-optimal generalization guarantees for generalized linear models were established by \citet{6566160, PhysRevX.2.021005, doi:10.1073/pnas.1802705116, NEURIPS2020_7ec0dbee}.
\cite{10.1214/18-AOS1789} have shown that, besides not being well-defined when $n<d$, the unregularized maximum likelihood estimator is biased \citep{Karoui2013a, Karoui2013b, Bean2013,sur_modern_2018, Bellec2022b} for $n>d$. One consequence is that the variance of the MLE underestimates the true variance of $\vec{\theta}_{\star}$, leading to an overconfident prediction \citep{bai2021dont, NEURIPS2021_9854d7af, clarte2023theoretical}. Indeed, \citet{clarte2023theoretical, clarte2022overparametrized} highlighted the importance of properly regularizing the MLE in the high-dimensional regime, showing that cross-validation over $\lambda$ can mitigate some of these issues. \citet{clarte2023ec} showed that post-training \textit{temperature scaling} can mitigate overconfidence, regardless of the regularization used.

Bagging (the combination of subsampling with ensembling) has been studied in the high-dimensional regime by \citep{NIPS1995_1019c809, PhysRevE.55.811, pmlr-v108-lejeune20b, JMLR:v24:23-0887, pmlr-v202-du23d, pmlr-v202-chen23am, ando2023highdimensional, patil2023asymptotically}. Ensembling has also been investigated in the context of the random features model as a tool to decouple the different sources of randomness \citep{pmlr-v119-d-ascoli20a, JMLR:v22:20-1211, NEURIPS2020_7d420e2b, Loureiro2022_ensembling}. The performance of bootstrap averaging has been studied in the context of Gaussian Processes and Support Vector Machines using the replica method by \cite{NIPS2002_f12ee973, NIPS2003_2c6ae45a}. A replicated AMP algorithm for computing bootstrap averages of GLMs was proposed by \cite{takahashi2019replicated} and studied in the context of LASSO \citep{JMLR:v20:18-109} and Elastic Net \citep{takahashi2023role}.

Finally, we note that resampling methods in the context of generalized linear models are not just theoretical abstractions but are actually used in machine learning practice. For instance, \citet{Musil2019Fast} use subsampling to estimate the uncertainty in kernel regression for the energy of molecular compounds. Their observation that subsampling yields a better uncertainty estimation than Bootstrap or Gaussian processes is one motivation for the present work.
\begin{figure*}[t]
    \centering
    \def\figwidth{0.33\linewidth}
    \def\figheight{0.25\linewidth}
    
    \input{icml2024/Figures/ridge/sigma=1lambda=0.01/ridge_regression_lambda=0.01_variance}
    \input{icml2024/Figures/ridge/sigma=1lambda=0.01/ridge_regression_lambda=0.01_variance_2}
    \input{icml2024/Figures/ridge/sigma=1lambda=0.01/ridge_regression_lambda=0.01_variance_difference}
    \input{icml2024/Figures/ridge/sigma=1lambda=1/ridge_regression_lambda=1.0_variance}
    \input{icml2024/Figures/ridge/sigma=1lambda=1/ridge_regression_lambda=1.0_variance_2}
    \input{icml2024/Figures/ridge/sigma=1lambda=1/ridge_regression_lambda=1.0_variance_difference}
    \caption{Variances for ridge regression at $\lambda = 10^{-2}$ (Top) and $\lambda = 1$ (Bottom). Left: variance of pair resampling methods and of Bayes-posterior. Middle: variance of conditional resampling and residual bootstrap. Right: difference between the true variances $\varianceOnXY$, $\varianceOnY$ and their estimation. Dots are simulations done at $d = 200$, with $B = 10$ resamples for bootstrap and subsampling.}
    \label{fig:variance_ridge}
\end{figure*}


\section{Main technical results}
\label{sec:technical}
The key observation in the results that follow is that in order to asymptotically characterize the biases and variances associated with any of the resampling methods in \cref{sec:setting}, it is sufficient to characterize only a few correlations. For example, the resampling variance \eqref{eq:def:var}:
\begin{align}
    \widehat{\Var} = \frac1d\left(\frac{1}{B}\sum\limits_{k=1}^{B}\lVert\hat{\vec{\theta}}_{k}\lVert^{2} - \frac{1}{B^2}\sum\limits_{k,k'=1}^{B}\hat{\vec{\theta}}_{k}^{\top}\hat{\vec{\theta}}_{k'}\right).
\end{align}
Assuming the data sets $\mathcal{D}_{k}^{\star}$ are independently resampled from $\mathcal{D}$, it is then enough to characterize the norm of $\hat{\vec{\theta}}_{1}$ and the correlation between two independent (conditionally on $\mathcal{D}$) resampled estimators $\hat{\vec{\theta}}_{1}^{\top}\hat{\vec{\theta}}_{2}$ - with all the rest being statistically similar. The results that follow precisely characterize these quantities asymptotically.
\begin{figure*}[t]
    \centering
    \def\figwidth{0.32\linewidth}
    \def\figheight{0.25\linewidth}
    
    \input{icml2024/Figures/ridge/sigma=1lambda=0.01/ridge_regression_lambda=0.01_bias}
    \input{icml2024/Figures/ridge/sigma=1lambda=0.01/ridge_regression_lambda=0.01_bias_2}
    \input{icml2024/Figures/ridge/sigma=1lambda=0.01/ridge_regression_lambda=0.01_bias_difference}
    \input{icml2024/Figures/ridge/sigma=1lambda=1/ridge_regression_lambda=1.0_bias}
    \input{icml2024/Figures/ridge/sigma=1lambda=1/ridge_regression_lambda=1.0_bias_2}
    \input{icml2024/Figures/ridge/sigma=1lambda=1/ridge_regression_lambda=1.0_bias_difference}
    
    \caption{Bias of ridge regression and its estimation using pair bootstrap and subsampling at $\lambda = 10^{-2}$ (Top) and $\lambda = 1$ (Bottom). Left: bias of pair resampling methods. Middle: conditional bias and bias of residual bootstrap. Right: difference between the various biases.}
    \label{fig:bias_ridge}
\end{figure*}
Finally, the methods defined in \cref{sec:setting} naturally divide into two categories: estimators for the statistics of the joint distribution $p_{\wstar}(\vec{x},y)$ (we refer to them as \emph{pair resampling}) and for the conditional distribution $p(y|\wstar^{\top}\vec{x})$ (we refer to them as \emph{conditional} or \emph{residual resampling}). Below, we start by discussing our results for the former. 

\subsection{Pair resampling}
The key idea is to reframe the regularized MLE problem \eqref{eq:def_erm} as a \textit{weighted empirical risk minimization} (wERM) problem:
\begin{equation}
    \werm\left(\dataset,\vec{p} \right) = \arg\min_{\Vec{\theta} \in \mathbb{R}^d} \sum_{i=1}^{n} - p_i \log p \left( y_i | \Vec{\theta}^{\top} \Vec{x}_i \right) + \sfrac{\lambda}{2} \| \Vec{\theta} \|^2
    \label{eq:def_weighted_erm}
\end{equation}
where for each sample $(\vec{x}_{i},y_{i})\in\mathcal{D}$, we have introduced a sample weight $p_{i}$. When $p_{i}=1$ for all $i\in[n]$, this reduces to standard MLE \eqref{eq:def_erm}, which we sometimes refer to as full resampling (abbreviated $\fr$). However, by taking the $p_{i}$'s at random from a judiciously chosen distribution, we can asymptotically cover all pair resampling methods from \cref{sec:setting}. 

Indeed, it is immediate to see that by choosing $p_{i}\in\{0,1\}$ at random from a Bernoulli distribution with probability $r\in(0,1]$, the wERM \eqref{eq:def_weighted_erm} asymptotically corresponds to doing subsampling. Intuitively, this can be seen as throwing a coin for each sample $i\in[n]$ in order to decide whether to include it in the subsampled batch $\mathcal{D}^{\star}_{\rm ss}$, which on average will contain precisely $r$ samples. The jackknife estimator can then be obtained as the $r\to 1^{-}$ limit of subsampling.

Similarly, pair bootstrap is asymptotically equivalent to taking $p_{i}\sim \Pois(1)$ independently. Indeed, for finite $n$, pair bootstrap exactly corresponds to taking $\vec{p}\in\mathbb{R}^{n}$ from the multinomial distribution ${\rm Multinomial}(n,\sfrac{1}{n})$. As $n\to\infty$, this is marginally equivalent to choosing $p_{i}\sim\Pois(1)$ independently~\citep[Section 3.1]{ElKaroui2018}.

To summarize, each resampling method can be thought of as applying sampling weights which are \text{i.i.d.}, with distributions defined as
\begin{align}
    \begin{cases}
        \mu_{\pb}(p) &\vcentcolon= \frac{1}{ep!}\\
        \mu_{\Ss(r)}(p) &\vcentcolon= r^p(1-r)^{1-p} \text{ for } r\in(0,1).
    \end{cases}
\end{align}
We note that a key assumption which permits to retrieve our result is that for a particular resampling method, the sample weights $p_i, \: i\in[n]$ are \textit{i.i.d.}. We are now ready to state our first two results for pair resampling. For the sake of clarity, we state our results for ridge regression and refer to~\cref{appendix:se_pair_resampling} for the derivation of our results and a statement for general convex loss and penalties.

In the following, the asymptotic values of correlations needed to compute biases and variances will be referred to as \textit{overlaps}. For $\rm{t}\!\in\!\{\pb, \Ss, \jk\}$, these overlaps read:
\begin{align}
\label{eq:def:overlaps}
    \begin{cases}
        Q_{11}^{\rm t} \!\!\!\! &\vcentcolon= \lim_{n, d\to\infty} \mathbb{E}_{\wstar, \dataset, \Vec{p} } \left[ \| \werm(\dataset, \Vec{p}) \|^2 \right] \\
        Q_{12}^{\rm t}\!\!\!\! &\vcentcolon= \lim_{n, d\to\infty}\mathbb{E}_{\wstar, \dataset } \left[ \| \mathbb{E}_{\Vec{p}} [ \werm(\dataset, \Vec{p}) ] \|^2 \right]\\
        Q_{11}^{\fr}\!\!\!\! &\vcentcolon= \lim_{n, d\to\infty} \mathbb{E}_{\wstar, \dataset} \left[ \| \werm(\dataset) \|^2 \right] \\
        Q_{12}^{\fr}\!\!\!\! &\vcentcolon= \lim_{n, d\to\infty}\mathbb{E}_{\wstar} \left[ \| \mathbb{E}_{\dataset} [ \werm(\dataset) ] \|^2 \right]\\
        Q_{12}^{\fr, \rm{t}}\!\!\!\! &\vcentcolon= \lim_{n, d\to\infty}\mathbb{E}_{\wstar, \dataset, \Vec{p}} \left[\werm(\dataset)^\top\werm(\dataset, \Vec{p}) \right]\\
        m_1^{\rm t}\!\!\!\! &\vcentcolon= \lim_{n, d\to\infty}\mathbb{E}_{\wstar, \dataset, \Vec{p}} \left[ \werm(\dataset, \Vec{p})^\top \wstar \right]\\
        m_1^{\fr}\!\!\!\! &\vcentcolon= \lim_{n, d\to\infty}\mathbb{E}_{\wstar, \dataset} \left[ \werm(\dataset)^\top \wstar \right]
    \end{cases},
\end{align}
where $\vec{p}=(p_1, \dots, p_n)\stackrel{\text{i.i.d.}}{\sim}\mu_{\rm t}$ and $\fr$ refers to full resampling. In what follows, these overlaps will be written in a matrix and vector form 
\begin{align}
\label{eq:matrix_overlaps}
    \begin{cases}
        \mat{Q}^{\rm t} &= \begin{bmatrix}
    Q_{11}^{\rm t} & Q_{12}^{\rm t} \\ Q_{12}^{\rm t} & Q_{11}^{\rm t}
\end{bmatrix} \\
    \mat{Q}^{\fr, \rm t} &= \begin{bmatrix}
    Q_{11}^{\fr} & Q_{12}^{\fr, \rm {t}} \\ Q_{12}^{\fr, \rm {t}} & Q_{11}^{\rm t}
\end{bmatrix} \\
\mat{Q}^{\fr} &= \begin{bmatrix}
    Q_{11}^{\fr} & Q_{12}^{\fr} \\ Q_{12}^{\fr} & Q_{11}^{\fr}
\end{bmatrix} \\
\Vec{m}^{\rm t} &= \left[ m_1^{\rm t}, m_1^{\rm t} \right]^\top \\
\Vec{m}^{\fr, \rm t} &= \left[ m_1^{\fr}, m_1^{\rm t} \right]^\top
    \end{cases}
\end{align}
Intuitively, for $\rm{t}\!\in\!\{\pb, \Ss, \jk\}$ the matrix $\mat{Q}^{\rm t} \in \mathbb{R}^{2 \times 2}$ represents the Gram matrix of two estimators trained on two independent resamples of the same training data $\mathcal{D}$. Similarly, $\mat{Q}^{\fr}$ is a Gram matrix between two estimators trained two datasets sampled independently from the same teacher $\wstar$.
Moreover, the vector $\Vec{m}^{\rm t}$ contains the correlation between estimators trained with method ${\rm t}$ and     $\wstar$. Our main technical result is a characterization of these quantities in the high-dimensional limit.


\begin{theorem}[Biases and Variances for pair resampling in ridge regression]\label{thm:pair_resampling}
    Let $\dataset = \{(\Vec{x}_{i}, y_{i})_{i\in[n]}\}$ denote $n$ independent samples drawn from model~\eqref{eq:def_model} with log-concave likelihood $p(y|z)$.
    In the high-dimensional proportional regime $n, d\to\infty$ with $\sfrac{n}{d}=\alpha$, the overlaps of interest \eqref{eq:matrix_overlaps} are given by the unique solution $\Vec{m} \in \mathbb{R}^2$, $\mat{Q} \in \mathbb{R}^{2 \times 2}, \mat{V} \in \mathbb{R}^2$ to the following set of self-consistent equations:
\begingroup
\allowdisplaybreaks
    \begin{align}
    \begin{cases}
        \Vec{m} &= \left( \lambda \mat{I}_2 + \hat{\mat{V}} \right)^{-1} \hat{\vec{m}} \\
        \mat{Q}    &= \left( \lambda \mat{I}_2 + \hat{\mat{V}} \right)^{-1} \left( \hat{\vec{m}} \hat{\vec{m}}^\top + \hat{\mat{Q}} \right) \left( \lambda \mat{I}_2 + \hat{\mat{V}} \right)^{-1\top} \\
        \mat{V}      &= \left( \lambda \mat{I}_2 + \hat{\mat{V}} \right)^{-1} 
    \end{cases}
    \label{eq:thm_se_overlaps}
    \\
    \begin{cases}
        \hat{\vec{m}}\!\!\!\! &= \alpha \mathbb{E}_{\Vec{p}} \left[ \mat{G}(\Vec{p}) \right] \mathbf{1}_2 \\
        \hat{\mat{Q}}   \!\!\!\!    &= \alpha \mathbb{E}_{\Vec{p}} \left[ \mat{G}(\Vec{p}) \left( (v_{\star} + \Delta) \mathbf{1}_{2 \times 2} + \mat{B Q B}^{\top} \right) \mat{G}(\Vec{p})^{\top} \right]\\
        \hat{\mat{V}}   \!\!\!\!    &= \alpha \mathbb{E}_{\Vec{p}} \left[ \mat{G}(\Vec{p}) \right]\\
    \end{cases}
    \label{eq:thm_se_hat_overlaps}
\end{align}
\endgroup
for a careful choice of the joint distribution of $\Vec{p} = (p_1, p_2)$. 
In the above, $\mat{G}(\Vec{p}) = (\mat{I}_2 + \mat{P V})^{-1} \mat{P}$ with $\mat{P} = \mathrm{Diag}(\Vec{p}), \;\mat{B} = \vec{1}_2\vec{m}^\top \mat{Q}^{-1} - \mat{I}_2$ and $v_{\star} = 1 - \Vec{m}^\top Q^{-1} \Vec{m}$.

Then, the following holds:
\begin{itemize}[noitemsep,leftmargin=1em,wide=1pt]
    \item the variance of resampling method $\mathrm{t}\in\{\pb, \Ss, \jk\}$ is given by
    \begin{equation}
        \widehat{\Var_t} = Q_{11}^{\mathrm{t}}-Q_{12}^{\mathrm{t}},
    \end{equation}
    where overlaps with superscript $\mathrm{t}$ are obtained by solving~\eqref{eq:thm_se_overlaps}, \eqref{eq:thm_se_hat_overlaps} using joint distribution $\mu(p_1, p_2) = \mu_{\rm t}(p_1)\cdot\mu_{\rm t}(p_2)$.
    \item the true variance is given by
    \begin{equation}
        \varianceOnXY = Q^{\fr}_{11}-Q^{\fr}_{12},
    \end{equation}
    where overlaps with superscript $\fr$ (indicating full resampling) are obtained by solving~\eqref{eq:thm_se_overlaps}, \eqref{eq:thm_se_hat_overlaps} using joint distribution $$\mu(p_1, p_2)= (\mathbbm{1}(p_1 = 0, p_2 = 1) + \mathbbm{1}(p_1 = 1 , p_2 = 0)).$$
    \item the squared bias of resampling method $\mathrm{t}$ is given by
    \begin{equation}
        \widehat{\Bias^2_{\rm t}} = Q^{\fr}_{11} + Q^{\rm t}_{12} - 2 Q^{\rm fr, \rm t}_{12},
    \end{equation}
    where overlaps with superscript $\mathrm{t}, \fr$ are obtained by solving~\eqref{eq:thm_se_overlaps}, \eqref{eq:thm_se_hat_overlaps} using distribution $\mu(p_1, p_2) = \mu_{\rm t}(p_1)\cdot\mathbbm{1}\{p_2=1\}$ for $p_1, p_2$.
    \item the true squared bias is given by
    \begin{equation}
        \biasOnXY=1 - 2 m^{\rm fr}_1 + Q^{\rm fr}_{12}.
    \end{equation}
\end{itemize}
\end{theorem}
The details for the derivations of~\cref{thm:pair_resampling} are shown in~\cref{appendix:gamp_ridge_regression}.

\paragraph{The specific case of subsampling} To make~\cref{thm:pair_resampling} more concrete, we consider in this paragraph the particular case of subsampling, for which Equations~\eqref{eq:thm_se_overlaps} and \eqref{eq:thm_se_hat_overlaps} can be written in a more succint form. Indeed, for subsampling with ratio $r$, the overlaps $m_1^{\Ss}$, $Q_{11}^{\Ss}$ and $Q_{12}^{\Ss}$ are given by
\begin{align}
    \begin{cases}
        m_1^{\Ss}    &= 1 - \lambda v \\
        Q_{11}^{\Ss} &= (m_1^{\Ss})^2 \cdot \frac{\alpha r + 1 + \Delta - 2 { m_1^{\Ss} }}{\alpha r - (m_1^{\Ss})^2} \\
        Q_{12}^{\Ss} &= (m_1^{\Ss})^2 \cdot \frac{\alpha + 1 + \Delta - 2 { m_1^{\Ss} }}{\alpha - (m_1^{\Ss})^2}
    \end{cases},
\end{align}
where $v = \frac{1 - \lambda - \alpha r + \sqrt{(\alpha r + \lambda - 1)^2 + 4 \lambda}}{2 \lambda}$, as detailed in~\cref{sec:subsampling_overlaps}. With this representation, the dependency of the overlaps on the different parameters such as $\alpha$ and on the subsampling ratio $r$ becomes much more explicit. We note in particular that the overlap $Q_{11}^{\Ss}$  of one of the subsampling estimator with itself, depends only on the subsampled data it has seen, explaining the dependency on $\alpha r$. On the other hand, the overlap $Q_{12}^{\Ss}$ involves both subsampling estimators, so that a dependency on $\alpha$ also appears since all samples are considered.

\looseness=-1
\subsection{Conditional resampling}

Similar to pair resampling, we leverage the fact that the conditional bias and variance, together with the estimates by residual bootstrap, can be written in terms of correlations between estimators. The key difference here is that the covariates $\vec{x}_1, \dots, \vec{x}_n$ remain constant, and only the labels are resampled. Focusing on linear regression, in the case of residual resampling (abbreviated $\rr$), the labels are sampled from the true distribution $y^{\star}_i \sim \mathcal{N}(\wstar^{\top} \Vec{x}_i, \Delta)$, whereas for residual bootstrap, we use the ERM estimator to approximate this distribution and $y^{\star}_i \sim \mathcal{N}(\werm^{\top} \Vec{x}_i, \tilde{\Delta})$ with $\tilde{\Delta}$ an estimator of $\Delta$. 
Similarly to pair bootstrap, we now just need the correlation between $B$ estimators $\hatw_{\lambda, b}$ trained on resampled datasets $\dataset^{\star}_b = \{(\vec{x}_{i},y_{i,b}^{\star})_{i=1}^{n}\}$. This can be done by considering the minimization problem~\eqref{eq:def_erm_residual}.  Despite minimizing each $\hatw_{\lambda, b}$ independently, they see the same covariates $\vec{x}_{i}$. In \cref{appendix:residual_bootstrap}, we discuss how this correlation can be exactly captured by designing a particular approximate message passing, and also provide more details and an extension to more generic losses. As in the previous section, we first define the overlaps of interest 
\begingroup
\allowdisplaybreaks
\begin{align}
\label{eq:def:res:overlaps}
    \begin{cases}
        Q_{11}^{\rb} &\vcentcolon= \lim_{n, d\to\infty} \mathbb{E}_{\wstar, \dataset } \left[ \mean{\vec{y}^\star|\dataset}{\| \werm(\mat{X}, \vec{y}^\star) \|^2} \right] \\
        Q_{12}^{\rb} &\vcentcolon= \lim_{n, d\to\infty}\mathbb{E}_{\wstar, \dataset } \left[ \| \mathbb{E}_{\vec{y}^\star|\dataset} [ \werm(\mat{X}, \vec{y}^\star) ] \|^2 \right]\\
        Q_{11}^{\rr} &\vcentcolon= \lim_{n, d\to\infty} \mathbb{E}_{\wstar, \dataset} \left[ \| \werm \|^2 |\mat{X}\right] \\
        Q_{12}^{\rr} &\vcentcolon= \lim_{n, d\to\infty}\mathbb{E}_{\wstar} \left[ \| \mathbb{E}_{\dataset} [ \werm|\mat{X} ] \|^2 \right]\\
        m_{1}^{\rb} &\vcentcolon= \lim_{n, d\to\infty}\mathbb{E}_{\wstar, \dataset} \left[\werm(\dataset)^\top\mean{\vec{y}^\star|\dataset}{\werm(\mat{X}, \vec{y}^\star)} \right]\\
        m_1^{\rr} &\vcentcolon= \lim_{n, d\to\infty}\mathbb{E}_{\wstar} \left[ \mean{\dataset}{\werm|\mat{X}}^\top \wstar \right].
    \end{cases}
\end{align}
\endgroup
and the minimization problem for conditional resampling
\begin{equation}
    \hatw_{\lambda, b} = \arg\min\limits_{\Vec{\theta}\in\reals^d}\sum_{i=1}^{n} - \log p(y^{\star}_{b,i} | \Vec{\theta}^{\top} \Vec{x}_i) + \sfrac{\lambda}{2} \| \Vec{\theta} \|^2,
    \label{eq:def_erm_residual}
\end{equation}
where $b=1, \dots, B$.

\begin{theorem}[Biases and Variances for conditional resampling in ridge regression]\label{thm:conditional_resampling}
    Let $\dataset = \{(\Vec{x}_{i}, y_{i})_{i\in[n]}\}$ denote $n$ independent samples drawn from model~\eqref{eq:def_model} with log-concave likelihood $p(y|z)$.
    In the high-dimensional proportional regime $n, d\to\infty$ with $\sfrac{n}{d}=\alpha$, the overlaps of interest \eqref{eq:def:res:overlaps} for ${\rm t} \in \{ \rr, \rb \}$ are given by :

\begin{align}
    \begin{cases}
        m_1^{\rm t} &= \Tilde{\rho}(1 - \lambda v) \\
        Q_{11}^{\rm t} &= (m_1^{\rm t})^2 \cdot \frac{\alpha \tilde{\rho} + \tilde{\rho} + \tilde{\Delta} - 2 m_1^{\rm t}}{\alpha \Tilde{\rho}^2 - (m_1^{\rm t})^2}\\
        Q_{12}^{\rm t} &= (m_1^{\rm t})^2 \cdot \frac{\alpha \tilde{\rho} + \tilde{\rho} - 2m_1^{\rm t}}{\alpha \Tilde{\rho}^2 - (m_1^{\rm t})^2}
        \label{eq:se_overlaps_conditional}
    \end{cases}
\end{align}

where $v = \frac{1 - \lambda - \alpha + \sqrt{(\alpha + \lambda - 1)^2 + 4 \lambda}}{2\lambda}$. The quantities $\Tilde{\Delta}, \Tilde{\rho}$ take different values depending on whether bootstrap is performed or not, as detailed below.
Then, the following holds:
\begin{itemize}[noitemsep,leftmargin=1em,wide=1pt]
    \item the variance of residual bootstrap is given by 
    \begin{equation}
        \varianceResidualBootstrap = Q_{11}^{\rb} - Q_{12}^{\rb},
    \end{equation}
    where $Q_{11}^{\rr}, Q_{12}^{\rr}$ are obtained by solving \eqref{eq:se_overlaps_conditional} using $\Tilde{\rho} = Q_{11}^{\fr}$ and $\Tilde{\Delta} = \sfrac{(1 + \Delta - 2 m_{1}^{\fr} + Q_{11}^{\fr})}{(1 + V_{11}^{\fr})^2}$. Note that the overlaps with superscript $\fr$ are specified in~\cref{thm:pair_resampling}.
    \item the true variance $\varianceOnY$ is given by
    \begin{equation}
        \varianceOnY = Q^{\rr}_{11}-Q^{\rr}_{12},
    \end{equation}
    where $Q_{11}^{\rr}, Q_{12}^{\rr}$ are obtained by solving \eqref{eq:se_overlaps_conditional}  using $\Tilde{\rho} = 1, \Tilde{\Delta} = \Delta$.
    
    \item the squared bias of residual bootstrap
    \begin{equation}
        \biasResidualBootstrap = Q^{\fr}_{11} + Q^{\rb}_{12} - 2 m^{\rb}_{1}
    \end{equation}
    \item the true conditional squared bias is given by
    \begin{equation}
        \biasOnY = 1 - 2 m^{\rr}_1 + Q^{\rr}_{12}.
    \end{equation}
\end{itemize}
\end{theorem}

The details for the derivations of~\cref{thm:conditional_resampling} are shown in~\cref{appendix:residual_resampling} and~\cref{appendix:overlaps_rates}. Compared to pair resampling, residual resampling does not involve introducing sample weights, only the labels are resampled from a conditional distribution. However, for residual bootstrap, the main idea is that the target weights $\wstar$ are replaced by $\werm$. Moreover, for ridge regression, we approximate the variance $\Delta$ by the averaged residual:
\begin{equation}
    \Tilde{\Delta} = \frac{1}{n}\sum_{i = 1}^n (y_i - \werm^{\top} \vec{x}_i)^2
\end{equation}

In the high-dimensional regime, the analytical expression of this training error is given by the overlaps of state-evolution, and $\Tilde{\Delta} = \sfrac{(1 + \Delta - 2 m_{1}^{\fr} + Q_{11}^{\fr})}{(1 + V_{11}^{\fr})^2}$. The derivation of this expression can be found in \citet{loureiro2021learning}. We end this section by observing that so far, we considered only the variance on the weights. However, one could be interested in other types of variances such as \textit{predictive variance}, which we discuss in~\cref{appendix:other_variances}.


\section{Discussions and main findings}
\label{sec:discussions}
\begin{table}[t]
    {Pair resampling rates} {Residual resampling rates}
\fontsize{8pt}{8pt}
\parbox{.49\linewidth}{
        \begin{tabular}{c | c | c }
                 \midrule             
                  & Rate & Error \\
            \midrule
            $\varianceOnXY$ & $\sfrac{1}{\alpha}$ &  --\\
            $\varianceSubsampling$ & $\sfrac{1}{\alpha}$ & $\sfrac{1}{\alpha}$\\
            $\varianceJackknife$ & $\sfrac{1}{\alpha}$ & $\sfrac{1}{\alpha^2}$\\
            $\variancePairBootstrap$ & $\sfrac{1}{\alpha}$ & $\sfrac{1}{\alpha^3}$\\
            \midrule
            $\biasOnXY$ & $\sfrac{1}{\alpha^2}$ &  --\\
            $\biasSubsampling$ & $\sfrac{1}{\alpha^2}$ & $\sfrac{1}{\alpha^2}$\\
            $\biasJackknife$ & $\sfrac{1}{\alpha^2}$ & $\sfrac{1}{\alpha^3}$\\
            $\biasPairBootstrap$ & $\sfrac{1}{\alpha^4}$ & $\sfrac{1}{\alpha^2}$\\
            \bottomrule
        \end{tabular}
        }
 \hfill
 \hspace{-0.4cm}
 \parbox{.49\linewidth}{
    \vspace{-1.8cm}
        \begin{tabular}{c | c | c}
  \midrule             
              & Rate & Error \\
            \midrule
            $\varianceOnY$ & $\sfrac{1}{\alpha}$ & -- \\
            $\varianceResidualBootstrap$ & $\sfrac{1}{\alpha}$ &  $\sfrac{1}{\alpha^2}$\\
            \midrule
            $\biasOnY$ & $\sfrac{1}{\alpha^2}$ & -- \\
            $\biasResidualBootstrap$ & $\sfrac{1}{\alpha^2}$ & $\sfrac{1}{\alpha^3}$ \\
            \bottomrule
        \end{tabular}
        }
 \hfill

    \caption{Summary of large $\alpha$ rates for ridge regression (see~\cref{appendix:large_alpha_rates} for details).}
    \label{table:large_alpha_rates}
\end{table}






In this section we discuss the consequences of the technical results from~\cref{sec:technical} on the performance of resampling methods, and compare with empirical values. We refer to~\cref{appendix:numerics} for more details on the plots.

\subsection{ridge regression} 
\label{sec:ridge_numerical_results}

\paragraph{Variance --} \cref{fig:variance_ridge} shows the different variances for ridge regression. We consider two important choices of regularization:  $\lambda = 10^{-2}$ to approximate the behavior of unpenalized estimators, and $\lambda = \Delta = 1$ which is the optimal value of $\lambda$: this regularization minimizes the generalization error of $\werm$ and its test error is the same as the Bayes-optimal estimator. As explained in~\cref{sec:resampling_estimates}, the variance of Jackknife is approximated by doing subsampling with $r = 0.99$. Note that the subsampling variances with ratio $r$ are rescaled by a factor $1 - r$. We compare our theoretical predictions with numerical experiments on Gaussian data and observe an excellent agreement.
For $\lambda = 10^{-2}$ in the regime where $n > d$, our results are qualitatively consistent with \cite{ElKaroui2018}, who showed that pair (respectively residual) bootstrap overestimates (resp. underestimates) the variance. On the other hand, our results allow us to study the variances at $d > n$. In this regime, we observe that both pair and residual bootstrap suffer from under-coverage: for residual bootstrap, it is easy to understand why, as without regularization $d > n$ the ERM interpolates the training data. Thus, the residual is exactly~$0$, and the residual bootstrap thus fatally underestimates the true level of noise in the data. On the other hand, subsampling and Jackknife are closer to $\varianceOnXY$ than pair bootstrap, and as is classically known \cite{efron1981jackknife}, the Jackknife estimate provides an upper bound of the true variance. On the right panel, we see that all variances converge to $0$ with rate $\sfrac{1}{\alpha}$, and pair bootstrap converges to $\varianceOnXY$ the fastest.
On the bottom row of~\cref{fig:variance_ridge}, we observe that optimal regularization greatly mitigates the under-coverage of bootstrapping, most notably for residual bootstrap. 
We thus conclude that for small values $\sfrac{n}{d}$, bootstrap fails to accurately capture the true variances, and appropriately regularizing partially mitigates this issue. 

Note that conditioned on $\dataset$ and if the data generating process is known, the Bayes-optimal posterior variance $\varianceBO$ is the best estimation of uncertainty on the weights. As in \cref{thm:pair_resampling} and \ref{thm:conditional_resampling}, this variance can be obtained by solving a corresponding set of self-consistent equations \citep{clarte2023theoretical}. We observe that at large $\alpha$, all variances agree with $\varianceBO$. However, at optimal $\lambda$ and small $\sfrac{n}{d}$, resampling will underestimate the actual posterior variance. 

\paragraph{Bias --} In~\cref{fig:bias_ridge}, we plot the bias of the different resampling methods for ridge regression with regularization $\lambda \in \{ 10^{-2}, 1 \}$. For the Jackknife and subsampling, the estimation of the squared bias is rescaled by a factor $(1 - r)^2$. We observe that as $\alpha \to \infty$, $\biasOnXY$ and $\biasPairBootstrap$ converge to zero, as expected by the consistency of the MLE estimator \eqref{eq:consistency}. However, $\biasPairBootstrap$ converges as $\sfrac{1}{\alpha^4}$, while $\biasOnXY \sim \sfrac{1}{\alpha^2}$, and pair bootstrap underestimates the true bias. We deduce that in our model, subsampling or Jackknife should thus be preferred to estimate $\biasOnXY$.



\begin{figure}[t!]
    \centering
    \def\figwidth{0.52\columnwidth}
    \def\figheight{0.52\columnwidth}    \input{icml2024/Figures/logistic/lambda=0.01/logistic_regression_lambda=0.01_variance}
    \input{icml2024/Figures/logistic/lambda=0.01/logistic_regression_lambda=0.01_variance_2}
    \input{icml2024/Figures/logistic/lambda=1/logistic_regression_lambda=1.0_variance}
    \input{icml2024/Figures/logistic/lambda=1/logistic_regression_lambda=1.0_variance_2}

    \caption{Variance for logistic regression at $\lambda = 10^{-2}$ (Top) and $\lambda = 1$ (Bottom). Left: variance of full resampling, pair bootstrap, subsampling. Right: variance of label resampling, residual bootstrap. See \cref{fig:variance_ridge} for the legend.
    }
    \label{fig:variance_logistic}
\end{figure}
\subsection{Logistic regression}
\label{sec:logistic_numerical_results}

Our results extend beyond ridge regression, and the quantities of interest can be computed for any convex loss. \Cref{fig:variance_logistic} displays the true variances and their estimation for regularized logistic regression with $\lambda \in \{ 10^{-2}, 1 \}$, similarly to~\cref{fig:variance_ridge}. However, contrary to the ridge case, $\lambda = 1$ yields the maximum-a-posteriori estimator but does not minimize the misclassification error. %

Qualitatively, we observe similar results as for ridge regression : at large $\alpha$, all methods consistently estimate the true variance and the Jackknife provides an upper bound of $\varianceOnXY$. Moreover, at low $\alpha$, regularization improves the estimation of the variance, even though $\lambda$ is not optimal.

Finally, at $\lambda = 0.01$ for both ridge and logistic regression, we observe a local maximum in the true and resampled bias and variance around $d = n$. This behavior is reminiscent of the double-descent behavior observed e.g. in random features models or neural networks : the test error achieves a local maximum at the interpolation threshold where the model can perfectly fit the training data, then decreases with the number of parameters. Moreover, we see that regularization can mitigate this ``double-descent'' phenomenon.

\section{Conclusion \& Perspectives}
In this work, we have provided an exact asymptotic comparison of the uncertainty estimations provided by different resampling methods, in the context of high-dimensional regularized maximum likelihood with generalized linear models.

Our results highlight the limitations of these methods in the high-dimensional regime relevant to modern machine learning practice and discuss how cross-validation can, to some extent, mitigate some of these limitations. 

Avenues for future work are manifold. For instance, how would our results change in a misspecified scenario? Can structure in the data help or hinder resampling methods? These interesting questions are left for future investigation. 

\section*{Acknowledgements}
This research was supported by the Swiss National Science Foundation grant SNFS OperaGOST, $200021\_200390$ and the NCCR MARVEL, a National Centre of Competence in Research, funded by the Swiss National Science Foundation (grant number 205602) and the Choose France - CNRS AI Rising Talents program.

\newpage 

\bibliography{bibliography}

\clearpage
\appendix
\onecolumn

\input{appendix}
\end{document}
