\onecolumn

\section{Approximate Message Passing to approximate leave-one-out residuals}
\label{appendix:amp}

\subsection{Introduction of relaxed-Belief Propagation and Approximate Message Passing}

% The use of GAMP for deriving high-dimensional asymptotics characterization is now a classic rigorous tool, that has been used in many situations \citep{bayati2011lasso,JMLR:v15:javanmard14a,sur2019likelihood,emami2020generalization,loureiro2021learning,Loureiro2022_ensembling,gerbelot2022asymptotic}. 

In this section, we explain how AMP can be used to compute the leave-one-out residuals used in~\cref{eq:scores_fcp}. The naive way to compute these residuals is to fit the leave-one-out estimators $\what_{-i} ( y ) $ for each sample $1 \leqslant i \leqslant n$ and each possible label $y$, which requires $n \times | \mathcal{Y} |$ fits, with $\mathcal{Y}$ the set of candidate labels, typically a discretization of $\mathbb{R}$. We will first see that AMP can be used to compute all the $\what_{-i}$

To introduce AMP, we first consider the following problem. Consider a dataset $\mathcal{D} = \left( \Vec{x}_i, y_i \right)_{i = 1}^n$ of size $n$.
Assume that the data is generated from the model~\eqref{eq:gaussian_data}, where the input $\vec{x}_i \in \mathbb{R}^d$ are sampled according to $\mathcal{N}(\Vec{0}, \sfrac{I_d}{d})$, and the labels are generated from a \textit{teacher} as $y \sim p(y | \wstar^{\top} \vec{x})$. Our goal is to sample the following distribution 
\begin{equation}
    p(\vectheta) = \frac{1}{Z} \prod_{i = 1}^n P_{out} \left( y_i | \vectheta^{\top} \Vec{x}_i \right) \prod_{\mu = 1}^d P_{\theta}( \vectheta_{\mu} ) 
    \label{eq:def_distribution_amp}
\end{equation}
The empirical risk minimization problem~\eqref{eq:def_erm} introduced in~\cref{sec:setting} is a particular instance of~\cref{eq:def_distribution_amp} where 
\begin{equation}
    P_{out}(y | z) \propto e^{- \beta \ell(y, z)}, \qquad P_{\theta}( z ) \propto e^{- \beta r(z)}
    \label{eq:probas_for_erm}
\end{equation}
in the limit $\beta \to \infty$. The starting point of approximate message passing is the writing of the belief-propagation algorithm for the graph associated with~\cref{eq:def_distribution_amp}, where the variable-nodes of the graph are the coordinates $\vectheta_{\mu}$ and the factor nodes, representing the interaction between the variable-nodes, are the observations $y_i$. The message passing consists in iterating messages $m_{\mu \to i}$ from variable to factor-nodes and $m_{i \to \mu}$ from factor to variable-nodes. These messages read
\begin{align}
    m_{\mu \to i} ( \theta_{\mu} ) &= \frac{1}{z_{i \to \mu}} P_{\theta} ( \theta_{\mu}) \prod_{j \neq i} m_{j \to \mu} ( \theta_{\mu} )  \\ 
    m_{i \to \mu} ( \theta_{\mu} ) &= \frac{1}{z_{\mu \to i}} \int \prod_{\nu \neq \mu} \dd \theta_{\nu} m_{\nu \to i} P_{out} \left( y_i | \sum_{\nu} \Vec{x}_{i \nu} \theta_{\nu} \right)
    \label{eq:def_messages_bp}
\end{align}
This messages give access to the distribution $p\left( \vec{\theta} \right)$ and in particular this marginals : indeed, the marginal distribution $p(\theta_{\mu})$ is given by 
\begin{equation}
    p(\theta_{\mu}) = \frac{1}{z_{\mu}} P_{\theta}(\theta_{\mu}) \prod_{i = 1}^n m_{i \to \mu} (\theta_{\mu}) 
\end{equation}
where $z_{\mu}$ is a normalization constant. Iterating~\cref{eq:def_messages_bp} is not tractable, especially in high-dimensions as it involves $(d - 1)$ integrals to update each $m_{i \to \mu}$. To make these equations tractable, one can use relaxed-Belief Propagation (rBP), which relies on the central limit theorem and the projection of the messages on their first two moments. We thus define the \textit{cavity mean} $\hat{\theta}_{\mu \to i}$ and \textit{cavity variance} $\hat{v}_{\mu \to i}$ as 
\begin{align}
    \hat{\theta}_{\mu \to i} &= \int \dd \theta_{\mu} \theta_{\mu} m_{\mu \to i} ( \theta_{\mu} ) \\
    \hat{v}_{\mu \to i}      &= \int \dd \theta_{\mu} \theta_{\mu}^2 m_{\mu \to i} ( \theta_{\mu} ) - \hat{\theta}^2_{\mu \to i}
    \label{eq:cavity_mean_variance}
\end{align}
In particular, the vector $\left( \hat{\theta}_{\mu \to i} \right)_{\mu = 1}^d$ represents the mean of the marginals of distribution~\eqref{eq:def_distribution_amp} in the absence of the $i$-th sample. In the context of empirical risk minimization, this is exactly the leave-one-out estimator $\what_{-i}$ defined as 
\begin{equation}
    \what_{-i} = \arg\min_{\Vec{\theta}} \sum_{j \neq i} \ell \left( y_j, \Vec{\theta}^{\top} \Vec{x}_j \right) + \sum_{\mu = 1}^d r(\theta_{\mu})
\end{equation}
Our goal is thus to compute efficiently the cavity means and use them to compute the leave-one-out residuals. 

\paragraph{rBP} The main idea behind rBP is to iteratively compute the cavity means and variances, to obtain the desired marginal mean and variance of $\Vec{\theta}$. We define $\omega_{i \to \mu}, V_{i \to \mu}$ the mean and variance of the messages $m_{i \to \mu}$ and $\hat{\theta}_{\mu \to i}, \hat{v}_{\mu \to i}$ the mean and variance of $m_{\mu \to i}$. 

We detail rBP in~\cref{alg:rBP}, and refer to~\cite[Chapter VI, Section C]{Zdeborova2016Statistical} for a detailed explanation of the algorithm. In particular, the algorithm makes use of the \textit{channel} and \textit{denoising} functions $\channel$ and $\denoiser$ functions, defined respectively as 
\begin{equation}
    \channel(y, \omega, V) = \frac{\partial \log \mathcal{Z}_y (y, \omega, V)}{\partial \omega}, \qquad \mathcal{Z}_y (y, \omega, V) = \int \dd z P_{out} (y | z) e^{- \frac{1}{2V}(z - \omega)^2}
    \label{eq:def_channel}
\end{equation}
and 
\begin{equation}
    \denoiser(b, A) = \frac{\partial \log \mathcal{Z}_w (b, A)}{\partial b}, \qquad \mathcal{Z}_w (b, A) = \int \dd x P_{\theta} (x) e^{bx - \frac{A}{2}x^2} 
    \label{eq:def_denoiser}
\end{equation}

In the case of empirical risk minization~\eqref{eq:def_erm}, using the prior and likelihood from \cref{eq:probas_for_erm} into the definitions\eqref{eq:def_channel} and \eqref{eq:def_denoiser} and taking the limit $\beta \to \infty$ yields Equation~\eqref{eq:def_channel_denoiser}.

\paragraph{From rBP to AMP} Note that in rBP, we iterate over $n \times d$ means and variances $\omega_{i \to \mu}, V_{i \to \mu}, \hat{\theta}_{\mu \to i}, \hat{v}_{\mu \to i}$, which scales quadratically with the dimension in the high-dimensional limit where $n, d \to \infty$ with a constant sampling ratio $\sfrac{n}{d} = \alpha$. However, a key observation is that the quantities $\hat{\theta}_{\mu \to i}, \hat{v}_{\mu \to i}$ only weakly depend on $\mu$, and similarly $\omega_{i \to \mu}, V_{i \to \mu}$ weakly depend on $\mu$. Hence, let us define 
\begin{align}
    \begin{cases}
        \omega_{i} &= \sum_{\mu} \Vec{x}_{i\mu} \hat{\theta}_{\mu \to i}  \\
        V_{i} &= \sum_{\mu} \Vec{x}_{i\mu}^2 \hat{v}_{\mu \to i}
    \end{cases},
    \qquad
    \begin{cases}
        A_{\mu} &= - \sum_{i = 1}^n \partial_{\omega} g_{out} \left( y_i, \omega_i, V_i \right) \vec{x}^2_{i \mu} \\
        b_{\mu} &= \sum_{i = 1}^n g_{out} \left( y_i, \omega_{i \to \mu}, V_{i \to \mu} \right) \vec{x}_{i \mu} \\
    \end{cases}
\end{align}
note that for all $\mu$ and all $i$, in the high-dimensonal limit considered here we have 
\begin{align}
    \omega_i &= \omega_{i \to \mu} + \Vec{x}_{i\mu} \hat{\theta}_{\mu \to i} = \omega_{i \to \mu} + O \left( \sfrac{1}{\sqrt{n}}\right) \\
    V_i      &= V_{i \to \mu} + \vec{x}_{i \mu}^2 \hat{v}_{\mu \to i} =  V_{i \to \mu} + O\left( \sfrac{1}{n} \right)
\end{align}

As a consequence, we have for all $\mu$ and all $i$
\begin{align}
     A_{\mu} &= - \sum_{j = 1}^n \vec{x}^2_{j \mu} \partial_{\omega} g_{out} \left( y_j, \omega_{j}, V_{j} \right) = \sum_{j = 1}^n \vec{x}^2_{j \mu} \left[ \partial_{\omega} g_{out} \left( y_j, \omega_{j \to \mu}, V_{j \to \mu} \right) + O(\sfrac{1}{\sqrt{n}}) \right] \\
     &= - \sum_{j = 1}^n \vec{x}^2_{j \mu} \partial_{\omega} g_{out} \left( y_j, \omega_{j \to \mu}, V_{j \to \mu} \right) + O(\sfrac{1}{\sqrt{n}}) \\
    &= - \sum_{j \neq i}^n \vec{x}^2_{j \mu} \partial_{\omega} g_{out} \left( y_j, \omega_{j \to \mu}, V_{j \to \mu} \right) + O(\sfrac{1}{\sqrt{n}}) \\
     &= - A_{\mu \to i} +  O \left( \sfrac{1}{\sqrt{n}} \right)
 \end{align}
Similarly, we get 
 \begin{align}
     b_{\mu} &= b_{\mu \to i} + O \left( \sfrac{1}{\sqrt{n}} \right) 
 \end{align}

So that one can simply compute the estimator $\vec{\theta} = \denoiser \left( \vec{b}, \vec{A} \right)$. The challenge is to compute the vectors $\vec{\omega}, \vec{V}, \vec{b}$. To do so, we note that 
\begin{align}
    \channel \left( y_i, \omega_{i \to \mu}, V_{i \to \mu} \right) &= \channel \left( y_i, \omega_{i}, V_{i} \right) - \vec{x}_{i \mu} \hat{\theta}_{\mu \to i} \partial_{\omega} \channel \left( y_i, \omega_{i \to \mu}, V_{i \to \mu} \right) + O \left( \sfrac{1}{n} \right) \\
\end{align}
such that 
\begin{align}
    b_{\mu} &= \sum_{i = 1}^n \vec{x}_{i \mu} \channel \left( y_i, \omega_{i}, V_{i} \right) - \sum_i \vec{x}^2_{i \mu} \hat{\theta}_{\mu} \partial_{\omega} \channel \left( y_{i}, \omega_i, V_i \right) + O \left( \sfrac{1}{\sqrt{n}} \right) \\
\end{align}

Moreover, 
\begin{align}
    \omega_i &= \sum_{\mu = 1}^d \Vec{x}_{i\mu} \hat{\theta}_{\mu \to i} = \sum_{\mu} \Vec{x}_{i\mu} \left( \hat{\theta}_{\mu} - \vec{x}_{i \mu} v_{\mu} \channel \left( y_i, \omega_i, V_i \right) \right) + O(\sfrac{1}{n} )\\
\end{align}

These iterative equations are, in the leading order, the same as those shown in~\cref{alg:gamp}. In the high-dimensional regime, these iteratives coincide with rBP. Going from rBP to AMP, we have reduced the number of variables to iterate on from $O(n \times d)$ to $O(n + d)$, and can still recover the marginal distribution by 
\begin{equation}
    \hat{\theta}_{\mu} = \denoiser \left( b_{\mu}, A_{\mu} \right)
\end{equation}

\subsection{Recovering the leave-one-out estimators from AMP}

For each sample $i$, computing the leave-one-out estimator $\what_{-i}$ means computing the marginals of the distribution
\begin{equation}
    p(\vec{\theta}) = \frac{1}{Z} \prod_{j \neq i} P_{out} \left( y_j | \vec{\theta}^{\top} \vec{x}_j \right) \prod_{\mu = 1}^d P_{\theta} \left( \vec{\theta}_{\mu} \right)
    \label{eq:def_distribution_loo}
\end{equation}
with $P_{out}$ and $P_{\theta}$ defined in~\cref{eq:probas_for_erm} and where the sample $(\vec{x}_i, y_i)$ is removed from the data. Our method leverages the fact that these marginals are computed iteratively by relaxed-BP and stored in the variables $\hat{\theta}_{\mu \to i}$. Indeed, each $\hat{\theta}_{\mu \to i}$ stores the posterior mean of $\theta_{mu}$ when the interaction node $i$ is removed from the graph associated to~\cref{eq:def_distribution_amp}, which corresponds exactly to the distribution of~\cref{eq:def_distribution_loo}. While rBP explicitly computes these quantities, its computational complexity makes it unusable. Instead, we will recover these estimators from AMP. Indeed, at the leading order we have : 

\begin{align}
    \hat{\theta}_{\mu \to i} &= \denoiser \left( b_{\mu \to i}, A_{\mu \to i} \right) = \denoiser \left( b_{\mu \to i}, A_{\mu} \right) + O \left( \sfrac{1}{n} \right) \\
                &=  \denoiser \left( b_{\mu}, A_{\mu} \right) - b_{i \to \mu} \partial_b \denoiser \left( b_\mu, A_\mu \right) + O \left( \sfrac{1}{n} \right) = \what_\mu - \channel (y_i, \omega_i, V_i) \Vec{x}_{i \mu} \vhat_\mu + O \left( \sfrac{1}{n} \right)
    \label{eq:from_marginal_to_cavity_mean}
\end{align}

The expression on the right-hand side corresponds to the approximation of the leave-one-out estimators $\what_{-i, \gamp}$ used in~\cref{alg:gamp}. 

\paragraph{Convergence of the leave-one-out residuals in high-dimensions} Under the assumptions~\eqref{eq:gaussian_data}, we see from~\cref{eq:from_marginal_to_cavity_mean} thatin the high-dimensional limit the leave-one-out estimators computed by AMP will converge to the exact ones at a $O(\sfrac{1}{n})$ rate. As such, for a given test sample $\vec{x}, y$ the approximated residuals $y - \vec{x}^{\top} \vec{\theta}_{-i, \gamp}$ will converge to $y - \vec{x}^{\top} \vec{\theta}_{-i}$ at a $O \left( \sfrac{1}{\sqrt{n}} \right)$ rate. This implies that asymptotically the prediction intervals built using the AMP leave-one-out converge to the prediction intervals with the exact residuals.

\paragraph{Applying AMP without Gaussian assumptions} 
We thus see that from AMP, we get an approximation of the leave-one-out estimator that can be used to compute the residuals in~\cref{eq:scores_fcp}. The derivations performed in this section were done under the assumption that the input data are Gaussian with i.i.d. covariance and $\sfrac{1}{d}$ variance. However, AMP can be applied on any data, with no guarantee a priori on its performance.

\begin{algorithm}[tb]
    \caption{relaxed-Belief Propagation}
    \label{alg:rBP}
    \begin{algorithmic}

    \REPEAT
        \STATE {\bfseries Input:} Dataset $\dataset = \left( \Vec{x}_i, y_i \right)_{i = 1}^n$

        \STATE \begin{align}
            \begin{cases}
                V_{i \to \mu}^{t}      &= \sum_{\nu \neq \mu} \vec{x}^2_{i \mu} v_{\nu \to i}^{t-1} \\
                \omega_{i \to \mu}^{t} &= \sum_{\nu \neq \mu} \vec{x}_{i \mu} \hat{\theta}_{\nu \to i}^{t-1}
            \end{cases}   
        \end{align}

        \STATE \begin{align}
        \begin{cases}
            A_{\mu \to i}^{t} &= - \sum_{j \neq i} \partial_{\omega} \channel \left( y_j, \omega_{j \to \mu}^t, V_{j \to \mu} \right) \Vec{x}_{j \mu}^2 \\
            b_{\mu \to i}^{t} &= \sum_{j \neq i} \channel \left( y_j, \omega_{j \to \mu}^t, V_{j \to \mu} \right) \Vec{x}_{j \mu}
        \end{cases}
        \end{align}

        \STATE \begin{align}
         \hat{\theta}_{\mu \to i}^{t} &= \denoiser \left( b_{\mu \to i}^{t}, A_{\mu \to i}^{t} \right) \\
         \hat{v}_{\mu \to i}^{t}      &= \partial_b \denoiser \left( b_{\mu \to i}^{t}, A_{\mu \to i}^{t} \right)
        \end{align}
        
    \UNTIL{Convergence  of $\hat{\theta}_{\mu \to i}, \hat{v}_{\mu \to i}$}
        \STATE {\bfseries Return} $\what, \vhat$ \textbf{ such that :}  
        \begin{align}
            \hat{\theta}_{\mu} &= \denoiser \left( \sum_i b_{\mu \to i}, \sum_i A_{\mu \to i} \right) \\
            \hat{v}_{\mu} &= \partial_b \denoiser \left( \sum_i b_{\mu \to i}, \sum_i A_{\mu \to i} \right)
        \end{align} 
    \end{algorithmic}
\end{algorithm}

\section{Derivation ot \taylorgamp}
\label{appendix:taylorgamp}

In this section, we derive the \taylorgamp algorithm. Our starting point is AMP, derived in Appendix~\ref{appendix:amp}. In what follow, we consider a dataset $\mathcal{D}$ of size $n+1$ to stay consistent with the notation of the main text. Our goal is to compute the variation of the $\what_{-i}$ to the first order with respect to the last label $y_{n+1}$. To this end, we will write the vectors defined in AMP $\what(y), \vhat(y), \Vec{g}(y), \partial\Vec{g}(y), \Vec{b}(y), \vec{A}(y), \Vec{\omega}(y), \Vec{V}(y)$ as functions of $y_{n+1} = y$  

For the sake of conciseness, let us define the vector 
\begin{equation}
    \Omega \left( y \right) = \left( \what(y), \vhat(y), \Vec{\omega}(y), \Vec{V}(y), \Vec{g}(y), \partial\Vec{g}(y), \Vec{b}(y), \vec{A}(y) \right) \in \mathbb{R}^{4 \times (d + n)}
    \label{eq:def_omega}
\end{equation}

Then, $\Omega (y)$ is the fixed point of the equation 
$$
\Omega (y) = \fgamp ( \Omega(y), y)
$$
where the function $\fgamp(\Omega) = \left( f_{\gamp}^{\what}, f_{\gamp}^{\vhat}, f_{\gamp}^{\Vec{\omega}}, f_{\gamp}^{\Vec{V}}, f_{\gamp}^{\Vec{g}}, f_{\gamp}^{\partial \Vec{g}}, f_{\gamp}^{\Vec{b}}, f_{\gamp}^{\partial \Vec{A}} \right)$ is defined as 
\begin{align}
    \begin{cases}
        \fgamp^{\what} &= \denoiser(\Vec{b}, \vec{A})\\
        \fgamp^{\vhat} &= \partial_b \denoiser(\Vec{b}, \vec{A})\\
        \fgamp^{\Vec{\omega}} &= X\what - \Vec{V} \odot \Vec{g} \\
        \fgamp^{\Vec{V}} &= X^2 \vhat \\
        \fgamp^{\Vec{g}} &= \channel \left( \Vec{y}, \Vec{\omega}, \Vec{V} \right) \\
        \fgamp^{\partial \Vec{g}} &= \partial_{\omega} \channel \left( \Vec{y}, \Vec{\omega}, \Vec{V} \right) \\
        \fgamp^{\Vec{b}} &= X^{\top} \Vec{g} + \Vec{A} \odot \what \\
        \fgamp^{\partial \Vec{A}} &= - X^{2 \top} \partial \Vec{g} \\
    \end{cases}
    \label{eq:def_f_gamp}
\end{align}

Equivalently, we have $\Omega(y) - \fgamp(\Omega (y), y) = \Vec{0}$. Under the assumption that the function $\Omega(y)$ is differentiable, one can use the implicit function theorem around a value $\hat{y}$ to write 
\begin{align}
    \frac{\partial \Omega}{\partial y}\left( \hat{y} \right)                 &= \left( \mathbf{I} - Jac \left( \fgamp \right) \right)^{-1} \frac{\partial \fgamp}{\partial y} \left( \hat{y} \right) \\
    \Leftrightarrow \frac{\partial \Omega}{\partial y}\left( \hat{y} \right) &=  Jac \left( \fgamp \right) \left( \frac{\partial \Omega}{\partial y}\left( \hat{y} \right) \right) + \frac{\partial \fgamp}{\partial y} \left( \hat{y} \right)
\end{align}

From the last equality we find that we can compute the derivative $\frac{\partial \Omega}{\partial y}\left( \hat{y} \right)$ by iterating the following system of linear equations over a vector $\Delta \Omega^t$ :
\begin{equation}
    \Delta \Omega^{t+1} =  Jac \left( \fgamp \right) \left( \Delta \Omega^{t} \right) + \frac{\partial \fgamp}{\partial y} \left( \hat{y} \right)
    \label{eq:iteration_delta_omega}
\end{equation}

The jacobian of the function $\fgamp$ is written 
\begin{align}
    \begin{cases}
        Jac f_{\amp}^{\what}            &= \left( \Vec{0}, \Vec{0}, \Vec{0}, \Vec{0}, \partial_b \denoiser(\Vec{b}, \Vec{A}), \partial_A \denoiser(\Vec{b}, \Vec{A}) \right) \\
        Jac f_{\amp}^{\vhat}            &= \left( \Vec{0}, \Vec{0}, \Vec{0}, \Vec{0}, \partial_b \partial_b \denoiser(\Vec{b}, \Vec{A}), \partial_A \partial_b \denoiser(\Vec{b}, \Vec{A}) \right) \\
        Jac f_{\amp}^{\omega}           &= \left( X, \Vec{0}, \Vec{0}, - Diag(\Vec{g}), - Diag(\Vec{V}), \Vec{0}, \Vec{0}, \Vec{0} \right) \\
        Jac f_{\amp}^{\Vec{V}}          &= \left( \vec{0}, X^2, \Vec{0}, \Vec{0}, \Vec{0}, \Vec{0} \right) \\
        Jac f_{\amp}^{\Vec{g}}          &= \left( \vec{0}, \vec{0}, \partial_{\omega} \Vec{g}, \partial_{V} \Vec{g}, \Vec{0}, \Vec{0}, \Vec{0}, \Vec{0} \right) \\
        Jac f_{\amp}^{\partial \Vec{g}} &= \left( \vec{0}, \vec{0}, \partial_{\omega} \partial_{\omega} \Vec{g}, \partial_{V} \partial_{\omega} \Vec{g}, \Vec{0}, \Vec{0} \right) \\
        Jac f_{\amp}^{\Vec{b}}          &= \left( Diag(\Vec{A}), \Vec{0}, \vec{0}, \vec{0}, X^{\top}, \Vec{0}, \Vec{0}, Diag(\Vec{w}) \right) \\
        Jac f_{\amp}^{\Vec{A}}          &= \left( \vec{0}, \Vec{0}, \Vec{0}, \Vec{0}, - X^{2\top}, \Vec{0},\Vec{0}, \Vec{0} \right)\\
    \end{cases}
    \label{eq:jacobian_f_gamp}
\end{align}

and the derivative $\frac{\partial \fgamp}{\partial y}$ with respect to the last label is 
\begin{align*}
    \begin{cases}
        \partial_y f_{\amp}^{\Vec{\theta}} &= \Vec{0} \\
        \partial_y f_{\amp}^{\Vec{v}} &= \Vec{0} \\
        \partial_y f_{\amp}^{\Vec{\omega}} &= \Vec{0}\\
        \partial_y f_{\amp}^{\Vec{V}} &= \Vec{0} \\
        \partial_y f_{\amp}^{\Vec{g}} &= \left( 0, \cdots, 0, \partial_y g(y_n, \omega_n, V_n) \right) \\
        \partial_y f_{\amp}^{\partial \Vec{g}} &= \left( 0, \cdots, 0, \partial_y \partial_{\omega} g(y_n, \omega_n, V_n) \right) \\
        \partial_y f_{\amp}^{\Vec{b}} &= \Vec{0} \\
        \partial_y f_{\amp}^{\Vec{A}} &= \Vec{0} \\
    \end{cases}
\end{align*}

When writing Equation~\eqref{eq:iteration_delta_omega} with the expression of the Jacobian of Equation~\eqref{eq:jacobian_f_gamp}, one obtains the iterations of \taylorgamp in Algorithm~\ref{alg:gamp_order_one}.

\subsection{Justification of \taylorgamp}
\label{appendix:taylorgamp_justification}

% Intuitively, the goal of \taylorgamp is to avoid repeting computations and only run the AMP algorithm once. Indeed, in high-dimensions, the label of the test samples should only weakly affect the estimator, such that 
As stated in the previous subsection, \taylorgamp is based on the assumption that the function $y \to \Omega \left) y \right)$ is differentiable. Our underlying assumption behind \taylorgamp is that the leave-one-out residuals only weakly depend on the last label in high-dimensions. We numerically justify this assumption in~\cref{fig:residuals_gamp_erm}. In this Figure, we compare the leave-one-out residuals obtained by computing the estimators $\what_{-i}$ exactly and with \taylorgamp for different settings. To do so, we sample a dataset $\dataset$ at random. We use~\cref{alg:gamp} and \cref{alg:gamp_order_one} to compute the $\what_{-i, \gamp}(y_n)$ and $\Delta \what_{-i, \gamp}(y)$ as prescribed above. Then, we change the last label $y_n \to y_n + \delta y$ with $\delta y = 5$. After this change we compute the leave-one-estimators exactly $\what_{-i} (y_n + \delta y)$ and use our linear approximation $\what_{-i, \gamp} (y + \delta y) = \what_{-i} ( y ) + \delta y \Delta \what_{-i. \gamp} (y)$. We then compare $\what_{-i} (y_n + \delta y)^{\top} \vec{x}_i$ and our approximation $\what_{-i, \gamp} (y_n + \delta y)^{\top} \vec{x}_i$ that is used to compute our conformity scores. As we observe in the figure, at high dimensions $d = 1000$, our approximations are very close to the true values, meaning that \taylorgamp will accurately estimate the scores (hence the prediction intervals) of FCP.

We note however from the lower-left plot that at moderate dimension, \taylorgamp does not precisely approximates the leave-one-out residuals for the LASSO, which partly explains the mediocre results obtained by \taylorgamp on real data in~\cref{tab:comparison_with_homotopy_real} in the main.

\begin{figure}
    \centering
    \includegraphics[width=0.45\textwidth]{Figures/residuals_comparison_Ridge_λ = 0.01_α = 0.5.pdf}
    \includegraphics[width=0.45\textwidth]{Figures/residuals_comparison_Ridge_λ = 1_α = 0.5.pdf}
    \includegraphics[width=0.45\textwidth]{Figures/residuals_comparison_Lasso_λ = 0.01_α = 0.5.pdf}
    \includegraphics[width=0.45\textwidth]{Figures/residuals_comparison_Lasso_λ = 1_α = 0.5.pdf}
    \caption{Comparison of the leave-one-out estimators computed exactly by solving~\cref{eq:argmin_loo} and by \taylorgamp, for Ridge (top row) and Lasso (bottom row), as $\lambda = 0.01$ (left column) and $\lambda = 1$ (right column). All plots are at $\sfrac{n}{d} = 0.5$}
    \label{fig:residuals_gamp_erm}
\end{figure}

\section{Coverage guarantee for AMP}
\label{appendix:amp_coverage}

First, we show that AMP is symmetric : indeed, consider a permutation $s : [1, n] \rightarrow [1, n]$ and $S$ the corresponding permutation matrix defined as $S_{ij} = \delta \left( j = s(i) \right)$. Then, consider running AMP on the permutated data $\Tilde{X} = SX$ and labels $\Tilde{y} = SY$. At each iteration $t$, the channel vectors $\Tilde{\Vec{g}}^t, \Tilde{\partial{\Vec{g}}}^t$ 
$\Tilde{\Vec{g}}^t = S \Vec{g}^t$ and $\Tilde{\partial \Vec{g}^t} = S \partial \Vec{g}^t$. Then, the vectors $\Vec{b}^t, \Vec{A}^t$ now become 
\begin{align}
\begin{cases}
\Tilde{\vec{A}}^t &= - \Tilde{X^{2\top}} \Tilde{\partial \vec{g}^t} = - X^{2\top} S^T S \Vec{g}^t = \Vec{A}^t \\
\Tilde{\vec{b}}^t &= \Tilde{X^{\top}} \Tilde{\vec{g}^t} + \Tilde{\Vec{A}^t} \otimes \what^t = X^{\top} S^T S \vec{g}^t + \Tilde{\Vec{A}^t} \otimes \what^t = \Vec{b}^t \\
\end{cases}
\end{align}
and by recursion we deduce that the estimator of AMP $(\what, \vhat)$ given after convergence is invariant under permutation. Then, the scores computed from~\cref{eq:leave_one_out_from_amp} are symmetric. Then, under the assumption that the data $(\vec{x}_i, y_i)$ is exchangeable, we obtain~\cref{prop:coverage} : in expectation over the training and test data
\begin{equation}
    \mathbb{P}_{\mathcal{D}, \vec{x}} \left( y \in \interval(\vec{x})  \right) \geqslant 1 - \kappa
\end{equation}

\section{Details on real datasets}
\label{app:numerical_details}

In this section, we provide details on the datasets used in~\cref{tab:comparison_with_homotopy_real}. We use :
\begin{enumerate}
    \item The wine quality dataset~\citep{Cortez2009ModelingWP}, containing 1143 samples at dimension 11, containing a rating of the wine quality on a 1-5 scale as a function of different physical quantities. In our experiments, we split the data into a training and test sets with a 90\% / 10\% proportion.
    \item The Boston housing dataset containing 506 samples at dimension 14, with a training / test split of 80 \% / 20 \%.
    \item The Riboflavin dataset~\cite{Buhlmann2014HighDimensional} of 71 samples at dimension 4088
\end{enumerate}
All datasets, with dimension noted as $d$, where normalized such that that standard deviation of the output $y$ is 1 and the standard deviation of each input dimension is $1 / \sqrt{d}$.

For~\cref{tab:comparison_with_homotopy_real}, approximate homotopy and exact homotopy were used with the default parameters provided by the authors.

\subsection{Additional target coverages}
\label{app:additional_coverages}

For the sake of completeness, we reproduce the experiments of~\cref{tab:comparison_with_homotopy_real} at other target coverages for the Boston and the Riboflavin datasets. We plot the empirical coverage for GAMP and \taylorgamp over as a function of the target coverage. The solid line and shaded area are respectively the mean and standard deviation over different train / test splits, and observe that both methods achieve the correct coverage on both datasets.

\begin{figure}
    \centering
    \includegraphics[width=0.45\linewidth]{Figures/other_coverages/boston_coverages_GAMP.png}
    \includegraphics[width=0.45\linewidth]{Figures/other_coverages/boston_coverages_Taylor-AMP.png}
    \caption{Coverage of AMP (Left) and Taylor-AMP (Right) on the Boston dataset as a function of the target coverage. Line and shaded area are respectively the mean and standard deviation of the coverage over 100 random training / test splits. Black dashed line corresponds to a valid coverage that matches the target.}
    \label{fig:additional_coverages_boston}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.45\linewidth]{Figures/other_coverages/riboflavin_coverages_GAMP.png}
    \includegraphics[width=0.45\linewidth]{Figures/other_coverages/riboflavin_coverages_Taylor-AMP.png}
    \caption{Coverage of AMP (Left) and Taylor-AMP (Right) on the Riboflavin dataset as a function of the target coverage. Line and shaded area are respectively the mean and standard deviation of the coverage over 20 random training / test splits. Black dashed line corresponds to a valid coverage that matches the target.}
    \label{fig:additional_coverages_boston}
\end{figure}

Note that all experiments in the paper were carried out on a Apple M1 Pro laptop with 16 Go of memory. The predictions intervals are obtained by selecting potential labels over a grid. 

The code and data used for the experiments are available at \url{github.com/lclarte/ConformalAmp.jl}

\section{Extension to generalized linear models}

\paragraph{Robust regression and quantile regression}
\label{app:quantile_robust}

Numerical experiments in~\cref{sec:numerics} were focused on the square loss. However, our method can be extended to other regression problems. In this section, we consider the pinball loss, also known as quantile loss, defined as 
\begin{equation}
    \ell(y, \hat{y}) = q \times \max(y - \hat{y}, 0) + (1 - q) \times \max(\hat{y} - y, 0)
\end{equation}
and used to estimate the quantile function $q$ of the data. The AMP can be applied to this loss with the channel 
\begin{align}
    \prox_{\ell}(y, \omega, V) = \arg\min_z \ell(y, z) + \frac{1}{2V} (\omega - z)^2 = \begin{cases}
         &\omega + (q - 1) V \text{ if } \omega > y - (q - 1) V \\
         &\omega + qV \text{ if } \omega < y - qV \\
         &y \text{ otherwise }
    \end{cases}
\end{align}
and $\channel(y, \omega, V) = \frac{\prox(y, \omega, V) - \omega}{V}$. For $q = \sfrac{1}{2}$, this loss is equal (up to a factor $2$ scaling) to the absolute value loss, as it equates
\begin{equation}
    \ell(y, \hat{y}) = \frac{1}{2}|y - \hat{y}| 
\end{equation}
which is notably used for robust regression in the presence of outliers. 

\paragraph{Binary classification}
\label{appendix:classification}

Conformal prediction has been successfully applied for classification tasks~\cite{angelopoulos2021learn, angelopoulos2022gentle}. Consider a classification task with $k$ classes, where a predictor estimate the probabilities $p_1 (\vec{x} ), \cdots, p_n ( \vec{x} )$. Then, the conformity scores are defined as 
\begin{equation}
    \score_i = \sum_{k = 1}^{\pi^{-1}(y)} p_{\pi(1)}
    \label{eq:score_classification}
\end{equation}
where $\pi$ is a permutation that ranks the classes by decreasing order of probability, i.e $p_{\pi(1)} > \cdots > p_{\pi(K)}$. In words, the score is the sum of the probability of all the classes whose $p_i$ is higher of equal to the true observed class.

In the context of generalized linear model, one might train an estimator using the cross entropy loss with an $L_2$ regularizer. For $K = 2$ classes, this is logistic regression 
\begin{equation}
    \what = \arg\min_{\vectheta} - \sum_{i = 1}^n \log \left( 1 + e^{- y_i \times \vec{x}_i^{\top} \vectheta} \right) + \sfrac{\lambda}{2} \| \vectheta \|^2
\end{equation}

As for regression, one can use AMP and \taylorgamp with the adequate channel and denoising function to estimate $\what$, and compute the leave-one-out estimators using~\cref{eq:leave_one_out_from_amp}. For the logistic loss, the channel function is defined as 
\begin{align}
    \channel(y, \omega, V) = \frac{{\rm prox} \ell_{\omega, V}(y, \cdot) - \omega}{V}, \qquad {\rm prox} \ell_{\omega, V}(y, \cdot) = \arg\min_z \ell(y, z) + \frac{1}{2V} \left( z - \omega \right)^2
\end{align}

\section{Asymptotic of the prediction interval sizes under Gaussian assumption}
\label{app:replica_sizes}

Our method leverages the state-evolution equations of AMP. In fact, using the state evolution equations of AMP, we can sharply compute the size of the prediction intervals in the high-dimensional limit, under the assumption of~\cref{eq:gaussian_data}. First, consider the leave-one-out residualx $r_i$
\begin{equation}
    r_i :=  \what_{-i} \left( y \right)^{\top} \Vec{x}_i - y_i
\end{equation}
such that $\sigma_i = | r_i |$. These residuals can be computed using r-BP as explained in~\cref{appendix:amp}. Indeed, in the high-dimensional limit 
\begin{equation}
    r_i = \what_{-i, rBP} \left( y \right)^{\top} \Vec{x}_i - y_i
\end{equation}
where the value of the vector $\what_{-i, rBP}$ at the index $\mu$ is the cavity mean $\hat{\theta}_{\mu \to i}$ defined in~\cref{eq:cavity_mean_variance}. Now, note that the distribution on the $r_i$ can be easily computed under the Gaussian data assumption : by definition, the vector $\what_{-i}$ is uncorrelated with $\vec{x}_i$. Hence, $\what_{-i}^{\top} \vec{x}_i - y_i = \left( \what_{-i} - \wstar \right)^{\top} \vec{x}_i + \varepsilon$ follows a Gaussian distribution with mean $0$ and variance $\| \frac{1}{d} \left( \wstar \|^2 - 2 \times \wstar^{\top} \what_{-i} + \| \what_{-i} \|^2 + \Delta \right)$. $\rho  = \frac{1}{d} \| \wstar \|^2$ is given by the prior on $\wstar$, and is for instance equal to $1$ when $\wstar_i \sim \mathcal{N}(0, 1)$. In the high-dimensional limit, the scalar products $\frac{1}{d} \wstar^{\top} \what_{-i} (y)$ converge to a common value $m = \lim_{d \to \infty} \frac{1}{d} \wstar^{\top} \what$ for all $i$ and all $y$. Similarly, the square norms of the leave-one-out estimators converge to the same value $q = \lim_{d \to \infty} \frac{1}{d} \| \what \|^2$. 

To summarize, as $n, d \to \infty$ the residuals $r_i$ follow the distribution
\begin{equation}
    r_i \sim \mathcal{N} \left( 0, \rho - 2 m + q + \Delta \right)
\end{equation}
with
\begin{equation}
    m = \lim_{d \to \infty} \frac{1}{d} \wstar^{\top} \what, \qquad  q = \lim_{d \to \infty} \frac{1}{d} \| \what \|^2
    \label{eq:def_m_q}
\end{equation}

\paragraph{From the distribution of the residuals to the prediction interval} Since the asymptotic distribution of the $\left( r_i \right)_i$ is Gaussian, one obtains the $1 - \kappa$ quantile of the scores $\score_i$ by computing the 1 - $\kappa / 2$ and the $\kappa / 2$ quantiles of this Gaussian distribution. By definition of full conformal prediction, a label $y$ will be included in the prediction set if and only if $| y - \what_{-(n+1)}^{\top} \vec{x} | < q_{1 - \kappa} ( (\score_i)_i )$, but since these scores asymptotically follow the distribution of the absolute value of a Gaussian variable, its $1 - \kappa$ quantile is equal to the $1 - \kappa / 2$ quantile of the corresponding Gaussian distribution. In conclusion, asymptotically, the prediction interval will be 
\begin{equation}
    \mathcal{S} (\vec{x}) = [ \what^{\top} \vec{x} \pm \sqrt{\rho - 2m + q + \Delta} \times q_{1 - \sfrac{\kappa}{2}} \left( Z \right)  ], \qquad Z \sim \mathcal{N}(0, 1)
    \label{eq:asymptotic_prediction_interval}
\end{equation}
where $m, q$ are given by the state-evolution equations of AMP that we detail in~\cref{app:se_amp}. Note that $\rho - 2m + q + \Delta$ is exactly equal the generalization error (for the mean square error) of the ERM estimator. Thus, \cref{eq:asymptotic_prediction_interval} directly links the generalization error of the estimator with the size of the prediction intervals and shows that the best estimator also has the tightest intervals.

\subsection{State-evolution equations of AMP}
\label{app:se_amp}

As explained in the previous section, one only needs the value of the overlaps $m$ and $q$ \eqref{eq:def_m_q} to compute the size of the prediction intervals in high-dimension. To do so, it is useful to go back to relaxed-BP, which is asymptotically equivalent to AMP and thus has the same overlaps. 

The rBP equations are written,

\begin{align}
    \begin{cases}
        {\omega}^{(t)}_{\mu \to i} &= \sum_{j \neq i} X_{\mu, j} \hat{{\theta}}^{(t)}_{j \to \mu} \\
        {V}^{(t)}_{\mu \to i} &= \sum_{j \neq i} X_{\mu, j}^2 \hat{{C}}^{(t)}_{j \to \mu}
    \end{cases},\quad
    \begin{cases}
        \channel{}_{\mu \to i}^{(t)} &= \channel(y_{\mu}, {\omega}_{\mu \to i}^{(t)}, {V}^{(t)}_{\mu \to i} ) \\
        \partial\channel{}_{\mu \to i}^{(t)} &= \partial_{{\omega}}\channel(y_{\mu}, {\omega}_{\mu \to i}^{(t)}, {V}^{(t)}_{\mu \to i} )
    \end{cases}
\end{align} 

\begin{align}
    \begin{cases}
        {b}_{\mu \to i}^{(t)} &= \sum_{\nu \neq \mu} X_{\nu, i} \channel^{(t)}{}_{\nu \to i} \\
        {A}_{\mu \to i}^{(t)} &= - \sum_{\nu \neq \mu} X_{\nu, i}^2 \partial \channel^{(t)}{}_{\nu \to i} 
    \end{cases},\quad
    \begin{cases}
        \hat{{\theta}}^{(t)}_{i \to \mu} &= \denoiser({b}^{(t)}_{i \to \mu}, {A}_{i \to \mu}^{(t)}) \\
        \hat{{C}}^{(t)}_{i \to \mu} &= \partial_{b}\denoiser({b}^{(t)}_{i \to \mu}, {A}_{i \to \mu}^{(t)}).
    \end{cases}
\end{align}

It turns out that the average asymptotic behavior of these equations can be tracked with some overlap parameters defined as follows:
\begin{align}\label{eq:rbp_se_overlaps}
    {m}^{(t)} &\equiv \lim_{d\to\infty}\frac1d\sum_{i=1}^d \hat{\vec{\theta}}^{(t)}_i\wstar^\top, \quad &Q^{(t)} &\equiv \lim_{d\to\infty}\frac1d\sum_{i=1}^d \hat{\vec{\theta}}^{(t)}_i\hat{\vec{\theta}}^{(t)\top}_i\\
    {V}^{(t)} &\equiv \lim_{d\to\infty}\frac1d\sum_{i=1}^d \hat{{C}}^{(t)}_i, \quad &\rho&= \lim_{d\to\infty} \frac{\|\wstar\|^2}{d}.
\end{align}
To derive the asymptotic behavior of these overlap parameters, we compute the overlap distributions starting from the rBP equations above.

\subsubsection{Messages Distribution}
For convenience, let us define $z_{\mu} \equiv \vec{x}_\mu^\top\wstar$ and $z_{\mu\to i} \equiv \frac1d \sum_{j\neq i} \vec{x}_{\mu, j}\theta_{\star j}$.

\paragraph{Distribution of $(z_{\mu}, {\omega}^{(t)}_{\mu \to i})$}
By the Central Limit Theorem, since $(z_{\mu}, {\omega}^{(t)}_{\mu \to i})$ are the sum of independent variables, they follow Gaussian distributions in the $d\to\infty$ limit. Therefore, we only need to compute their means, variances, and cross-correlation. Recall that from our assumptions, the random variables $X_{\mu, j}$ are i.i.d. zero-mean Gaussian with variance $\sfrac1d$. Hence, the first and second-order statistics read

\begin{align}
    \mathbb{E} \left[ z_{\mu} \right] &= \wstar^\top\mathbb{E}[\vec{X}_\mu] =  0 \\
    \mathbb{E} \left[ z_{\mu}^2 \right] &= \sum_{i, j=1}^d \mathbb{E}[X_{\mu, i}X_{\mu, j}]\theta_{\star i}\theta_{\star j} = \sum_{i, j=1}^d \frac1d\delta_{ij}\theta_{\star i}\theta_{\star j} =  \frac{\| \wstar \|^2}{d} \stackrel{d\to\infty}{\longrightarrow} \rho \\
    \mathbb{E} \left[ {\omega}^{(t)}_{\mu \to i} \right] &= \sum_{j \neq i} \mathbb{E}[X_{\mu, j}]\hat{\vec{\theta}}^{(t)}_{j \to \mu} = {0} \\
    \mathbb{E} \left[ {\omega}^{(t)}_{\mu \to i}({\omega}^{(t)}_{\mu \to i})^\top \right] &= \sum_{j \neq i}^{d}\sum_{k \neq i}^{d}\mathbb{E}[X_{\mu, j}X_{\mu, k}]\hat{\Vec{\theta}}^{(t)}_{j \to \mu}\hat{\Vec{\theta}}^{(t) \top}_{k \to \mu} = \frac1d\sum_{j \neq i}^{d}\hat{\Vec{\theta}}^{(t)}_{j \to \mu}\hat{\Vec{\theta}}^{(t) \top}_{k \to \mu} \\
    &= \frac{1}{d} \sum_{j=1}^{d} \hat{\Vec{\theta}}^{(t)}_{j \to \mu} \hat{\Vec{\theta}}^{(t) \top}_{j \to \mu} - \frac1d\hat{\Vec{\theta}}^{(t)}_{i \to \mu} \hat{\Vec{\theta}}^{(t) \top}_{i \to \mu}  \stackrel{d\to\infty}{\longrightarrow} {q}^{(t)}\\
    \mathbb{E} \left[ z_{\mu} {\omega}^{(t)}_{\mu \to i} \right] &= \sum_{j=1}^d \sum_{k\neq i}^d\mathbb{E}[X_{\mu, j}X_{\mu, k}] \hat{\Vec{\theta}}^{(t)}_{k \to \mu} \wstar{}_j = \frac{1}{d} \sum_{j \neq i} \hat{\Vec{\theta}}^{(t)}_{j \to \mu} \wstar\\
    &= \frac{1}{d} \sum_{j=1}^d \hat{\Vec{\theta}}^{(t)}_{j \to \mu} \wstar - \frac{1}{d}\hat{\Vec{\theta}}^{(t)}_{i \to \mu} \wstar\stackrel{d\to\infty}{\longrightarrow} {m}^{(t)}
\end{align}

In summary, in the $d \to \infty$ limit : 
\begin{equation}\label{eq:joint_distribution_z_omega}
    \left( z_{\mu}, {\omega}^{(t)}_{\mu \to i} \right) \sim \mathcal{N}\left( 0, \begin{bmatrix}
        \rho & {m}^{(t) \top} \\
        {m}^{(t)} & {q}^{(t)}
    \end{bmatrix}
    \right)
\end{equation}

\paragraph{Concentration of ${V}^{(t)}_{\mu \to i}$}

In the asymptotic limit, the variances ${V}^{(t)}_{\mu \to i}$ concentrate around their means, which equates  
\begin{equation}
    \mathbb{E} \left[ {V}^{(t)}_{\mu \to i} \right] = \sum_{j \neq i}^d \mathbb{E} \left[ X_{\mu, j}^2 \right] \hat{{C}}^{(t)} = \frac{1}{d} \sum_{j \neq i} \hat{{C}}^{(t)}_j = \frac{1}{d} \sum_{j=1}^d \hat{{C}}^{(t)}_j - \frac1d \hat{{C}}^{(t)}_i \stackrel{d\to\infty}{\longrightarrow} {V}^{(t)}
\end{equation}

\paragraph{Distribution of ${b}^{(t)}_{\mu \to i}$}
Recall from our setting that for a given input $\vec{x}_\mu$, the corresponding label is distributed as $y_\mu\sim p(\cdot|z_\mu)$. In fact, one can equivalently write $y^\mu=\varphi_0(z_\mu)$ for some (random) function $\varphi_0$. For example, the choice $\varphi_0(x)=x+\sqrt{\Delta}\xi$ corresponds to the linear regression, where $\xi\sim\mathcal{N}(0, 1)$ is Gaussian noise scaled by a variance $\Delta\geq 0$.
With this representation for $y_\mu$, we have
% We have
% \begin{equation}
% \Vec{b}^{(t)}_{\mu \to i} = \sum_{\nu \neq \mu} X_{\nu, i} \channel(y_{\nu}, \Vec{\omega}^{(t)}_{\nu \to i}, \Vec{V}_{\nu \to i} )
% \end{equation}
% $y_{\nu}$ is correlated to $X_{\nu, i}$, we thus write it as 
% \begin{equation}
%     y_{\nu} = \varphi_0 \left( z_{\mu \to i} + \wstar{}_i X_{\nu, i}, \varepsilon_{\nu} \right), z_{\mu \to i} = \sum_{j \neq i} \wstar{}_j X_{\nu, j}
% \end{equation}
% where the random variable $\varepsilon_{\nu}$ accounts for the stochasticity in the data-generating process. We have : 
\begin{align}
    {b}^{(t)}_{\mu \to i} &= \sum_{\nu \neq \mu} X_{\nu, i} \channel( \varphi_0 \left( z_\nu \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} ) \\
    &= \sum_{\nu \neq \mu} X_{\nu, i} \channel( \varphi_0 \left( z_{\nu \to i} + \theta_{\star i} X_{\nu, i} \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} ) \\
    &= \sum_{\nu \neq \mu} X_{\nu, i} \channel( \varphi_0 \left( z_{\nu \to i} \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} ) + X_{\nu, i}^2 \theta_{\star i} \partial_z \channel( \varphi_0 \left( z_{\nu \to i} \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} ) + O(d^{-3/2}),
\end{align}
where in the last equality we have expanded the denoising function at leading order. Taking expectation on both sides yields
\begin{align}
    \mathbb{E}[{b}^{(t)}_{\mu \to i}] &= \frac{\theta_{\star i}}{d}  \sum_{\nu \neq \mu}\partial_z \channel( \varphi_0 \left( z_{\nu \to i} \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} ) + O(d^{-3/2})\\
    &= \frac{\theta_{\star i}}{d}  \sum_{\nu =1}^n \partial_z \channel( \varphi_0 \left( z_{\nu \to i} \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} ) - \frac{\theta_{\star i}}{d}\partial_z \channel( \varphi_0 \left( z_{\mu \to i} \right) , {\omega}^{(t)}_{\mu \to i}, {V}^{(t)}_{\mu \to i} ) + O(d^{-3/2}),
\end{align}
Note that as $d\to\infty$, it follows from our computations above that for all $\nu$, $(z_{\nu\to i}, {\omega}^{(t)}_{\nu\to i})$ are identically distributed according to~\cref{eq:joint_distribution_z_omega}. Consequently, by the Law of Large Numbers,
\begin{equation}
    \frac{n}{d}\cdot \frac1n\sum_{\nu =1}^n \partial_z \channel( \varphi_0 \left( z_{\nu \to i} \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} ) \stackrel{n, d\to\infty}{\longrightarrow} \alpha\mean{(z, \omega)  }{\partial_z \channel( \varphi_0 \left( z \right) , {\omega}, {V}^{(t)}  )} \equiv \hat{{m}}^{(t)},
\end{equation}
from which we find that
\begin{equation}
    \mathbb{E}[{b}^{(t)}_{\mu \to i}] \stackrel{n, d\to\infty}{\longrightarrow} \theta_{\star i}\hat{{m}}^{(t)}.
\end{equation}
The second moment can be computed in a similar fashion:
\begin{align}
    \mathbb{E}[{b}^{(t)}_{\mu \to i}{b}^{(t)\top}_{\mu \to i}] &= \sum_{\nu \neq \mu}\sum_{\kappa \neq \mu} \mathbb{E}[X_{\nu, i}X_{\kappa, i}] \channel( \varphi_0 \left( z_\nu \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} )\channel( \varphi_0 \left( z_\kappa \right) , {\omega}^{(t)}_{\kappa \to i}, {V}^{(t)}_{\kappa \to i} )^\top\\
    &= \frac1d \sum_{\nu \neq \mu} \channel( \varphi_0 \left( z_{\nu \to i} \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} ) \channel( \varphi_0 \left( z_{\nu \to i} \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} )^\top + O(d^{-2})\\
    &= \frac1d \sum_{\nu = 1}^n \channel( \varphi_0 \left( z_{\nu \to i} \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} ) \channel( \varphi_0 \left( z_{\nu \to i} \right) , {\omega}^{(t)}_{\nu \to i}, {V}^{(t)}_{\nu \to i} )^\top + O(d^{-2})\\
    &\stackrel{n, d\to\infty}{\longrightarrow} \alpha\mean{(z, {\omega}^{(t)})  }{\channel( \varphi_0 \left( z \right) , {\omega}^{(t)}, {V}^{(t)}  )\channel( \varphi_0 \left( z \right) , {\omega}^{(t)}, {V}^{(t)}  )^\top} \equiv \hat{{q}}^{(t)}.
\end{align}
Hence, ${b}^{(t)}_{\mu \to i} = \theta_{\star i}\hat{{m}}^{(t)} + \left(\hat{{q}}^{(t)}\right)^{\sfrac12}{\xi}$ with ${\xi}\sim\mathcal{N}({0}, 1)$.

\paragraph{Concentration of ${A}^{(t)}_{\mu \to i}$}
It remains to show that the covariances ${A}^{(t)}_{\mu \to i}$ concentrate. We have
\begin{align}
    {A}^{(t)}_{\mu \to i} &= - \sum_{\nu \neq \mu} X_{\nu, i}^2 \partial_{{\omega}} \channel(y_{\nu}, {\omega}_{\nu \to i}^{(t)}, {V}^{(t)}_{\nu \to i} )\\
    &= - \sum_{\nu \neq \mu} X_{\nu, i}^2 \partial_{{\omega}} \channel(\varphi_0(z_\nu), {\omega}_{\nu \to i}^{(t)}, {V}^{(t)}_{\nu \to i} )\\
    &= - \sum_{\nu \neq \mu} X_{\nu, i}^2 \partial_{{\omega}} \channel(\varphi_0(z_{\nu\to i}), {\omega}_{\nu \to i}^{(t)}, {V}^{(t)}_{\nu \to i} ) + O(d^{-3/2}).
\end{align}
Taking the expectation gives
\begin{align}
    \mathbb{E}[{A}^{(t)}_{\mu \to i}] &= -\frac1d \sum_{\nu \neq \mu} \partial_{{\omega}} \channel(\varphi_0(z_{\nu\to i}), {\omega}_{\nu \to i}^{(t)}, {V}^{(t)}_{\nu \to i} ) + O(d^{-3/2}) \\
    &= -\frac1d \sum_{\nu = 1}^n \partial_{{\omega}} \channel(\varphi_0(z_{\nu\to i}), {\omega}_{\nu \to i}^{(t)}, {V}^{(t)}_{\nu \to i} )-\frac1d\partial_{{\omega}}\channel(\varphi_0(z_{\mu\to i}), {\omega}_{\mu \to i}^{(t)}, {V}^{(t)}_{\mu \to i} ) + O(d^{-3/2}) \\
    &\stackrel{n, d\to\infty}{\longrightarrow} -\alpha\mean{(z, {\omega}^{(t)})  }{\partial_{{\omega}}\channel( \varphi_0 \left( z \right) , {\omega}^{(t)}, {V}^{(t)}  )} \equiv \hat{{V}}^{(t)}
\end{align}

\paragraph{State-evolution equations} From the previous computations, we deduce that asymptotically the coordinates of the estimator are distributed as 
\begin{equation}
    \hat{\theta}^t_i \sim \denoiser \left( \theta_{\star i} \hat{m}^t + \sqrt{\hat{q}} \varepsilon, \hat{V} \right), \quad \varepsilon \sim \mathcal{N}(0, 1)
\end{equation}

And finally, we get that the overlaps $m, q$ are the solutions of the following state-evolution equations 
\begin{align}
    \begin{cases}
        m &= \mathbb{E}_{\theta_{\star}, \varepsilon} \left[ \denoiser(\hat{m} \theta_{\star} + \sqrt{\hat{q}} \varepsilon, \hat{v}) \theta_{\star} \right] \\
        q &= \mathbb{E}_{\theta_{\star}, \varepsilon} \left[ \denoiser(\hat{m} \theta_{\star} + \sqrt{\hat{q}} \varepsilon, \hat{v})^2 \right] \\
        V &= \mathbb{E}_{\theta_{\star}, \varepsilon} \left[ \partial_b \denoiser(\hat{m} \theta_{\star} + \sqrt{\hat{q}} \varepsilon, \hat{v}) \right] 
    \end{cases}
\end{align} for $\varepsilon \sim \mathcal{N}(0, 1)$ and 

\begin{align}
    \begin{cases}
        \hat{m} &= \alpha \mathbb{E}_{z, \omega} \left[ \partial_z \channel( \varphi_0(z), \omega, V) \right] \\
        \hat{q} &= \alpha \mathbb{E}_{z, \omega} \left[ \channel( \varphi_0(z), \omega, V)^2 \right] \\
        \hat{V} &=-\alpha \mathbb{E}_{z, \omega} \left[ \partial_{\omega} \channel( \varphi_0(z), \omega, V) \right] \\
    \end{cases}
\end{align}

Solving these equations, we deduce the value of $m, q$ that we can plug in~\cref{eq:asymptotic_prediction_interval} to compute the size of the prediction intervals in the high-dimensional limit.