\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{multirow}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
% \makeatletter
% \newcommand*{\addFileDependency}[1]{% argument=file name and extension
%   \typeout{(#1)}% latexmk will find this if $recorder=0 (however, in that case, it will ignore #1 if it is a .aux or .pdf file etc and it exists! if it doesn't exist, it will appear in the list of dependents regardless)
%   \@addtofilelist{#1}% if you want it to appear in \listfiles, not really necessary and latexmk doesn't use this
%   \IfFileExists{#1}{}{\typeout{No file #1.}}% latexmk will find this message if #1 doesn't exist (yet)
% }
% \makeatother
% \addFileDependency{main.tex}
% \addFileDependency{main.aux}
\externaldocument{wang_665}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\usepackage{amsmath,amsthm,amsfonts,amssymb,soul,cancel,enumitem}
\usepackage{cleveref}
\usepackage{mdframed}

\usepackage[T3,T1]{fontenc}
\DeclareSymbolFont{tipa}{T3}{cmr}{m}{n}
\DeclareMathAccent{\invbreve}{\mathalpha}{tipa}{16}


\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{example}{Example}
\theoremstyle{remark}
\newtheorem{remark}{Remark}

\newcommand{\indep}{\perp \!\!\! \perp}
\newcommand{\mred}[1]{{\color{red} #1}}
\newcommand{\mblue}[1]{{\color{blue} #1}}
\newcommand{\mgray}[1]{{\color{gray} #1}}
\newcommand{\pmodel}[1]{q(#1;\theta)}
\newcommand{\pdata}[1]{p(#1)}
\newcommand{\norm}[1]{\lVert #1 \rVert}
\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}
\newcommand{\KL}[2]{\mathrm{KL}(#1\,\Vert\, #2)}
\newcommand{\mbf}[1]{\mathbf{#1}}
\newcommand{\mb}[1]{\mathbb{#1}}
\newcommand{\mc}[1]{\mathcal{#1}}
\newcommand{\mrm}[1]{\mathrm{#1}}
\newcommand{\msf}[1]{\mathsf{#1}}
\newcommand{\nablazj}[1]{\nabla_{\theta^{(#1)}}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\bracket}[1]{\langle #1 \rangle}
\newcommand{\grad}{\mathrm{grad}}
\newcommand{\<}{\langle}
\let\oldket\>
\renewcommand{\>}{\rangle}

% \ba ... \bZ
\def\mydefb#1{\expandafter\def\csname b#1\endcsname{\mathbf{#1}}}
\def\mydefallb#1{\ifx#1\mydefallb\else\mydefb#1\expandafter\mydefallb\fi}
\mydefallb abcdeghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ\mydefallb
\let\oldbf\bf
%\renewcommand{\bf}{
%    \ifmmode \mathbf{f} \else \oldbf
%}

% \cA ... \cZ
\def\mydefc#1{\expandafter\def\csname c#1\endcsname{\mathcal{#1}}}
\def\mydefallc#1{\ifx#1\mydefallc\else\mydefc#1\expandafter\mydefallc\fi}
\mydefallc ABCDEFGHIJKLMNOPQRSTUVWXYZ\mydefallc

\newcommand{\bzero}{\mathbf{0}}
\newcommand{\bone}{\mathbf{1}}
\newcommand{\balpha}{\bm{\alpha}}
\newcommand{\bbeta}{\bm{\beta}}
\newcommand{\bpsi}{\bm{\psi}}
\newcommand{\bphi}{\bm{\phi}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\train}{\mathrm{train}}
\newcommand{\Xtrain}[0]{\mbf{X}_{\mathrm{train}}}
\newcommand{\Krr}{K_{rr}}
\newcommand{\Ker}{K_{er}}
\newcommand{\Kre}{K_{re}}
\DeclareMathOperator{\Var}{Var}
\DeclareMathOperator{\Cov}{Cov}
\DeclareMathOperator{\diag}{diag}

\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\newcommand{\parenfrac}[2]{\biggl(\frac{#1}{#2}\biggr)}
\DeclareMathOperator{\erf}{erf}
\DeclareMathOperator{\Unif}{Unif}
\DeclareMathOperator{\Acc}{Acc}

\newcommand{\zw}[1]{{\color{blue}[[\textbf{ZW:} #1]]}}
\newcommand{\todo}[1]{{\color{blue}[[\textbf{TODO:} #1]]}}


\title{A Constrained Bayesian Approach to Out-of-Distribution Prediction\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<wzy196@gmail.com>?Subject=Your UAI 2023 paper}{Ziyu Wang}{}\textsuperscript{*}}
\author[1]{Binjie Yuan\textsuperscript{*}}
\author[2]{Jiaxun Lu}
\author[3]{Bowen Ding}
\author[2]{Yunfeng Shao}
\author[3]{Qibin Wu}
\author[1]{Jun Zhu\textsuperscript{\#}}
% Add affiliations after the authors
\affil[1]{%
    Dept. of Comp. Sci. \& Tech., BNRist Center, Tsinghua-Huawei Joint Center for AI, THBI Lab, Tsinghua University
}
\affil[2]{%
    Huawei Noah's Ark Lab
}
\affil[3]{%
    Huawei Technologies Co., Ltd.
  }
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\renewcommand{\thesection}{S\arabic{section}}
\renewcommand{\thetheorem}{S\arabic{theorem}}
\renewcommand{\theexample}{S\arabic{example}}
\renewcommand{\thefigure}{S\arabic{figure}}
\renewcommand{\thetable}{S\arabic{table}}
\renewcommand\theequation{S\arabic{equation}}

\newcommand{\invParam}{{\bar\theta_{inv}}}
\newcommand{\bayesParam}[1][e]{{\bar\theta_{spu}^{#1}}}
\newcommand{\invVec}{{\bar\beta_{inv}}}
\newcommand{\spuVec}[1][e]{{\bar\beta_{spu}^{#1}}}
\newcommand{\Flocal}{{\partial\cF_{\delta}}}
\newcommand{\spuCollection}{{\cS_{tr}}}



\paragraph{Additional notations and conventions} The following conventions are used throughout appendix: the denotation of constants ($c_1,c_2,\ldots, C_1,C_2,\ldots$) may change from line to line. $\|\cdot\|_2$ denotes the Euclidean norm for vectors, or the $L_3(P_x^*)$ norm for functions of $x$. $\<\cdot,\cdot\>_2$ denotes the respective inner product. $\|\cdot\|_F$ denotes the Frobenius norm. 
$\phi_z,\Phi_z$ denote the PDF and CDF of the standard Gaussian distribution. 
Recall the inequality 
$
1-\Phi_z(x) \le e^{-x^2/2}
$
for $x>0$. 


\section{Proof for Lemma~\ref{lem:classif} and Additional Remarks}

\paragraph{Proof for the lemma}
The first two claims in the lemma are implied by the following two lemmas:
\begin{lemma}\label{lem:classif-claim}
When $\alpha>0,m \ll \max\{1,\sigma_s^{-2}\tau_s^2\}d_{spu}$, the classifier parameterized by 
$
\tilde\theta = (0, \tilde\theta_s) = (0, \alpha\sum_{e\in\cE_{tr}} \spuVec[e])
$
satisfies
\begin{equation}\label{eq:ex-classif-claim}
R_{e,01}(f_{\tilde\theta}) \le 
c_1 \exp(-c_2 \sigma_s^{-2}\tau_s^2 d_{spu}/m + o_p(1)) \to 0, ~~\forall e\in\cE_{tr}.
\end{equation}
Moreover, the same bound holds for the logistic loss, if in addition 
$\alpha \le \cO(\mrm{poly}(d_{spu}/m)), \alpha\tau_s^2\gtrsim 1/\sigma_s^2 m$. 
\end{lemma}

% Secondly, the following lemma shows that the aggregated spurious feature $\tilde\theta_s^\top \bx_{spu}$ can become ``nearly indistinguishable'' from invariant features, as we explain in the following:
\begin{lemma}\label{lem:classif-claim-2}
Denote by $\bx_i\in\RR^{d_{inv}}, \bx_s\in\RR^{d_{spu}}$ the invariant and spurious components of the input $\bx$. For any $e_1,e_2\in\cE_{tr}$, let 
$$
\text{KL}_{ij} := \KL{p_{marg,e_1}}{p_{marg,e_2}} = 
\KL{p_{e_1}(\by, \tilde\theta_s^\top\bx_s, \bx_i)}{
p_{e_2}(\by, \tilde\theta_s^\top\bx_s, \bx_i)}
$$
denote the KL divergence between the marginal distributions. 
When $\alpha>0, m<d_{spu}/16$, with probability $\ge 1-e^{-m/18}$ there exists an $\ell\in [m]$, determined by $\{\spuVec[e]: e\in\cE_{tr}\}$ s.t. 
\begin{align}
\mrm{KL}\Bigl(
\bigotimes_{e\in\cE_{tr}} p_{marg,e}
 \Bigm\Vert
p_{marg,e_\ell}^{\otimes m}
 \Bigr) =
\sum_{j=1}^m \mrm{KL}_{j \ell} \le \frac{256m}{\sigma_s^2 d_{spu}}, \label{eq:classif-claim-21}
\end{align}
%Moreover, on the same event we have 
%\begin{equation}
%\max_{i,j\in [m]} \mrm{KL}_{ij} \le \frac{8m}{\sigma_s^2 d_{spu}}. \label{eq:classif-claim-22}
%\end{equation}
\end{lemma}
The last claim in the text, Eq.~\ref{eq:sample-size-threshold}, follows by a standard argument following 
\eqref{eq:classif-claim-21}: consider the scenario $n_e \equiv n$ for simplicity. Denote by 
$
\cD_\top := ( ( (y_k^{e_j}, \tilde\theta_s^\top x_{k,s}^{e_j}, x_{k,i}^{e_j}): j\in [m] ): k\in [n] )
$ the transposed dataset which contains the same information as $\cD_{tr}$. 
Any test on $\cD_{tr}$ for the null hypothesis 
``$\tilde\theta_s^\top\bx_s\text{ is an invariant feature}$'' 
always provides a two-sample test for 
$$
H_0': \cD_\top \sim \bigl((p_{marg,e_\ell})^{\otimes m}\bigr)^{\otimes n} =: P_{0,n}, ~~
H_1': \cD_\top \sim \Bigl(\bigotimes_{e\in\cE_{tr}} p_{marg,e} \Bigr)^{\otimes n} =: P_{1,n},
$$
with the same size and power in the two scenarios. However, any such test must have its combined error lower bounded by 
\begin{align*}
1 - D_{TV}(P_{0,n}, P_{1,n}) &\ge 1 - \sqrt{2 \KL{P_{1,n}}{P_{0,n}}} 
\ge 1 - n\cdot \frac{256m}{\sigma_s^2 d_{spu}}.
\end{align*}
This completes the proof.\hfill\qedsymbol

\begin{remark}[IRM and GDRO]
From \cref{lem:classif-claim} it is clear that $\tilde\theta$ constitutes an approximate optima for ERM and GRDO, as well as the variance-penalty based approaches such as \citet{krueger2021out}. 
It also shows that the predictor $f = w\circ\Phi$, where $\Phi = \mrm{id}$ and $w(h) = \tilde\theta^\top h$, approximately satisfies the hard constraints in \eqref{eq:irm} and constitutes an approximate optima in this sense.\footnote{
In practice, soft constraints based on gradient penalty are used for IRM. But it is easy to adapt our proof to show that the gradient norm $\|\nabla_w R_{e,log}(w\circ\Phi)\|_2 = \|\nabla_{\tilde\theta} R_{e,log} f_{\tilde\theta}\|_2$ vanishes at a similar exponential rate. Thus, such a $(w,\Phi)$ pair is also an approximate optima for the soft constraint-based formulation.
} 
\end{remark}

\begin{remark}[interpretations of the KL bound, additional error from feature learning]
In addition to the testing-based interpretation as in Eq.~\ref{eq:sample-size-threshold}, \eqref{eq:classif-claim-21} can also be interpreted directly if we restrict to the family of tests based on a conditional KL divergence, since we have, for all $j\in [m]$,
$$
\KL{p_{e_j}(y\mid \tilde\theta^\top x_s)}{p_{e_\ell}(y\mid \tilde\theta^\top x_s)} \le 
\KL{p_{e_j}(y\mid x_i, \tilde\theta^\top x_s)}{
p_{e_\ell}(y\mid x_i, \tilde\theta^\top x_s)} \le \mrm{KL}_{j\ell} \le \frac{256 m}{\sigma_s^2 d_{spu}}.
$$
Such tests can be viewed as restricting to the validation of the definition of invariant features, which concerns such conditionals; they should reject $H_0$ if the estimated KL divergence becomes larger than a threshold $\delta_n = o_n(1)$, which at least needs to cover the estimation error for the KL divergence. 
It is thus clear that in the feature learning scenario we must use a threshold $\delta_n' \gg \delta_n$, since even for the truly invariant part of the model, we can only learn approximately invariant features which will inevitably violate the KL bound by an extra margin. 
This makes for a larger threshold than \eqref{eq:sample-size-threshold}: 
for example, 
if the feature learning process is such that $\delta_n' \gtrsim d_{spu} / n$, we would have the indistinguishability result as long as $d_{spu} / n \gg m / \sigma_s^2 d_{spu}$, i.e., 
$$
\max_{e\in\cE_{tr}} n_e \ll \frac{\sigma_s^2 d_{spu}^2}{m}.
$$
Another issue that exists for \emph{any possible test} is that in the feature learning scenario, we need to apply to a collection of feature extractors; 
we thus needs more stringent requirements on the power of the test, rather than merely requiring them to be $1-o(1)$. For $|\cM|$ feature extractors with independent failure probabilities, we would require 
$
n\gtrsim |\cM| \sigma_s^2 d_{spu} / m
$ for reliable learning of invariant features. 
Note how these regimes allow for successful fitting of an in-distribution predictor.
\end{remark}


% based on an M statistic will be unable to distinguish between $\bx_s$ and an exactly invariant feature if $m\sqrt{m/\sigma_s^2 d_{spu}} \lesssim (m n_e)^{-1/2}$.
%\footnote{
%While the requirement of $n_e\lesssim d_{spu}$ appear to imply we cannot learn an in-distribution predictor well, this is not necessarily true: the feature $x\mapsto \tilde\theta_s^\top x_s$ may have been learned on external data, and when feature learning is conducted on the same training set, the error in the learning process, or the need for testing on multiple hypotheses, 
%will restrict us to testing for approximate invariance, with a much larger critical value for any test. 
%}
\subsection{Proof for Auxiliary Lemmas}

\newcommand{\ones}{\vec{\mbf{1}}}
\newcommand{\Mat}{\Sigma_{\cS}}
\newcommand{\PopuMat}{\bar\Sigma_{\cS}}

We first introduce the following notations: $\cE_{tr} =: \{e_1,\ldots,e_m\}$, $\ones := \{1,\ldots,1\} \in \RR^m$, and define 
the $m\times m$ matrix $(\Mat)_{ij} = (\spuVec[e_i])^\top (\spuVec[e_j])$, so that 
$
\|\tilde\theta_s\|_2^2 = \alpha^2 \ones^\top \Sigma_{\cS} \ones.
$ Define 
$
\PopuMat := \EE_{\{\spuVec\}} \Mat = \tau_s^2 d_{spu} I.
$ Note that by covariance concentration \citep[Ch.~6]{wainwright2019high}, we have, when $m\le d_{spu}$, 
\begin{equation}\label{eq:Mat-event}
\PP_{{\{\spuVec}\}} \left(\frac{1}{\tau_s\sqrt{d_{spu}}}\|\PopuMat - \Mat\| \le 3\sqrt{\frac{m}{d_{spu}}} + \delta\right) 
\ge 1 - e^{-d_{spu} \delta^2/18}.
\end{equation}

\begin{proof}[Proof for \cref{lem:classif-claim}] 
We first derive the 0-1 loss for $\tilde\theta$. 
Note that in the setting of the example, we have, for any $\theta = (\theta_i, \theta_s)$, 
\begin{align*}
R_{e,01}(f_\theta) &:= \EE_{\bx^e,\by^e} \mbf{1}\{\mrm{sgn}(\theta^\top \bx^e) = \by^e\} = \Phi_z\left(
    -\frac{\theta^\top\EE(\bx^e\mid\by^e=1)}{\sqrt{\theta^\top\mrm{Cov}_e(\bx^e(\bx^e)^\top)\theta}}
\right) = 
\Phi_z\left(
    -\frac{\theta_i^\top \invVec + \theta_s^\top \spuVec[e]}{
        \sqrt{\sigma_i^2 \|\theta_i\|_2^2 + \sigma_s^2 \|\theta_s\|_2^2}
    }
\right).
\end{align*}
Thus when $m<d_{spu}$, we have, by central limit theorem and \eqref{eq:Mat-event},
% NOTE: E \Sigma_s \in R^{m*m} = d_{spu} \tau_s^2 I, so its F-norm is \sqrt{d_{spu} m} tau_s.
\begin{align}
\|\tilde\theta_s\|_2 &= \alpha \sqrt{\ones^\top \Mat \ones} 
\le \alpha\sqrt{m(\|\PopuMat\| + \|\PopuMat - \Mat\|)} 
= \alpha\tau_s \sqrt{m d_{spu}}(1 + \cO_p((m/d_{spu})^{1/4})),  \label{eq:norm-bound}\\ 
\tilde\theta_s^\top \spuVec[e] &= \alpha\biggl(\|\spuVec[e]\|_2^2 + \sum_{e'\in\cE_{tr}, e'\ne e} \<\spuVec[e'],\spuVec[e]\>_2
\biggr) = \alpha \tau_s^2 (d_{spu} + \cO_p(\sqrt{m d_{spu}})).\label{eq:innerp-bound}
\end{align}
Thus, when $m \ll \min\{1, \sigma_s^{-2}\tau_s^2\} d_{spu}$, we have 
$$
R_{e,01}(f_{\tilde\theta}) = \Phi_z(-\sigma_s^{-1}\tau_s(\sqrt{d_{spu} / m} + o_p(1))) \le 
\exp(-2\sigma_s^{-2}\tau_s^2 d_{spu}/m +o_p(1)) \overset{p}{\to} 1, ~~~~\forall e\in\cE_{tr}.
$$

Now we consider the logistic loss. Fix any $e\in\cE_{tr}$, and recall $\bar\theta_e := \spuVec[e]/2\sigma_s^2$ defines the Bayes classifier given the \emph{spurious features} $\bx^e_{spu}\in\RR^{d_{spu}}$. 
Introduce the random variables 
$$
\bx \sim\cN(\spuVec[e], \sigma_s^2 I), ~~
\bz_1 = \bar\theta_e^\top \bx, ~~
\bz_2 = \tilde\theta_s^\top \bx,
$$
so that 
\begin{align*}
R_{e,log}(\tilde\theta) &= \EE_{\bx^e_{spu}} \sum_{y\in\{\pm 1\}}
    p(\by^e=y\mid\bx^e_{spu})\log (1 + e^{-y\tilde\theta^\top \bx^e_{spu}})  \\ 
&= 
\EE_{\bz_1,\bz_2} \mbf{1}\{\bz_1\ge 0\} \log(1+e^{-\bz_2}) + 
    \mbf{1}\{\bz_1 < 0\} \log (1+e^{\bz_2}) \\ 
&\le 
\EE \log(1+e^{-\bz_2}) + 
    \EE(\mbf{1}\{\bz_1 < 0\}  \max\{1, 2\bz_2\}) \\ 
&\le 
\underbrace{
\EE \log(1+e^{-\bz_2})}_{(I)} + 
    \underbrace{\EE\mbf{1}\{\bz_1 < 0\}}_{(II)} + 
    \underbrace{\EE \mbf{1}\{\bz_1<0, \bz_2> 1/2\} 2\bz_2}_{(III)}.
\end{align*} 
We will bound the three terms in turn. Before that, we first derive the joint distribution of $(\bz_1,\bz_2)$. 
By \eqref{eq:norm-bound}-\eqref{eq:innerp-bound}, we have 
\begin{align*}
    \mu_2 := \EE\bz_2 = \alpha\tau_s^2d_{spu}(1+o_p(1)), ~
    \sigma_2^2 := \Var(\bz_2) = \sigma_s^2 \alpha^2\tau_s^2 m d_{spu}(1+o_p(1)), ~
    \mrm{Cov}(\bz_1,\bz_2) % = \sigma_s^2 \<\bar\theta_e,\tilde\theta_s\>_2 
    = \frac{1+o_p(1)}{2\sigma_s^2}
        \alpha\tau_s^2d_{spu}.
\end{align*}
Moreover, we have 
\begin{align*}
\mu_1 := \EE\bz_1 = \frac{\|\spuVec[e]\|_2^2}{2\sigma_s^2} = \frac{\tau_s^2 (d_{spu} + \cO_p(\sqrt{d_{spu}}))}{2\sigma_s^2}, ~~
\sigma_1^2 := \mrm{Var}(\bz_1) = \frac{1}{2}\mu_1.
\end{align*}
We now return to the three terms for $R_{e,log}$. For (I), consider the decomposition
\begin{align*}
(I) &= \EE \log(1 + e^{-\bz_2}) \mbf{1}\{\bz_2 < -1/2\} + 
\EE \log(1 + e^{-\bz_2}) \mbf{1}\{-1/2 < \bz_2 < A\} + 
\EE \log(1 + e^{-\bz_2}) \mbf{1}\{\bz_2 > A\} \\ 
&\le 
\EE(-2\bz_2 \mid \bz_2 < -1/2)\PP(\bz_2<-1/2) + 
\mbf{1}\{-1/2 < \bz_2 < A\} + 
\log(1 + e^{-A}) \\ 
&\overset{(i)}{\lesssim}
e^{-\mu_2^2/2\sigma_2^2} \cdot \left(1 + \frac{\sigma_2^2}{\mu_2}\right) + 
e^{-(\mu_2-A)^2/2\sigma_2^2} +
\log(1 + e^{-A}) \\ 
&\overset{(ii)}{\lesssim}
e^{-\mu_2^2/3\sigma_2^2}  + 
e^{-(\mu_2-A)^2/2\sigma_2^2} +
e^{-A} \overset{(iii)}{\lesssim} e^{-\mu_2^2 / 3\sigma_2^2}.
\end{align*}
In the above, (i) follows by the Gaussian CDF bound and \cref{lem:trunc-normal} below, (ii) follows since $\mu_2^2 \gg \sigma_2^2 \gg 1, \alpha=\cO(\mrm{poly}(d_{spu}/m))$, and (iii) sets $A=\mu_2/10$. Now, 
\begin{align*}
(II) &\le e^{-\mu_2^2/2\sigma_2^2}, 
\\
(III) &= \int_{1/2}^\infty 2t \PP(\bz_1<0\mid\bz_2=t) \PP_{\bz_2}(dt) \le \PP(\bz_1<0) \EE(2\bz_2\mid \bz_2>1/2) 
\lesssim e^{-\mu_1^2/2\sigma_1^2}(\mu_2+2\sigma_2 e^{-\mu_2^2/2\sigma_2^2}),
\end{align*}
where the first inequality follows because $\bz_1$ and $\bz_2$ are positively correlated, and $\mu_2>0$, and the second follows by an application of \cref{lem:trunc-normal}. 
Combining, we can see that in the regime $m\ll \sigma_s^{-2}\tau_s^2 d_{spu}$, 
there exists $c_1,c_2>0$ s.t.~
$$
R_{e,\log}(\tilde\theta) \le (I) + (II) + (III) \le c_1 e^{-c_2 \tau_s^2 d_{spu}/m\sigma_s^2}.
$$
This proves \eqref{eq:ex-classif-claim}.
\end{proof}

\begin{proof}[Proof for \cref{lem:classif-claim-2}]
For any $e_1,e_2\in\cE_{tr}$ and fixed realizations of $\{\spuVec[e_1], \spuVec[e_2]\}$ (i.e., the following display implicitly conditions on the two), we have 
\begin{align*}
\text{KL}_{ij} = 
\KL{p_{e_1}(\by, \tilde\theta_s^\top\bx_s, \bx_i)}{
p_{e_2}(\by, \tilde\theta_s^\top\bx_s, \bx_i)} &= 
\EE_{p_{e_1}} \log \frac{p_{e_1}(\by, \tilde\theta_s^\top\bx_s, \bx_i)}{
p_{e_2}(\by, \tilde\theta_s^\top\bx_s, \bx_i)
} =%\overset{(i)}{=} 
\EE_{p_{e_1}} \log \frac{p_{e_1}(\tilde\theta_s^\top\bx_s\mid \by,\cancel{\bx_i})p_{e_1}(\by,\bx_i)}{
p_{e_2}(\tilde\theta_s^\top\bx_s\mid\by,\cancel{\bx_i}) p_{e_2}(\by,\bx_i)
} \\ 
&= \EE_{p_{e_1}} \log \frac{p_{e_1}(\tilde\theta_s^\top\bx_s\mid \by)}{
p_{e_2}(\tilde\theta_s^\top\bx_s\mid \by)
} = \KL{
    \cN(\mu_{21}, \sigma^2_2)
}{
    \cN(\mu_{22}, \sigma^2_2)
}
\end{align*} where 
$
\mu_{2j} := \<\spuVec[e_j], \tilde\theta_s\>_2 ~\forall j\in [m],$ and $
\sigma_2^2 := \sigma_s^2 \|\tilde\theta_s\|_2^2.
$
Plugging in the expression for KL divergence between Gaussian distributions, we find, \emph{for all $i,j\in [m]$,}
\begin{align*}
&\phantom{=}
\text{KL}_{ij} = 
\frac{(\mu_{2i} - \mu_{2j})^2}{2\sigma_2^2} 
= \frac{\alpha^2}{2\sigma_2^2} 
    \Bigl(\sum_{k=1}^m (\Mat)_{ik} - (\Mat)_{jk} \Bigr)^2 \\ 
&= 
\frac{\alpha^2}{2\sigma_2^2} 
    \Bigl(\sum_{k=1}^m (\Mat)_{ik} - (\PopuMat)_{ik} + 
        \sum_{k=1}^m (\PopuMat)_{jk} - (\Mat)_{jk} + 
        \cancel{\sum_{k=1}^m(\PopuMat)_{ik} - (\PopuMat)_{jk}}
    \Bigr)^2  \\ 
&\le 
\frac{4\alpha^2}{2\sigma_2^2} \biggl(
    \Bigl(\sum_{k=1}^m (\Mat)_{ik} - (\PopuMat)_{ik} \Bigr)^2  +
    \Bigl(\sum_{k=1}^m (\Mat)_{jk} - (\PopuMat)_{jk} \Bigr)^2
\biggr) \\
&= \frac{2\alpha^2}{\sigma_2^2}\bigl((\ones^\top (\Mat - \PopuMat) \mbf{e}_i)^2 + 
(\ones^\top (\Mat - \PopuMat) \mbf{e}_j)^2\bigr), \numberthis\label{eq:kl-ij-bound}
% \le \frac{9m\alpha^2}{2\sigma_2^2}\|\Mat - \PopuMat\|^2.
\end{align*}
where $\mbf{e}_i = (\underbrace{\ldots,0}_{(i-1)\text{ zeros}},1,0,\ldots)$ denotes the $i$-th Euclidean basis.
We thus have, by symmetry and the trace formula,  
\begin{align*}
\sum_{i=1}^m \sum_{j=1}^m \mrm{KL}_{ij} &\le 2m\cdot \sum_{l=1}^m 
\frac{2\alpha^2}{\sigma_2^2}(\ones^\top (\Mat - \PopuMat) \mbf{e}_l)^2 
= 2m\cdot \sum_{l=1}^m 
\frac{2\alpha^2}{\sigma_2^2}\ones^\top (\Mat - \PopuMat)^2 \ones. 
\end{align*}
Plugging in \eqref{eq:Mat-event} with $\delta\gets \sqrt{m/d_{spu}}$ we find, when $d\ge 64m$, on that event we have  
\begin{align*}
\sum_{i=1}^m \sum_{j=1}^m \mrm{KL}_{ij}
&\le 
\frac{4\alpha^2 m\|\ones\|_2^2\|\Mat - \PopuMat\|^2}{\sigma_2^2} 
\overset{\eqref{eq:Mat-event}}{\le} 
\frac{4\alpha^2 m^2\cdot (4\tau_s\sqrt{m})^2}{\sigma_2^2}
=
\frac{64\alpha^2 \tau_s^2 m^3}{\sigma_s^2\|\tilde\theta_s\|_2^2}  \\ 
&\overset{\eqref{eq:norm-bound}}{\le}
\frac{
    64\alpha^2 \tau_s^2 m^3
}{
    \sigma_s^2 \alpha^2\tau_s^2 m d_{spu}/4
} 
= 
\frac{
    256 m^2
}{
    \sigma_s^2 d_{spu}
},
\end{align*}
where we note that \eqref{eq:norm-bound} holds on the same event. Therefore, on that event there always exist some $\ell \in [m]$ which is determined by $\{\spuVec[e]\}$ s.t.~
$$
\sum_{j=1}^m \mrm{KL}_{\ell j} \le \frac{256 m}{\sigma_s^2 d_{spu}}.
$$
%This proves \eqref{eq:classif-claim-21}. Moreover, applying to \eqref{eq:kl-ij-bound}
%the inequality 
%$ \ones^\top (\Mat - \PopuMat) \mbf{e}_i \le \sqrt{m}\cdot \|\Mat - \PopuMat\| \cdot 1$ 
%shows that \eqref{eq:classif-claim-22} holds on the same event. 
This completes the proof.
\end{proof}

\begin{lemma}\label{lem:trunc-normal}
Let $\bz\sim\cN(\mu, \sigma^2)$, $b<\mu$. Then we have 
$$
\EE(\bz\mid \bz<b) \overset{(i)}{=} \mu - \sigma \frac{\phi_z((b-\mu)/\sigma)}{\Phi_z((b-\mu)/\sigma)} 
\overset{(ii)}{\ge} b - \frac{\sigma^2}{\mu-b}, ~~
\EE(\bz\mid \bz<b) \overset{(i)}{=} \mu + \sigma \frac{\phi_z((b-\mu)/\sigma)}{1-\Phi_z((b-\mu)/\sigma)} \le \mu + \sigma \phi_z((b-\mu)/\sigma).
$$
\end{lemma}
\begin{proof}
(i) is a known property of truncated normal distribution \citep{greene2003econometric}. (ii) follows by the bound 
$
\Phi_z(-x) \ge \frac{x}{x^2+1} \phi_z(-x)
$  \citep{abramowitz1988handbook}.
\end{proof}




\section{Proof for Proposition~\ref{prop:regr}}

\emph{Throughout the proof we will work with the transformed data and parameters:} 
 $\bx^e \gets M^{-1} \bx^e, \theta\gets M^\top\theta$. This allows us to ignore the presence of $M$, as long as we replace $U$ with $U' := \|M\| U$. 
We also introduce the following notations:
\begin{align*}
    d := d_{inv} + d_{spu}, ~
    X &:= \{x_1^*,\ldots,x_{n_*}^*\} \in \RR^{n_*\times d}, ~
    Y := \{y_1^*,\ldots,y_{n_*}^*\} \in \RR^{n_*\times 1}, ~
    % \invParam := (\invVec, 0), ~ 
\\ 
    \bayesParam[e] &:= (\invVec, \spuVec[e]), ~
    \spuCollection := \{\spuVec[e]: e\in\cE_{tr}\}. 
    % ~\text{ and } 
    % f_\theta(\cdot) := \<\theta,\cdot\>_2 ~\text{ for all }
    % \theta\in \RR^d.
\end{align*}
Note that given the above transformation, $\bayesParam$ now parameterizes the Bayes predictor for environment $e$. And 
across all environment $e$, we will also have
$$
R^e(\theta) = \EE_{\bx^e, \by^e} (\theta^\top \bx^e - \by^e)^2 = \|\theta - \bayesParam\|_2^2 + \sigma^2,
$$
and the constraint set, for which we have fixed $\rho+\varepsilon_n=0$, reduces to 
\begin{align*}
\cC_{tr} &= \{\theta\in\RR^d: \|\theta - \bayesParam\|_2^2 \le \|\invParam - \bayesParam\|_2^2  ~~\forall e\in\cE_{tr}\}  \\ 
&= \{\theta=(\beta_i,\beta_s)\in\RR^d: \|\beta_i - \invVec\|_2^2 + \|\beta_s - \spuVec\|_2^2 \le \|\spuVec\|_2^2 ~~\forall e\in\cE_{tr}\}  \\ 
&\subset \Bigl\{\theta = (\beta_i,\beta_s) \in\RR^d: \<\beta_s, \spuVec\>_2 \ge \frac{1}{2} \|\beta_s\|_2^2 ~~\forall e\in\cE_{tr}\Bigr\}.
\end{align*}
The \emph{constrained} parameter space is 
$$
\cF = \{f_\theta: \|\theta\|_2^2 \le U', \theta\in\cC_{tr}\}.
$$
Note that by construction, $\invParam \in \cF$ always holds. 

Our main taks is to establish improved metric entropy bounds for the following ``localized'' space:
$$
\Flocal := \biggl\{f_\theta-f_{\theta'}: f_\theta,f_{\theta'}\in\cF, \|f_\theta - f_{\theta'}\|_{n,2}^2 := \frac{1}{n_*}\sum_{i=1}^{n_*}(f_\theta(x^*_i)-f_{\theta'}(x^*_i))^2 \le \delta
\biggr\}.
$$
We first note that 
\begin{lemma}\label{lem:norm-equiv}
When $n_*\ge 5d$, there exists $c>0$ s.t.
\begin{align}\label{eq:norm-event}
\PP_{X}\Bigl(\forall \theta,\theta'\in\RR^d, ~
\frac{1}{2}\|\theta - \theta'\|_2^2 \le \|f_\theta - f_{\theta'}\|_{n,2}^2 \le 2\|\theta-\theta'\|_2^2
\Bigr) \ge 1-e^{-c n_*}.
\end{align}
\end{lemma}
\begin{proof}
By the fact that 
$\|f_\theta - f_{\theta'}\|_{n,2} = \|n_*^{-1/2} X(\theta-\theta')\|_2$, and the concentration of Gaussian covariance matrices \citep[Theorem 6.1]{wainwright2019high}.
\end{proof}

\begin{proposition}[entropy bound]\label{lem:ent}
On the event \eqref{eq:norm-event} we have, for any $\zeta,\delta,t,Z>0$, with $\spuCollection$-probability $\ge 1-\zeta$,
\begin{align*}
\log N(\Flocal, \|\cdot\|_{n,2}, t) &\le 
\min\biggl\{
    d_{spu}\log\biggl(1+\frac{c'\delta}{e^{m Z^2\delta^2/2}\,t}\biggr) +
    d_{spu}\log\biggl(1+\frac{c'Z\delta}{t}\biggr), %\\&\hspace{5em} 
\\ & \hspace{6em}
    d_{spu}\log\biggl(1+\frac{c'\delta}{2^{-m/d_{spu}} t}\biggr) %\\&\hspace{5em} 
\biggr\}
+ d_{inv} \log\biggl(1+\frac{c'\delta}{t}\biggr) +
    \log\zeta^{-1}.
\end{align*}
\end{proposition}

\begin{proof}
We condition on the event \eqref{eq:norm-event} throughout the proof.
Then, any $\theta=(\beta_i,\beta_s)$ s.t.~$f_{\theta} \in\Flocal$ must satisfy 
$$
    \|\beta_i - \invVec\|_2^2 + \|\beta_s\|_2^2 = \|\theta - \invParam\|_2^2 \overset{\eqref{eq:norm-event}}{\le} 
    2\|f_\theta - f_{\invParam}\|_{n,2}^2 \le
    2\delta^2 ~~\Rightarrow~~ 
\max\{\|\beta_i - \invVec\|_2, \|\beta_s\|_2\} \le \sqrt{2}\delta,
$$
and 
$$
\beta_s \in \cC_s := \{\beta_s: 
\<\beta_s, \bar\beta^e_{spu}\>_2 \ge \frac{1}{2}\|\beta_s\|_2^2 ~~~\forall e\in\cE_{tr}\}.
$$
By \eqref{eq:norm-event}, we also know that 
a $(\|\cdot\|_{n,2},t)$-covering for $\Flocal$ can be constructed as the Cartesian product of 
    a $(\|\cdot\|_2,t/3)$-covering for the ball $\mb{B}_{i,\delta} := \{\beta_i\in\RR^{d_{inv}}: \|\beta_i - \invVec\|_2\le\sqrt{2}\delta\}$, and
    a $(\|\cdot\|_2,t/3)$-covering for $\cC_s\cap \mb{B}_{s,\delta}$, where 
    $\mb{B}_{s,\delta} := \{
        \beta_s\in\RR^{d_{spu}}:~
            \|\beta_s\|_2\le\sqrt{2}\delta\}$. Thus, we have, for some $c,c'>0$, 
\begin{align*}
\log N(\Flocal, \|\cdot\|_{n,2}, t) &\le 
    \log N(\mb{B}_{i,\delta}, \|\cdot\|_2, ct) + 
    \log N(\mb{B}_{s,\delta}\cap\cC_s, \|\cdot\|_2, ct)  \\ &\le 
    d_{inv} \log(1 + c'\delta/t) + 
    \log N(\mb{B}_{s,\delta}\cap\cC_s, \|\cdot\|_2, ct), \numberthis\label{eq:logN-decomp}
\end{align*}
where the second inequality can be found as \citet[Example 5.8]{wainwright2019high}. 
It remains to bound 
$
\log N(\mb{B}_{s,\delta}\cap \cC_s, \|\cdot\|_2, ct). 
$ 

For this purpose, first note that, 
for any fixed $\beta\ne 0$, we have 
\begin{align*}
\PP_{\spuCollection}(\beta\in\cC_s) &= \prod_{e\in\cE_{tr}} \PP_{\spuVec[e]\sim\cN(0, d_{spu}^{-1}I)}\Bigl(
    \<\beta, \spuVec[e]\>_2 \ge \frac{1}{2}\|\beta\|_2^2
\Bigr) \\ 
&= \prod_{j=1}^m \PP\Bigl(\cN(0, d_{spu}^{-1}\|\beta\|_2^2) \ge \frac{1}{2}\|\beta\|_2^2\Bigr) 
%= \biggl(1 - \Phi_z\biggl(\frac{\sqrt{d_{spu}}\|\beta\|_2}{2}\biggr)\biggr)^m
\le \min\{e^{-m d_{spu} \|\beta\|_2^2 / 4},~2^{-m}\}. \numberthis\label{eq:fix-beta-prob-bound}
\end{align*}
Introduce
$$
B_{s1} := \{\beta_s\in\RR^{d_{spu}}:\|\beta_s\|_2\le \sqrt{2}Z\delta\}, ~~
B_{s2} := \mb{B}_{s,\delta} \backslash B_{s1},
$$
and $\mrm{C}_{s1}, \mrm{C}_{s2}$ be $(\|\cdot\|,ct)$-coverings for $B_{s1},B_{s2}$ with the optimal cardinality. 
Then 
\begin{align*}
\EE_{\cE_{tr}} N(\mb{B}_{s,\delta}\cap\cC_s, \|\cdot\|_2, ct) &\le 
\sum_{\beta\in\mrm{C}_{s1}} \PP(\beta\in\cC_s) + 
\sum_{\beta\in\mrm{C}_{s2}} \PP(\beta\in\cC_s)
\\ 
&\overset{\eqref{eq:fix-beta-prob-bound}}{\le} 
N(B_{s1}, \|\cdot\|, ct) + 
N(\mb{B}_{s,\delta}, \|\cdot\|, ct) e^{-m d_{spu} Z^2\delta^2/2} 
% NOTE: This is correct regardless of whether Z<=1, as the Z>1 case is covered by the first term
\\ 
&\le (1+cZ\delta/t)^{d_{spu}} + (1+c'\delta/t)^{d_{spu}} e^{-Md_{spu} Z^2\delta^2/2}.
\end{align*}
By Markov's inequality and the monotonicity of $\log(\cdot)$, we find that for all $\zeta\in (0,1)$, with $\spuCollection$-probability $\ge 1-\zeta$, 
\begin{align*}
\log N(\mb{B}_{s,\delta}\cap\cC_s,\|\cdot\|_2,ct) &\le 
    d_{spu}\log(1+c'Z\delta/t) + 
    d_{spu}\log\biggl(\frac{1+c'\delta/t}{e^{M Z^2\delta^2/2}}\biggr) + \log\zeta^{-1} \\ 
&< 
    d_{spu}\log(1+c'Z\delta/t) + 
    d_{spu}\log\biggl(1+\frac{c'\delta}{e^{M Z^2\delta^2/2}\cdot t}\biggr) + \log\zeta^{-1}.
\end{align*}
Plugging back to \eqref{eq:logN-decomp} proves the first claim. The second claim (involving $2^{-m/d_{spu}}$) can be proved similarly, by using the second case in \eqref{eq:fix-beta-prob-bound} (involving $2^{-m}$). 
\end{proof}

This allows us to prove our main result:
\begin{proof}[Proof for Proposition~\ref{prop:regr}]
It suffices to bound the critical radius $\hat\delta_n^2$ of the Gaussian complexity of $\Flocal$ \citep[Eq.~(13.42a)]{wainwright2019high}; given a high-probability bound $\hat\delta_n^2 \le \delta_n^2$ that holds w.~p.~$1-\zeta'$, we will have, with probability $\ge 1-\zeta'-c_1 e^{-c_2 n\delta_n^2}$, 
\begin{equation}\label{eq:rad-to-rate}
\|f_{\hat\theta} - f_{\invParam}\|_2^2 \overset{\eqref{eq:norm-event}}{\le} 
2\|f_{\hat\theta} - f_{\invParam}\|_2^2 \le 
c_0 \|f_{\invParam} - f_{\bayesParam}\|_2^2 + c_1 \delta_n^2,
\end{equation}
where the last inequality is \citet[Theorem 13.13]{wainwright2019high}. 

By \citet[Corollary 13.7]{wainwright2019high}, any solution $\delta$ to the following inequality will bound $\hat\delta_n^2$:
\begin{equation}\label{eq:ent-integral}
\frac{16}{\sqrt{n}} \int_{\delta^2}^{\delta} \sqrt{\log N(\Flocal,\|\cdot\|_{n,2}, t)}\, dt  \le \frac{\delta^2}{4\sigma}.
\end{equation}
Let us restrict to $\delta\ge n^{-1/2}$ and bound the LHS. Define $t_j = 2^{-j \delta}$ for $j\le J := \lceil\log n^{1/2}\delta\rceil$. Then 
\begin{align*}
\int_{\delta^2}^{\delta} \sqrt{\log N(\Flocal,\|\cdot\|_{n,2}, t)}\, dt &\le 
    \sum_{j=0}^{J} \sqrt{\log N(\Flocal, \|\cdot\|_{n,2}, t_j)} (t_j-t_{j+1}). 
% <  2\int_{\delta^2}^{\delta} \sqrt{\log N(\Flocal,\|\cdot\|_{n,2}, t)}\, dt.
\end{align*}
For any fixed $Z>0,\delta\ge n^{-1/2}$, a union bound over $J$ applications of \cref{lem:ent} to $(\zeta\gets n^{-10}, \delta, t\gets t_j, Z)$ shows that, with $\spuCollection$-probability $\ge 1-n^{-10}\log(n^{1/2}\delta)$, 
\begin{align*}
\sum_{j=0}^{J} \sqrt{\log N(\Flocal, \|\cdot\|_{n,2}, t_j)} (t_j-t_{j+1}) &< 
    \sqrt{10 \log n}\delta + \sum_{j=0}^J (t_j - t_{j+1})\cdot \Biggl(
    d_{inv} \log\biggl(1+\frac{c'\delta}{t}\biggr) + \\ 
&\hspace{5em}    d_{spu}\log\biggl(1+\frac{c'\delta}{e^{m Z^2\delta^2/2}\cdot t}\biggr) +
    d_{spu}\log\biggl(1+\frac{c'Z\delta}{t}\biggr) %\\&\hspace{5em}  
   \Biggr)^{1/2} \\ 
&\le 
    \sqrt{10\log n}\delta + \sum_{j=0}^J (t_j - t_{j+1})\cdot \Biggl(
    \sqrt{d_{inv} \log\biggl(1+\frac{c'\delta}{t}\biggr)} + \\
&\hspace{5em}    \sqrt{d_{spu}\log\biggl(1+\frac{c'\delta}{e^{M Z^2\delta^2/2}\cdot t}\biggr)} +
    \sqrt{d_{spu}\log\biggl(1+\frac{c'Z\delta}{t}\biggr)} %\\&\hspace{5em}  
   \Biggr), \numberthis\label{eq:integral-bound-1}
\end{align*}
and that 
\begin{align*}
\sum_{j=0}^{J} \sqrt{\log N(\Flocal, \|\cdot\|_{n,2}, t_j)} (t_j-t_{j+1}) &< 
    \sqrt{10\log n}\delta +
\sum_{j=0}^J (t_j - t_{j+1})\Biggl( \\
&\hspace{5em}    
    \sqrt{d_{inv} \log\biggl(1+\frac{c'\delta}{t}\biggr)} + 
\sqrt{d_{spu}\log\biggl(1+\frac{c'\delta}{2^{m/d_{spu}}\cdot t}\biggr)} \Biggr).
     \numberthis\label{eq:integral-bound-2}
\end{align*}
By basic calculus and a scaling argument as in \citet[p.~427]{wainwright2019high}, % applied to the last three terms, 
we find the summation is bounded by 
$$
\sqrt{10\log n}\,\delta + c''(\sqrt{d_{inv}} + \min\{
    Z + e^{-mZ^2\delta^2/2}, 2^{-m/d_{spu}} 
\}\sqrt{d_{spu}})\delta.
$$
Therefore, for any $Z,\delta_{0,n}>0$, any solution to 
$$
\delta\ge \max\{\delta_{0,n}, n^{-1/2}\}, ~
\sqrt{10\log n}\,\delta +
c''(\sqrt{d_{inv}} + \min\{Z + e^{-mZ^2\delta_{0,n}^2/2}, 2^{-m/d_{spu}}\}\sqrt{d_{spu}}) \delta \le \frac{\sqrt{n}\delta^2}{64\sigma^2}
$$
always solves \eqref{eq:ent-integral} with the claimed probability. Choosing $
\delta_{0,n} = \sqrt{Z^2 d_{spu} / n}, Z^2 = \left(\frac{2n\log m}{m d_{spu}}\right)^{1/4}
$ 
yields 
$$
\delta_n^2 \le c'''\biggl(\frac{\log n + d_{inv}}{n} + \sqrt{\frac{d_{spu} \log m}{n m}} \biggr),
$$
while considering the second argument of $\min$ yields 
$$
\delta_n^2 \le c'''\frac{\log n + d_{inv} + 2^{-2m/d_{spu}} d_{spu}}{n}.
$$
Both bounds hold with the aforementioned $\spuCollection$-probability. Plugging back to \eqref{eq:rad-to-rate} completes the proof.
\end{proof}

% NOTE: our reslt does not depend on U or U', which is to be expected for nonparametric least square in the $n \gtrsim d$ regime (see also the example in Wainwright).





\section{Experiment Setup and Full Results}

\subsection{Full Results for Section~\ref{sec:exp-synth}}

Full results for all methods in the setting of \Cref{fig:sim-main} are reported in \Cref{fig:sim-ma}, where we add the results for \texttt{BLR-LC} for $N := 0.3 n_*$, \texttt{BLR-Prior} for $\alpha\in\{1, 3\}$, and \texttt{CBLR} for $\rho=0.2$. The results are consistent with the discussion in the text. 
We have also conducted experiments in a larger-scale setting, with 
$n_e\gets 12000$ for classification and 
$n_e \gets 18000, d_{inv} \gets 80, d_{spu} \gets 160, m \gets 5$ for regression. As shown in \Cref{fig:sim-ml}, the results are qualitatively similar, with our methods performing slightly better for the regression task.  

We further experimented with generalized variants of both the regression and classification experiments, where we replace the definition of $\spuVec[*]$ with 
$$
\spuVec[*] \sim \cN\Bigl(\frac{2\alpha}{m}\sum_{e\in\cE_{tr}} \spuVec[e], (1-\alpha^2) \tau_*^2 I
    \Bigr),
$$
where $\tau_* = 1$ for classification and $d_{spu}^{-1/2}$ for regression. 
For regression, we also introduce environment-specific correlations between the invariant and spurious features, by replacing the generating process of $x_{spu,i}^e$ with 
$$
x_{spu,i}^e \sim \cN(\beta A^e x_{inv,i}^e, (1-\beta^2) I), 
$$
where $A^e \in \RR^{d_{spu}\times d_{inv}}$ is a random matrix with i.i.d.~$\cN(0, d_{inv}^{-1})$ components. The data generating process in the text thus corresponds to $\alpha=1,\beta=0$. 
Results for other choices of $(\alpha,\beta)$ are reported in \Cref{fig:sim-classif-rd} and \Cref{fig:sim-regr-rd}, where we plot the distribution of test losses across 32 independently samples for $\{\invVec, \spuVec[\cdot]\}$. As we can see, our method remains competitive across all settings. 

\begin{figure}[hbt]
%%% This and fixdist-large-full.pdf: g47:notebooks/notebooks/drodg-sim-proc.ipynb
    \centering 
    \includegraphics[width=\linewidth]{figs/fixdist-6k-20-full.pdf} 
    \caption{Synthetic experiment: results for all methods in the setting of \Cref{fig:sim-main}.}\label{fig:sim-ma}
\end{figure}

\begin{figure}[htb]
    \centering 
    \includegraphics[width=0.95\linewidth]{figs/fixdist-large-full.pdf} 
    \caption{Synthetic experiment: results for all methods at a larger scale. 
}\label{fig:sim-ml}.
\end{figure}

\begin{figure}[htb]
%%% This and the regression experiment: g35:notebooks/dro-sim-proc-initial.ipynb
    \centering
    \includegraphics[width=0.32\linewidth]{figs/classif-6k-randdist-0-full.pdf}
    \includegraphics[width=0.32\linewidth]{figs/classif-6k-randdist-0.5-full.pdf}
    \includegraphics[width=0.32\linewidth]{figs/classif-6k-randdist-1-full.pdf}
    \caption{Synthetic classification experiment: violin plot of classification errors, across independently sampled environments, in the setting of \Cref{fig:sim-main}. From left to right: results for $\alpha\in\{0, 0.5, 1\}$. From top to bottom: results for $n_*\in \{4, 32, 256\}$. Best viewed when zoomed.
    }\label{fig:sim-classif-rd}
\end{figure}

\begin{figure}[htb]
    \centering 
    \includegraphics[width=0.32\linewidth]{figs/regr-18k-s0-randdist-0-full.pdf}
    \includegraphics[width=0.32\linewidth]{figs/regr-18k-s0-randdist-1-full.pdf}
    \includegraphics[width=0.32\linewidth]{figs/regr-18k-s05-randdist-1-full.pdf}
    \caption{Synthetic regression experiment: violin plot of test MSE across independently sampled environments in the larger-scale setting ($n_e=18000, m=5, d=240$). From left to right: results for $(\alpha,\beta)\in\{(0, 0), (1, 0), (1, 0.5)\}$. 
}\label{fig:sim-regr-rd}.
\end{figure}


\subsection{Setup and Results for Section~\ref{sec:exp-db}}


Full results for all methods in the setting of \Cref{tab:cmnist} and \Cref{tab:pacs} are reported in \Cref{tab:cmnist_full} and \Cref{tab:pacs_full}, respectively, where we also report the results for our method with $\rho\in\{0.05, 0.2\}$. 
As we can see, our method achieves robust performance across all settings. 
In order to maintain a high acceptance rate for the M-H test of the Langevin Monte Carlo steps, we set the step-size upper bound $\bar\eta_k = 0.001$ for ColoredMNIST and $\bar\eta_k = 0.0025$ for PACS using binary search. To guarantee convergence, we run $2 \times 10^4$ steps with $50$ parallel chains for each method. In accordance with the ERM baseline, the batch size for the BLR-LC method is set to $32$ for each domain. For the BLR-LC-N\_train method, because of the relatively large number of training examples $n_{e}$ on Colored MNIST, we set $N := 0.02 n_{e}$ for Colored MNIST, while $N := n_{e}$ for PACS.
The training-domain validation set selection approach from \citep{gulrajani_search_2020} is used to search the ERM baseline through $20$ hyperparameter configurations $\times$ 3 trials, which is important for ERM to establish itself as a strong baseline. 

% Please add the following required packages to your document preamble:
% \usepackage{multirow}
% Please add the following required packages to your document preamble:
% \usepackage{multirow}
% Please add the following required packages to your document preamble:
% \usepackage{multirow}

\begin{table}[!ht]
    \centering
    \caption{
Colored MNIST: test accuracy for all methods on all domains. We report the 
        $20\textrm{th}$ percentile / mean / $80\textrm{th}$ percentile across 20 independent runs. 
    }
    \begin{adjustbox}{max width=\linewidth}
        \begin{tabular}{cccccc}
        \toprule
        $n_*$ & Method / $e_*$ & $ 0.1 $ & $ 0.2 $ & $ 0.9 $ &  \\ \midrule
        $ 0 $ & ERM & $ 88.5 $ & $ 87.2 $ & $ 71.5 $ &  \\ \midrule
        \multirow{4}{*}{$ 4 $} & CBLR & $ 87.8 $ / $ 88.2 $ / $ 88.5 $ & $ 86.7 $ / $ 87.0 $ / $ 87.3 $ & $ 81.5 $ / $ 81.9 $ / $ 85.6 $ &  \\
        ~ & CBLR\_0.05 & $ 87.8 $ / $ 87.9 $ / $ 88.0 $ & $ 87.1 $ / $ 87.3 $ / $ 87.7 $ & $ 79.8 $ / $ 81.9 $ / $ 85.9 $ &  \\
        ~ & CBLR\_0.20 & $ 85.0 $ / $ 85.7 $ / $ 87.2 $ & $ 86.0 $ / $ 86.6 $ / $ 87.6 $ & $ 79.3 $ / $ 76.4 $ / $ 86.9 $ &  \\
        ~ & BLR & $ 75.2 $ / $ 80.6 $ / $ 88.1 $ & $ 77.5 $ / $ 79.3 $ / $ 85.8 $ & $ 88.4 $ / $ 87.5 $ / $ 90.0 $ &  \\ 
        ~ & BLR-LC-N\_adapt & $ 88.1 $ / $ 88.0 $ / $ 88.7 $ & $ 86.6 $ / $ 87.0 $ / $ 87.4 $ & $ 80.5 $ / $ 81.8 $ / $ 85.6 $ &  \\ 
        ~ & BLR-LC-N\_train & $ 88.3 $ / $ 88.4 $ / $ 88.5 $ & $ 87.2 $ / $ 87.3 $ / $ 87.4 $ & $ 67.9 $ / $ 69.3 $ / $ 71.1 $ &  \\
        ~ & DivDis & $ 88.4 $ / $ 88.4 $ / $ 88.5 $ & $ 87.8 $ / $ 87.8 $ / $ 87.9 $ & $ 70.7 $ / $ 70.7 $ / $ 70.9 $ &  \\
        \midrule
        \multirow{4}{*}{$ 8 $} & CBLR & $ 87.8 $ / $ 88.2 $ / $ 88.5 $ & $ 86.5 $ / $ 87.0 $ / $ 87.4 $ & $ 85.0 $ / $ 85.7 $ / $ 87.1 $ &  \\ 
        ~ & CBLR\_0.05 & $ 87.8 $ / $ 88.1 $ / $ 88.6 $ & $ 87.0 $ / $ 87.2 $ / $ 87.5 $ & $ 83.9 $ / $ 85.1 $ / $ 86.8 $ &  \\
        ~ & CBLR\_0.20 & $ 85.4 $ / $ 86.1 $ / $ 88.0 $ & $ 85.1 $ / $ 86.0 $ / $ 87.1 $ & $ 85.3 $ / $ 86.7 $ / $ 88.9 $ &  \\
        ~ & BLR & $ 83.0 $ / $ 86.1 $ / $ 88.4 $ & $ 80.9 $ / $ 81.7 $ / $ 86.5 $ & $ 89.2 $ / $ 89.3 $ / $ 90.0 $ &  \\ 
        ~ & BLR-LC-N\_adapt & $ 87.9 $ / $ 88.3 $ / $ 88.7 $ & $ 87.0 $ / $ 87.1 $ / $ 87.5 $ & $ 82.9 $ / $ 83.5 $ / $ 86.0 $ &  \\ 
        ~ & BLR-LC-N\_train & $ 88.3 $ / $ 88.4 $ / $ 88.5 $ & $ 87.2 $ / $ 87.3 $ / $ 87.4 $ & $ 68.3 $ / $ 69.9 $ / $ 71.8 $ &  \\
        ~ & DivDis & $ 88.4 $ / $ 88.4 $ / $ 88.5 $ & $ 87.8 $ / $ 87.8 $ / $ 87.8 $ & $ 70.6 $ / $ 70.7 $ / $ 70.8 $ &  \\
        \midrule
        \multirow{4}{*}{$ 16 $} & CBLR & $ 87.6 $ / $ 88.2 $ / $ 88.7 $ & $ 86.8 $ / $ 87.0 $ / $ 87.4 $ & $ 87.1 $ / $ 87.7 $ / $ 88.5 $ &  \\ 
        ~ & CBLR\_0.05 & $ 87.9 $ / $ 88.1 $ / $ 88.6 $ & $ 86.9 $ / $ 87.2 $ / $ 87.5 $ & $ 86.4 $ / $ 87.0 $ / $ 87.9 $ &  \\
        ~ & CBLR\_0.20 & $ 85.7 $ / $ 86.3 $ / $ 88.5 $ & $ 86.0 $ / $ 86.5 $ / $ 87.3 $ & $ 87.9 $ / $ 88.6 $ / $ 89.4 $ &  \\
        ~ & BLR & $ 86.0 $ / $ 87.4 $ / $ 88.5 $ & $ 85.4 $ / $ 85.5 $ / $ 87.0 $ & $ 89.1 $ / $ 89.4 $ / $ 90.0 $ &  \\ 
        ~ & BLR-LC-N\_adapt & $ 88.3 $ / $ 88.5 $ / $ 88.7 $ & $ 87.1 $ / $ 87.3 $ / $ 87.5 $ & $ 83.9 $ / $ 84.9 $ / $ 86.9 $ &  \\ 
        ~ & BLR-LC-N\_train & $ 88.2 $ / $ 88.4 $ / $ 88.6 $ & $ 87.2 $ / $ 87.3 $ / $ 87.4 $ & $ 70.1 $ / $ 71.4 $ / $ 73.4 $ &  \\
        ~ & DivDis & $ 88.4 $ / $ 88.4 $ / $ 88.5 $ & $ 87.8 $ / $ 87.8 $ / $ 87.9 $ & $ 70.7 $ / $ 70.7 $ / $ 70.8 $ &  \\
        \midrule
        \multirow{4}{*}{$ 32 $} & CBLR & $ 88.2 $ / $ 88.5 $ / $ 88.8 $ & $ 86.7 $ / $ 87.0 $ / $ 87.4 $ & $ 88.6 $ / $ 88.8 $ / $ 89.1 $ &  \\ 
        ~ & CBLR\_0.05 & $ 88.1 $ / $ 88.4 $ / $ 88.7 $ & $ 87.0 $ / $ 87.2 $ / $ 87.5 $ & $ 87.5 $ / $ 88.0 $ / $ 88.4 $ &  \\
        ~ & CBLR\_0.20 & $ 87.2 $ / $ 87.7 $ / $ 88.5 $ & $ 86.2 $ / $ 86.8 $ / $ 87.4 $ & $ 89.1 $ / $ 89.4 $ / $ 89.7 $ &  \\
        ~ & BLR & $ 87.8 $ / $ 88.2 $ / $ 88.8 $ & $ 86.3 $ / $ 86.8 $ / $ 87.3 $ & $ 89.6 $ / $ 89.8 $ / $ 90.0 $ &  \\ 
        ~ & BLR-LC-N\_adapt & $ 88.5 $ / $ 88.6 $ / $ 88.7 $ & $ 87.2 $ / $ 87.3 $ / $ 87.5 $ & $ 84.8 $ / $ 85.6 $ / $ 86.4 $ &  \\ 
        ~ & BLR-LC-N\_train & $ 88.2 $ / $ 88.4 $ / $ 88.6 $ & $ 87.2 $ / $ 87.3 $ / $ 87.4 $ & $ 72.7 $ / $ 73.6 $ / $ 75.4 $ &  \\
        ~ & DivDis & $ 88.4 $ / $ 88.4 $ / $ 88.5 $ & $ 87.8 $ / $ 87.8 $ / $ 87.9 $ & $ 70.7 $ / $ 70.7 $ / $ 70.8 $ &  \\
        \bottomrule
        \end{tabular}

    \end{adjustbox}
\label{tab:cmnist_full}
\end{table}


\begin{table}[!ht]
    \centering
    \caption{
PACS: test accuracy for all methods on all domains. We report the 
$20\textrm{th}$ percentile / mean / $80\textrm{th}$ percentile across 20 independent runs. }
    \begin{adjustbox}{max width=\linewidth}
        \begin{tabular}{cccccc}
        \toprule
        $n_*$ & Method / $e_*$ & A & C & P & S \\ \midrule
        $ 0 $ & ERM & $ 87.8 $ & $ 72.6 $ & $ 96.1 $ & $ 76.3 $ \\ \midrule
        \multirow{4}{*}{$ 16 $} & CBLR & $ 87.8 $ / $ 88.5 $ / $ 89.5 $ & $ 80.8 $ / $ 81.7 $ / $ 82.7 $ & $ 96.7 $ / $ 97.1 $ / $ 97.6 $ & $ 77.6 $ / $ 78.3 $ / $ 79.5 $ \\ 
        ~ & CBLR\_0.05 & $ 86.6 $ / $ 87.4 $ / $ 88.8 $ & $ 79.5 $ / $ 80.2 $ / $ 81.6 $ & $ 96.1 $ / $ 96.7 $ / $ 97.3 $ & $ 76.1 $ / $ 76.9 $ / $ 78.0 $ \\ 
        ~ & CBLR\_0.20 & $ 83.9 $ / $ 85.4 $ / $ 87.0 $ & $ 78.2 $ / $ 79.3 $ / $ 81.0 $ & $ 95.8 $ / $ 96.3 $ / $ 96.7 $ & $ 73.8 $ / $ 74.9 $ / $ 76.4 $ \\ 
        ~ & BLR & $ 86.8 $ / $ 87.7 $ / $ 89.2 $ & $ 76.1 $ / $ 78.8 $ / $ 82.9 $ & $ 95.5 $ / $ 96.1 $ / $ 97.0 $ & $ 70.1 $ / $ 72.5 $ / $ 75.5 $ \\ 
        ~ & BLR-LC-N\_adapt & $ 88.3 $ / $ 88.7 $ / $ 89.5 $ & $ 81.8 $ / $ 82.6 $ / $ 83.5 $ & $ 97.0 $ / $ 97.3 $ / $ 97.6 $ & $ 77.8 $ / $ 78.7 $ / $ 79.9 $ \\ 
        ~ & BLR-LC-N\_train & $ 86.1 $ / $ 86.8 $ / $ 87.5 $ & $ 79.7 $ / $ 80.0 $ / $ 80.6 $ & $ 96.7 $ / $ 96.9 $ / $ 97.0 $ & $ 76.1 $ / $ 76.4 $ / $ 77.6 $ \\ 
        ~ & DivDis & $ 85.1 $ / $ 85.6 $ / $ 86.3 $ & $ 79.3 $ / $ 79.3 $ / $ 79.7 $ & $ 96.4 $ / $ 96.8 $ / $ 97.6 $ & $ 77.7 $ / $ 78.0 $ / $ 79.1 $ \\ 
        \midrule
        \multirow{4}{*}{$ 32 $} & CBLR & $ 88.8 $ / $ 89.8 $ / $ 90.7 $ & $ 82.7 $ / $ 83.3 $ / $ 84.2 $ & $ 96.7 $ / $ 97.1 $ / $ 97.3 $ & $ 78.6 $ / $ 79.2 $ / $ 80.1 $ \\ 
        ~ & CBLR\_0.05 & $ 87.5 $ / $ 88.5 $ / $ 89.5 $ & $ 80.3 $ / $ 81.6 $ / $ 82.7 $ & $ 96.1 $ / $ 96.6 $ / $ 97.0 $ & $ 77.2 $ / $ 78.3 $ / $ 79.5 $ \\ 
        ~ & CBLR\_0.20 & $ 86.1 $ / $ 87.0 $ / $ 88.5 $ & $ 79.5 $ / $ 80.8 $ / $ 82.1 $ & $ 96.1 $ / $ 96.4 $ / $ 97.0 $ & $ 75.4 $ / $ 76.8 $ / $ 78.2 $ \\ 
        ~ & BLR & $ 88.8 $ / $ 89.6 $ / $ 90.7 $ & $ 82.1 $ / $ 82.9 $ / $ 84.4 $ & $ 96.4 $ / $ 96.9 $ / $ 97.6 $ & $ 76.4 $ / $ 77.6 $ / $ 79.2 $ \\ 
        ~ & BLR-LC-N\_adapt & $ 89.0 $ / $ 89.5 $ / $ 90.2 $ & $ 81.6 $ / $ 82.5 $ / $ 83.8 $ & $ 97.0 $ / $ 97.3 $ / $ 97.6 $ & $ 76.4 $ / $ 77.0 $ / $ 79.9 $ \\ 
        ~ & BLR-LC-N\_train & $ 86.1 $ / $ 86.9 $ / $ 87.5 $ & $ 79.7 $ / $ 80.3 $ / $ 81.0 $ & $ 96.7 $ / $ 97.0 $ / $ 97.3 $ & $ 75.9 $ / $ 76.4 $ / $ 77.5 $ \\ 
        ~ & DivDis & $ 85.1 $ / $ 85.1 $ / $ 85.6 $ & $ 79.1 $ / $ 79.6 $ / $ 81.2 $ & $ 96.4 $ / $ 96.8 $ / $ 97.3 $ & $ 77.1 $ / $ 77.3 $ / $ 78.5 $ \\ 
        \midrule
        \multirow{4}{*}{$ 64 $} & CBLR & $ 89.7 $ / $ 90.8 $ / $ 91.7 $ & $ 84.2 $ / $ 85.0 $ / $ 85.7 $ & $ 97.0 $ / $ 97.4 $ / $ 97.9 $ & $ 79.9 $ / $ 80.5 $ / $ 81.4 $ \\ 
        ~ & CBLR\_0.05 & $ 89.0 $ / $ 89.4 $ / $ 90.2 $ & $ 82.3 $ / $ 83.3 $ / $ 84.2 $ & $ 96.4 $ / $ 96.9 $ / $ 97.3 $ & $ 78.5 $ / $ 79.4 $ / $ 80.4 $ \\ 
        ~ & CBLR\_0.20 & $ 88.0 $ / $ 88.8 $ / $ 90.0 $ & $ 82.7 $ / $ 83.1 $ / $ 83.8 $ & $ 96.4 $ / $ 96.8 $ / $ 97.3 $ & $ 76.7 $ / $ 78.3 $ / $ 79.6 $ \\ 
        ~ & BLR & $ 90.0 $ / $ 90.5 $ / $ 91.4 $ & $ 84.2 $ / $ 85.3 $ / $ 86.1 $ & $ 97.0 $ / $ 97.3 $ / $ 97.9 $ & $ 79.4 $ / $ 80.0 $ / $ 81.3 $ \\ 
        ~ & BLR-LC-N\_adapt & $ 89.2 $ / $ 90.0 $ / $ 90.7 $ & $ 82.3 $ / $ 82.7 $ / $ 84.4 $ & $ 97.0 $ / $ 97.2 $ / $ 97.6 $ & $ 77.6 $ / $ 79.0 $ / $ 80.5 $ \\ 
        ~ & BLR-LC-N\_train & $ 87.0 $ / $ 87.4 $ / $ 88.0 $ & $ 79.3 $ / $ 79.9 $ / $ 80.6 $ & $ 97.0 $ / $ 97.0 $ / $ 97.3 $ & $ 75.5 $ / $ 76.3 $ / $ 77.2 $ \\
        ~ & DivDis & $ 85.6 $ / $ 85.9 $ / $ 86.8 $ & $ 78.6 $ / $ 79.1 $ / $ 79.9 $ & $ 96.4 $ / $ 96.8 $ / $ 97.3 $ & $ 77.2 $ / $ 77.6 $ / $ 78.6 $ \\ 
        \midrule
        \multirow{4}{*}{$ 128 $} & CBLR & $ 90.7 $ / $ 91.5 $ / $ 92.2 $ & $ 86.3 $ / $ 86.9 $ / $ 87.8 $ & $ 97.3 $ / $ 97.6 $ / $ 98.2 $ & $ 81.5 $ / $ 82.4 $ / $ 83.7 $ \\ 
        ~ & CBLR\_0.05 & $ 90.0 $ / $ 90.4 $ / $ 91.4 $ & $ 84.2 $ / $ 85.5 $ / $ 87.6 $ & $ 97.3 $ / $ 97.3 $ / $ 97.6 $ & $ 80.8 $ / $ 81.4 $ / $ 83.4 $ \\ 
        ~ & CBLR\_0.20 & $ 89.5 $ / $ 90.6 $ / $ 92.4 $ & $ 84.0 $ / $ 84.8 $ / $ 87.4 $ & $ 96.7 $ / $ 96.9 $ / $ 97.3 $ & $ 79.2 $ / $ 79.7 $ / $ 81.3 $ \\ 
        ~ & BLR & $ 90.5 $ / $ 91.1 $ / $ 91.7 $ & $ 86.1 $ / $ 86.6 $ / $ 87.4 $ & $ 96.7 $ / $ 97.4 $ / $ 97.9 $ & $ 80.1 $ / $ 81.2 $ / $ 82.5 $ \\ 
        ~ & BLR-LC-N\_adapt & $ 89.2 $ / $ 89.9 $ / $ 90.7 $ & $ 82.7 $ / $ 83.2 $ / $ 84.2 $ & $ 97.0 $ / $ 97.3 $ / $ 97.9 $ & $ 79.1 $ / $ 79.8 $ / $ 80.6 $ \\ 
        ~ & BLR-LC-N\_train & $ 86.6 $ / $ 87.2 $ / $ 88.0 $ & $ 79.5 $ / $ 80.4 $ / $ 81.2 $ & $ 96.7 $ / $ 97.0 $ / $ 97.3 $ & $ 75.7 $ / $ 76.4 $ / $ 77.6 $ \\
        ~ & DivDis & $ 84.8 $ / $ 85.4 $ / $ 86.1 $ & $ 78.8 $ / $ 79.7 $ / $ 80.6 $ & $ 96.4 $ / $ 96.9 $ / $ 97.3 $ & $ 76.7 $ / $ 77.2 $ / $ 78.1 $ \\ 
        \midrule
        \multirow{4}{*}{$ 256 $} & CBLR & $ 91.9 $ / $ 92.5 $ / $ 92.9 $ & $ 86.3 $ / $ 86.7 $ / $ 88.5 $ & $ 97.6 $ / $ 97.9 $ / $ 98.2 $ & $ 83.7 $ / $ 84.2 $ / $ 85.1 $ \\ 
        ~ & CBLR\_0.05 & $ 91.9 $ / $ 92.3 $ / $ 92.7 $ & $ 78.4 $ / $ 82.9 $ / $ 87.6 $ & $ 97.3 $ / $ 97.5 $ / $ 97.9 $ & $ 82.9 $ / $ 83.5 $ / $ 84.2 $ \\ 
        ~ & CBLR\_0.20 & $ 91.4 $ / $ 91.9 $ / $ 92.4 $ & $ 85.7 $ / $ 86.9 $ / $ 88.7 $ & $ 97.3 $ / $ 97.5 $ / $ 97.9 $ & $ 81.8 $ / $ 82.9 $ / $ 83.8 $ \\ 
        ~ & BLR & $ 91.4 $ / $ 91.8 $ / $ 92.4 $ & $ 85.9 $ / $ 86.5 $ / $ 87.4 $ & $ 97.3 $ / $ 97.6 $ / $ 98.2 $ & $ 80.6 $ / $ 81.7 $ / $ 82.8 $ \\ 
        ~ & BLR-LC-N\_adapt & $ 89.7 $ / $ 90.0 $ / $ 90.5 $ & $ 80.8 $ / $ 82.5 $ / $ 84.2 $ & $ 97.0 $ / $ 97.4 $ / $ 97.9 $ & $ 77.2 $ / $ 78.3 $ / $ 80.1 $ \\ 
        ~ & BLR-LC-N\_train & $ 87.5 $ / $ 87.9 $ / $ 88.5 $ & $ 79.9 $ / $ 80.4 $ / $ 81.0 $ & $ 96.7 $ / $ 96.8 $ / $ 97.0 $ & $ 76.9 $ / $ 77.2 $ / $ 78.5 $ \\ 
        ~ & DivDis & $ 85.3 $ / $ 85.8 $ / $ 87.0 $ & $ 80.6 $ / $ 80.6 $ / $ 80.8 $ & $ 96.7 $ / $ 96.9 $ / $ 97.3 $ & $ 76.3 $ / $ 77.0 $ / $ 78.1 $ \\ 
        \bottomrule
        \end{tabular}

    \end{adjustbox}
\label{tab:pacs_full}
\end{table}



\bibliography{wang_665-supp}

\end{document}
