% \documentclass{uai2023} % for initial submission
\def\forConf{1}  % UAI camera-ready requires separating the text and appendix
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
%\usepackage[dvipsnames]{xcolor}
\usepackage{multirow}
\usepackage{adjustbox}

\definecolor{mblc}{RGB}{25,25,152}
\definecolor{vlt}{RGB}{138,0,136}
\definecolor{fusc}{RGB}{202,44,146}
\hypersetup{colorlinks=true,linkcolor=black,urlcolor=black,citecolor=mblc,allbordercolors={1 1 1}}

\usepackage{amsmath,amsthm,amsfonts,amssymb,soul,cancel,enumitem}
\usepackage{mdframed}
\usepackage{cleveref}
\usepackage{algorithm,algpseudocode}

\usepackage[T3,T1]{fontenc}
\DeclareSymbolFont{tipa}{T3}{cmr}{m}{n}
\DeclareMathAccent{\invbreve}{\mathalpha}{tipa}{16}


\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{example}{Example}
\theoremstyle{remark}
\newtheorem{remark}{Remark}

\newcommand{\indep}{\perp \!\!\! \perp}
\newcommand{\mred}[1]{{\color{red} #1}}
\newcommand{\mblue}[1]{{\color{blue} #1}}
\newcommand{\mgray}[1]{{\color{gray} #1}}
\newcommand{\pmodel}[1]{q(#1;\theta)}
\newcommand{\pdata}[1]{p(#1)}
\newcommand{\norm}[1]{\lVert #1 \rVert}
\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}
\newcommand{\KL}[2]{\mathrm{KL}(#1\,\Vert\, #2)}
\newcommand{\mbf}[1]{\mathbf{#1}}
\newcommand{\mb}[1]{\mathbb{#1}}
\newcommand{\mc}[1]{\mathcal{#1}}
\newcommand{\mrm}[1]{\mathrm{#1}}
\newcommand{\msf}[1]{\mathsf{#1}}
\newcommand{\nablazj}[1]{\nabla_{\theta^{(#1)}}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\bracket}[1]{\langle #1 \rangle}
\newcommand{\grad}{\mathrm{grad}}
\newcommand{\<}{\langle}
\let\oldket\>
\renewcommand{\>}{\rangle}

% \ba ... \bZ
\def\mydefb#1{\expandafter\def\csname b#1\endcsname{\mathbf{#1}}}
\def\mydefallb#1{\ifx#1\mydefallb\else\mydefb#1\expandafter\mydefallb\fi}
\mydefallb abcdeghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ\mydefallb
\let\oldbf\bf
%\renewcommand{\bf}{
%    \ifmmode \mathbf{f} \else \oldbf
%}

% \cA ... \cZ
\def\mydefc#1{\expandafter\def\csname c#1\endcsname{\mathcal{#1}}}
\def\mydefallc#1{\ifx#1\mydefallc\else\mydefc#1\expandafter\mydefallc\fi}
\mydefallc ABCDEFGHIJKLMNOPQRSTUVWXYZ\mydefallc

\newcommand{\bzero}{\mathbf{0}}
\newcommand{\bone}{\mathbf{1}}
\newcommand{\balpha}{\bm{\alpha}}
\newcommand{\bbeta}{\bm{\beta}}
\newcommand{\bpsi}{\bm{\psi}}
\newcommand{\bphi}{\bm{\phi}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\train}{\mathrm{train}}
\newcommand{\Xtrain}[0]{\mbf{X}_{\mathrm{train}}}
\newcommand{\Krr}{K_{rr}}
\newcommand{\Ker}{K_{er}}
\newcommand{\Kre}{K_{re}}
\DeclareMathOperator{\Var}{Var}
\DeclareMathOperator{\Cov}{Cov}
\DeclareMathOperator{\diag}{diag}

\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\newcommand{\parenfrac}[2]{\biggl(\frac{#1}{#2}\biggr)}
\DeclareMathOperator{\erf}{erf}
\DeclareMathOperator{\Unif}{Unif}
\newcommand{\invParam}{{\bar\theta_{inv}}}
\newcommand{\bayesParam}[1][e]{{\bar\theta_{spu}^{#1}}}
\newcommand{\invVec}{{\bar\beta_{inv}}}
\newcommand{\spuVec}[1][e]{{\bar\beta_{spu}^{#1}}}
\newcommand{\Flocal}{{\partial\cF_{\delta}}}
\newcommand{\spuCollection}{{\cS_{tr}}}

\def\draft{1}
\if\draft1
\newcommand{\zw}[1]{{[\color{blue}\smaller\textbf{ZW}: {#1}]}}
\newcommand{\todo}[1]{{[\color{red}\smaller\textbf{TODO}: {#1}]}}
\else
\newcommand{\zw}[1]{}
\newcommand{\todo}[1]{}
\fi



\title{A Constrained Bayesian Approach to Out-of-Distribution Prediction}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<wzy196@gmail.com>?Subject=Your UAI 2023 paper}{Ziyu Wang}{}\textsuperscript{*}}
\author[1]{Binjie Yuan\textsuperscript{*}}
\author[2]{Jiaxun Lu}
\author[3]{Bowen Ding}
\author[2]{Yunfeng Shao}
\author[3]{Qibin Wu}
\author[1]{\href{mailto:<dcszj@mail.tsinghua.edu.cn>?Subject=Your UAI 2023 paper}{Jun Zhu}{}\textsuperscript{\#}}
% Add affiliations after the authors
\affil[1]{%
    Dept. of Comp. Sci. \& Tech., BNRist Center, Tsinghua-Huawei Joint Center for AI, THBI Lab, Tsinghua University
}
\affil[2]{%
    Huawei Noah's Ark Lab
}
\affil[3]{%
    Huawei Technologies Co., Ltd.
  }
  
\begin{document}
\maketitle


\begin{abstract}
Consider the problem of out-of-distribution prediction given data from multiple environments. While a sufficiently diverse collection of training environments will facilitate the identification of an invariant predictor, with an optimal generalization performance, 
many applications only provide us with a % more 
limited number of environments. 
It is thus necessary to consider adapting to distribution shift using a handful of labeled test samples. 
We propose a constrained Bayesian approach for this task, 
which restricts % the prior 
to models with a worst-group training loss above a prespecified threshold. 
Our method avoids a pathology % pathological % inefficiency 
% behavior 
of the standard Bayesian posterior, 
% in the presence of spurious correlations that 
which occurs when spurious correlations improve in-distribution prediction. We also show that 
on certain high-dimensional linear problems, 
constrained modeling 
improves the sample efficiency of adaptation.  
Synthetic and real-world experiments demonstrate the robust performance of our approach.
\end{abstract}

%% INTRODUCTION

\section{Introduction}

\renewcommand*{\thefootnote}{*}
\footnotetext{Equal contribution. \textsuperscript{\#}Corresponding author.}
\setcounter{footnote}{0}
\renewcommand*{\thefootnote}{\arabic{footnote}}

A crucial challenge in machine learning applications is to make predictions in % to deploy models to 
a novel %\emph{out-of-distribution} (OOD) 
environment, with a data distribution different from those of the training environments \citep{quinonero2008dataset,blanchard2011generalizing}. 
In such scenarios, there often % NOTE: must limit the scope, since there's also covariate shift
exist \emph{spurious features} \citep{sagawa_distributionally_2020} that % appear predictive in training data, but have a correlation structure that is environment-specific and should not be utilized in prediction. 
exhibit environment-specific correlation structures to the target variable, which can be drastically different between training and test data.  
% should not be utilized for reliable prediction. 
% that appear informative for the target variable, but in an environment-specific way. 
% Deploying machine learning models to changed environments is a common need in applications. 
% A common cause for distribution shift is the presence of % \emph{spurious features}, % factors with 
% \emph{spurious correlations} in training data; as an example, 
For example, in aggregated medical imaging datasets, factors such as radiographic positioning or projection often appear predictive about the diagnostic outcome, % within training data, 
but only because both factors are correlated with the data source; 
machine learning models may thus learn ``shortcuts'' based on such features, leading to poor generalization \citep{degrave_ai_2021}. 

A diverse body of literature is dedicated to this issue of out-of-distribution (OOD) prediction, with different assumptions introduced on the forms of distribution shift and the information available to aid generalization. %  \citep{wang_generalizing_2022}. 
We are primarliy interested in scenarios where % we assume the existence of 
% interested in a common \zw{reword?} scenarios where we 
% a set of \emph{invariant features} exist and induce 
a causally \emph{invariant predictor} \citep{buhlmann_invariance_2018,rojas-carulla_invariant_2018} exists, and is reasonably performant across environments. In such cases, 
its recovery can be possible given a sufficiently diverse collection of training environments \citep{peters2016causal,arjovsky2019invariant}. 
% Training data from multiple environments is often available, 
It is often possible to have training data from multiple environments, % the training samples grouped into multiple, distinct environments; 
such as in medical applications where hospitals form distinct environments; %  forms a different environment. 
we can then apply algorithms such as invariant risk minimization [IRM, \citealp{arjovsky2019invariant}] and group distributionally-robust optimization [GDRO, \citealp{sagawa_distributionally_2020}]. 

Unfortunately, % it is recognized that 
identification of the invariant predictor may require an excessive number of training environments: for $d$-dimensional linear models this may amount to $\cO(d)$ environments \citep{rosenfeld_risks_2021}. 
Thus, in a large proportion of practical applications, we will find ourselves in an underidentified regime with an insufficient number of training environments, in which case the benefits of existing methods % domain generalization algorithms 
are far less clear. 
Indeed, \citet{gulrajani_search_2020} showed that across a number of benchmarks with a smaller number of % 3-10 
environments, methods such as IRM and GDRO consistently fail to outperform an ERM baseline, even though the latter does not account for distribution shift. 
The challenge of underspecification 
can be fundamental; 
as we demonstrate in \Cref{lem:classif}, in seemingly benign scenarios with $o(d)$ environments, there may exist spurious features that are indistinguishable from invariant features, by \emph{any} statistical procedure working on finite data. 

In light of the practical need for OOD prediction given few environments and the inherent difficulties of generalization, % in such a regime, 
it is thus necessary to take a step back and consider 
\emph{adapting} a learned model to a target environment, using a handful of labeled samples. Such samples are often available, for example if 
% In many applications such samples may come ``for free'', if 
our deployment process involves first testing the model in the target environment;  
in such cases, we can simply set out a few samples for adaptation.

It is natural to consider a Bayesian approach 
given the underidentified nature of our problem, as is also advocated by % the recent work of 
\citet{lee_diversify_2022} in a connected but different setting. 
The Bayesian formulation may also appear desirable due to the interpretation as % performing 
sequential belief updates --- % \zw{rephrase?} 
the posterior % (or its possible generalization, see \cref{sec:gb}) 
given training data ``naturally'' serves as a prior during adaptation. 
Unfortunately, the Bayesian approach can be inherently flawed for our purpose, as long as there exists a % small but 
non-negligible gap between the in-distribution performance of the invariant and non-invariant models. As we shall discuss in \Cref{sec:gb}, % a scaling of evidence, required in (generalized) Bayesian modeling, will necessarily amplify this gap and lead to inefficient adaptation: 
this gap will get amplified by a scaling of evidence that is required in (generalized) Bayesian modeling, and cause the posterior to remain concentrated on non-invariant models until a very large number of adaptation samples have been seen. 
This is concerning due to the prevelance of such \emph{performance gaps}; indeed, they are % often considered as 
part of the reason for the failure of domain generalization algorithms \citep[e.g.,][]{rosenfeld_risks_2021,sagawa_investigation_2020}. %\zw{find someone who talked about label noise}

In this work, we attempt to address this issue by proposing a principled approach for the adaptation task. We assume the knowledge of a lower bound of the invariant predictor's performance. 
Such knowledge is often possible, % available 
given our implicit assumption that the invariant predictor has an acceptable performance. 
% g., for classification problems we may know a bound on the level of ``spurious label noise'' presented in data. 
We then use the training environments to define \emph{constraints}: we restrict to the subset of models that do not perform significantly worse than the lower bound, across all training environments. 
This % The constrained formulation 
% allows for efficient adaptation, as it 
ensures the invariant predictor presents in the constraint set with high probability, and is weighed similarly to the non-invariant predictors, even though the latter would have induced a much better likelihood on training data. % in the Bayesian formulation. 
Consequently, % we can avoid the inefficiency of the Bayesian formulation, and achieve 
efficient adaptation can be achieved. 

Our method can be justified in many ways, by considering its behavior in the presence of performance gaps as sketched above, or by relating it to a relaxed formulation of GDRO. 
% As we sketched above, it avoids a pathological behavior of the Bayesian formulation % when the adaptation samples are few and the distribution shift is drastic. 
% in the presence of performance gaps. 
% It can be naturally viewed as a relaxed, (constrained) Bayesian counterpart of GDRO, % in that any predictor in the constraint set can be chosen % -- in principle -- 
% as an approximate optima to the GRO problem. 
% and thus inherits its benefits when the training data is informative. 
We complement these justifications with an asymptotic analysis, showing that in certain asymptotics for high-dimensional % regimes for linear regression problems, 
linear models, \emph{adaptation with constrained models may achieve a vanishing estimation error with
a relatively small number of training environments, % in the adaptation process, 
whereas using neither training nor adaptation data alone cannot guarantee convergence.} 
This result improves the understanding of OOD learning, %c in general, 
by showing that a smaller number of training environments can still be useful. 

We evaluate the proposed method through synthetic and real-world experiments. 
% synthetic datasets, the standard benchmarks of ColorMNIST \citep{arjovsky2019invariant} and PACS \citep{li2017deeper}, and a real-world application. 
On several image classification tasks where off-the-shelf domain generalization algorithms struggle to improve over ERM, our method delivers significant improvement, with only a handful of adaptation samples. 
Moreover, among all the adaptation algorithms evaluated, our method is the only one with reliable performance across all settings; in contrast, the baseline procedures fail intermittently, in different settings, which can be attributed to their less principled nature. 
% Our approach demonstrates robust performance, and exhibiting being the only method with reliable performance across all settings.

The rest of this paper is structured as follows: in Sec.~\ref{sec:bg} we review the setup of OOD generalization and discuss its hardness. Sec.~\ref{sec:method} discusses the pitfall of standard Bayesian modeling and introduces our method, which is further justified in Sec.~\ref{sec:theory} % further 
through asymptotic analyses. 
We review related work in Sec.~\ref{sec:related-work}, present empirical evaluations in Sec.~\ref{sec:exp}, and provide concluding remarks in Sec.~\ref{sec:conclusions}.


\section{OOD Generalization and its Hardness}\label{sec:bg}

\paragraph{Notations} We adopt the following notations in the paper: $[n] := \{1,\ldots,n\}$. 
$\asymp,\lesssim,\gtrsim$ denote (in)equality up to constants. $c_1,\ldots,$ denote universal constants. For finite-dimensional vectors, $\|\cdot\|_2$ denotes the Euclidean norm.

% \paragraph{Out-of-distribution learning} % In domain generalization tasks we wish to guarantee performance on 
\paragraph{Invariant models and OOD generalization}
Consider a prediction task with training data from $m$ environments: $
\cD_{tr} := \{\{(x_i^e,y_i^e)\sim P_e:i\in [n_e]\}:e\in\cE_{tr}\}, 
$ where $|\cE_{tr}| = m$. 
We are interested in % the performance on 
an out-of-distribution test environment where the data comes from a different $P_*$. 
% Various formulations exist that impose different assumptions on the distributional difference across environments; 
% We are interested in a line of literature that assumes 
We assume the existence of an \emph{invariant predictor} that only depends on the input $x$ through some $\Phi_{inv}(x)$, such that $p_e(y\mid\Phi_{inv}(x)) \equiv p(y\mid\Phi_{inv}(x))$ is invariant across all environments. 
We also assume that $\Phi_{inv}(x)$ can be reasonably informative about $y$. Such $\Phi_{inv}(x)$ are named \emph{invariant features}, in contrast to the \emph{spurious features} $\Phi_{spu}(x)$ which induce different $p_e(y\mid \Phi_{spu}(x))$ across environments, and  
% The shift of conditional distributions 
hinder generalization when they are included in a predictor. 

A variety of approaches have been proposed for learning the invariant predictor. Of particular interest is the method of \emph{group distributionally-robust optimization} (GDRO), which minimizes the worst-case risk across training environments: 
\begin{equation}\label{eq:gro}\tag{GDRO}
\min_{f\in\cH} \max_{e\in\cE_{tr}} \hat R_{e}(f), 
\end{equation}
and invariant risk minimization (IRM): 
\begin{equation}\label{eq:irm}\tag{IRM}
\begin{aligned}
\min_{f=w\circ\Phi\in\cH}~~ &\sum_{e\in\cE_{tr}} \hat R_{e}(w\circ \Phi), \\ 
\text{subject to}~~~ &w\in\argmin_{w'} \hat R_e(w'\circ \Phi), ~~\forall e\in\cE_{tr}.
\end{aligned}
\end{equation}
In the above, $\hat R_e(f) := \frac{1}{n_e}\sum_{i=1}^{n_e} \ell(f(x_i^e), y_i^e)$ denotes the empirical risk for an environment $e$, $\ell$ denotes a suitable loss function, and $\cH$ is our hypothesis space. For IRM, $\Phi$ and $w$ denote the learned invariant features and the optimal predictor atop them. 

\paragraph{Hardness of OOD generalization}
It is intuitive that an invariant predictor may be recovered, given a large and % large number of 
diverse collection of training environments. % may enable the recovery of 
% methods such as GDRO and IRM to learn an invariant predictor. 
For IRM and certain linear models with dimensionality $d$, this amounts to having $m\asymp d$ environments that are independent in a certain sense \citep{arjovsky2019invariant}. Unfortunately, such requirements can be unrealistic for high-dimensional data, and/or nonlinear models, and with a smaller $m$ the empirical performance of domain generalization algorithms can often be disappointing: \citet{gulrajani_search_2020} show that a wide range of methods may fail to match the performance of an empirical risk minimization (ERM) baseline. 

Let us illustrate the hardness of invariant prediction using the following example, adapted from \citet{rosenfeld_risks_2021}: 

\begin{example}\label{ex:classif}
Consider a classification problem with data generated as follows:
\begin{align*}
&\spuVec[e]\sim \cN(0, \tau_s^2 I) \in \RR^{d_{spu}}, ~
y^e_i\sim\mrm{Unif}\{\pm 1\},
\\
x^e_i &= \begin{bmatrix}
    x^e_{i,inv} \\ x^e_{i,spu}
\end{bmatrix}\sim \cN\biggl(y^e_i \begin{bmatrix}
\invVec \\ \spuVec[e]
\end{bmatrix}, \begin{bmatrix}
    \sigma_i^2 I & 0 \\ 0 & \sigma_s^2 I
\end{bmatrix}\biggr),
\end{align*} where $\tau_s,\sigma_s,\sigma_i>0$, 
and $\invVec\in\RR^{d_{inv}}$ is fixed. When $m < d_{spu}/4$, the vectors $\{\spuVec[e]\}$ are linearly independent with high probability \citep[chapter 6]{wainwright2019high}. Thus, by Theorems 5.1 and 5.3 in \citet{rosenfeld_risks_2021}, % apply and show that 
all of ERM, IRM and GDRO will learn a non-invariant predictor. 
We provide further insights through the following:
\end{example}

\begin{lemma}\label{lem:classif}
In the setting of \Cref{ex:classif}, let 
$$\textstyle
x_{pe,i}^e := \alpha \sum_{e\in\cE_{tr}} (\spuVec[e])^\top x_{i,spu}^e,\text{ with }\alpha\ne 0.
$$
be a ``purely environmental'' feature. 
Then, 
\begin{enumerate}[leftmargin=*,topsep=0pt,label=(\roman*)]
\item\label{it:informative} A classifier based on $x_{pe}$ alone will achieve a vanishing error, if % an accuracy of $1-o_p(1)$ if 
$m\ll d_{spu} \min\{1,(\tau_s/\sigma_s)^2\}$. 
\item\label{it:hardness} For all $e\in\cE_{tr}$, denote the marginal distribution of $(y,x_{inv}^e,x_{pe}^e)$ by $p_{e,marg}$. Then, 
 w.p.~$\ge 1-e^{-m/18}$ w.r.t.~$\{\spuVec[e]\}$ there exists some $\tilde e\in\cE_{tr}$ s.t.~
\begin{equation}\label{eq:kl-bound}
\mrm{KL}\Bigl(\bigotimes_{e\in\cE_{tr}} p_{e,marg} \Bigm\Vert 
p_{\tilde e,marg}^{\otimes m} 
\Bigr)
\le \frac{256 m}{\sigma_s^2 d_{spu}}.
\end{equation}
Consequently, given a training sample with size 
\begin{equation}\label{eq:sample-size-threshold}
\max_{e\in\cE_{tr}} n_e \ll \sigma_s^2 d_{spu} / m,
\end{equation}
\underline{no} statistical test with a size of % a false positive rate of 
$o(1)$ could reject $x_{pe}$ as a non-invariant feature w.p.~$\ge o(1)$. 
\end{enumerate}
%    \item when $m\ll d_{spu} \min\{1,(\tau_s/\sigma_s)^2\}$, the ``purely environmental'' predictor $\textstyle
%\tilde f(x) := \tilde\theta^\top x, \text{ with }
%\tilde\theta \propto (0, \sum_{e\in\cE_{tr}}\spuVec[e]), 
%$
%can achieve fast vanishing error across $e\in\cE_{tr}$, and become an approximate solution to % \eqref{eq:irm} and 
%\eqref{eq:gro} and \eqref{eq:irm}; % and allows it to outperform invariant predictors by a large margin.
%\item\label{it:hardness} when $m\ll d_{spu} \sigma_s^2$, the ``purely environmental'' feature 
%$
%x_{pe} := \tilde\theta^\top x
%$ will satisfy, with high probability, %will become statistically indistinguishable from an invariant feature; with high probability we have
%\begin{align*}
%\!\!&\phantom{=}\max_{e,e'\in\cE_{tr}}\KL{p_e(y\mid x_{inv})}{p_{e'}(y\mid x_{inv})}  \\ 
%\!\!&\le\max_{e,e'\in\cE_{tr}}\KL{p_e(y,x_{pe},x_{inv})}{p_{e'}(y,x_{pe},x_{inv})} \to 0,
%\end{align*} 
%implying that $x_{pe}$ becomes hard to distinguish from invariant features for \emph{any} statistical test given finite data.
%\end{enumerate}
\end{lemma}
\begin{proof}
See the supplementary material. 
\end{proof}
The above result highlights the hardness of OOD generalization in high dimensions. It shows in the $m\ll d_{spu}$ regime the existence of a spurious feature that has an arbitrarily high predictive power across $e\in\cE_{tr}$, yet can be \emph{indistinguishable from invariant features} given finite samples. 
In reality, the sample size threshold will be much higher than \eqref{eq:sample-size-threshold}, since for features learned from finite data \emph{it is only valid to % we can only % afford to 
test for approximate invariance}; see the supplementary material for a detailed discussion.%, which amounts to more significant deviations from \eqref{eq:kl-bound}.
\footnote{
% As a more extreme example, 
It is also clear from the proof that 
if $\{\spuVec[e]\}$ are exactly orthonormal, indistinguishability will hold for all finite $n_e$.
} 
It should be noted that quantitatively similar results do not always hold, across all linear models: \citet{chen2022iterative} showed that under certain data generating processes, identification may become possible when $m=\cO(\log d)$. 
Still, % such a requirement this can still be a difficult requirement, and 
it remains concerning that such a pathology arises from a seemingly benign setting, with i.i.d.~training environments and $x_{pe}$ constructed by a simple averaging. % operation.
Also note while past works have studied adaptation based on unlabeled test samples \citep{zhang2021adaptive}, it would be ineffective on this setup, since the input has the same distribution across all environments. 

We note that multiple %mechanisms exist for 
mechanisms exist that may explain % the emergence of 
the hardness of OOD generalization, %learning under spurious correlations, 
and the possible (in-distribution) \emph{performance gap} between the invariant and non-invariant predictors: 
they may be inherent to the data distribution %generating/collection process 
as demonstrated above, or they can arise from % arise from undesirable inductive bias 
inappropriate model specifications, which may lead  % the model specification being not fully appropriate, leading 
to the memorization of data \citep{sagawa_investigation_2020}, undesirable margin-maximization behavior \citep{nagarajan2020understanding,wald2023malign}, or simply a larger approximation error for the invariant predictors. 
% However, as demonstrated by our point \ref{it:hardness} above, there are at least some settings % in some scenarios 
% Our method will not rely on assumptions on the specific cause of spurious correlations, although part of our analysis restricts to a certain setting. What is important is that 
We take an agnostic view to the cause, % of spurious correlations, 
but stress the ubiquity of hard-to-learn % OOD 
problems: as exemplified by claim~\ref{it:hardness} above, there are many scenarios where 
% it is unwise to expect 
generalization to completely unseen environments is fundamentally difficult. Instead, we may have to take a step back, and seek additional information about the target environment. 

\section{Adapting to Environment Shift with Constrained Bayesian Models}\label{sec:method}

% As we have seen, OOD generalization given few training environments can be fundamentally difficult, yet such problems often arise in applications. However, 
In many applications, it is possible to collect a handful of labeled samples from the test environment before deploying the model; for example, such samples may come ``for free'' if the deployment process involves first evaluating the model in the test environment. % , in which case we can set out a few samples from the evaluation set. 
In light of the % aforementioned 
inherent difficulties of generalization to unseen environments, 
it is reasonable to % consider the possibility of using 
study the use of such samples to adapt our model to the shifted environment. 

\subsection{Why not (generalized) Bayes?}\label{sec:gb}

Before presenting our method, % however, 
let us first consider a na\"ive alternative which employs as the prior for adaptation 
% It may appear that (generalized) Bayesian updating provides a natural solution to this problem; we would then 
a (generalized/Gibbs) posterior from training data, % as the prior for adaptation, 
which is then updated with samples from the test environment. 
Let $\cD_{ad} := \{(x^*_i,y^*_i)\sim P_*:i\in [n_*]\}$ denote the \emph{adaptation samples}, 
and $\theta\in\Theta$ denote the parameters % in a predictor %
of a predictor $f_\theta$. 
The updated posterior is then 
$$
p_{GB}(d\theta\mid \cD_{tr}, \cD_{ad}) \propto 
    \pi(d\theta) e^{-\cL(\theta;\cD_{tr})} \prod_{i=1}^{n_*} e^{-\ell(y_i^*, f_\theta(x_i^*))}.
$$
In the above, 
the ``initial prior'' $\pi$ represents our subjective belief before seeing any data, 
$\cL(\theta;\cD_{tr})$ can be any \textbf{properly scaled} training objective,  
and $\ell(y_i^*, f_\theta(x_i^*))$ denotes an arbitrary loss. 
% For example, 
With $
\ell(y_i^*,f_\theta(x_i^*)) \gets -\log p(y_i^*\mid f_\theta(x_i^*)), ~
\cL(\theta;\cD_{tr}) \gets \sum_{e\in\cE_{tr}}\sum_{i=1}^{n_e} \ell(y_i^e,f_\theta(x_i^e))
$
we recover the standard Bayesian posterior, while using \eqref{eq:gro} or \eqref{eq:irm} for $\cL$, or using a different % choices of 
$\ell$, will lead to different generalized posteriors \citep{zhang2006varepsilon,bissiri_general_2016}. Note that this generalized posterior can also understood from a variational perspective with proper posterior regularization~\citep{zhu2014regbayes}.

Importantly, 
in (generalized) Bayesian modeling, the scale of $\cL$ should be proportional to, or at least increasing w.r.t.~the training sample size, 
as otherwise the ``adaptation-time prior'' $
\pi_{ad,GB}(d\theta) \propto \pi(d\theta) e^{-\cL(\theta;\cD_{tr})}
$ would be equivalent to the original $\pi$, rendering the training data useless. With an additive $\cL$ such as in ERM or \eqref{eq:irm}, the linear scaling is also desirable because it allows us to maintain a coherence property of sequential Bayesian updates \citep{bissiri_general_2016}. 

It is precisely this necessary scaling that make the generalized Bayesian approach unsuitable %in our setting. 
for our adaptation goal. 
The problem is that in many OOD problems, there exists a small but non-negligible gap between the in-distribution performance of the invariant predictor and a non-invariant predictor, as we discussed at the end of \Cref{sec:bg};  
% such a \emph{performance gap} may be inherent to the data generation/collection process, or it may arise from inappropriate inductive bias in our model. 
and such a gap gets amplified by the scaling of the objective:
\begin{example}%[Bayesian adaptation in the presence of performance gap]
    \label{ex:2p}
As a pedagogical example, consider a classification task with $\ell$ being the 0/1 loss, $n_e\equiv 10^5$, and a two-point prior $\pi$ supported on the invariant predictor and a non-invariant predictor: $
\pi = \mrm{Unif}\{\theta_{non-inv}, \theta_{inv}\}, 
$ where 
\begin{align*}
R_*&(\theta_{non-inv}) - R_*(\theta_{inv}) \ge 0.99, \\ 
\min_{e\in\cE_{tr}}&(\hat R_e(\theta_{inv}) - \hat R_e(\theta_{non-inv})) \ge 0.01. 
\end{align*}
(Note the shorthand notation $R_{(\cdot)}(\theta) := R_{(\cdot)}(f_\theta)$, and $R_*$ denotes the population risk on the test environment.) 
% (As we have discussed, such a gap on $\hat R_e$ can occur easily, for example for linear models with $m \ll d_{spu}$.) 
Let $\cL$ be scaled by $n_e$. Then we have $
\frac{\pi_{ad,GB}(\{\theta_{inv}\})}{\pi_{ad,GB}(\{\theta_{non-inv}\})} = e^{10^5\times 0.01}
$ for the adaptation-time prior, and the log posterior mass ratio is approximately 
$
10^3 - 0.99 n_*.
$ Therefore, 
even though $\theta_{non-inv}$ % the non-invariant predictor 
has catastrophic performance on the test environment, it would take more than $10^3$ adaptation samples for $p_{GB}$ to concentrate to the right parameter.\footnote{
Note that while the example concerns generalized Bayesian posteriors, a similar pathology exists for the respective point estimators, due to the exponential concentration of the loss functions. 
}
\end{example}

While it is certainly possible to alleviate this issue with more heuristics, e.g., by switching to a smaller scaling, %more slowly-growing scaling, 
it is difficult to determine a sensible scheme that facilitates efficient adaptation; a slower scaling also discounts the training data ``as a whole'', making them less useful for % modeling 
the invariant features, and for identifying part of the spurious correlations that could have been identified from training data. % \emph{Fundamentally, }
The awkward situation reflects the inherently different roles of 
training and adaptation samples, % in out-of-distribution tasks, 
which necessitates a different treatment for the distinct forms of evidence they provide.

\subsection{Constrained Bayesian modeling} 

In light of the pathological inefficiency of the generalized Bayesian approach, we propose an alternative which % uses % 
is to use the training environments to define constraints. % suppose we know of an upper bound for the risk of the invariant predictor, 
Concretely, let $\rho \ge R_{e}(f_{inv})$ be a prespecified upper bound for the risk of the invariant predictor. % of interest. 
We define our predictive distribution using the constrained posterior 
\begin{align*}
p_{C}(d\theta\mid\cD_{tr},\cD_{ad}) &\propto\,\, \pi(d\theta) 
    \mbf{1}_{\{\theta\in\cC_{tr}\}} \prod_{i=1}^{n_*} e^{-\ell(y_i^*, f_\theta(x_i^*))}, \\ 
\text{where}~~~ \cC_{tr} &:=
    \bigl\{\theta: \max_{e\in\cE_{tr}}\hat R^e(f_\theta) \le \rho + \varepsilon_n
    \bigr\}
\end{align*}
is the constraint set, 
and $\varepsilon_n\to 0$ covers the small sampling error $
|\hat R_e(f_{inv}) - R_e(f_{inv})|
$ so that we can have 
$\theta_{inv}\in\cC_{tr}$ with high probability.\footnote{For subgaussian loss we can choose $\varepsilon_{n,e}\propto n_e^{-1/2}\sqrt{\log m}$.} 
In many applications we have 
knowledge of a good choice for $\rho$, due to the implicit assumption that the %there exists an 
invariant predictor has an acceptable performance;  
e.g., in classification problems where the performance gap between the invariant and non-invariant classifiers can be attributed to various types of label noise, 
%\footnote{\zw{discuss eaxmples here or in Sec 2?}} 
we can often upper bound the noise level based on our 
domain knowledge. It is also possible % in principle 
to utilize less reliable sources of information about $\rho$, by viewing $\rho$ as a model parameter and equipping it with a prior. Alternatively, we may simply % , and conducting Bayesian inference over it.
set $\rho$ to be larger than the risk of the ERM to achieve a better trade-off between in-distribution and OOD performance; this approach will be evaluated in \cref{sec:exp-db}. 

When $\rho$ is small, any $\theta\in\cC_{tr}$ will correspond to an approximate optima for \eqref{eq:gro}. Thus, 
\emph{the constrained posterior is % can be viewed as 
a natural generalization of % a relaxed version of 
GDRO}, and will not perform significantly worse, which is useful if the training data turns out to be informative. 
In the underidentified regime, the constrained posterior allows for more efficient adpatation, by relaxing the optimization problem and modeling the uncertainty in training data. % by tracking all predictors that are possibly useful in view of the data. 
% In contrast to 
Comparing with the na\"ive Bayesian approach, % discussed before, 
the constrained posterior is based on an adaptation-time prior $\pi_{ad,C}(d\theta)\propto \pi(d\theta) \mbf{1}_{\theta\in\cC_{tr}}$ that does not introduce additional weighting % further discrimination 
to models in the constraint set; %, in contrast to % the adaptation-time prior 
% $\pi_{ad,GB}$ in $p_{GB}$. 
% the (generalized) posterior. % $p_{GB}$. 
this allows us to avoid the pathological behavior of the former: % in the $m<d_{spu}$ regime; 
returning to Example~\ref{ex:2p}, we can see that the constrained posterior only requires $\cO_p(1)$ samples to converge to the correct prediction. 

\subsection{Algorithm Implementation}

We draw approximate samples from the constrained posterior using a simple algorithm that augments Langevin Monte Carlo (LMC) with line search: at each iteration, we choose within a prescribed range 
the largest step-size s.t.~the LMC update could stay in the constraint set. The process is listed as \Cref{alg:main}. 
We run multiple LMC chains in parallel, a nd use the obtained samples $\{\theta_K^{(j)}: j\in [J]\}$ to define the % mean 
predictor 
$
\tilde p_{C}(y^*\mid x^*) = \frac{1}{J}\sum_{j=1}^J p(y^*\mid f_{\theta_K^{(j)}}(x^*)).
$

\begin{algorithm}[h]
\caption{Approximate inference for the constrained posterior.}\label{alg:main}
\begin{algorithmic}[1]
    \Require{Training and adaptation samples $(\cD_{tr}, \cD_{ad})$, loss $\ell$, prior $\pi(d\theta)$, $K,\rho,\varepsilon_n,\varepsilon_b>0,\{\bar\eta_k:k\in[K]\}$}
    \Ensure{Approximate sample $\theta_K\sim \tilde p_C\approx p_C$}
\State initialize $\theta_0$ using e.g., ERM on $\cD_{tr}$ \Comment{{\color{gray}proper choices for $(\rho,\varepsilon)$ will ensure $\theta_0\in\cC_{tr}$}}
\For{$k\gets 1,\ldots,K$}
    \State draw $z_k\sim \cN(0, I)$
    \State $g_k \gets \nabla_\theta \sum_{i=1}^{n_*}\ell(y_i^*,f_{\theta}(x_i^*)) |_{\theta=\theta_{k-1}}$
    \State $
    \theta_k \gets \theta_{k-1} - \eta_k g_k + \sqrt{2\eta_k} z_k,
$
where $\eta_k\in [0, \bar\eta_k]$ is the largest number s.t.~$\theta_k\in\cC_{tr}$ \Comment{{\color{gray}$\eta_k$ is determined (up to an error of $\varepsilon_b$) using binary search}}
\EndFor
\State \Return $\theta_K$
\end{algorithmic}
\end{algorithm}
Intuitively, the algorithm can be viewed as simulating %implements a crude simulation for 
a reflected Langevin equation \citep{lions1984stochastic,bubeck2018sampling}, which is the constrained counterpart to the standard Langevin dynamics. Note that 
refined numerical schemes exist if we can compute the boundary of $\cC_{tr}$ efficiently \citep{bubeck2018sampling,sato2022convergence}, which is possible in settings like linear models with a convex $\ell$. Alternative constraint sampling algorithms, such as \citet{zhang2022sampling}, can also be utilized; 
we opt for \cref{alg:main} merely for its simplicity. If needed, 
we can improve its computational efficiency through standard means, by introducing preconditioning, stochastic gradients, or by replacing the training set with a uniformly random or curated subset \citep[e.g.,][]{bachem2017practical}. %, or by introducing preconditioning to the dynamics. 


\section{Theoretical Analysis}\label{sec:theory}

We have motivated our method by connecting it to a relaxation of GDRO, and by considering its small-sample behavior in simple settings. % such as \cref{ex:2p}. 
We now provide further justifications, by showing that on a family of linear models, constrained modeling in general can improve the sample efficiency even when the adaptation sample size is large. 

\paragraph{Analysis setup} %\todo{add lax?}
We consider a regression setup with data generated as follows:
\begin{align*}
\spuVec[e] &\sim \cN(0, d_{spu}^{-1} I), ~
\bx^e_i = 
    M\begin{bmatrix}\bx^e_{inv,i}\\ \bx^e_{spu,i}\end{bmatrix}\sim \cN(0, I), \\
\by^e_i&\sim \cN(\invVec^\top \bx^e_{inv,i} + (\spuVec[e])^\top \bx^e_{spu,i}, \sigma_y^2).
\numberthis\label{eq:lg-dgp}
\end{align*}
In the above, $e\in\cE_{tr}$ indexes the training domain, 
$\invVec$ is an arbitrary, fixed vector with norm $\cO(1)$, 
$\bx^e_{inv,i}\in\RR^{d_{inv}}, \bx^e_{spu,i}\in\RR^{d_{spu}}$ are the invariant and spurious features, 
and $M$ is a mixing matrix assumed to be invertible and well-conditioned. % We write $d_{spu} := \dim\spuVec[e], d = d_{inv} + d_{spu}$. 
Test data $(\bx^*,\by^*)$ is generated similarly using $\spuVec[*]$ in place of $\spuVec[e]$, which we assume is an \emph{arbitrary, fixed} vector. 
We use the square loss $\ell(s,t) = (s-t)^2$, and assume 
access to infinite training samples % from each of the training domains 
for simplicity. The invariant predictor is parameterized by $\invParam = M^{-\top}(\invVec, 0)$. 

% We assume access to $m$ training domains, and $n_*$ adaptation samples from the test domain. 

As discussed in Section~\ref{sec:bg}, 
on similar linear problems, identification of $\invParam$ may require $m=\cO(d)$ domains. To understand the necessity % of $m=\cO(d)$ 
on this %particular 
setup, observe that when $m\ll d$, the vectors $\{\spuVec[e]:e\in\cE_{tr}\}$ are approximately orthonormal \citep[Ch.~6]{wainwright2019high}, and that when they are exactly orthonormal, ERM and GRO will both identify the parameter $\tilde\theta = M^{-\top} (\invVec, \frac{1}{m}\sum_{e\in\cE_{tr}} \spuVec[e])$ which leads to 
$$
R_e(\tilde\theta) \equiv \sigma^2 + \frac{m-1}{m} \le \sigma^2 + 1 \equiv R_e(\invParam), ~~\forall e\in\cE_{tr}.
$$ 
\eqref{eq:irm} learns the same $f_{\tilde\theta}$, which can fulfill its constraint using 
% $f_{\tilde\theta}$ %factorizes as %
%  admits the factorization $f_{\tilde\theta} = w\circ\Phi$ for 
$
\Phi(x) = (\invVec^\top M^{-1} x, \frac{1}{m} \sum_{e\in\cE_{tr}}(\spuVec[e])^\top M^{-1} x).
$ % which fulfills the IRM constraints. % \eqref{eq:irm}. 
By the arbitrariness of $\spuVec[*]$, the predictor
$f_{\tilde\theta}$ may incur an arbitrarily high error on new environments. 
% Thus, we can understand that when $m\ll d_{spu}$, it is difficult to learn a predictor with performance guarantees without utilizing samples from the test domain.

\paragraph{Improved convergence of a constrained estimator}
We now present our analysis. For technical simplicity, we study a constrained \emph{point estimator}:
$$
\hat\theta := \argmin_{\theta\in \cC_{tr}\cap\Theta} \sum_{i=1}^{n_*} \ell(y_i^*,\theta^\top x_i^*), \text{where}~
\Theta := \{\theta: \|\theta\|_2 \le U\}
$$
parameterizes our hypothesis space, and $U >\|M\|^{-1}$ is a constant. We then have the following:

\begin{proposition}\label{prop:regr}
Suppose the data is generated as as above, \emph{$\spuVec[*]\in\RR^{d_{spu}}$ be arbitrary}, and $\hat\theta$ is defined as above. Let $\bar f^*$ be the Bayes predictor on the test domain. Then there exist universal constants $c_1,c_2,c_3>0$ s.t.~when $n_*\ge 3d$ we have, with probability $\ge 1-n_*^{-9}$, 
\begin{align*}
R_*(f_{\hat\theta}) - R_*(\bar f^*) &\le 
c_1 \inf_{\theta'\in\Theta\cap\cC_{tr}}(R_*(f_{\theta'}) - R_*(\bar f^*)) + \epsilon_{n_*}^2  \\ 
&\le c_1(R_*(f_{\invParam}) - R_*(\bar f^*)) + \epsilon_{n_*}^2, 
\end{align*}
\begin{align*}
\text{where}~~\epsilon_n^2 &=  c_2\frac{\sigma_y d_{inv} + \log n_*}{n_*} + \\ &\hspace{2em} 
    c_3\sigma_y\min\biggl\{
    \sqrt{\frac{d_{spu}\log m}{n_* m}}, \frac{2^{-m/d_{spu}} d_{spu}}{n_*}
    \biggr\}.
\end{align*}
\end{proposition}
\begin{proof}
The full proof is in supplementary material. Its main idea is that for any $\theta=M^{-\top}(\beta_i,\beta_s)$ with $\beta_s\ne 0$, we have 
\begin{align*}
\PP_{\cD_{tr}}(\theta\in\cC_{tr}) &\le \min\{e^{-m d_{spu}\|\beta_s\|_2^2}, 2^{-m}\}.
\end{align*}
This allows us to derive high-probability bounds on the reduced complexity of $\Theta\cap\cC_{tr}$.
\end{proof}

Proposition~\ref{prop:regr} establishes an oracle inequality, which % quantifies the prediction error $R_*(\hat\theta)$ using an approximation error 
allows us to compare the % test 
performance of the constrained estimator with the invariant predictor. At the claimed probability, the unconstrained maxmimum likelihood estimate (MLE) that does not utilize 
training data achieves an estimation error of
$$
\epsilon_n'^2 = c_4\frac{\sigma_y(d_{inv} + d_{spu}) + \log n}{n}.
$$
Therefore, we can see that the constrained formulation 
(at least) improves the efficiency in estimating the spurious component of the model. The improvement is most interesting when $d_{spu} \gg d_{inv}$; in particular, observe that % two types of benefits can be read out:
\begin{enumerate}[leftmargin=*,topsep=0pt,label=(\roman*)]
    \item When $n_*\asymp d_{spu}$, unconstrained MLE will fail to converge as we have $\epsilon'_n\asymp 1$. In contrast, the constrained estimator satisfies $\epsilon_n^2 = \tilde\cO(m^{-1/2})$. This is useful on high-dimensional problems when we only have a moderate number of environments, i.e., when $1\ll m \ll d_{spu}$: % as we mentioned, in this regime IRM and GDRO based on training data alone will also fail to converge. 
    given the previous discussion on % in light of the aforementioned behaviors of 
    IRM and GDRO, we can see that \emph{using neither the training or adaptation samples alone cannot guarantee convergence} in this regime, which demonstrates the efficacy of constrained modeling. 
    \item\label{it:dof} Even as $n_*\gg d_{spu}$ becomes larger, the training data still remains useful, as it improved the estimation error for the spurious component by a factor of $2^{-m / d_{spu}}$. When $m/d_{spu}$ is small, % compared to $d_{spu}$, 
    the expansion 
% $$
% \frac{2^{-m / d_{spu}} d_{spu}}{n} \approx \frac{(1- m/d_{spu}\log 2)d_{spu}}{n}% = \frac{d_{spu}-m}{n}
% $$ 
$
2^{-m / d_{spu}} d_{spu} \approx (1- m/d_{spu}\log 2)d_{spu}% = \frac{d_{spu}-m}{n}
$
shows that each % training 
environment roughly removes one ``degree of freedom'' from the adaptation process. 
\end{enumerate}

Our choice to analyze high-dimensional linear problems follows previous works in this area \citep[e.g.,][]{arjovsky2019invariant,sagawa_investigation_2020,rosenfeld_risks_2021}. % , although 
The linear setup is also justified by the observation that the last layer of DNN models often retain sufficient information about the invariant features \citep{kirichenko_last_2022}, even though our algorithm is not restricted to linear models. 
% For nonlinear problems which are common in practice, such results can be viewed as justifying the practice of last-layer fine-tuning as they apply to the feature space.
The regression setup is adapted from \citet{arjovsky2019invariant}; 
our assumption of i.i.d.~training environment is stronger. 
However, our setup remains non-trivial, as existing domain generalization approaches still underperform the invariant predictor by a notable margin. (Also note that we did not impose any restrictions on the test environment.) 
It may be possible to demonstrate similar sample efficiency gains in other scenarios, but they need to be established on a case-by-case basis. 
% ; the supplementary material includes additional discussions on a few such setups \todo{write}. 
% e.g., for the classification task in Ex.~\ref{ex:classif} we can establish results similar to \ref{it:dof} above. However, they need to be established on a case-by-case basis. 
Another limitation % in our analysis 
is that for simplicity, we did not analyze the efficiency gain in estimating the invariant component; 
numerical simulations 
will provide a more complete understanding on the benefits of our method. 


\section{Related Work}\label{sec:related-work}

Our work is motivated by the practical need of deploying machine learning models to OOD environments, given data from a % diverse but 
small collection of training environments and assuming the presence of spurious correlations. 
Our setup is thus connected to, but different from, a few lines of works on % that studies % out-of-distribution generalization, 
% out-of-distribution generalization, 
spurious correlations and/or transfer learning; given the vast literature, we refer readers to \citet{wilson_survey_2020,wang_generalizing_2022,jiang_transferability_2022}
for a detailed review. 
Comparing with most works on %out-of-distribution generalization and 
spurious correlations, we do not assume % it is possible at all to recover an invariant predictor solely based on the training data. 
the training data contains sufficient information for learning an invariant predictor, a common situation as discussed in \Cref{sec:bg}. 
Comparing with the transfer learning literature, we have a specific focus on spurious correlations, 
as is also noted in \citet{kirichenko_last_2022}. 

% Most related are the works of 
The recent works of 
\citet{kirichenko_last_2022,ye_freeze_2022,lee_diversify_2022} operate in a similar underspecified setting and also 
utilize adaptation samples, but all of them assume a single training environment. 
We have demonstrated how environment annotations % the availability of multiple training environments 
can be utilized to improve adaptation performance. Empirically, our method also outperforms the adaptation procedures in \citet{kirichenko_last_2022,lee_diversify_2022} in a multi-environment setup (\cref{sec:exp-db}). 
Still, our general idea may also be interesting for single-environment problems, for which we may define constraints using alternative characterizations for the invariant predictor (e.g., as in IRM) to address the issue of possible performance gaps.
% In such cases, 
It 
may be interesting for future work to % for future work to study uncertainty modeling for adaptation in single-environment problems, and to bring 
combine the development in \citet{ye_freeze_2022,lee_diversify_2022} with our framework. 
% Comparing to standard transfer learning, we assume the presence of spurious features which cannot 

\citet{lin2022bayesian,lee_diversify_2022} have investigated % the benefits of 
uncertainty modeling for OOD generalization and are broadly related to our work in this aspect, but both have a different focus: 
\citet{lin2022bayesian} on finite-sample estimation error of the IRM objective, and \citet{lee_diversify_2022} on the computational cost of Bayesian inference. As such, neither work addresses the issue of potential performance gaps between the invariant and non-invariant predictors, which, as we have discussed in \Cref{sec:gb}, requires a careful treatment.
Finally, \citet{wald2023malign} studied OOD learning in the presence of similar performance gaps, but focused on scenarios where the invariant predictor is \emph{identifiable} by alternative strategies (e.g., by matching the class-conditional distributions of features across environments). As we discussed in \Cref{sec:bg}, identifiability is not always a realistic assumption.


\begin{figure*}[t]
%%% g47:notebooks/notebooks/drodg-sim-proc.ipynb
  \centering
  \includegraphics[width=\linewidth,clip,trim={0 0.25cm 0 0.25cm}]{figs/fixdist-6k-20.pdf}
  \caption{Synthetic experiment: test error vs.~adaptation sample size for classification and regression. We report the median and $(20\textrm{th},80\textrm{th})$ percentile across 32 independently sampled adaptation sets. Plots are slightly shifted for visibility.}\label{fig:sim-main}
\end{figure*}

\section{Experiments}\label{sec:exp}

In this section we evaluate our method empirically, on synthetic data, benchmark datasets and a real-world application. 
Code for the experiments can be found at \url{https://gitee.com/mindspore/models/tree/master/research/cv/ConstrainedBayesian}.

\subsection{Synthetic Experiments}\label{sec:exp-synth}

For the synthetic experiments, we consider the classification setup in \Cref{ex:classif} and the regression setup in \Cref{sec:theory}. 

\paragraph{Experiment setup} The data generating processes are instantiated as follows: 
% For classification, we set 
we set 
$\sigma_i = 7.5, \sigma_s = 3, \tau_s = 1$ for classification, and $\sigma_y=0.5$ for regression. 
For both sets of experiments we use $m=3, n_e = 6000, d_{inv} = 20, d_{spu} = 50$, 
$\invVec\sim\cN(0, 4\sigma_v^2 I)$, and $
\spuVec[*] := \frac{1}{2m} \sum_{e\in\cE_{tr}} \spuVec[e].
$
% where $\sigma_v = 1$ for classification and $1/\sqrt{d_{spu}}$ for regression. 
% The parameter $\nu\in [0,1]$ interpolates the test environment from an adversarial choice that maximizes the effect of spurious correlations ($\nu=1$), to being identically distributed to the training environments ($\nu=0$). 
The supplementary material includes additional experiments covering different parameters and choices of $\spuVec[*]$. %, as well as an additional setting with correlation between the invariant and spurious correlations. 

We employ a Bayesian linear model with a Gaussian prior; the prior variance is set to % determined by
match the norm of the empirical risk minimizer. We use a correctly specified likelihood, i.e., normal for regression and logistic for classification. For our method, 
We define the constraint set using the 0/1 loss for classification and the square loss for regression, and set 
$
\rho + \epsilon_n := \max_{e\in\cE_{tr}}\hat R_e(\invParam) + \delta, 
$ where 
$\delta\in \{0.05, 0.1, 0.4\}$ models our imprecise knowledge about $R_e(\invParam)$.

We compare our method (%denoted as 
\texttt{CBLR}-$\delta$) to the following: % baselines: 
\begin{itemize}[leftmargin=*,topsep=0pt]
  \item \texttt{BLR}: Bayesian inference using the same Gaussian prior, and only the adaptation samples for the likelihood. %linear/logistic regression using adaptation samples only.
  \item \texttt{BLR-LC}: the (generalized) Bayesian approach discussed in \Cref{sec:gb}. The method involves scaling the empirical risk by a factor $N$; in the text we consider $N := n_e$, which corresponds to standard Bayesian modeling, and $N := n_*$, a heuristic that may allow for faster adaptation. 
  \item \texttt{BLR-Prior-}$\alpha$: another heuristic approach that replaces the prior mean with the empirical risk minimizer $\hat\theta_{ERM}$ on training data, and scales the prior variance by $\alpha^{-2}$. % In the text we report results for $\alpha := 3$.
\end{itemize}
For all baselines, we run Metropolis-adjusted Langevin algorithm (MALA) using $10^4$ iterations and $50$ parallel chains. Based on the MALA acceptance rate, we set the step-size to $\bar\eta_{k,u} \equiv 0.016 / n_*$. 
For our method we set the step-size upper bound to $\bar\eta_k := \bar\eta_{k,u}/4$ and use $4\times 10^4$ iterations. 
The Markov chains are initialized at $\hat\theta_{ERM}$ for our method and \texttt{BLR-Prior}, and the minimizer of an interpolated empirical risk for \texttt{BLR-LC}. 
We also report the performance of $\hat\theta_{ERM}$ for reference. 

\paragraph{Results and discussion}
The results are plotted in \cref{fig:sim-main}. % As we can see, 
Our method demonstrates competitive performance across all choices of adaptation sample size, 
and is reasonably insensitive to the choice of the performance bound hyperparameter. 
% as long as $\rho$ is not significantly overestimated, i.e., when $\delta$ is not too large. 
All baselines have less reliable performance: 
\texttt{BLR} perform notably worse when $n_*$ is small. 
Adaptation of the standard posterior (\texttt{BLR-LC-N\_train}) is extremely slow since we have $n_* \ll n_e$, in line with our discussion in \Cref{sec:gb}. 
With % When switching to 
the heuristic scaling in \texttt{BLR-LC-N\_adapt}, the performance becomes better at moderate sample sizes, but still not as good at smaller or larger $n_*$; the former is because the variance of the adaptation likelihood dominates, and the latter may be related to % the method having 
an asymptotic bias. 
\texttt{BLR-Prior} demonstrates generally worse performance with $\alpha=3$. 
The supplementary material includes results for additional choices of $\alpha$, % , as well as different scalings for \texttt{BLR-LC}, 
which are omitted from \cref{fig:sim-main} for visibility. We find that a larger $\alpha$ improves the performance on regression, but at a significant cost for classification performance. % For \texttt{BLR-LR}, switching to 

Importantly, \emph{none of the baselines consistently match the performance of our method}, across both problems and all choices of $n_*$. Moreover, they involve hyperparameters that are difficult to determine \emph{a priori}, in contrast to our method where the hyperparameter $\rho$ has a clear interpretation. Also note that only our method has the appealing property of never underperforming the ERM baseline. 

\subsection{Benchmark Datasets}\label{sec:exp-db}

We now turn to two datasets adapted from the DomainBed benchmark \citep{gulrajani_search_2020}: 
Colored MNIST \citep{arjovsky2019invariant} and PACS \citep{li2017deeper}. 

\paragraph{Background and setup} The PACS dataset consists of 9991 images from 4 domains and 7 categories, with images from different domains having different stylistic features. 
The Colored MNIST % describes a binary classification task with label noise and anticausal features: 
dataset is defined as follows:
let $(\bar x_{e,i},\bar y_{e,i})\in \RR^{784}\times \{0,\ldots,9\}$ be a MNIST sample, and sample 
$
y_i^e := \mbf{1}\{\bar y_{e,i} < 5\} \oplus \mrm{Bern}(\alpha), 
c_i^e := y_i^e \oplus \mrm{Bern}(\beta_e), 
$
where $\beta_e \in \{0.1,0.2,0.9\}$ depends on the environment, and $\oplus$ denotes the XOR operation. The input 
$x_i^e$ is obtained by coloring $\bar x_{e,i}$ to green or red based on $c_i^e$. The original dataset \citet{arjovsky2019invariant} uses $\alpha := 0.25$, implying an accuracy of $75\%$ for the invariant predictor. As we are interested in scenarios where the invariant predictor is more performant,  
\emph{we use a modified value of $\alpha := 0.1$}.  % and refer to the modified dataset as \texttt{Colored MNIST-0.1}. 
We note the different natures of the two datasets: by construction, on Colored MNIST there is an unavoidable trade-off between the in-distribution and test performance, whereas on PACS there may exist an invariant predictor with near-perfect accuracy, even though its recovery can still be hindered by the inductive bias of a neural network model. 

We compare the proposed method with \texttt{BLR} and \texttt{BLR-LC} baselines in the preceding section, as well as the \texttt{DivDis} method by \citet{lee_diversify_2022}. \texttt{DivDis} builds a finite collection of candidate predictors based on a diversity criterion, and selects the predictor with the best performance on the adaptation samples. 
\emph{All adaptation algorithms are applied to the last linear layer of a ConvNet model}, which is initialized at the ERM. 
% For all methods, 
% We use the ConvNet models described in \citet{gulrajani_search_2020} and initialize them at the ERM; %  with hyperparameters the training-domain validation procedure therein. 
% we then \emph{freeze 
% the feature extraction layers and apply the adaptation algorithm to the last linear layer. 
% } 
Note this is not a limitation of our algorithm (or the baselines), but is adopted for simplicity; still, this strategy is also advocated by recent works such as \citet{kirichenko_last_2022}, and the \texttt{BLR} baseline recovers the procedure in their work. % in this setting. 

% Somewhat
Contrary to many applications, on these datasets it is unclear whether there exists a near-perfect invariant predictor %with a near perfect accuracy 
within our hypothesis space. In such scenarios, it is not necessarily reasonable to assume good prior knowledge of a performance lower bound for the (best) invariant predictor. Therefore, to ensure a realistic setup, we determine the lower bound hyperparameter in our method \emph{based on the ERM}, using a possibly misspecified choice of $\rho+\epsilon_n := \max_{e\in\cE_{tr}} \hat R_e(\hat\theta_{ERM}) + \delta$. 
We report the results for $\delta=0.1$ in the main text and defer the results for alternative choices to the appendix. 

We use the ConvNet architecture in \citet{gulrajani_search_2020} and follow the training protocol therein. 
The number of learnable parameters is thus $(1024+1)\times 2$ for Colored MNIST, and $(2048+1)\times 7$ for PACS.  
For the \texttt{DivDis} baseline, we implement the method on the same adaptation samples, using 50 predictor heads; we vary its hyperparameters $(\lambda_{mi},\lambda_{reg})\in \{0.1, 1.0, 10\}$ and report the configuration with the best \emph{test} accuracy. 
The rest of the setup largely follows the preceding section and are deferred to the appendix. 

\begin{table}[!ht]
    \centering
    \caption{Average accuracy and a lower estimate of performance on the modified Colored MNIST dataset; the latter is defined as the $20\mathrm{th}$ percentile of accuracy across $20$ replications, for the worst train/test environment split. \texttt{CBLR} denotes the proposed method.
}\label{tab:cmnist}
\begin{adjustbox}{max width=\linewidth}
    \begin{tabular}{cccccc}
    \toprule
    $n_*$ & $0$ (ERM) & $4$ & $8$ & $16$ & $32$\\ \midrule
	BLR	& \multirow{4}{*}{$82.4$ / $71.5$} &  $82.5$ / $75.2$	&	$85.7$ / $80.9$	&	$87.4$ / $85.4$	& $88.3$ / $86.3$\\
    BLR-LC-$N_{\mrm{adapt}}$ &  &	                 $85.6$ / $80.5$	&	$86.3$ / $82.9$	&	$86.9$ / $83.9$	& $87.2$ / $84.8$\\	
    BLR-LC-$N_{\mrm{train}}$ &  &	                 $81.7$ / $67.9$	&	$81.9$ / $68.3$	&	$82.4$ / $70.1$	& $83.1$ / $72.7$\\	
        DivDis&  & $82.3$ / $70.7$	&	$82.3$ / $70.6$	&	$82.3$ / $70.7$	& $82.3$ / $70.7$\\
        CBLR&  & $85.7$ / $81.5$	&	$87.0$ / $85.0$	&	$87.6$ / $86.8$	& $88.1$ / $86.7$\\
        \bottomrule
    \end{tabular}
\end{adjustbox}
\end{table}

\begin{table}[!ht]
    \centering
    \caption{Average accuracy and a lower estimate of performance on PACS. The latter is defined as in \Cref{tab:cmnist}. 
}\label{tab:pacs}
\begin{adjustbox}{max width=\linewidth}
    \begin{tabular}{cccccc}
    \toprule
    $n_*$ & $0$ (ERM) & $16$ & $32$ & $64$ & $256$\\ \midrule
	BLR	& \multirow{4}{*}{$83.2$ / $72.6$} &$83.8$ / $70.1$ & $86.8$ / $76.4$ & $88.3$ / $79.4$ & $89.4$ / $80.6$\\
    BLR-LC-$N_{\mrm{adapt}}$ &  &	                $86.8$ / $77.8$ & $86.6$ / $76.4$ & $87.2$ / $77.6$ & $87.1$ / $77.2$\\	
    BLR-LC-$N_{\mrm{train}}$ &  &	                $85.0$ / $76.1$ & $85.1$ / $75.9$ & $85.2$ / $75.5$ & $85.6$ / $76.9$\\	
    DivDis & & $85.0$ / $77.6$  & $84.8$ / $77.1$ & $84.8$ / $76.8$ & $85.0$ / $76.9$\\
    CBLR&  &	$86.4$ / $77.6$ & $87.4$ / $78.6$ & $88.4$ / $79.9$ & $90.3$ / $83.7$\\
        \bottomrule
    \end{tabular}
\end{adjustbox}
\end{table}

\paragraph{Results and discussion} For space reasons, we only report aggregated results in the text, deferring full results to the supplementary material. Table~\ref{tab:cmnist}-\ref{tab:pacs} present the average accuracy across environments, as well as a pessimistic performance estimate that provides intuition on % the performance in 
unfavorable scenarios; % e; 
the latter can be particularly important for domain generalization applications \citep{eastwood2022probable}. 
As we can see, our method demonstrates excellent performance across all settings. 
In contrast, \texttt{BLR} has unstable performance at small sample sizes, and \texttt{BLR-LC} becomes less competitive as sample size increases, which is consistent with the synthetic experiments. 
\texttt{DivDis} is generally less competitive on the modified Colored MNIST dataset, % whereas on PACS it demonstrates competitive performance at smaller sample sizes, but underperforms % the other baselines 
and on PACS at larger sample sizes, indicating insufficient coverage of its candidate solution set. 
A possible reason is that \texttt{DivDis} does not account for the performance gap between the invariant and non-invariant predictors, which is notably larger on Colored MNIST. However, 
its performance may also be improved if a larger number of unlabeled test samples are available and can be selectively labeled, as is done in the experiments of \citet{lee_diversify_2022}. 

On both datasets, there is a rapid improvement over the ERM baseline after a handful of adaptation samples, which is important because \emph{ERM is a strong baseline} on these benchmarks, having outperformed all algorithms tested in \citet{gulrajani_search_2020}. 
We note the slower improvement of worst-case performance for PACS is because one domain exhibits significant label shift, which is generally at odds with % label shift is generally incompatible with 
the assumption that invariant predictor exists \citep{arjovsky2019invariant}. 
It is in principle possible to adapt our method to label shift scenarios, by redefining the constraint set to use a reweighted accuracy, but we will not explore this for simplicity. 


\subsection{Real-World Experiment}\label{sec:re}

Finally, we illustrate our method on a real-world application of out-of-distribution prediction. 

\paragraph{Background and setup}
The task concerns the classification of acoustic array data,
which are spatio-temporal signals that can be viewed as images. 
The input consists of % describes % class-dependent 
certain ``primary signals'' that induce approximately invariant conditionals, %  across environments, 
superimposed with environment-specific responses. 
The latter induce spurious correlations and are consistently picked up by % and proved to be easier to model using 
ConvNet models. 
There are 4 classes; we have data from 4 environments, each with $n_e\sim 10^5$ samples. 
Domain knowledge suggests that on class-balanced data, an invariant classifier should have an error rate lower than $5\%$.

Past experiments suggest that the training data do not contain sufficient information to guarantee OOD generalization: in leave-one-domain-out evaluation, a generalization gap of up to $20\%$ shows up, and off-the-shelf algorithms including IRM, GDRO and domain-adversarial training all fail to improve over ERM. Thus, test-time adaptation appears necessary. % In practice, up to 100 adaptation samples can be collected. 

The experiment setup largely follows the last subsection: we perform adaptation on the last linear layer, and conduct leave-one-domain-out evaluation with repeatedly sampled adaptation sets. We set $\rho=0.05$. 
Due to the large training sample size, we subsample $10^3$ samples from each environment in defining our constraint set, and set $\varepsilon_n$ accordingly. (We find that the result is generally insensitive to $\rho+\epsilon_n\in [0.03,0.1]$.)
We compare with \texttt{BLR} and \texttt{BLR-LC}. 
For the latter, we experiment with scaling the training loss using a factor of $N\in \{1,2,4,8\}\times 100$, and provide an optimistic estimate for its performance by setting $N$ based on test performance. 

\newcommand{\sdd}[2]{$#1$ {\scriptsize $\pm #2$}}
\begin{table}[h]\footnotesize\centering
\caption{
    Results for \Cref{sec:re}. For each train/test domain split, we report the mean and standard deviation of test accuracy across $20$ trials. \texttt{LB} denotes an estimate of performance in unfavorable scenarios, defined as in \Cref{tab:cmnist}. 
}\label{tab:re}
\begin{adjustbox}{max width=\linewidth}
    \begin{tabular}{cccccc} \toprule
$e_*$ & $n_*$ & (ERM) & $20$ & $80$ & $320$ \\ \midrule  

\multirow{3}{*}{ 1 } &
BLR & \multirow{3}{*}{ 81.9 } 
	 & \sdd{ 83.5 }{ 1.3 } & \sdd{ 91.3 }{ 0.7 } & \sdd{ 94.1 }{ 0.3 }  \\
&BLR-LC	&& \sdd{ 86.9 }{ 1.7 } & \sdd{ 91.3 }{ 0.9 } & \sdd{ 93.6 }{ 0.3 }  \\
&CBLR	&& \sdd{ 86.7 }{ 1.4 } & \sdd{ 91.7 }{ 0.5 } & \sdd{ 93.9 }{ 0.2 }  \\
\midrule
\multirow{3}{*}{ 2 } &
BLR & \multirow{3}{*}{ 83.1 } 
	 & \sdd{ 85.6 }{ 2.5 } & \sdd{ 91.1 }{ 1.3 } & \sdd{ 93.3 }{ 0.1 }  \\
&BLR-LC	&& \sdd{ 89.2 }{ 0.2 } & \sdd{ 92.4 }{ 0.6 } & \sdd{ 93.8 }{ 0.1 }  \\
&CBLR	&& \sdd{ 89.2 }{ 0.1 } & \sdd{ 92.1 }{ 0.1 } & \sdd{ 93.9 }{ 0.1 }  \\
\midrule
\multirow{3}{*}{ 3 } &
BLR & \multirow{3}{*}{ 92.6 } 
	 & \sdd{ 91.9 }{ 1.3 } & \sdd{ 92.8 }{ 1.1 } & \sdd{ 95.3 }{ 0.1 }  \\
&BLR-LC	&& \sdd{ 93.2 }{ 0.1 } & \sdd{ 94.8 }{ 0.1 } & \sdd{ 95.5 }{ 0.1 }  \\
&CBLR	&& \sdd{ 94.0 }{ 0.1 } & \sdd{ 95.0 }{ 0.1 } & \sdd{ 95.7 }{ 0.1 }  \\
\midrule
\multirow{3}{*}{ 4 } &
BLR & \multirow{3}{*}{ 82.5 } 
	 & \sdd{ 88.9 }{ 1.8 } & \sdd{ 92.8 }{ 1.1 } & \sdd{ 96.0 }{ 0.3 }  \\
&BLR-LC	&& \sdd{ 87.8 }{ 1.1 } & \sdd{ 92.6 }{ 0.8 } & \sdd{ 96.3 }{ 0.5 }  \\
&CBLR	&& \sdd{ 86.2 }{ 1.3 } & \sdd{ 92.4 }{ 1.3 } & \sdd{ 96.3 }{ 0.2 }  \\
\midrule
\multirow{3}{*}{ LB } &
BLR & \multirow{3}{*}{ 81.9 } 
	 & $82.4$ & $90.0$ & $93.2$  \\
&BLR-LC	&& $85.4$ & $90.5$ & $93.4$  \\
&CBLR	&& $85.1$ & $91.3$ & $93.7$  \\
\bottomrule

    \end{tabular}
\end{adjustbox}
\end{table}

\paragraph{Results and discussion}
The results are shown in \Cref{tab:re}. As we can see, test-time adaptation delivers significant improvements over the ERM baseline, which has not been possible in the past experiments with domain generalization algorithms. 
Our method and \texttt{BLR-LC} has similar performance, whereas \texttt{BLR} has less stable performance at smaller sample sizes. 
Our method can be preferable, because its hyperparameter can be easily determined using domain knowledge in a principled way. % apart from the parameter $\rho$ which was determined by domain knowledge, it does not involve any tunable hyperparameter. 


\section{Conclusion}\label{sec:conclusions}

In this work we study the problem of %consider OOD learning in the underidentified regime. Motivated by the inherent difficulties of generalization given a relatively small number of environments, we turn to the problem of 
adaptation to distribution shift, given a small % but diverse 
collection of training environments and a handful of test samples. 
We reveal a pathological behavior of the standard Bayesian posterior % in the presence of certain performance gaps, 
and address it with % by proposing 
a 
constrained Bayesian formulation. %  which is connected to the group DRO method. 
We prove that constrained modeling may lead to sample efficiency gains in certain settings,  
and demonstrate the robust 
performance of our method on synthetic, benchmark and real-world tasks. 

Our work addresses OOD prediction in the underidentified regime, which can be inherently challenging. It is thus necessary to introduce 
additional information or assumptions. 
We note our core assumptions: the existence of an invariant predictor, % with a reasonable performance, 
some knowledge about its performance, and access to adaptation samples. 
While these assumptions are satisfied in many problems and can be relaxed to various extents, there are inevitably scenarios where alternative assumptions are more appropriate. 
It would be interesting future work to study adaptation and uncertainty modeling in such settings. 

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This work was supported by the National Key Research and Development Program of China (2020AAA0106302), NSFC Projects (Nos. 62061136001, 62076145, 62076147, U19B2034, U1811461, U19A2081, 61972224), Tsinghua Institute for Guo Qiang, and the High Performance Computing Center, Tsinghua University. J.Z is also supported by the New Cornerstone Science Foundation through the XPLORER PRIZE.
\end{acknowledgements}

% References
\bibliography{wang_665}

\end{document}
