%!TEX root = ../paper.tex

\paragraph{Invariant Risk Minimization} Invariance as an indicator of causality was introduced by \citet{peters2015}, who outline the goal of seeking a subset of features that are causal for a target variable or label. The features are generated by structural equation models (SEMs), where interventions on different features create different environments. 
They suggest that with access to a sufficient number of independent environmental interventions, or environments, the invariant features can be recovered.
The IRM paradigm \citep{arjovsky2020invariant} applies this idea to learning causal features across a number of training domains. 
% They continue to use the linear generative model
% \abcomment{using `the'; not self contained exposition, generative model (or SEM) has not been discussed}, 
% and construct the original bilevel optimization problem: 
Because this optimization question is computationally intractable, they propose the IRMv1 variant, which uses the gradient norm as a constraining penalty for invariance.

A number of followup works propose variants that implement the paradigm, including IRM games \citep{pmlr-v119-ahuja20a_games}, IRM with information bottlenecking \citep{ahuja_2021_irm_ib_bottleneck, liInvariantInformationBottleneck2022a}, risk extrapolation \citep{pmlr-v139-krueger21a}, and learning spurious features without environment index \citep{pmlr-v202-tan23b-tiva}.
% 2023 Neurips papers
% Spuriosity Didn’t Kill the Classifier: Using Invariant Predictions to Harness Spurious Features \citep{eastwood-spuriositydidnt-2023}
%  Locally invariant explanations\citep{dhurandhar-locallyinvariant-2023}
%  Invariant Learning via Probability of Sufficient and Necessary Causes \citep{yang-invariantlearningppsn-2023}
Theoretical works on the IRM paradigm largely analyze linear models \citep{arjovsky2020invariant, rosenfeld2020risks,wangProvableDomainGeneralization2022}, although analyses of nonlinear models for analysis exist to varying degrees of generality \citep{rosenfeld2020risks,lai_2024_tvmodel}. Some of these works also highlight simple failure cases of IRM \citep{rosenfeld2020risks, ahuja2020empirical}. 
The data generation model introduced by \citep{arjovsky2020invariant} has also been extended to overparameterized models \citep{zhouSparseInvariantRisk2022}, or to cover different types of environmental variables \citep{kaur2022modeling, rosenfeld2020risks}.



% \jdcomment{Move that remark discussing classification and regression here? classification examples are \citep{eastwood-spuriositydidnt-2023, rosenfeld2020risks, wangProvableDomainGeneralization2022}}
% \begin{remark}
% We remark that 

% \abcomment{we want to remark on how earlier work has considered classification version of such generative models, we can do that conditional Bernoulli or conditional multinomial models} 
% \end{remark}
% \paragraph{Multi-task learning}
% Other related works in meta-learning, Domain Adaptation \jdcomment{cite here}, transfer learning, have access to the environment indicator at testing time.
\paragraph{Domain Generalization}
IRM is closely related to other methods that tackle Domain Generalization (DG), which broadly targets good OOD generalization on unseen environments after training on more than one training domain. 
Similar lines of work include distributionally robust optimization \citep{sagawa2020distributionally, volpi_generalzeunseen_2018}, which aims to improve the overparameterized models over worst-case training loss on different data groups.
Domain adaptation covers methods which also leverage information of the test domain to best capture distributional shift \citep{bendavid_2006,sun2016adaptation,ganin2016_dann}. 
 % In addition, multi-task learning \citep{Caruana1997MultitaskL,goncalves_2016_sparsemtl}, metalearning \citep{maml_finn_2017}, and federated learning \citep{pmlr-v54-mcmahan17a_fedlearn,fed_learning_zhang_2021} are all learning tasks that involve generalization across multiple domain distributions.

\paragraph{Sparse Representation}
Highly overparameterized DNNs are prevalent in modern machine learning, and many works have developed techniques to eliminate unnecessary weights or finding sparse representations \citep{handeepcompression_2015, Li2016PruningFF,Hinton2015DistillingTK}. 
A simple and popular technique is to use constrained $L_0$ norm, or its convex relaxation with LASSO, to enforce sparsity. 
Alternatively, projected gradient descent (PGD) methods are fast, efficient, and provably recover the optimal parameter with low estimation error \citep{loh_2013_regularized_mest, negahban_2009_higdim_mestimators, agarwal_2010_fastconvergence, banerjee2015estimation}.
% These methods are general and can find a number of structural solutions, including low-rank 
Other paradigms explored to induce sparsity include 
% the Lottery Ticket Hypothesis \citep{lotteryfrankle2019}, which suggests that dense neural networks contain subnetworks that can be trained faster and achieve higher test accuracy when trained in isolation.
probability-based methods for pruning \citep{Louizos2017LearningSN, srinivas_2017_trainingsparsenn,Molchanov2017VariationalDS}, which have shown empirical success in this regime as well. 
Finally, \citet{zhouSparseInvariantRisk2022, fan2024eills} provide evidence that combining sparsity with IRM can improve generalizability across domains, requiring knowledge of the number of sparse, invariant features, and combinatorially iterating through all feature subsets to find the causal subset.

% presents methods that prune dense feed-forward Neural Networks in order to find a subnetwork that, when trained independently, achieves the same test accuracy of the original network. 

% On the other hand, \jdcomment{discussion about sparse linear regression/ridge regression?}
% \jdcomment{A section for random analysis of regression problems/general linear models? Like hsu 2012, norm reg. 2014 paper, etc.?}
% LASSO in regression for finding sparse solutions is a classical method analyzed in depth
% % \jdcomment{cite}
% The analysis of regularization can be extended to other norms to find estimation error \citep{banerjee2015estimation}. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% other deleted snippets %%%%%%%%%%%%%%%%%%%%

% \abcomment{notation or variables have not been introduced}
% \jddelete{\begin{equation}
% \label{eqn:irm}
% \begin{aligned}
%   &\min _{\substack{\Phi: \mathcal{X} \rightarrow \mathcal{H} \\ w: \mathcal{H} \rightarrow \mathcal{Y}}} 
%   & & \sum_{e \in \mathcal{E}_{\mathrm{tr}}} R^e(w \circ \Phi) \\
%   &\text{subject to} 
%   & & w \in \underset{\bar{w}: \mathcal{H} \rightarrow \mathcal{Y}}{\arg \min } R^e(\bar{w} \circ \Phi),
%    \quad \forall e \in \mathcal{E}_{\mathrm{tr}}.
% \end{aligned}
% \end{equation}}
% % \abcomment{in general, using math in "Related Work" is unusual, this should be part of Problem Formulation}

% \jddelete{\begin{equation}
% \label{eqn:irmv1}
% \min _{\Phi: \mathcal{X} \rightarrow \mathcal{Y}} \sum_{e \in \mathcal{E}_{\mathrm{tr}}} R^e(\Phi)+\lambda \cdot\left\|\nabla_{w \mid w=1.0} R^e(w \cdot \Phi)\right\|^2.
% \end{equation}}