%!TEX root = ../sublime-text.tex
\label{sec:technical}
% \abcomment{reflecting on the flow ... high level: the current draft is not flowing well. Specifics (a) we start with what will be desirable and show that Theorem 1 accomplishes that; also show that Theorem 1 can be rewritten as an optimization problem; (b) discuss our proof sketch, on parts that we bound, highlight difference between population and sample; (c) discuss prior work (Zhou et al.), how they assumed something will hold for sample, but it will not and then present our Lemma 1, discuss their Lemma 1, discuss the difference will be larger is $c_s^e$ are large, discuss the benefits of uniform spread of the parameters and sparsity. These will be a series of specific points (later ones will be remarks), technically written. }

% \subsection{Information-Theoretic}
% \jdcomment{AB: (a) we start with what will be desirable and show that Theorem 1 accomplishes that; also show that Theorem 1 can be rewritten as an optimization problem;}
We want to show sample complexity bounds under which we can guarantee, with high probability, recovery of the invariant feature subset $S_\inv$  by minimizing \Cref{eqn:irm-minimax-empirical}.
In \Cref{sec:implementation}, we will examine both the use of the IRMv1 penalty, and the minimax penalty, the latter of which provides an additional result that demonstrates the optimality of the population parameter on even the empirical loss.
We then provide an analysis of computationally efficient methods for maintaining sparsity. This involves fast projected gradient methods like Iterative Hard Thresholding, and we address this in terms of the gradient norm penalty \Cref{eqn:irmv1} which is more commonly used in practice.
\subsection{Theoretical Results}
\label{sec:technical-info-theory}
We first establish that, although IRM methods aim to eliminate spurious features already, that it fails in the overparameterized regime, motivating the need for sparsity-constrained IRM methods.
\begin{proposition}
IRM fails in the overparameterized setting. We assume that  $d > n_{\text{tot}} = \sum_{e\in \cE} n_e \ge d_\inv$, 
\begin{equation}
    \hat \cL (S_\inv) \ge \hat \cL(S), |S| > n_{\text{tot}}
\end{equation}
\end{proposition}
\begin{proof}
Note $\min_{|S| > d_\inv}\hat \cL(S)=0$ in the linear setting.  
Indeed, the set $S_\inv$ belongs to the set of footprints with cardinality $|S| > n_\text{tot}$, so $\min_{|S| > n_\text{tot} } \hat \cL(S)$ is necessarily a lower bound.
\end{proof} 

Empirically, the IRM paradigm alone struggles to eliminate the spurious features $\vx_s$ and random features $\vx_r$, which together constitutes the majority of features input to the linear classifier.
% Experiments demonstratin experiments are provided and discussed in more in more detail in \Cref{sec:experiments}.
% The IRM paradigm suggests that by maintaining the constraint, $\Phi$ will find the invariant features $S_\inv$. 
Then, the natural starting point is the formulation of $\hat \cL(\vv)$ as a IRM minimax loss function from \Cref{eqn:irm-minimax-empirical} with an explicit $L_0$ constraint, 
\begin{equation}
\label{eqn:info-theory-optimization}
\min_{\vv \in \RR^d}
\hat \cL(\vv)
\subt
\Ds{\vv}_0 \le d_\inv
% \Ds{\vv}_0 \le d_\inv
.
\end{equation}
% Options include the IRMv1 loss in \Cref{ir}, or the IRM minimax loss in \Cref{eqn:irm-minimax}. minimax loss in \Cref{eqn:irm-minimax-vspecific}, 
% \begin{equation} 
% \label{eqn:info-theory-optimization}
% % \min_{S \in 2^d}
% \min_{S, \vv \in \Sp(S)}
% \hat \cL(\vv)
% % \sum_{e \in \cE}  
% % \hat \cR^e(\vv) 
% % + \rho
% % \sum_{e\in \cE} \Ds { \nabla_{\vv} \cR ^e (\vv)}^2_2
% % \max _{\vv^e \in \Sp (S)} 
% % \quad 
% % \max _{\vv^e \in \Sp (S)} 
% % \left[\cR^e(\vv)- \cR^e\left(\vv^e \right)\right]
% \subt
% |S| = d_\inv
% % \Ds{\vv}_0 \le d_\inv
% .
% \end{equation}
In this setting, we provide a guarantee of invariant feature recovery with finite samples on the minimax penalty.
\begin{theorem}[Informal: sample complexity of optimizing Eqn.~\ref{eqn:irm-minimax-empirical}]
\label{thm:info-theory}
Assume at least $n$ samples per environment $e\in \cE$, for a total of $N = |\cE| n$ across the whole training set. If 
\[n \ge O\ps{\operatorname{poly}(d_\inv) \log\ps{\frac{ |\cE|d}{\delta}}},\]
together with assumptions in \Cref{sec:assumptions}, with probability at least $(1-\delta)$, the following holds:
\begin{equation}
\hat {\cL} (S_\inv) < \hat {\cL} (S),
\quad \forall \ds{S} \le d_\inv, ~~~ S \ne S_\inv~,
\end{equation} 
% This  implies that $\hat {\cL} (\hat \beta_S) < \hat {\cL} (\hat \beta_\inv)$. 
\end{theorem}
\begin{remark}
The formal statement and a more detailed treatment of the constants in the sample complexity are provided in \Cref{par:theorem_1}.
\end{remark}
\Cref{thm:info-theory} provides a sample complexity under which we guarantee that the resulting model depends on exactly the invariant features $S_\inv$. 
With the definitions in \Cref{eqn:empirical-optima}, we see that it is equivalent to the statement $\hat \cL(\hat \beta_\inv) < \hat\cL(\hat \beta_S)$ for all $|S| \le d_\inv$.
% \abcomment{how can we say this using $\cL$ ... typo?}.
Informally, this implies that a parameter using any non-invariant features incurs a large enough penalty that it will have higher loss than $ \hat \cL (\hat \beta_\inv)$. 
Our result applies to $|\cE|$ environments, noting that the minimum number of samples per environment scales with $\log(|\cE|d/\delta)$, logarithmic in both the number of environments and the ambient dimensionality. In practice, this is easy to satisfy and is
% and is generally a much simpler task than collecting sufficient samples per environment. This is 
reflected in standard benchmarks Colored MNIST \citep{arjovsky2020invariant}, ColoredObject \citep{coco_dataset, zhouSparseInvariantRisk2022}, and MNISTCIFAR \citep{shah2020-simplicitybias}.
% \abcomment{need a build up to the next result. Also, why is it a proposition?}
% \jdcomment{I thought it was too incremental from Theorem 1, and somehow Corollary feels too diminutive...}


% \abcomment{last sentence is difficult to parse, for me}

% A consequence of \Cref{thm:info-theory-popn} is that the IRM penalty, 
\jdcomment{Setting up theorem 2}
The next result shows that the empirical loss $\hat \cL$ is also able to differentiate between the invariant optimal predictor $ \beta^*_\inv$ from the population optimizers on non-invariant footprints $S \ne S_\inv$, which we show in \Cref{thm:info-theory-popn}.
{%\color{blue} 
This unusual connection between empirical loss and population minimizer is a consequence of the structure of the IRM penalty in \Cref{eqn:irm-minimax}, and we are able to achieve this result}
with only mildly higher sample complexity: a multiplicative factor $O(\text{poly}(d_{\inv}))$ more than the sample complexity in Theorem~\ref{thm:info-theory}.
% Even with the empirical losses, we will see that 
% is affects a given parameter $\vv_S$ because spurious features are likely to induce different 
% The penalty 
% % incurred by spurious features differing across environments, 
% impacts population minima $\beta^*_S$ enough to differentiate it from the the invariant feature subset $\beta^*_\inv$, even on empirical losses. 
% If $\valpha^{e_1}_s$ is different enough from $\valpha^{e_2}_s$, the 
% the inequality also holds for $\beta_\inv^*$ and $\beta^*_S$.
\begin{theorem}[Sample complexity for sparse IRM with population optima]
\label{thm:info-theory-popn}
For population minimizers as defined in \Cref{eqn:population-optima}, and $n$ samples per environment $e\in \cE$, for a total of $N = |\cE| n$ across the whole training set, we have
\begin{equation}
\hat \cL(\beta^*) < \hat  \cL (\beta^*_S), \quad \ds{S} \le d_\inv,  S \ne S_\inv,
% \ \forall \hat \vv \ne \beta^* \text{ and } 
% \Ds {\hat \vv} _0 \le d_\inv, 
\end{equation}
if $n > O\ps{\textnormal{poly}(d_\inv)\log\ps{\frac{d \cdot |\cE|}{\delta}}}$ with constants specified in \Cref{proof:prop1-popn-minimzer}.
\end{theorem}
% \begin{remark}
% Note that $\hat \cL(\beta^*_S) \ne \hat \cL(S) =  \hat \cL(\hat \beta_S)$.
% \end{remark}
% \jdcomment{How to discuss the difference between \Cref{thm:info-theory-popn} and \Cref{thm:info-theory}?}
% In other words, the minimax loss provides a strong enough penalty that can identify invariant feature selection.
% , not only on the empirical minimizers $\hat \beta_\inv$ and $\hat \beta_S$ for some non-invariant $S$, but also the population minimizers $ \beta^*_\inv$ and $\beta_S^*$.
% Intuitively, the loss function $\hat \cL(\cdot)$ is formulated in a way that penalizes any parameter that uses non-invariant features.
% Specifically, $\hat \cL(\beta^*_\inv)$ and $\cL(\beta^e_\inv)$ because 
% $\cJ(\beta$
Details that characterize this further are found in the proofs of \Cref{thm:info-theory} and \Cref{thm:info-theory-popn} in \Cref{appx:thm-1}.

\begin{remark}
If we assume $\vzeta_s = \vone^{d_s}$ and $\vzeta_r = \vone^{d_r}$, we get the original linear model by \citet{zhouSparseInvariantRisk2022}. 
However, this will yield sample complexity and estimation error bounds which are dimension-dependent, i.e., dependent on $d_{\inv}, d_s$, and $d_r$. 
% \jdcomment{Not exactly. This dependency came from $\err(1/\delta, n)$, which is part of the $\le 0$ from our analysis now.}
{%\color{blue} 
To motivate variable $\vzeta_s$ as an example, consider that for $d_s$ features, the size of the data $\|\vx^e\|_2$ is $O(\sqrt{d_s})$ when $\vzeta_s = \vone$.
If we instead let the scaling parameter $\vzeta_s$ be changed, we allow different spurious features to correlate differently with labels. 
In addition to being a substantially more realistic assumption on the data, it allows us to create scale-dependent bounds. 
}
Then, the scale may be as low as $O(\frac{d_\inv}{d_s + d_r})$  when instead generating the data with a fixed $\|\vzeta_s^e\|_2^2$. \Cref{cor:missing-emp-uniform} compares this case.
\end{remark}

% The intuition is that the minimax penalty captures the difference between environmental optima, and that this penalty sufficiently differentiates  $\hat \cL(\beta^*_\inv)$ and $\cL(\beta^e_\inv)$.
% Specifically, spurious features only depend on the environment through $\valpha^e_s$.
% \jdcomment{Not sure how to build this up! I don't know why the result \Cref{thm:info-theory-popn} is significant exactly.}

% \abcomment{what is `this'? in general, always avoid `this', `that', `it' etc., in technical writing} arises from spurious features $x^e_i \in S | x^e_i \notin S_\inv$ 
% \abcomment{dont understand what this notation means, first use} that induce differences in the parameter learned on that feature. 
% \begin{remark}
% Using Generalized Linear Models, we can extend these regression results to a wide class of problems, including, e.g., binary classification on a conditional Bernoulli model. Further detail can be found in \Cref{sec:glm}.
%     \qed
% \end{remark}

\subsection{Proof Sketches}
% \jdcomment{the sketch is similar for both, think about how to write this}
% \begin{sproof}
To prove \Cref{thm:info-theory}, our analysis follows an approach similar to \citet{zhouSparseInvariantRisk2022}, but avoids the several errors in that analysis required to show \Cref{eqn:irm-minimax-empirical}. Our approach is sketched in this section with full details in \Cref{appx:thm-1}.
% Theorem~\ref{thm:info-theory-popn} corrects their analysis by adding missing terms and bounding them suitably. 

%We extend their results to apply to our more general model in \Cref{eqn:problem-setting}, and we make note of several corrections to their analysis in \Cref{sec:corrections}.
\begin{sproof}
First, we break down the minimax penalty, defined in \Cref{eqn:irm-minimax-vspecific}, into a sum of three error components. In other words, 
{
% \color{blue} 
$\cJ(\hat \beta_S) = \xi_a(S) + \xi_b(S) + \xi_c(S)$.}
We let $c_1, c_2, c_3 >0$ be  positive constants, $S_{\text{spu}}$ be the set of spurious features, and $\alpha_i^2 = \frac{1}{|\cE|} (\alpha_i^e)^2$ be the average value of the $\alpha_i$ scaling for a  spurious feature $i$ across environments.
\begin{equation}
    \label{eqn: xi a def main}
   \xi_a(S) =  
   \sum_ {e\in \cE} \bs{\hat \cR^e(\hat \beta_S)-\cR^e\left(\beta^*_S \right)}
   \le c_1\sqrt{\frac{\log (\frac{1}{\delta})}{\ds{\cE}n} }
   ,
\end{equation}
\begin{equation}
\label{eqn: xi b def main}
   \xi_b(S) =  
   \sum_ {e\in \cE} \bs{\hat \cR^e(\hat \beta^e_S)-\cR^e\left(\beta^e_S \right)}
    \le c_2 \sqrt{\frac{\log (\frac{\ds{\cE}}{\delta})}{n} }
   .
\end{equation}
{
% \color{blue}
These two intermediate quantities $\xi_a(S)$ and $\xi_b(S)$ 
bound similar gaps, but $\xi_a(S)$ works with the across-environment minimizers $\hat \beta_S$ and $\beta^*_S$, and $\xi_b(S)$ bounds the environment-specific  $\hat \beta^e_S$ and $\beta^e_S$. Both sum the gap across all environments, and the generalization-style bound is tighter for $\xi_a(S)$'s single classifier and greater sample complexity.}
\begin{align}
\label{eqn: xi c def main}
   \xi_c(S)
   & =  
   \sum_ {e\in \cE} \bs{ \cR^e(\beta^*_S)-\cR^e\left(\beta^e_S \right)}
   \nonumber
   \\
    &\ge c_3 \min_{i \in S_{\text{spu}}} \ds{ \alpha_i^2 - (\alpha_i^e)^2 }
   .
\end{align}
Note that \Cref{eqn: xi a def main} and \Cref{eqn: xi b def main} are not a result of directly applying Hoeffding's inequality for sub-Gaussian random variables, as the different errors are not independent. Instead, we apply triangle inequality and  
$\hat \cR^e(\hat \beta_S) - \hat \cR^e(\beta^*) < 0$, by
by definition of $\hat \beta_S$. We may then apply Hoeffding's inequality on the errors incurred on $\beta^*$. Thus, $\xi_a(S), \xi_b(S)$ decrease with sample complexity.

% They are: $\xi_a(S)$, which is the difference  $\hat \cR(\beta^*_S) - \hat \cR(\hat \beta_S)$ for population and empirical optima on subset $S$; $\xi_b(S)$, which is the same, but for all environments $e \in \cE$, $\hat \cR^e(\beta^e_S) - \hat \cR(\hat \beta^e_S)$; and finally, $\xi_c(S)$, which is the gap between population minimizers $\cR^e(\beta^*_S)-\cR^e\left(\beta^e_S \right)$. \abcomment{using equations, as align will be much better ... we are describing math in English.}
% The sums in \Cref{eqn: xi a def main} and \Cref{eqn: xi b def main}  both reduce to an application of a Hoeffding-type inequality for sub-Gaussian random variables.  
% \jdcomment{Get order complexity}
\Cref{eqn: xi c def main}, can be computed directly; 
its lower bound can be derived under reasonable assumptions of the environmental parameter $\valpha^e$, detailed in \Cref{sec:assumptions}. Intuitively, the more $\alpha_i$ varies across environments, the better the bound. 
With these quantities, we compute the samples required $n$ to have any non-invariant footprint $S \ne S_\inv$ elicit a higher loss $\hat \cL(S)$, provided that $|S| \le d_\inv$. Critically, $\xi_c(S_\inv) = 0$, and  $\xi_c(S_\inv) > O$ for all $S \ne S_\inv$.
% $\xi_a(S) =  \sum_ {e\in \cE} \bs{\hat \cR^e(\hat \beta_S)-\cR^e\left(\beta^*_S \right)}$,
% $\xi_b(S) =  \sum_ {e\in \cE} \bs{\hat \cR^e(\hat \beta^e_S)-\cR^e\left(\beta^e_S \right)}$, and
% $\xi_c(S) =  \sum_ {e\in \cE} \bs{ \cR^e(\beta^*_S)-\cR^e\left(\beta^e_S \right)}$.
The full proof can be found in \Cref{proof:info-theoretic-thm}.
\end{sproof}


\begin{remark}
The quantity $\xi_c(S)$ is positive as long as there exist environments $e_1, e_2 \in \cE$ and some spurious feature such that $i \in S$, where $\alpha^{e_1}_i \ne \alpha^{e_2}_i$. Intuitively, $\xi_c(S)$ captures the difference between environmental distributions, when only accessing features in $S$. As a result, $\xi_c(S_\inv) = 0$, since $S_\inv$ contains only features that remain invariant across environments. 
Then, it is possible to lower bound $\xi_c(S)$ for $S\ne S_\inv$ by leveraging environmental separation of the underlying distributions. It benefits from more ``widely-ranging" values of $\valpha^e$. In this way, it links back to previous works like \citep{ahuja_2021_irm_ib_bottleneck}, which impose requirements on differences in environment to present general sample complexity results. \qed 
\end{remark}

The proof of \Cref{thm:info-theory-popn} follows the same structure as that of \Cref{thm:info-theory}, 
with an additional $\text{poly}(d_{\inv})$ term
% $O(n^{-0.5})$ term \abcomment{do not follow this, theorem shows extra $\text{poly}(d_{\inv})$ term} 
incurred by $\ds{\hat \cR(\beta^*_S) - \hat \cR(\hat \beta^*_S)}$ and $\ds{\hat \cR(\beta^e_S) - \hat \cR(\hat \beta^e_S)}$.
% , both resolved with Hoeffding's inequality.
The full proof can be found in \Cref{proof:prop1-popn-minimzer}.

\begin{remark}
\citet{zhouSparseInvariantRisk2022} provides a bound for $\xi_b(\Phi) = \xi_b(S)$ which requires
$\hat \cR^e (\hat \beta^e_S) - \hat \cR^e ( \beta^e_S) \le \ds{\hat \cR^e (\hat \beta^e_S) - \hat \cR^e ( \beta^e_S) } = \Ds{\hat \beta^e_S-\beta^e_S}_{\hat \Sigma^e} $. The equality is formally stated in Lemma 1 of \citet{zhou2021effective} but is in general untrue for arbitrary feature subset $S$ in the non-asymptotic setting.
As a result, their final claim that this term is $O(n^{-0.5})$ is incorrect as well; they are missing an important term that arises from the misspecified model.
We provide a corrected analysis in \Cref{lemma:missing err-appx} in our appendix. \qed
\end{remark}

Under the generative model introduced in \Cref{eqn:problem-setting}, it is impossible for ERM and Sparse ERM to recover the invariant features only in the asymptotic case; see \Cref{sec:setting-props}. In this setting, both IRM and IRM with sparsity constraints can recover the optimal invariant predictor.
For the non-asymptotic case, we provide sample complexity bounds for Sparse IRM that leverage the invariant feature dimensionality. The result follows in \Cref{thm:info-theory}, and the full proof is found in \Cref{proof:info-theoretic-thm}. 
% Without knowledge of $d_\inv$, we treat it as a hyperparameter, which is a common approach for works in high-dimensional statistics \citep{Wainwright2019-tb}. 

% \subsection{Errors in Prior Work}
% \label{sec:corrections}
% There is a notable error in the analysis in \citet{zhouSparseInvariantRisk2022}; we discuss this briefly in this section and provide a different way to do parts of their analysis to avoid those errors. We provide more details in the appendix.

% % The first error is an assumption required to prove their non-asymptotic result; it is a statement of bounded approximation error for ordinary least squares, as detailed in Condition 3 by \citet{hsu2014random}. 
% % However, it concerns the inverse of the covariance of sparse feature vectors,  $(\Sigma^e)^{-1/2} = \EE^e[\vx (\vx^e)^\top ]$, which are singular matrices that have sparse rows and columns by definition\abcomment{I do not undertand this: why is there an inverse? and this matrix is full-rang, by construction, e.g., using our ICML'20 result (which they did not use).}. As such, the standard ordinary least squares analysis cannot be directly applied to the masked features in their analysis. We do not make use of this assumption in our work. 
% In addition, their bound for $\xi_b(\Phi) = \xi_b(S)$ requires that
% $\hat \cR^e (\hat \beta^e_S) - \hat \cR^e ( \beta^e_S) \le \ds{\hat \cR^e (\hat \beta^e_S) - \hat \cR^e ( \beta^e_S) } = \Ds{\hat \beta^e_S-\beta^e_S}_{\hat \Sigma^e} $. The equality is formally stated in Lemma 1 of \citet{zhou2021effective} but is in general untrue for arbitrary feature subset $S$ in the non-asymptotic setting.
% As a result, their final claim that this term is $O(n^{-0.5})$ is incorrect as well; they are missing an important term that arises from the misspecified model.
% We provide a corrected analysis in \Cref{lemma:missing err-appx} in our appendix. 

% \begin{remark}
%     Because $\hat \beta^e_S$ is the empirical minimizer, we have by definition  $\hat \cR^e (\hat \beta^e_S) \le \hat\cR^e ( \beta^e_S)  $. Then, $\hat \cR^e (\hat \beta^e_S) - \hat \cR^e ( \beta^e_S)\le 0$, and the constant dependency is resolved. From this, we can achieve a bound on $|\xi_b(S)|$ , and the remainder of the analysis holds, which we detail in \Cref{xib lemma} in the appendix.\qed
% \end{remark}
% % Additionally, the evaluation of 
%     % They use a sample complexity result from \citep{hsu2014random} to bound the quantity 
%     % $\hat \cR^e (\hat \beta^e_S) - \hat \cR^e ( \beta^e_S) $, 
%   % located immediately following equation (21) in their proof.


% \abcomment{which `above' bound ... also this is our proof technique, not an error in their analysis} 
% Finally, we observe that the above bound is unnecessary. Because $\hat \beta^e_S$ is the empirical minimizer, we have by definition  $\hat \cR^e (\hat \beta^e_S) \le \hat\cR^e ( \beta^e_S)  $. Then, $\hat \cR^e (\hat \beta^e_S) - \hat \cR^e ( \beta^e_S)\le 0$, and the constant dependency is resolved. From this, we can achieve a bound on $|\xi_b(S)|$ , and the remainder of the analysis holds, which we detail in \Cref{xib lemma} in the appendix.

% \abcomment{Inclined to say ... there is only one main error we are pointing out here, para starting with "In addition". Perhaps that can be included as a remark in Section 4.2, and we can drop the current Section 4.3. That will also save space for other stuff.}

% \abcomment{if there is additional space after dropping this, you can expand the proof sketch a bit, by using a few equations}


\subsection{Efficient Algorithms}
\label{sec:implementation}
The loss formulation in \Cref{thm:info-theory} uses a $L_0$ constraint, which is not computationally practical. 
We refer to the the rich line of work proving sharp convergence rates and bounds on estimation error under constraints for regression problems \citep{negahban_2009_higdim_mestimators, agarwal_2010_fastconvergence, banerjee2015estimation}. 
% With the assumptions in \Cref{sec:assumptions}, including RSC loss in \Cref{assn: rsc}, we have by Theorem 3 of 
% We describe
% This corresponds to Sparse IRM being insensitive to choice of constraint $K > d_\inv $ while still p
% \paragraph{$L_1$ constraints.} 
We later leverage a subset of these works \citep{loh_2013_regularized_mest, jain_iterativehardthreshold_2014} which show the same guarantees for methods in the family of Projected Gradient Descent (PGD) or Iterative Hard Thresholding (IHT) algorithms, which provide bounds for high-dimensional statistical settings. % This popular class of methods involves projecting the parameter to a feasible space after the gradient update.  
% These methods project the gradient descent update onto the desired feasible set, which can be non-convex.
% The first will be a $L_1$ ball, and the second is the $L_0$ ball. 
% Solving for the constrained optimization problem is challenging to 
We apply IHT to solve \Cref{eqn:irm-minimax-empirical} and
show that the sparse invariant feature recovery is possible with these fast, space-efficient methods.

% To begin tackling the nonconvex, non-smooth $L_0$ constraint, the natural starting point is the convex relaxation, LASSO for linear regression. 
% The $L_1$ norm is the closest norm in the $L_p$ family to the strict sparsity norm \citep{Wainwright2019-tb}.
% We first extend \Cref{thm:info-theory} to the result in \Cref{thm:convex-relaxation}.
% \begin{theorem}[Sparse IRM with LASSO]
% \label{thm:convex-relaxation}
% Assume we are given $n$ samples per training environment,
% % with at least $n > Q\ps{\textnormal{poly}(d_\inv)\log\ps{\frac{d \cdot |E|}{\delta}}}$ per training environment, 
% together with assumptions in \Cref{sec:assumptions}, for $n > Q\ps{\textnormal{poly}(d_\inv)\log(d)\log\ps{\frac{ |E|}{\delta}}}$ and weight $\eta > 0$.
% Then we can say with probability at least $1-\delta$,
% \begin{equation}
% \label{eq:irm-l1pgd}
% \begin{gathered}
% \tilde \beta = 
% \min _{\vv} \hat \cL(\vv) + \eta \Ds{\vv}_1,
% % \text { s.t. } 
% % \vv \in\bbR^{d}, 
% % \leq K_1, %\Ds{\vv'}_1 
% \end{gathered}
% \end{equation}
% returns a parameter $\tilde\beta$ with low estimation error $\Ds{\tilde\beta - \beta^*_\inv}_2 \le O(\sqrt{\frac{d_\inv\log d} {n}})$.
% \end{theorem}
% \abcomment{(1) What is $K$?}
% \abcomment{(2) estimation error should have a dependence on $d_{\inv}$, say $\sqrt{d_{inv} \log d}$;}
% \abcomment{(3) Any such result will have a sample complexity, below which the estimation error bound will not work. There is a good chance the sample complexity is the same as the info theoretic result, with an extra multiplicative $\log d$ factor;}
% \abcomment{(4) The linear dependence on $|E|$ is problematic, but I have not checked the proof yet}
% The proof is found in  \Cref{proof:l1}. 
% This can then be implemented by applying PGD to the IRM loss
% The relaxation allows us to approximate the information-theoretic version of the problem with a piecewise linear and convex optimization. 
% Convex relaxations are a possible solution and can be implemented straightforwardly with a regularizer. 
% However, the speed of convex relaxations with non-smooth penalties like $L_1$ can cause issues in training \citep{jain_iterativehardthreshold_2014}. 
% Instead of computing this constraint by formulating the Lagrangian, which involves computing a gradient on the non-differentiable $L_1$ penalty, we can instead use a PGD update, which projects the gradient update directly onto a $L_1$ ball. 

\begin{algorithm}
\caption{Sparse IRM with Iterative Hard-Thresholding}
\label{algorithm}
    \begin{algorithmic}[1]
    \State \textbf{Input:} target nonzero features $d_\inv < d$, $\cD = \{\cD^e\}_{e\in \cE}$ and $\cD^e \coloneqq \{(\vx^e_i, y_i )\}^{n_e}_{i=1}$.
    \State Initialize weights $\vv$.
    \For {training iteration $t = 1, 2, \cdots, T$}
        % \If {$t \mod P = 0$}ll l
        \State $\vv^{t+1} \gets \text{proj}_{s}(\vv^t - \eta \nabla_{\vv} \hat \cL(\vv^t))$
        % \State $\Phi^{t+1} \gets \Phi^t - \eta \nabla_{\Phi} \hat \cL(\Phi^t) $
        \State $t = t+1$
        % \Else
        %     $\vv^t \gets Proj_{d_\inv}(\vv^t - \eta \nabla_{\vv} \hat \cL(\vv^t))$, $t = t+1$
        % \EndIf
    \EndFor
    % \Procedure {SparseIRM+IHT}{$\vx$}
    % \If {$i\geq maxval$}
    %     \State $i\gets 0$
    % \Else
    %     \If {$i+k\leq maxval$}
    %         \State $i\gets i+k$
    %     \EndIf
    % \EndIf
    \end{algorithmic}
\end{algorithm}
% We define $\operatorname{proj}_{s}(\cdot) $ to be the IHT project, 
% namely sorting elements of the parameter by absolute value and keeping only the top $s$ for sparsity parameter as defined in \Cref{thm:iht}.
Let \( s \in \mathbb{N} \) be the sparsity level. Then, the \emph{hard thresholding projection operator} \( \text{proj}_{s} : \mathbb{R}^d \rightarrow \mathbb{R}^d \) is defined as:
\begin{equation}
\label{eqn:iht}
\text{proj}_{s}(\vv) := \arg\min_{\vu \in \mathbb{R}^d} \left\{ \|\vv - \vu\|_2^2 \;\middle|\; \|\vu\|_0 \le s \right\},
\end{equation}
where \( \|u\|_0 \) denotes the number of nonzero entries in \( u \).
\Cref{algorithm} directly projects the gradient descent update onto the non-convex feasible set. 
% This can be performed effectively by sorting the parameter elements by magnitude.
Previous works \citep{jain_iterativehardthreshold_2014} have shown that, despite the non-convexity of the problem, tight, minimax lower bounds can be achieved on the learned parameter,
% an estimation error bound for linear regression with constrained sparsity in high probability. 
% As detailed in \jdcomment{add to appendix}, we can apply this result to get a bound in high probability.
and we use constants from Theorem 3 in \citet{jain_iterativehardthreshold_2014} for sparse linear regression. 
Details are included in \Cref{proof:IHT}.
% We demonstrate that the properties required of the loss function are satisfied in \Cref{proof:IHT}.
% This result will show that projecting at sparsity level $s + d_\inv$ is still sufficient to find low estimation error, $\Ds{\beta_t - \beta^*_\inv}_2$. 
% we can show that our loss satisfies a general notion of Restricted Strong Convexity (RSC), based on Lemma 3 in \citet{pmlr-v119-sivakumar20a}.
\begin{theorem}[Sparse IRM with IHT]
\label{thm:iht}
Assume $n$ samples per training environment, for $n > Q\ps{\textnormal{poly}(d_\inv)\log(d)\log\ps{\frac{ |E|}{\delta}}}$.
% with at least $n > Q\ps{\textnormal{poly}(d_\inv)\log\ps{\frac{d \cdot |E|}{\delta}}}$ per training environment, 
Together with assumptions in \Cref{sec:assumptions}, 
using the IRMv1 penalty as defined in \Cref{eqn:irmv1},
\Cref{algorithm}
% \begin{equation}
% \label{eq:basic-setup}
% \begin{gathered}
% \tilde \beta = 
% \min _{\vv} \hat \cL_{IHT}(\vv)
% \text { s.t. } 
% \vv \in\bbR^{d}
% \end{gathered}
% \end{equation}
returns a parameter $\tilde\beta \in \RR^d$ 
% via the IHT algorithm with projection operator $\operatorname{proj}_s(\cdot)$. 
With $s$ chosen to be $O(d_\inv)$, we have with probability at least $1-\delta$, a bound on the estimation error,
% $\Ds{\tilde\beta - \beta^*_\inv}_2 \le O(\sqrt{\frac{d_\inv} {n}})$.
\begin{equation}
    \Ds{\tilde\beta - \beta^*_\inv}_2 = O \ps{\lambda_{\max}^2 \sqrt {\frac{d_\inv \log d}{n}}
    +\frac{\sigma_\inv}{\kappa_s}} .
\end{equation}
\end{theorem}
% \abcomment{(1) What is $K$? (2) estimation error should have a dependence on $d_{\inv}$, say $\sqrt{d_{inv} \log d}$; (3) Any such result will have a sample complexity, below which the estimation error bound will not work. There is a good chance the sample complexity is the same as the info theoretic result, with an extra multiplicative $\log d$ factor; (4) The linear dependence on $|E|$ is problematic, but I have not checked the proof yet}
The full proof and definitions for constants $Q, \sigma_\inv, and \kappa_s$ are provided in \Cref{thm:iht}.
Because we do not know $\Ds{\beta^*}_0 = d_\inv$ beforehand, we discuss tuning $s$ as a hyperparameter in \Cref{sec:experiments}.
Overall, both methods provide guarantees of low estimation error in high probability, while being fast and having low memory cost, scaling to much larger models and datasets.

% \begin{remark}
% This is in contrast to the probability-based sparsification used in \citet{zhouSparseInvariantRisk2022}, in which every weight of the model has a corresponding likelihood of being kept for test-time. Weights that are not sampled are then zeroed out, leaving a smaller, distributionally-robust subnet. The downside of this method is that the speed of training and storage cost both scale poorly with larger models and datasets. 
% \end{remark}

% There is a rich line of work that investigates the challenge of efficient estimation of a sparse solution in an overparameterized setting. 


 
