%!TEX root = ../paper.tex
\label{appx:thm-1}

This appendix includes the proof and supporting analysis for \Cref{thm:info-theory}. We restate it below for clarity.
% \begin{theorem}[Sample complexity of our Sparse IRM]
% \label{thm:info-theory}
\paragraph{Theorem 1} 
\label{par:theorem_1}
Assume at least $n$ samples per environment $e\in \cE$, for a total of $N = |\cE| n$ across the whole training set. If $n > O\ps{\log\ps{\frac{d \cdot |\cE|}{\delta}}}$, together with assumptions in \Cref{sec:assumptions}, with probability at least $(1-\delta)$, we have:
\begin{equation}
\hat {\cL} (S_\inv) < \hat {\cL} (S),
\quad \forall \ds{S} \le d_\inv, ~~~ S \ne S_\inv~,
\end{equation}
with constants in sample complexity further specified below. 
% \end{theorem}
\begin{proof}
\label{proof:info-theoretic-thm}
To sketch the proof, we first analyze $\hat \cL(\hat \vv_S)$ where $\vv_S \in \Sp(S)$ for $S\in 2^d$, breaking down its IRM penalty term into three error components 
$\cJ_\mm(S) = \xi_a(S) - \xi_b(S) + \xi_c(S)$. This is then used to bound 
$\hat \cL(\hat \vv_S) - \hat \cL(\beta^*)$. When $S\ne S_\inv$, we show this gap to be positive.
\subsection{Components of penalty}
The penalty term from $\hat \cL(\hat \vv) $ is 
\begin{equation}
\label{eqn: xi breakdown}
\cJ_\mm(S)  
= \sum_ {e\in \cE} \bs{\hat \cR^e(\hat \beta_S)-\hat\cR^e\left(\hat \beta^e_S\right)}
= \xi_a(S) - \xi_b(S) + \xi_c(S),
\end{equation}
with $\hat \beta_S$ and $\hat \beta_S^e$ as defined in \Cref{sec:preliminaries}
\begin{equation}
    \label{eqn: xi a def}
   \xi_a(S) =  \sum_ {e\in \cE} \bs{\hat \cR^e(\hat \beta_S)-\cR^e\left(\beta^*_S \right)},
\end{equation}
\begin{equation}
\label{eqn: xi b def}
   \xi_b(S) =  \sum_ {e\in \cE} \bs{\hat \cR^e(\hat \beta^e_S)-\cR^e\left(\beta^e_S \right)},
\end{equation}
\begin{equation}
\label{eqn: xi c def}
   \xi_c(S) =  \sum_ {e\in \cE} \bs{ \cR^e(\beta^*_S)-\cR^e\left(\beta^e_S \right)}.
\end{equation}
% Additionally, let $\xi_a = \max_S \xi_a(S)$, $\xi_b = \max_S \xi_b(S)$ , $\xi_c = \max_S \xi_a(S)$




We bound $\ds{\xi_b(S)}$ in \Cref{xib corollary} and $\ds{\xi_a(S)}$ in \Cref{xi a corollary}, followed by an analysis of $\ds{\xi_c(S)}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% xi a, xi b lemma
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{lemma}[Penalty component]
\label{xib lemma}
For a given environment $e\in \cE$ and feature subset $S \in 2^d$, we can bound $\ds{
\hat \cR^e (\hat \beta^e_S) - \cR^e (\beta^e_S)
}$ with probability $1-\delta$, given a sample size per environment of $n\ge cw^2(A) = 
O(d_\inv)$:
\begin{equation}
\label{eqn: lemma xib l1}
\ds{
\hat \cR^e (\hat \beta^e_S) - \cR^e (\beta^e_S)
} 
% \le O\ps{\frac{ d_\inv^{1.5} \lambda^e_{\max} \log\ps{\frac{1}{\delta}}}{n} }
% + \err(1/\delta, n)
\le O\ps{\kappa_\inv\sqrt{\frac{\log (\frac{1}{\delta})}{n} }}
\end{equation} 
where 
$\lambda_{\max} \coloneqq \max_{e\in \cE} (\lambda_{\max}(\Sigma^e))$ 
and 
$\err(1/\delta, n) 
% \coloneqq
% O \ps{
% \frac{d_\inv}{\sqrt n}\log\frac{{1}}{{\delta}}}
% \coloneqq O \ps{
%     d_\inv \sqrt{ \frac{\log (1/\delta)}{n}} 
%         \ps{
%         1 + \sqrt{ \frac{\log (1/\delta)}{n}}
%         }
%     }
$ as defined in \Cref{prop:missing-emp-general}.
\end{lemma}

\begin{proof}
\label{prf: xi_b}
% We want to bound the following expression with high probability ($1-\delta_b$): 
%
First, with triangle inequality, we have for all $S$,
\begin{equation}
\label{lemma 2 start}
\ds{
\hat \cR^e (\hat \beta^e_S) - \cR^e (\beta^e_S)
}
\le 
	\ds{
		\hat \cR^e (\hat \beta^e_S) - \hat \cR^e ( \beta^e_S) 
	}
	+ \ds{
		\hat \cR^e ( \beta^e_S) - \cR^e (\beta^e_S) 
	}.
\end{equation}
% The two error terms in \cref{lemma 2 start} can be bounded separately. 
The second term from \Cref{xib lemma} can be bounded by generalized Hoeffding's inequality for unbounded sub-Gaussian random variables, as stated in Proposition 5.10 in \citep{vershyninIntroductionNonasymptoticAnalysis2011}. 
With probability $1-O(\delta_b )$,
\begin{equation}
\label{eqn:xib-term2}
\ds{
    \hat \cR^e ( \beta^e_S) - \cR^e (\beta^e_S) 
}   
\le 
O\ps{\kappa_\inv\sqrt{\frac{\log (\frac{1}{\delta})}{n} }}
\end{equation}

For the first term with $\hat \cR^e, \hat \beta^e_S, \beta^e$ as defined above, it is necessarily less than 0 by the definition of $\hat \beta_s^S$, the minimizer of $\hat \cR^e(\beta^e_S)$.
\begin{equation}
    \ds{
		\hat \cR^e (\hat \beta^e_S) - \hat \cR^e ( \beta^e_S) 
	} \le 0.
\end{equation}

% We will see later that this is dominated by the first term. Then, for the second term with $\hat \cR^e, \hat \beta^e_S, \beta^e$ as defined above, 
% we apply \Cref{lemma:missing err-appx} to get a bound with probability $1-\delta$:
% \begin{equation}
% \label{eqn: v diff hat sigma}
%     \ds{
% 	\hat \cR^e (\hat \beta^e_S) - \hat \cR^e ( \beta^e_S) 
%     } 
%     \le \Ds{
% 	\hat \beta ^e_S -\beta^e_S
%     }_{\hat \Sigma^e}^2 
%     + \err(\frac{1}{\delta}, n )
%     \le  2 \ds{\err(\frac{1}{\delta}, n )}.
%     \end{equation}
% for
% $\err(\frac{1}{\delta}, n)$ as defined in \Cref{prop:missing-emp-general}. The second inequality comes from \Cref{cor:lessthanzero}.


\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% xi b
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{corollary}[Second penalty term]
\label{xib corollary}
The following bound holds with probability $1-\delta_b$ for all $S \in 2^d$:
\begin{equation}
\ds {\xi_b(S)}
\le
O\ps{\ds{\cE}\kappa_\inv\sqrt{\frac{\log (\frac{\ds{\cE}}{\delta})}{n} }} 
% O\ps{
% \ds{\cE}
%     \frac{ %\kappa_\inv^2 
%             d_\inv^{1.5}
%             \lambda_{\max}^e
%             \log \ps{ \frac{1}{\delta_b}}
%     }
%     {n} 
% }
% + \ds{\cE}\err\ps{\frac{\ds{\cE}}{\delta_b}, n}
\eqqcolon \ds{\xi_b}
\end{equation}
where $\err(\frac{1}{\delta}, n)$ is as defined in \Cref{prop:missing-emp-general}.
The RHS is independent of $S$, so we name the upper bound $\ds{\xi_b}$.
\end{corollary}

\begin{proof}
We expand $\xi_b(S)$ to get 
\begin{equation}
\ds { \xi_b(S) }
\le
\sum_{e \in \cE }\ds{
\hat \cR^e (\hat \beta^e_S) - \cR^e (\beta^e_S)
}
 \label{xi_b_start}
\end{equation}
\Cref{xib lemma} can directly be applied on each of the different environments $e\in\cE$. Applying the union bound for the environments produces the desired result. 
% The RHS is independent of $S$, and we will sometimes refer to that upper bound as $\ds{\xi_b}$.
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% xi a
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{corollary}[First penalty component]
\label{xi a corollary}
The following bound holds with probability $1-\delta_a$ for all $S\in 2^d$:
\begin{equation}
\label{xi_a}
\ds { \xi_a(S)}
\le 
O\ps{\kappa_\inv\sqrt{\frac{\log (\frac{1}{\delta})}{\ds{\cE}n} }} 
% O\ps{    
%     \frac{ 
%             d_\inv^{1.5}
%             \lambda_{\max}^e
%             \log \ps{ \frac{1}{\delta_a}}}
%     {  n\ds{\cE} }
% }
% + \err(\log \frac{1}{\delta_a}, n\ds{\cE})
\eqqcolon \ds{\xi_a}
\end{equation}
where $\err(\frac{1}{\delta}, n)$ is as defined in \Cref{prop:missing-emp-general}. Again, the RHS is independent of $S$, and we name the upper bound $\ds{\xi_a}$.
% where
% $ \lambda_{\max}\coloneqq \max_{e\in \cE} (\lambda_{\max}(\Sigma^e))$ and 
% $\err(1/\delta_a, n\ds{\cE}) =  O \ps{
% \ps{K_e+\sigma_\inv\sqrt{\log 1/\delta}}
%     \sqrt{ s\log d /n   }
%     }$.
\end{corollary}

\begin{proof}
Recall that $\cR (\vv) = \sum_{e\in \cE} \cR^e(\vv)$ for all classifiers $\vv \in \RR^d$. 
Additionally, 
$\beta^*_S = \argmin_{\vv\in V_S} \cR(\vv)$ 
and $\hat \beta_S = \argmin_{\vv \in V_S} \hat \cR(\vv)$. 
We want to bound the following expression with high probability ($1-\delta_a$): 
\begin{equation}
\ds { \xi_a(S)}
=
\left | \sum_{e \in \cE }
    \bs{
    \hat \cR^e (\hat \beta_S) - \cR^e (\beta^*_S)
    }
\right |
= \left | 
    \hat \cR (\hat \beta_S) - \cR (\beta^*_S)
\right |
\label{xi_a_start}
\end{equation}
We may apply the analysis of \Cref{xib lemma} to the whole dataset, which can be treated as one environment with $n\ds{\cE}$ points as is done in  \cite{zhouSparseInvariantRisk2022}.
% Here, the sparsity constraint for the population optimum is still $ d_\inv$.
% Specifically, $\hat \beta_S$ is learned over a set of $n \ds{\cE}$ points, and its risk is computed by summing over all domains $ e \in \cE$; in other words, $\cR (\beta) = \sum_{e\in \cE} \cR^e(\beta)$ for any classifier $\beta$. 
 % with probability $1-\delta_a$,
 % which results in 
\end{proof}

\begin{proposition}
\label{prop:xic bound}
    For $D,\Delta$ as defined in \Cref{assn: spurious diff followup xic},
\begin{equation}
\label{eqn: xic final}
	\xi_c(S)
	\begin{cases}
		=0, 
        & \text { if } S \in S_\inv \cup S_r\\
		\geq \frac{D^2 \Delta^2}{\lambda_c^3}, 
		& \text { otherwise. }
	\end{cases}
\end{equation}
$\lambda_{m} = \max \ps{\bigcup_{e \in \cE}\{\lambda_i^e\}^d_{i=1} \cup \{\lambda_i\}^d_{i=1}}$.
\end{proposition}
To bound $\ds{\xi_c(S)}$, we reference the analysis of Equation (29) in \citep{zhouSparseInvariantRisk2022}. 

% with differences as noted in \cref{obs: xic lambdamax}.

Let $\{\lambda_i^e\}^d_{i=1}$ and $\{\lambda_i\}^d_{i=1}$ be the eigenvalues of $\Sigma^e$ and $\Sigma$ respectively with corresponding eigenvectors $\{\vu_i^e\}^d_{i=1}$ and $\{\vu_i\}^d_{i=1}$. 

Define $\Sigma^e_S\coloneqq \EE^e[(\vx_S^e) (\vx^e_S)^\top]$, and $\Sigma_S\coloneqq \EE[(\vx_S^e) (\vx_S^e)^\top]$, where $\vx_S^e \in \RR^{|S|}$ and is the pruned ``projection" of $\vx^e$ keeping only the indices $i \in S$. 
Note that this differs from the analysis in \citep{zhouSparseInvariantRisk2022}, particularly Assumption 1, which involves taking the inverse of the analagous matrix $\Sigma_\Phi^e$ that is not full rank.
\begin{align}
\label{eqn:c1}
\xi_c(S) 
&= \sum_{e\in \cE} \Ds{(\beta^*_S - \beta^e_S)}_{\Sigma^e_S}\\
&=  \sum_{e\in \cE} \ps{
    \Sigma_S^{-1} \E[\vx^e_S y]- (\Sigma_S^e)^{-1} \E^e[\vx^e_S y]
}^{\top} 
\Sigma^e_S
\ps{
    \Sigma_S^{-1} \E[\vx^e_S y] - (\Sigma_S^e)^{-1} \E^e[\vx^e_S y]
}\\
&= \sum_e \sum_{i \in S}\ps{
	\E^e[\vx^e_S y]^{\top} \vu_i
}^2
\lambda_i \ps{
	\frac{1}{\lambda_i^e}-\frac{1}{\lambda_i}
}^2 
\end{align}
% \jdcomment{I replaced/fixed the uninvertible $\Sigma$ from the original paper}

For any classifier footprint $S\in 2^d$, we know that 
$\Sigma^e_S = \Sigma_S + \text{Diag}(0, \cdots, 0, (\valpha^e_i)^2 - \valpha_i^2, 0, \cdots, 0)$. From this, at most $d_s$ eigenvalues have a nonzero difference $\lambda_i^e - \lambda_i = (\alpha_i^e)^2 - \alpha_i^2$. This is bounded by \Cref{assn: spurious diff}, and $\ds{\E^e[\vx^e y]^{\top} \vu_i}$ is bounded in \Cref{assn: spurious diff followup xic}.
Note also that
\begin{equation}
\lambda_i     \ps{\frac{1}{\lambda_i^e }- \frac{1}{\lambda_i }}^2
\ge 
\frac{\lambda_i(\lambda_i - \lambda_i^e )^2}
{(\lambda_i^e )^2 \lambda_i^2} 
\ge 
\frac{\Delta^2}{\lambda_m^3},
\end{equation}
for $\lambda_{m} = \max \ps{\bigcup_{e \in \cE}\{\lambda_i^e\}^d_{i=1} \cup \{\lambda_i\}^d_{i=1}}$. Then the overall bound for $\xi_c$ for $S_\inv $ is
\begin{equation}
% \label{eqn: xic final}
	\xi_c
 % \ge \frac{C^2 \Delta^2}{\lambda_{\max }^3}
	\begin{cases}
		=0, 
		% & \text { if } \vv \text{ is zero on spurious features } [d_s] \\ 
        & \text { if } S \in S_\inv \cup S_r\\
		% \geq \frac{d_s(\vv') C^2 \lambda_{\min } \Delta^2}{\lambda_{\max }^4} 
		\geq \frac{D^2 \Delta^2}{\lambda_c^3}, 
		& \text { otherwise. }
	\end{cases}
\end{equation}

This is a simpler and slightly tighter lower bound than provided previously.


% where $\lambda_{\min}, \lambda_{\max}$ are the maximum and minimum across $\{\lambda_i^e\}^d_{i=1} \cup \{\lambda_i\}^d_{i=1}$. 

\subsection{Analyzing empirical risk}
\label{sec: combine xi a b c}
% So far, we have defined our error breakdown $\xi_a, \xi_b, \xi_c$ in terms of their L0 sparsity constraint $d_\inv$. 
% There are several categories of possible learned classifiers:
% \begin{align}
%     \hat \vv _\text{\{inv,r,s\}} &\coloneqq \argmin  \hat \cR (\vv), 
%     &
%     \vv \in \RR^d,
%     \Ds{\vv}_0 \le d_\inv,
%     \\
%     \hat \vv_{\text{\{inv\}}} &\coloneqq \argmin \hat \cR (\vv),
%     &
%     \vv = [\vv', \bfzero^{d_{\text r}}, \bfzero^{d_{\text s}}]^\top, 
%     \vv' \in \RR^{d_\inv},
%     \Ds{\vv}_0 \le d_\inv,
%     \\
%      \hat \vv _\text{\{inv,r\}} &\coloneqq 
%     \argmin \hat \cR (\vv),
%     &     
%         \vv = [\vv', \bfzero^{d_{\text s}}]^\top ,
%         \vv' \in\RR^{d_\inv + d_r},
%         \Ds{\vv}_0\le d_\inv,
%     \\
%     \label{eqn: inv s vs inv}
%     \hat \vv _\text{\{inv,s\}}&\coloneqq \argmin \hat \cR (\vv),
%     &
%         \vv = [\vv', \bfzero^{d_{\text r}}]^\top ,
%         \vv' \in\RR^{d_\inv + d_s},
%         \Ds{\vv}_0 \le d_\inv.
%     \\
%     \hat \vv _\text{\{r\}}\\
%     \hat \vv _\text{\{r,s\}}\\
%     \hat \vv _\text{\{s\}}.
% \end{align}
% These categories can be indicated by their footprints

We want to show that $\hat \cL(S_\inv) < \hat \cL(S)$ for footprint $S \in 2^d$ such that $\Ds{S}_0 \le d_\inv$. 
% We will find that only $|S| \le d_\inv$ are relevant, 
To borrow the analysis in \citep{zhouSparseInvariantRisk2022}, we observe that there are two categories of footprints: those that include at least one spurious feature ($\cS_\text{with-spu}$) and those that do not ($\cS_\text{no-spu})$. 
% If $S_\inv, S_s, S_r$ are the subsets of all invariant, spurious, and random features respectively, 
% let $S_\text{no-spu} = S_\inv \cup S_r$ and $S_\text{with-spu} = S_\inv \cup S_r \cup S_s$. 

We will first show that $\forall S \subseteq \cS_\text{with-spu}$, $\hat \cL(S_\inv) \le \hat \cL(S)$. 
Then, we will show the same for the no-spurious feature selectors, that is $\forall S \subseteq \cS_\text{no-spu}$, $\hat \cL(S_\inv) \le \hat \cL(S)$. We will use $\beta_\inv = \beta_{S_\inv}$ as a shorthand.

% we need to consider the classifiers' footprints, namely, whether the nonzero elements occur on invariant, random, or spurious features. 
% As a result, we look at whether the footprint $S$ contains features from $S_\inv, S_s, S_r$, or the invariant, random, and spurious sets respectively.


% Using $S = S_\inv \cup S_r$ as an example,
% \[\xi_a(S) = \sum_e\bs{\hat \cR^e(\hat \beta_{\text{inv,r}}) - \cR^e(\beta^*_{\text{inv,r}})},\]
% %
% using
% % \[\hat\beta_{\text{inv,r,s}} = \vvtwohat,\] 
% %
% \[\beta^*_{\text{inv,r}} =  \argmin \cR (\vv) \subt 
%         \vv = [\vv', \bfzero^{d_{\text s}}]^\top ,
%         \vv' \in\RR^{d_\inv + d_r},
%         \Ds{\vv}_0\le d_\inv,\] 
% %
% \[\beta^e_{\text{inv,r}} = \vvallhat \subt 
%         \vv = [\vv', \bfzero^{d_{\text s}}]^\top ,
%         \vv' \in\RR^{d_\inv + d_r},
%         \Ds{\vv}_0\le d_\inv,\]
% and with corresponding  classifiers $\hat \beta_{\text{inv,r}}$ and $\hat \beta^e_{\text{inv,r}}$ defined in the same way but with empirical risks $\hat \cR(\cdot)$ and $\hat \cR^e(\cdot)$. Then, we have $\hat \beta_S = \hat \vv_S$ for a given footprint $S$, and we may also write $\beta^* = \beta^*_\inv$.


% The results bounding $\ds{\xi_a}, \ds{\xi_b}, \ds{\xi_c}$, namely \Cref{xi a corollary,xib corollary} and \Cref{eqn: xic final}, still hold for $\xi_i(m)$ for $m\in M, i\in\{a,b,c\}$. 

% Note that this holds because we have $\Ds{\beta^*_m}_0\le d_\inv$, $\Ds{\beta^e_m}_0\le d_\inv$

% \jdcomment{Should I prove this more formally -- basically rewrite everything with $\beta^e_S$ instead of $\beta^e$ etc.?} 
 
% \Cref{xi a corollary} holds from \cref{xib lemma}. This requires that \Cref{prop: v diff hat sigma} and \Cref{prop:missing-emp-general} hold true. \Cref{prop: v diff hat sigma} is algebraic manipulation and \Cref{prop:missing-emp-general} holds, keeping in mind that in \cref{eqn:define-omegae} we do not replace $\beta^*$ with $\beta^*_m$. In other words, we use $\omega^e = y - (\beta^e)^\top \vx = (\beta^*_\inv - \beta^e_m)^\top \vx +  \epsilon_\inv$. 
 
% So the problem is then to bound $\Ds{\beta^*_\inv - \beta^e_m}_2$, where we can leverage the fact that both $\beta^*_\inv$ $ \beta^e_m$ are $d_\inv$-sparse. 
% \jdcomment{Right now there's a todo in bounding the original $\Ds{\beta^*_\inv - \beta^e}_2$. This seems like it should follow from that.}  

% We tackle the different $d_\inv$ sparsity footprints one by one. 
% \subsection{Sparse IRM loss on different footprints}

First, for $S \subseteq \cS_\text{with-spu}$, we have
\begin{align}
\label{eqn: inv,r,s vs inv}
\hat \cL (S) - \hat \cL (S_{\inv})
% \nonumber\\
%
&= \sum _e \bs{
\hat \cR^e(\hat \beta_S) + \rho \ps{ \hat \cR^e(\hat \beta_S) - \hat \cR^e (\hat \beta_S^e)} 
}\nonumber\\
&\quad - 
    \sum _e \bs{\hat \cR^e(\hat \beta_\inv) + \rho \ps{ \hat \cR^e(\hat \beta_\inv) - \hat \cR^e (\hat \beta_\inv^e)}
}   
\\
%
&= \sum _e \bs{
\hat \cR^e (\hat \beta_S)}
+ \rho \xi_a(S) - \rho\xi_b(S) + \rho\xi_c(S)
\nonumber \\
&\quad - \xi_a(S_\inv) - \ds{\cE}\cdot\sigma_\inv^2
- \rho \xi_a(S_\inv) + \rho \xi_b(S_\inv) - \rho \xi_c(S_\inv)
\end{align}
For the second equality, note the definition of $\xi_a(S_\inv)$, using $\sigma_\inv^2$ as the variance of $\epsilon_\inv$: 
\[\xi_a(S_\inv) = \sum_e\bs{\hat \cR^e(\hat \beta_\inv) - \cR^e( \beta^*)} 
% = \sum_e \bs{\hat \cR^e(\hat \beta_\inv)} - \cR(\beta^*) 
= \sum_e \bs{\hat \cR^e(\hat \beta_\inv)} -  \sum_e \sigma_\inv^2 .\]
By dropping error terms $\sum_e\hat \cR^e( \cdot)  \ge 0$ and using $\ds{\xi_a(S)} \le \ds{\xi_b(S)}$ for all $S\in 2^d$, we get a lower bound.
\begin{align}
\hat \cL (S) - \hat \cL (S_{\inv})
&\ge - \rho\xi_b(S) + \rho \xi_c(S) 
\nonumber \\ &\quad
- \xi_a(S_\inv) - \ds{\cE}\cdot \sigma_\inv^2
- \rho\xi_a(S_\inv)  
- \rho \xi_c(S_\inv)\\
\intertext{Additionally, from \cref{eqn: xic final}, $\xi_c(S_\inv) = 0$.}
&\ge -(2\rho + 1)\ds{\xi_b} - \ds{\cE}\cdot \sigma_\inv^2
+  \rho \xi_c(S) 
\end{align}
We then select $\rho$ to eliminate the $\xi_c(S) $ term, while also producing a positive term in the RHS. Specifically, let $\rho \xi_c(S) \ge 2 \ds{\cE}\cdot \sigma_\inv^2$, getting 
\begin{equation}
\label{eqn:rho}
    \rho 
    = \frac{2\ds{\cE}\sigma^2_\inv}{D^2\Delta^2/\lambda_{\max}^3}
    \ge\frac{2\ds{\cE}\sigma^2_\inv}{\xi_c(S)}.
\end{equation}

Setting the weight $\rho$ to the LHS, we can write the gap as
\begin{align}
    \label{eqn: all features loss diff}
    &\hat \cL (S) - \hat \cL ( S_\inv)
    \nonumber \\
    &\ge
    -(2\rho + 1)
    O\ps{
        \ds{\cE}
        \sqrt{\frac{ \log \frac{|\cE|}{\delta}
        }
        {n} }
        % + \ds{\cE}\err\ps{\frac{\ds{\cE}}{\delta_b}, n}
    }
    + \ds{\cE} \cdot \sigma^2_\inv.
    % &\ge
    % -(2\rho + 1)
    % O\ps{
    %     \ds{\cE}
    %     \frac{ %\sigma^2_\inv
    %             d_\inv^{1.5}
    %             \lambda_{\max}^e
    %             \log \ps{\frac{1}{\delta_b}}
    %     }
    %     {n} 
    %     + \ds{\cE}\err\ps{\frac{\ds{\cE}}{\delta_b}, n}
    % }
    % + \ds{\cE} \cdot \sigma^2_\inv.
    % \\ & > 0,
\end{align}
Finally, we solve for the sample complexity required to 
% The resulting sample complexity required to 
differentiate $ S_\inv$ and $S$. \jdcomment{review this after fix to lemma with the xi b}
\begin{equation} 
    \label{eqn:first-complexity}
    n >      
        \frac{(2 \rho + 1) ^2  
            \log\ps { \frac{\ds{\cE}}{\delta_b}}
        }
        {\sigma^4_\inv} 
     % \max\cs{
     %    \frac{(2 \rho + 1)  %\Ds{\epsilon_\inv}^2_{\psi_2} 
     %            c_\total
     %            d_\inv^{1.5}
     %            \lambda_{\max}^e
     %            \log \ps{\frac{1}{\delta_b}}
     %    }
     %    {\sigma^2_\inv} 
     %    ,
     %    \frac{(2 \rho + 1) ^2  
     %        c_\total^2
     %        d_\inv ^2
     %        % \ps{
     %        %     d_\inv +% \Ds{\epsilon_\inv}_{\psi_2} ^2
     %        %     \log\ps { \frac{1}{\delta_b}}
     %        % }
     %        \ps{\log\ps { \frac{\ds{\cE}}{\delta_b}}}^2
     %    }
     %    {\ds{\cE}\sigma^4_\inv} 
    % }
\end{equation}
Likewise, to analyze the classifiers that include random features but not spurious features, we now consider $S \subseteq \cS_{no-spu}$. Note that $S_r$ refers to the set of features $i\in[d]$ such that $\vx^e_i$ is a random feature; $S\cap S_r$ is the set of random features in $S$. 

\begin{align}
\hat \cL (S) - \hat \cL (S_{\inv})
&= \sum _e \bs{
\hat \cR^e (\hat \beta_S)}
+ \rho \xi_a(S) - \rho\xi_b(S) + \rho\xi_c(S)
\nonumber \\
&\quad - \xi_a(S_\inv) - \ds{\cE}\cdot \sigma_\inv^2
- \rho \xi_a(S_\inv) + \rho \xi_b(S_\inv) - \rho \xi_c(S_\inv)
\end{align}
From \citep{zhouSparseInvariantRisk2022}, we have $R^e(\beta_S) - R^e(\beta^*) = \sum_{i \in S_\inv \setminus S} \gamma_i x_{\inv, i} ^2$, so
\[\xi_a(S) 
= \sum _e \bs{\hat \cR^e (\hat \beta_S)  -R^e(\beta^*) - \cR^e(\beta_S) +  R^e(\beta^*)}
= \sum _e \hat \cR^e (\hat \beta_S) - \ds{\cE}\cdot \sigma_\inv^2 - \ds{\cE}\sum_{i \in S_\inv \setminus S} \gamma_i x_{\inv, i} ^2 .\]
We also use the definitions in \Cref{assn: gamma} to lower bound $\gamma_i$ and $x_{\inv,i}$ with high probability.
\begin{remark}
    We need a \textbf{lower bound} specifically to provide the sample complexity result after. In other words, we work both with the subgaussian norm to ``upper bound" the features, and we need a lower bound on the label noise variance $\sigma_\inv^2$ and the smallest feature variance $\sigma_0^2$.
\end{remark}
Proceeding to compare the empirical losses, 
\begin{align}
\hat \cL (S) - \hat \cL (S_{\inv})
&\ge \ps{\xi_a(S) + \ds{\cE}\cdot \sigma_\inv^2
+ \ds{\cE} \sum_{i \in S_\inv \setminus S}\gamma_i x_{\inv,i}^2} \nonumber \\
&\quad+ \rho \xi_a(S) - \rho\xi_b(S) + \rho\xi_c(S) \nonumber \\
&\quad- \xi_a(S_\inv) - \ds{\cE}\cdot \sigma_\inv^2
\nonumber \\
&\quad
- \rho \xi_a(S_\inv) + \rho \xi_b(S_\inv) - \rho \xi_c(S_\inv),
\\
&\ge -(2\rho + 1) \ds{\xi_b} + \ds{\cE} \cdot \bar \gamma \cdot \sigma_0^2.
\end{align}
Again, we eliminate positive terms; additionally, $\xi_c(S) = \xi_c(S_\inv) = 0$.
The resulting sample complexity for differentiating $\hat S_\inv$ from $S \in \cS_\text{no-spu}$ is then the following:
\begin{equation}
    \label{eqn:second-complexity}
    n >
	\frac{(2\rho + 1)^2 \log \frac{\ds{\cE}}{\delta_b}}{\bar \gamma^2\sigma_0^4}          % \max\cs{
     %    \frac{(2\rho + 1) % \sgnorm
     %            % c_\total
     %            % d_\inv^{1.5}
     %            % \lambda_{\max}^e
     %            % \log \ps{\frac{1}{\delta_b}}
     %            \ds{\cE}
	% 	        \sqrt{\frac{ \log \frac{|\cE|}{\delta}
     %    }
     %    {\bar \gamma \cdot \sigma_0^2} 
     %    ,
     %    \frac{(2\rho + 1) ^2 
     %        c_\total^2
     %        d_\inv^2
     %        \ps{\log\ps { \frac{\ds{\cE}}{\delta_b}}}^2
     %    }
     %    {\bar \gamma^2 \cdot \sigma_0^4} 
    % }
\end{equation}
Together, \Cref{eqn:first-complexity} and \Cref{eqn:second-complexity} form the sample compleixty; we take the max between the both. We note that both are $O(\frac{\ds{\cE}}{\delta_b})$.

Because $\ds{\cS_{\text{with-spu}} \cup \cS_{\text{no-spu}}} = \binom {d}{d_\inv}\le d^{d_\inv}$, 
we can set $\delta_b = \frac{\delta}{d^{d_\inv}}$ 
before taking the union bound, 
incurring a $\log \frac{\ds{\cE}}{\delta_b} = d_\inv \log d + \log \ds{\cE} - \log \delta \le d_\inv \log \frac{d\ds{\cE}}{\delta}$.
Under this sample complexity, we have $\hat \cL (S) - \hat \cL (S_{\inv})$ for all $|S|_0 \le d_\inv, S \ne S_\inv$.
% $\sum_{i=1}^{d^{d_\inv}}\delta_b \le \delta $ for taking the union bound over all possible footprints. 
% Then let the probability for each candidate footprint $S$ be $\frac{\delta}{d}. $
% incurs an extra $ O(d_\inv \log \frac{d}{\delta})$.

% Combining \cref{eqn: first complexity } and \cref{second complexity} results in a bound $n > Q_1 + Q_2 \log (d/\delta)$. 
% % \jdcomment{Double check this part. }
% \begin{equation}
%     \label{q1}
%     Q_1 = O\ps{
%         \rho^2  
%         \ps{  \frac{1}{\sigma_\inv^4} + 
%         \frac{1}{\bar \gamma^2 \cdot \sigma_0^4} } 
%     }.
% \end{equation}
% \begin{equation}
%     \label{q2}
%     Q_2 = O\ps{
%         \ps{
%         \rho c_\total^2 d_\inv^{1.5} \lambda_{\max}^e
%         \ps{  
%              % \frac{1}{\sgnorm} 
%             1
%             + \frac{\sigma_\inv^2}{\bar \gamma \cdot  \sigma_0^2} 
%         } 
%         % \log \ps{\frac{1}{\delta_b}}
%         + 
%         \rho^2 c_\total^4 d_\inv  ^2
%         \ps{  
%             \frac{1}{\sigma_\inv^2} 
%             % 1
%             + \frac{\sigma_\inv^2}{\bar \gamma^2 \cdot  \sigma_0^4} 
%         }
%         }
%     }.
% \end{equation}
% % from the derivation in \cite{zhouSparseInvariantRisk2022}.
% The final complexity in terms of the dimensions is $O \ps {c_\total^4 d_\inv^{3} (\log \frac{d}{\delta})^2}$.
\end{proof}

\subsection{Comparison: Empirical Loss with Population Minima}
\label{proof:prop1-popn-minimzer}
We provide a proof for \Cref{thm:info-theory-popn} of the main paper, restated below for reference:

% \begin{theorem}[Sample complexity for sparse IRM with population optima]
% \label{prop:emp-loss-popn-minimizer}
\paragraph{Theorem 2}
For population minimizers as defined in \Cref{eqn:population-optima}, and $n$ samples per environment $e\in \cE$, for a total of $N = |\cE| n$ across the whole training set, we have
\begin{equation}
\hat \cL(\beta^*) < \hat  \cL (\beta^*_S), \quad \ds{S} \le d_\inv,  S \ne S_\inv,
% \ \forall \hat \vv \ne \beta^* \text{ and } 
% \Ds {\hat \vv} _0 \le d_\inv, 
\end{equation}
if $n > O\ps{\textnormal{poly}(d_\inv)\log\ps{\frac{d \cdot |\cE|}{\delta}}}$ with constants specified below.
% \end{theorem}
\begin{proof}
We want to show that 
\begin{equation}
    \hat \cL (\beta^*_\inv) < \hat \cL(\beta^*_S)
\end{equation}
for $|S| < d_\inv$ and $S \ne S_\inv$. We will use the notation $\beta^*_\inv \coloneqq \beta^* = \beta^*_{S_\inv}$ for the invariant optimal predictor.

First note that with high probability $1-\delta_1$,
\begin{align}
    \label{eqn:lhat-betainv1}
    \hat \cL(\beta^*_\inv) 
    % = \sum_{e\in \cE} \hat \cR^e(\beta^*_\inv) 
    % + \rho \sum_{e\in \cE} \bs{ \hat \cR^e(\beta^*_\inv) - \hat \cR^e (\hat \beta^e_\inv)}
    &= \sum_{e\in \cE} \hat \cR^e(\beta^*_\inv) 
    + \rho \sum_{e\in \cE} \bs{ 
    \hat \cR^e(\beta^*_\inv)  - \hat \cR^e(\beta^e_\inv)
    + \hat \cR^e(\beta^e_\inv)   - \hat \cR^e (\hat \beta^e_\inv)}\\
    &= \sum_{e\in \cE} \hat \cR^e(\beta^*_\inv) 
    + \rho \left (
        0 
        % + O\ps{\kappa_{\inv}^2\sqrt{ \frac{\log (\frac{1}{\delta})}{\ds{\cE}n}}}
        % + O\ps{\kappa_{\inv} \sqrt{\frac{\log (\frac{1}{\delta}) {\ds{\cE}n} }}}
        + 
        % \ds{\cE} \cdot 
        \xi_b(S_\inv)
        \right )
    \label{eqn:lhat-betainv2}\\
    &\le \sum_{e\in \cE} \hat \cR^e(\beta^*_\inv) 
    + \rho \ds{\cE} O\ps{\kappa_\inv\sqrt{\frac{\log (\frac{\ds{\cE}}{\delta_1})}{n} }}
\end{align}
The second equality uses two definitions. First, $\beta^{e}_\inv = \beta^*_\inv$ for $e \in \cE$, so $\hat \cR^e(\beta^e_\inv) - \hat \cR^e(\beta^*_\inv) = 0$. 
% Then, $|\hat \cR^e(\beta^*_\inv) - \cR^e(\beta^*_\inv)| = |\hat \EE^e[\epsilon_\inv^2] - \EE^e[\epsilon_\inv^2]|$ can be bound in high probability with Hoeffding's inequality. 

Then, we use the
% $\hat \cR^e(\beta^*_\inv)  - \cR^e(\beta^*_\inv) \le O\ps{\kappa_\inv\sqrt{\frac{\log (\frac{1}{\delta})}{n} }}$
definition of $\xi_b(S)$ from \Cref{eqn: xi b def}, which is upper bound in \Cref{xib corollary}.
Also, the equality $\beta^{e_1}_\inv = \beta^{e_2} _\inv$ for $e_1, e_2 \in \cE$, because the subset selects only the invariant features that are shared between all environments. 

Next, with high probability $1-\delta_2$,
\begin{align}
    \label{eqn:lhat-betaS}
    \hat \cL(\beta^*_S) 
    &= \sum_{e\in \cE} \hat \cR^e(\beta^*_S) 
    + \rho \sum_{e\in \cE} \bs{ 
    \hat \cR^e(\beta^*_S)  - \hat \cR^e(\beta^e_S)
    + \hat \cR^e(\beta^e_S)   - \hat \cR^e (\hat \beta^e_S)}\\
     &\ge \sum_{e\in \cE} \hat \cR^e(\beta^*_S) 
    + \rho \xi_c(S)
    + \rho \sum_{e\in \cE} \bs{ 
     - 2O\ps{c_\total \sqrt{\frac{\log(\frac{\ds{\cE}}{\delta_2})}{n}} }
    } - \rho|\xi_b(S)|\\
    &\ge
    \sum_{e\in \cE} \hat \cR^e(\beta^*_S) 
    % + \rho \sum_{e\in \cE} \bs{
    % \hat \cR^e(\beta^*_S)  - \hat \cR^e(\beta^e_S)
    % }
    + \rho \ds{\cE} O \ps{
    \frac{D^2\Delta^2}{\lambda_m^3}
    -c_\total\sqrt{\frac{\log \frac{\ds{\cE}}{\delta_2}}{n}}
    }
    - \rho \ds{\cE} 
    O\ps{\kappa_\inv\sqrt{\frac{\log (\frac{\ds{\cE}}{\delta_2})}{n} }}
    % O \ps{ \frac{d_\inv^{1.5}\lambda^e_{\max}\log{\frac{1}{\delta_2}}}{n}},
\end{align}
where $c_\total ^2 = c_0 \max \{ (c_a^2\kappa_s^2 + c_r^2 \kappa_r^2)^2, \kappa_\inv^4\}$ for some constant $c_0 > 0$.
To get the first inequality, we want to bound $|\hat \cR(\beta^*_S)  - \cR(\beta^*_S) |$ and $|\hat \cR(\beta^e_S)  - \cR(\beta^e_S) |$. 
Hoeffding's inequality may be used, but we need the sub-Gaussian norm of the least squares error; let this be  $Z_j \coloneqq (y_j -(\beta^*_S)^\top \vx_j^e)^2$ for $j \in [n]$:
\begin{align}
% \cR^e(\beta^*_S) 
% = 
Z_j = (y_j - (\beta^*_S)^\top \vx_j^e)^2 
=   ((\beta^*_\inv - \beta^*_S)^\top \vx_j^e + \epsilon_{\inv,j} )^2
\le (\Ds{\beta^*_\inv - \beta^*_S}_2 \Ds{\vx_j^e}_2 + \epsilon_{\inv,j} )^2.
\end{align}
By the assumption that classifiers $\beta^*_\inv, \beta^*_S$ are normalized, we have $\Ds{\beta^*_\inv - \beta^*_S}_2\le 2$, and $\Ds{\vx^e}_2$ is sub-Gaussian; from \Cref{lemma:norm-x} with probability $1-\delta$, we get
\begin{equation}
\EE^e[Z_i] = O (\max \{(c_s + c_a \kappa_s + c_r \kappa_r)^2, \kappa_\inv^2\}),
\end{equation} 
and 
\begin{equation}
c_\total  =
\Ds{Z_i}_{\psi_2} 
\le O (\max \{
4 \Ds{\vx^e}_2^2 + 4\Ds{\vx^e}_2\epsilon_\inv + \epsilon_\inv^2
\})
% \Ds{Z_i - \EE^e[Z_i]}_{\psi_2} 
% \le O (\max \{
% \Ds{\Ds{\vx^e}_2- \EE^e[\Ds{\vx^e}_2]}_{\psi_2}^2,
% \Ds{\epsilon_{\inv}}_{\psi_2}^2
% \})
% =O ( \max \{ (c_a^2\kappa_s^2 + c_r^2 \kappa_r^2)^2, \kappa_\inv^4\})
=O(
(c_s + c_a \kappa_s + c_r \kappa_r )^2, \kappa_\inv^4
)
.
\end{equation}

Applying Hoeffding's inequality to $|\hat \cR^e(\beta^*_S)  - \cR^e(\beta^*_S) |$ then gets the bound of 
$O\ps{c_\total \sqrt{\frac{\log(\frac{\ds{\cE}}{\delta_2})}{n}} }$ with probability $1-\delta_1$.
Likewise, 
$|\hat \cR^e(\beta^e_S)  - \cR^e(\beta^e_S) | \le O\ps{c_\total \sqrt{\frac{\log(\frac{\ds{\cE}}{\delta_2})}{n}} }$ 
with probability $1-\delta_2$.
We apply these inequalities over $\ds{\cE}$ environments, so we set $\delta_1 = \delta_2 = \frac{\delta}{2\ds{\cE}}$.
% We remark that this bound can be quite catastrophic if, as in the original setting in \citet{zhouSparseInvariantRisk2022}, in which $\vzeta_s = \vone^{d_s}$ and therefore $c_s^2 = d_s$, making $c_\total^2 = O(d_s^2)$.
% To complete the first inequality, we use the upper bound on $|\xi_b(S)|$ in \Cref{xib corollary}.
% To evaluate $\xi_c(S)$, we use the lower bound for $\xi_c(S) > \frac{D^2 \Delta^2}{\lambda_m^3} $ as defined in \Cref{prop:xic bound}
% .
To complete the first inequality, we use $\xi_b(S) = \sum_{e\in \cE}[\hat \cR^e(\beta^e_S)   - \hat \cR^e (\hat \beta^e_S) ]$ and $\xi_c(S) = \sum_{e\in \cE}[\cR^e(\beta^*_S)  - \cR^e(\beta^e_S)]$.

We use the upper bound on $|\xi_b(S)|$ in \Cref{xib corollary} and the lower bound on $\xi_c(S)$ from \Cref{prop:xic bound}, noting that $\xi_c(S)$ is greater than some constant $c>0$  with high probability provided that there exists at least one feature $i$ such that $\alpha^{e_1}_i \ne \alpha^{e_2}_i$, for $e_1, e_2 \in \cE, e_1 \ne e_2$.
% This arises from \Cref{prop:xic bound}.
% \abmargincomment{check}
% \begin{align}
%     \sum_{e\in \cE} \bs{
% \hat \cR^e(\beta^*_S)  - \hat \cR^e(\beta^e_S)
% }
% &= \sum_{e\in \cE} \bs{
% \hat \cR^e(\beta^*_S)  
% - \cR^e(\beta^*_S)  
% + \cR^e(\beta^*_S)  
% -  \cR^e(\beta^e_S)
% +  \cR^e(\beta^e_S)
% - \hat \cR^e(\beta^e_S)
% }\\
% &=
% \sum_{e\in \cE} \bs{
% O \ps {\sqrt{\frac{\log\frac{1}{\delta}} {n}}}
%  + \xi_c(S)
% + O \ps {\sqrt{\frac{\log\frac{1}{\delta}} {n}}}
% }\\
% &\ge|\cE|O \ps {\sqrt{\frac{\log\frac{1}{\delta}} {n}}}
% \end{align}
% The second equality depends on the definition $\cR^e(\beta^e_S) = \min_{\vv \in \Sp(S)} 
% \cR^e(\vv)$, making $\cR^e(\beta^*_S)  - \cR^e(\beta^e_S) \ge 0 $.


Next, we demonstrate that $\hat \cL(\beta^*_S) - \hat \cL(\beta^*_\inv)>  0$.
% Taking this difference, we can denote the part controlled by sample complexity by $G(\frac{1}{\delta}, n)$:
% \begin{equation}
%    G (\frac{1}{\delta}, n)\coloneqq  \rho \ds{\cE} O\ps{ \frac{d_\inv^{1.5}\lambda^e_{\max}\log{\frac{1}{\delta}}}{n}
%     +\sqrt{\frac{\log \frac{1}{\delta_2}}{n}}
%     + \frac{d_\inv^{1.5}\lambda^e_{\max}\log{\frac{1}{\delta_2}}}{n}}
% \end{equation}
% we have with probability $1-\delta_3$,
\begin{align}
 \hat \cL(\beta^*_S) -\hat \cL(\beta^*_\inv)
&\ge
 \sum_{e\in \cE} \hat \cR^e(\beta^*_S) -
    \sum_{e\in \cE} \hat \cR^e(\beta^*_\inv) 
    + \rho \ds{\cE} O \ps{
    \frac{D^2\Delta^2}{\lambda_m^3}
    -
	c_\total \sqrt\frac{\log \ds{\cE}/\delta_3}{n}
    } 
    \\
    &= -\ds{\cE}\sigma_\inv^2 
    + \rho \ds{\cE} \ds{\xi_c(S)} 
    - \rho \ds{\cE} c_\total \sqrt\frac{\log \ds{\cE}/\delta_3}{n}
    % - G (\frac{1}{\delta}, n)
    % \\
% &\ge
%  % \sum_{e\in \cE} \hat \cR^e(\beta^*_S) 
%  -  \ds{\cE}\sigma_\inv^2
%     + \ds{\cE} O \ps{\sqrt\frac{\log 1/\delta_3}{n}}
%     + \rho \ds{\cE} O \ps{
%     \frac{D^2\Delta^2}{\lambda_m^3}} 
    % - G (\frac{1}{\delta}, n)
\end{align}
We observe that $\hat \cR^e(\beta^*_S) -
    \sum_{e\in \cE} \hat \cR^e(\beta^*_\inv)  \ge 0 - \ds{\cE}\sigma_\inv^2$. 
   First, we set $\rho$ such that $\rho \ds{\cE} \ds{\xi_c(S)} -\ds{\cE}\sigma_\inv^2 >\sigma_\inv^2> 0$. This can be satisfied by setting 
   % the hyperparameter $\rho$, 
\begin{equation}
    \rho 
    > \frac{\sigma^2_\inv + \ds{\cE} \sigma^2_\inv }{ \ds{\cE} \xi_c(S)},
    % > \frac{\sigma^2_\inv}{D^2\Delta^2/\lambda_{\max}^3}
    % \ge\frac{\sigma^2_\inv}{\xi_c(S)}.
\end{equation}
which can be satisfied with 
% \begin{equation}
$
\rho > \frac{(1 + \ds{\cE})\sigma^2_\inv}{\ds{\cE}D^2\Delta^2/\lambda_{\max}^3}
$.
% \end{equation}
Then, we can guarantee that $\hat \cL(\beta^*_S) - \hat \cL(\beta^*_\inv)>  0$, with the sample complexity computed in the proof of \Cref{thm:info-theory}:
\begin{equation} 
    \label{eqn: popn complex }
    n > 
     \max\cs{
		\frac{(2\rho + 1)^2 c_\total^2 \log \frac{\ds{\cE}}{\delta_b}}
		{\sigma_\inv^4},
		\frac{(2\rho + 1)^2  c_\total^2\log \frac{\ds{\cE}}{\delta_b}}
		{\bar \gamma^2\sigma_0^4}
        % \frac{(2 \rho + 1)  %\Ds{\epsilon_\inv}^2_{\psi_2} 
        %         c_\total
        %         d_\inv^{1.5}
        %         \lambda_{\max}^e
        %         \log \ps{\frac{1}{\delta_b}}
        % }
        % {\sigma^2_\inv} 
        % ,
        % \frac{(2 \rho + 1) ^2  
        %     c_\total^2
        %     d_\inv ^2
        %     \ps{\log\ps { \frac{\ds{\cE}}{\delta_b}}}^2
        % }
        % {\ds{\cE}\sigma^4_\inv} 
    }
\end{equation}
Again, we need to solve $\binom {d}{d_\inv}\le d^{d_\inv}$ optimization problems, so we set $\delta_b = \frac{\delta}{d^{d_\inv}}$ before taking the union bound.

With $n > c_\total^2 d_\inv \log \frac{d\ds{\cE}}{\delta}$, we achieve the desired result.


We note that in the general case, $c_\total = 
% O ( \max \{ (c_a^2\kappa_s^2 + c_r^2 \kappa_r^2)^2, \kappa_\inv^4\})
O(\max \{
(c_s + c_a \kappa_s + c_r \kappa_r )^2, \kappa_\inv^4
\})
$. 
With the original setting in \citet{zhouSparseInvariantRisk2022}, $\vzeta_s = \vone^{d_s}$. This leads to $c_s = \sqrt{d_s}$. Then, with $c_\total^2 = O(c_s^4) = O(d_s^2)$, we get a polynomial dependency on $d_s$ without a more refined analysis.
% In general, if we assume $d_s + d_r \gg d_\inv$, then $\Ds{\vzeta_s^e}_2 = c_s^2 = O(d)$. This can cause catastrophic error in overparameterized settings, which motivates the use of sparsity constraints.
\end{proof}
\paragraph{Remark:}
It is also unlikely that spurious features are all equally correlated with the label, as assumed when we take $\vzeta_s^e$ to be all ones. Even with sparse feature selection, it is possible for the predictor to pick up the largest elements of $\vx^e $. An example of a ``heavy-hitter" would be a spurious feature $x^e_j$ that has a strong correlation with the label through a high $\zeta_j^e$, contributing to $c_s^2$. 
In the case that $j \in S$, we end up with $c_\total \propto c_s^2$. This is expored further in \Cref{cor:missing-emp-ones}.

On the other end of the spectrum, a more evenly distributed feature vector can demonstrate even tighter bounds. This is explored further in \Cref{cor:missing-emp-uniform}.
\qed

\subsection{Comparison: Sparse IRM vs ERM and Sparse ERM}
\label{sec:setting-props}
The following propositions give a characterization of this data generation model to motivate the use of IRM. 
% \Cref{prop: vs only,prop: environmental optimal not sparse} show that ERM on the global population risk is unable to find the invariant features. 
\Cref{prop: sparse erm fails too} shows that ERM with sparsity constraints on the global population is also unable to find the invariant features. 

We can represent a given classifier $\vv$ in its three parts $\vv = [\vv_\inv, \vv_s, \vv_r]^\top$. 

\begin{proposition}[Invariant Optimal Classifier is ground truth]
\label{prop:invariant-optimal-classifier}
In the problem setting defined by \Cref{eqn:problem-setting}, $\beta^* = [\gamma^\top, (\vzero ^{d_s})^\top, (\vzero^{d_r})^\top]$, and is also a solution to \Cref{eqn:irm}. \abcomment{can this be easily shown, can we cite}
\end{proposition}
\begin{proof}
First, $\cR(\beta^*) = \Var (\epsilon_\inv)$. Let a comparison be made to candidate parameters $\beta \in S$ with $|S| \le d_\inv$. If any of the parameters in $\beta$ are random, that is $\beta_i \ne 0$ for $i\in S_{r}$, we lose information and $\cR^e(\beta) \ge \cR^e(\beta^*)$. Likewise, $\cJ^e(\beta)  = \cR^e(\beta) - \min_{\beta' \in \Sp(S)} \cR^e(\beta') \ge 0 = \cJ^e(\beta)$. So, any parameter $\beta$ with random features will not be a solution to \Cref{eqn:irm}.

Next, we consider $\beta_i \ne 0$ for $i\in S_{sp}$, potentially including spurious features. In this case, we want to show that $\cL(\beta^*) - \cL(\beta)$ is negative:
\begin{align*}
\cL(\beta^*) - \cL(\beta)
&=   \sum_{e\in \cE} \cR^e(\beta^*) - \cR^e(\beta) + \rho\cJ^e(\beta^*) -  \rho\cJ^e(\beta) \\
&\ge \sum_{e\in \cE} \Var (\epsilon_\inv) - \cR^e(\beta)-  \rho \cJ^e(\beta)
\\
& =   \sum_{e\in \cE} \Var (\epsilon_\inv) - (1+\rho)  \cR^e(\beta) + \rho \min_{\beta' \in \Sp(S)} \cR^e(\beta') 
\end{align*}
We know that $\cR^e(\beta) - \min_{\beta' \in \Sp(S)} \cR^e(\beta') = J_S^e \ge 0$. With an appropriately selected $\rho$, we can see that the penalty incurred by a non-invariant $\beta$ will incur greater IRM population loss than the optimum $\beta^*$.\jdcomment{Really should do a precise anlaysis with $\alpha$ required for this}
\end{proof}


\begin{proposition}[ERM does not overfit on random features]
\label{prop: ignore noise}
Because $\vx^e_r$ is independent from the other features and zero-mean, we can guarantee that $\vv^e$ does not have elements on the random noise features $\vx_r$. 
In other words,
% \abcomment{why? prove these results, as propositions}
\begin{equation}
    \cR^e([\vv_\inv^\top, \vv_s^\top, \vv_r^\top]^\top) \le \cR^e([\vv_\inv^\top, \vv_s^\top, \vzero^\top]^\top)
\end{equation}
\end{proposition}
% So, we have a sparsity guarantee on the environmental optimum, $\Ds{\vv^e}_0 \le s = d_\inv + d_s$. 
\begin{proof}
% We can let $\vv_{\inv,s} \in \RR^{d_\inv + d_s}$ be the concatenation $[\vv_\inv, \vs]^\top$. 
We can see that
\begin{align*}
    \cR^e([\vv_\inv^\top, \vv_s^\top, \vv_r^\top]^\top)
    &= \EE^e\bs{\ps{
        y - [\vv_\inv, \vv_s] [\vx_\inv^e, \vx^e_s])] - \vv_r^\top \vx_r
    }^2}\\
    &= \EE^e\bs{
        (y - [\vv_\inv, \vv_s] [\vx_\inv^e, \vx^e_s])^2
        - 2(y - [\vv_\inv, \vv_s] [\vx_\inv^e, \vx^e_s])(\vv_r^\top \vepsilon_r)
        + (\vv_r^\top \vepsilon_r)^2
    }\\
    &= \cR^e([\vv_\inv^\top, \vv_s^\top, \vzero^\top]^\top) + \EE[(\vv_r^\top \vepsilon_r)^2]\\
    &\ge \cR^e([\vv_\inv^\top, \vv_s^\top, \vzero^\top]^\top).
\end{align*}
The difference $\EE[(\vv_r^\top \vepsilon_r)^2] = 0$ if and only if $\vv_r^\top \vepsilon_r = 0$, in which case $\vv_R = \vzero$.
\end{proof}
\begin{proposition}[ERM overfits on spurious features]
When there exists spurious feature $i\in [d]$ such that $\alpha_i^2 < \sigma_\inv^2$, then 
\begin{equation}
    \beta^*= [\gamma,0,0]^\top \notin \argmin_{\vv\in \RR^d} \cR(\vv).
\end{equation}
Thus, unconstrained ERM on the even mixture of environments will not be able to detect the ground truth.
\end{proposition}
\begin{proof}
This can be seen by setting $\vv = \ve_i$ where $\frac{\ve_i}{\zeta_i}$ is the standard basis vectors with a 1 on the $i$th feature on the spurious feature $i$,
\begin{equation*}
    \cR(\frac{\ve_i}{\zeta_i})
    % = \EE[(y- \ve_i^\top \vx^e)^2] 
    = \EE\bs{\EE^e[(y - y - \alpha_i^e \epsilon_{s,i})^2]}
    = \EE\bs{(\alpha_i^e)^2} = \alpha_i^2 
    < \sigma_\inv^2 = \cR([\gamma,\vzero, \vzero)].
\end{equation*}
While $\ve_i$ is not generally the minimizer of the global population loss, it does show that $[\gamma,\vzero, \vzero]$ does not achieve minimum loss when there are no restrictions on footprint/L0 norm.
\end{proof}


\begin{proposition}[Sparse ERM doesn't find invariant features]
\label{prop: sparse erm fails too}
With population risk, ERM with constrained L0 norm does not find invariant features
when $\alpha_i^2 < \sigma_\inv^2$. 
\begin{equation}
    % \vv_{erm} = \argmin_{\Ds{\vv}_0 \le d_\inv} \cR(\vv)
    \min_{\Ds{\vv}_0 \le d_\inv} \cR(\vv) < \cR(\beta^*)
    % \min_{\vv'} \cR(\vv') \text{ s.t. } \vv' = [\vv'_\inv, \vzero, \vzero].
\end{equation}
\end{proposition}

\begin{proof}
We can observe that
\begin{equation}
     \min_{\Ds{\vv}_0 \le d_\inv} \cR(\vv) \le \cR(\frac{\ve_i}{\zeta_i}) = \alpha_i^2 < \sigma^2_\inv = \cR(\beta^*)
     % = \min_{\vv'} \cR(\vv') \text{ s.t. } \vv' = [\vv'_\inv, \vzero, \vzero].
\end{equation}
In other words, ERM with sparsity constraints is not guaranteed to find the exact invariant footprint in the population case.
\end{proof}

\begin{proposition}[Sparse IRM finds invariant features]
% Minimizing $ \cL (\vv)$ gets $[\vv_\inv, \vzero, \vzero]$, even when $\alpha_i^2 < \sigma_\inv^2$.
IRM with sparsity can find the invariant optimal classifier, where
\begin{equation}
    \beta^* = \argmin_{\vv} \cL (\vv) \subt \Ds{\vv}_0 \le d_\inv 
\end{equation}
\end{proposition}
\begin{proof}
With the assumptions in \Cref{sec:assumptions}, we can analyze \cref{eqn: xi c def} to see that the penalty term added by $\cL(\vv)$, which is $\cR^e(\vv) - \cR^e(\vv^e)$ in the population case, is only zero when the classifier learned has nonzero elements on the invariant features only.
% \jdcomment{incoherence stuff comes into play here probably. This proposition is a little bit of a big deal and is not discussed in \cite{zhouSparseInvariantRisk2022}.}
\end{proof}

