\label{sec:assumptions}
\begin{assumption}
\label{assn: subgaussian}
The noise on the invariant features is subgaussian. Specifically, there exists $C>0$ such that
\[\EE\bs{ \exp (t\epsilon_\inv) \le \exp(t^2 C^2)}\ \forall t\in \RR.\]
% Then, we can say that the variance of the invariant feature is bounded by some positive constant $\sigma_\inv > 0$.
% \todo{does the spurious feature also have bounded variance?}
\end{assumption}

\begin{assumption}
\label{assn: gamma}
There exist positive constants $\sigma_0$ and $\bar \gamma$ such that 
\[\forall i \in [d_\inv], |\gamma_i|\ge \bar \gamma, \ \Var[x_{\inv,i}] \ge \sigma_0^2.\]
This states that each invariant feature $x_{\inv,i}$ and its corresponding ground truth weight $\gamma_i$ must sufficiently contribute to the explanation of the label. 
% \jdcomment{The variance needs to be lower bounded for the L0 analysis to work}
\end{assumption}

\begin{assumption}
\label{assn: spurious diff}
For the $i$th spurious feature, let $\alpha_i^2 = \frac{1}{\ds{\cE}}\sum_{e\in \cE}(\alpha_i^e)^2$ for spurious features $i \in [d_s]$. There exists a constant $\Delta > 0$ such that the following holds for each spurious feature $i \in [d_s]$ for all environments $e \in \cE$:
\[\ds{ \alpha_i^2 - (\alpha_i^e)^2 } \ge \Delta.\]
% This is altered from \cite{zhouSparseInvariantRisk2022}; see \Cref{obs: alpha def}.
\end{assumption}

% \begin{assumption}
% \jdcomment{Compare $\alpha_i^2$ with $\sigma_\inv^2$}
% \jdcomment{Is that alpha stuff somehow captured by \cref{assn: spurious diff followup xic}?}
% \end{assumption}

% \begin{assumption}
% \label{assn:boundedl2}
%     The optimal classifiers in this problem are bounded. In other words, 
%     \begin{align*}
%         &\Ds{\gamma}_2  = \Ds{\beta^*}_2= 1,\\
%         &\Ds{\beta^e_S}_2 = O(d_\inv) \forall S \in 2^d, e\in \cE.\\
%     \end{align*}
%     % for some constant $c_1 \in \RR_+$.
% \end{assumption}

\begin{assumption}
\label{assn: spurious diff followup xic}
Let $\{\lambda_i^e\}^d_{i=1}$ and $\{\lambda_i\}^d_{i=1}$ be the eigenvalues of $\Sigma^e$ and $\Sigma$ respectively with corresponding eigenvectors $\{\vu_i^e\}^d_{i=1}$ and $\{\vu_i\}^d_{i=1}$. 

Then for those $\vu_i$ such that $\lambda_i^e - \lambda_i = \alpha_i^2 - (\alpha_i^e)^2 > 0$, \abcomment{the last inequality need not be true for all $i$ based on the assumptions, we should be able to show that $\exists i$ for which this is true} we have a constant $D$ such that
% For each subset of the nonzero elements in $\vx^e$ that is size $d_\inv$ or smaller, 
\begin{equation}
\ds{\EE[\vx^e y^e]^\top\vu_i} \ge D > 0
\end{equation}
\abcomment{why?} 
\jdcomment{Kind of an assumption just to ``make things work", inherited from \citep{zhouSparseInvariantRisk2022}}
% \jdcomment{Assumption 5 in the original paper. todo: figure this out ``ensures that the coefficients of a spurious feature can not be always 0, otherwise IRM can not differentiate betwee the spurious and invariant features, neither."}
% \abcomment{what is $\vx, y$? defined these clearly before we start any analysis}
\end{assumption}

\begin{assumption}
\label{assn: low eigval}
The invariant features may not be rank deficient. That is,
\begin{equation}
    \min_i \lambda_i \ge c_{\min} > 0 
\end{equation}
for $\lambda_i$ (defined in \Cref{assn: spurious diff followup xic}) as an eigenvalue of $\hat \Sigma$.
\end{assumption}

\begin{assumption}
\label{assn: rsc}
The loss function is RSC. Specifically, a function $\cL $ satisfies $\alpha-$restricted strong convexity ($\alpha-$RSC) 
% with respect to the tuple $($
% \begin{equation}
%     \min_i \lambda_i \ge c_{\min} > 0 
% \end{equation}
% for $\lambda_i$ (defined in \Cref{assn: spurious diff followup xic}) as an eigenvalue of $\hat \Sigma$.
\end{assumption}
% \begin{assumption}
% \jdcomment{Not finalized. I think this is the right direction, though.}
% \label{assn: mutual incoherence}
% There exists $\zeta \in [0,1)$ such that
% \begin{equation}
%     \max_{j \notin S_{\inv}} \min_{e\in \cE} \Ds{((X^e_\inv)^\top X^e_\inv)^{-1} (X^e_\inv)^\top X^e_j}_1 \le \zeta,
% \end{equation}
% where $X^e_\inv \in \RR^{n \times d_\inv}$ is comprised of the invariant features in the data matrix $X^e$ and  $X^e_j \in \RR^n$ is defined to be the column of some non-invariant feature $j$. 

% In other words, for any spurious or random feature $j$, there must exist an environment in which it is not highly correlated with the submatrix of invariant features. 
% \end{assumption}