\subsection{Sparse IRM with LASSO}
\label{proof:l1}
Below is the proof of \Cref{thm:convex-relaxation}. We restate it for reference:
\paragraph{Theorem 3}
Assume $n$ samples per training environment, for $n > O\ps{\textnormal{poly}(d_\inv)\log(d)\log\ps{\frac{ |E|}{\delta}}}$.
% with at least $n > Q\ps{\textnormal{poly}(d_\inv)\log\ps{\frac{d \cdot |E|}{\delta}}}$ per training environment, 
Together with assumptions in \Cref{sec:assumptions}, 
we can say with probability at least $1-\delta$:
\begin{equation}
\label{eq:basic-setup}
\begin{gathered}
\tilde \beta = 
\min _{\vv} \hat \cL(\vv)
\text { s.t. } 
\vv \in\bbR^{d}, \Ds{\vv}_1\leq d_\inv, %\Ds{\vv'}_0
\end{gathered}
\end{equation}
returns a parameter $\tilde\beta$ with low estimation error $\Ds{\tilde\beta - \beta^*_\inv}_2 \le O(\sqrt{\frac{d_\inv} {n}})$.

Denoting $\hat{\cL}(\vv) := \hat{\cL}_{\mm}(\vv)$ and $c^e_S = \min_{\vv^e \in \Sp (S)} \hat{\cR}^e(\vv^e)$, we note that $\hat \cL (\vv)$ is a discontinuous function. To apply results from the high-dimensional statistics literature \citep{negahban_2009_higdim_mestimators, banerjee2015estimation, Wainwright2019-tb}, we work with the smooth approximation of this function.
\begin{align*}
\hat{\cL}(\vv) + \gamma_n  \| \vv \|_1 & = 
\sum_{e \in \cE_{tr}}  \hat \cR^e(\vv)  + \rho \sum_{e\in \cE_{tr}}  \max _{\vv^e \in \Sp (S)} 
\left[\hat \cR^e(\vv)-\hat \cR^e\left(\vv^e \right)\right] + \gamma_n  \| \vv \|_1 \\
& =  \sum_{e \in \cE_{tr}}  \hat \cR^e(\vv)  + \rho \sum_{e\in \cE_{tr}}  \left[\hat \cR^e(\vv) - c^e_S \right] + \gamma_n  \| \vv \|_1 \\
& = (1+\rho) \sum_{e \in \cE_{tr}}  \hat \cR^e(\vv) - \rho \sum_{e \in \cE_{tr}}  c^e_S + \gamma_n  \| \vv \|_1
\end{align*}
The dependency on $c^e_S$ causes $\hat \cL(\vv)$ to be discontinuous. We reformulate the minimization with a continuous proxy:
% Then, we note that minimizing the loss $\min_{\vv} \hat \cL(\vv)$
% can be written $\min_{S\in \cS} \min_{\vv} \hat \cL(\vv)$ 
% for $\cS \coloneqq \{S \in 2^d \mid |S| \le d_\inv\}$.
\begin{align}
     % \min_{S\in \cS}\min_{\vv \in \Sp(S)}\hat{\cL}(\vv) 
    % + \gamma_n  \| \vv \|_1 
    \tilde \cL(\vv;S) 
    & = - \log \sum_{S \in \cS} \exp \cs{- \hat \cL(\vv;S)}\\
    & = - \log \sum_{S \in \cS} \exp \cs{-(1+\rho) \sum_{e \in \cE_{tr}}  \hat \cR^e(\vv;S) - \rho \sum_{e \in \cE_{tr}}  c^e_S },
\end{align}
observing that 
\begin{equation}
    \min_{\vv }\tilde \cL(\vv) 
    \le 
    \min_{\vv} \hat \cL(\vv) + \log \binom{d}{d_\inv} 
    = \min_{\vv} \hat \cL(\vv) + O(d_\inv)
\end{equation}

% Denoting $\hat{\cL}(\vv) := \hat{\cL}_{\mm}(\vv)$ and $c^e = \min_{\vv^e \in \Sp (S)} \hat{\cR}^e(\vv^e)$, the convex relaxed regularized problem can be written as that of minimizing
% \begin{align*}
% \hat{\cL}(\vv) + \gamma_n  \| \vv \|_1 & = 
% \sum_{e \in \cE_{tr}}  \hat \cR^e(\vv)  + \rho \sum_{e\in \cE_{tr}}  \max _{\vv^e \in \Sp (S)} 
% \left[\hat \cR^e(\vv)-\hat \cR^e\left(\vv^e \right)\right] + \gamma_n  \| \vv \|_1 \\
% & =  \sum_{e \in \cE_{tr}}  \hat \cR^e(\vv)  + \rho \sum_{e\in \cE_{tr}}  \left[\hat \cR^e(\vv) - c^e \right] + \gamma_n  \| \vv \|_1 \\
% & = (1+\rho) \sum_{e \in \cE_{tr}}  \hat \cR^e(\vv) - \rho \sum_{e \in \cE_{tr}}  c^e + \gamma_n  \| \vv \|_1
% \end{align*}


Let $E$ denote the total number of environments, each with $n$ samples, and let $X^e \in \R^{n \times p}$ denote the design matrix for environment $e \in \cE_{tr}$, with independent rows $, i \in [n], e \in [E]$. \abcomment{Need $\Sigma = \E[\vx \vx^\top]$ with $\lambda_{\max}(\Sigma) := \lambda_{\max}$ -- this should have been defined earlier}
We start by showing that $\vx_i^e$ are subGaussian with $\psi_2$ norm of the order of $\min(1,\sqrt{d_s})$, where $d_s$ is the number of spurious features. \abcomment{will also depend on $\lambda_{\max}$ for anisotropic designs $\vx_{\inv}$, with largest eigenvalue of covariance $\lambda_{\max}$}
\begin{lemma}[Sub-Gaussian Design]
    $\kappa_{\vx} := \| \Sigma^{-1/2} \vx^e_i \|_{\psi_2} \leq c_1 \min(1,\sqrt{d_s})$. 
    \label{eqn sub-gaussian design}
\end{lemma}
Next, following developments in the high-dimensional statistics literature \citep{negahban_2009_higdim_mestimators, banerjee2015estimation, Wainwright2019-tb}, with $\vv^* = [\gamma, 0, 0]$ denoting the population parameter, we set
\begin{align}
\gamma_n \geq 2 \left\| \nabla \hat{\cL}(\vv^*) \right\|_{\infty}~.
\label{eq:lambda_low}
\end{align}


Through an analysis based on the form of $\hat{\cL}, \vv^*, X^e$, we have the following result.
\begin{lemma}[Regularization Weight]
Setting $\gamma_n = c (1+\rho) \kappa_{\vx} \kappa_{\eps_{\inv}} \sqrt{\lambda_{\max}} \sqrt{ n E \log p }$, where $\kappa_{\vx}, \kappa_{\eps_{\inv}}$ are respectively the $\psi_2$ norms of covariates $\vx^e_i$ and noise $\eps_{\inv}$,  satisfies the requirement on $\gamma_n$ in \eqref{eq:lambda_low} with high probability.
\end{lemma}
\proof By definition
\begin{align*}
\frac{1}{1+\rho} \nabla \hat{\cL}(\vv^*) & =   \sum_{e \in \cE_{tr}}  \nabla \hat \cR^e(\vv^*) \\
& = \sum_{e \in \cE_{tr}}  \sum_{i=1}^n - (y_i^e - (\vx_i^e)^\top \vv^*) \vx_i^e \\
& = - \sum_{e \in \cE_{tr}}  \sum_{i=1}^n - \eps_{inv,i}^e \vx_i^e \\
& = - \sum_{e \in \cE_{tr}} (X^e)^\top \eps_{inv}^e \\
& = - X^\top \eps_{\inv}~,
\end{align*}
where $X \in \R^{nE \times p}$ is the concatenation of covariates from all environments and $\eps_{\inv} \in \R^{nE}$ is the concatenation of all noise $\eps_{inv,i}^e$ over all environments. Then, following Theorems 3 of [Banerjee et al., '14], effectively with the scaling $u = \frac{\eps_{\inv}}{\|\eps_{\inv}\|_2} \in S^{nE-1}$, i.e., an unit vector, we have 
\begin{align*}
\frac{1}{1+\rho} \E \left[ \left\| \nabla \hat{\cL}(\vv^*) \right\|_\infty \right] & \leq c_1 \E[\| \eps_{\inv} \|_2 ] \sup_{u \in S^{nE-1}} \E \left[ \left\| X^\top u \right\|_\infty \right] \\
& \leq c_2 \kappa_{\eps_{\inv}} \sqrt{nE} \times \sqrt{\lambda_{\max}} \kappa_{\vx} \sqrt{\log p}~.
\end{align*}
Further, following Theorem 4 of \citep{banerjee2015estimation}, a bound of the same order holds with high probability. Rearranging terms completes the proof. \qed 




Let $\hat{\vv}_n$ be the solution to the regularized estimation problem and consider the estimation error $\Delta_n := \hat{\vv}_n - \vv^*$. Since the loss function for convex relaxation based on regularization is effectively scaled least squares, the restricted strong convexity (RSC) condition \citep{Wainwright2019-tb} reduces to the restricted eigen-value (RE) condition \citep{JMLR:v11:raskutti10a, pmlr-v119-sivakumar20a}. For our specific setting, we show that the RE condition is indeed satisfied with suitable sample complexity.
\begin{lemma}[RE Condition]
\label{lemma: re condition}
If the number of environments $E$ and samples per environment $n$ satisfy $nE \geq c \lambda_{\max} \kappa_{\vx}^2 d_{\inv} \log p$, then with high probability $\| X \Delta_n \|_2^2 \geq \kappa n E \| \Delta_n \|_2^2$ for some positive $\kappa = \Omega(1)$.
\end{lemma}
\proof 
\abcomment{follows from Theorem 6 in \citet{banerjee2015estimation}} 
% We cite Lemma 3 in \citet{pmlr-v119-sivakumar20a} to prove this results.
We note our design matrix $X^e \in \RR^{n\ds{\cE} \times d}$ consists of rows $\vx^e$ where columns corresponding to the spurious features $\vx^e_s$ are dependent on the other features; specifically, the invariant features $\vx^e_\inv$ through the generative model in \Cref{eqn:problem-setting}.

Let $x^e_{i}$ be the $i$th index of $\vx^e$, and is in this case, a spurious feature. Then, $x^e_i = (\gamma^\top \vx_\inv) \zeta_{s,i}  + \alpha^e_i \epsilon_{s,i}$. Because $\vepsilon_{s,i}$ is i.i.d. sampled sub-Gaussian with $\Ds{\epsilon_{s,i}}_{\psi_2} = \kappa_s$, we cite the result of Lemma 3 in \citet{pmlr-v119-sivakumar20a},:
\begin{equation}
    \lambda_{\min} (\EE_{\vx^e} [\vx^e (\vx^e)^\top ]) \ge c_1 \kappa_s^2.
\end{equation}
Restated, we have the RE condition as desired.
\qed 

