\subsection{Convex Relaxation with LASSO}
\label{proof:l1}
Below is the proof of \Cref{thm:convex-relaxation}. We restate it for reference:
% \begin{theorem}[Sparse IRM with IHT]
% \label{thm:iht}
\paragraph{Theorem 3}
Assume $n$ samples per training environment, for $n > O\ps{\textnormal{poly}(d_\inv)\log(d)\log\ps{\frac{ |E|}{\delta}}}$.
% with at least $n > Q\ps{\textnormal{poly}(d_\inv)\log\ps{\frac{d \cdot |E|}{\delta}}}$ per training environment, 
Together with assumptions in \Cref{sec:assumptions}, 
we can say with probability at least $1-\delta$:
\begin{equation}
\label{eq:basic-setup}
\begin{gathered}
\tilde \beta = 
\min _{\vv} \hat \cL(\vv)
\text { s.t. } 
\vv \in\bbR^{d}, \Ds{\vv}_0\leq d_\inv, %\Ds{\vv'}_1 
\end{gathered}
\end{equation}
returns a parameter $\tilde\beta$ with low estimation error $\Ds{\tilde\beta - \beta^*_\inv}_2 \le O(\sqrt{\frac{d_\inv} {n}})$.
% \end{theorem}

% \jdcomment{Question here:}
% The bounds for $\xi_a(S), \xi_b(S),$ and $\xi_c(S)$ are controlled by $|S|$.
% Most notably, $\xi_c(S)$ is bounded in \Cref{eqn:c1} by a summation over at most $|S|$ invariant features. 
% % $\xi_a(S)$ and $\xi_b(S)$ are bounded by $\min(|S|, d_\inv)$ and do not explicitly require $|S|$ to be small?

% However, \Cref{thm:info-theory} contains a L0 constraint that is computationally intractable. It is an information-theoretical result that may be approximated with one of several different convex relaxations. 

%\abcomment{we will do the L1 analysis here ... theorem statement will need to be changed and notation will need work}

Denoting $\hat{\cL}(\vv) := \hat{\cL}_{\texttt{IRMV1}}(\vv)$, 
% and $c^e = \min_{\vv^e \in \Sp (S)} \hat{\cR}^e(\vv^e)$, 
the convex relaxed regularized problem can be written as that of minimizing
\begin{align*}
\hat{\cL}(\vv) + \gamma_n  \| \vv \|_1 & = 
\sum_{e \in \cE_{tr}}  \hat \cR^e(\vv)  + \rho \sum_{e\in \cE_{tr}}  
\|\nabla_{\vv} \cR^e(\vv)\|_2^2
+ \gamma_n  \| \vv \|_1 
% & = (1+\rho) \sum_{e \in \cE_{tr}}  \hat \cR^e(\vv) - \rho \sum_{e \in \cE_{tr}}  c^e + \gamma_n  \| \vv \|_1
\end{align*}
Let $E$ denote the total number of environments, each with $n$ samples, and let $X^e \in \R^{n \times p}$ denote the design matrix for environment $e \in \cE_{tr}$, with independent rows $, i \in [n], e \in [E]$. \abcomment{Need $\Sigma = \E[\vx \vx^\top]$ with $\lambda_{\max}(\Sigma) := \lambda_{\max}$ -- this should have been defined earlier}
We start by showing that $\vx_i^e$ are subGaussian with $\psi_2$ norm of the order of $\min(1,\sqrt{d_s})$, where $d_s$ is the number of spurious features. \abcomment{will also depend on $\lambda_{\max}$ for anisotropic designs $\vx_{\inv}$, with largest eigenvalue of covariance $\lambda_{\max}$}
\begin{lemma}[Sub-Gaussian Design]
    $\kappa_{\vx} := \| \Sigma^{-1/2} \vx^e_i \|_{\psi_2} \leq c_1 \min(1,\sqrt{d_s})$. 
    \label{eqn sub-gaussian design}
\end{lemma}
Next, following developments in the high-dimensional statistics literature \citep{negahban_2009_higdim_mestimators, banerjee2015estimation, Wainwright2019-tb}, with $\vv^* = [\gamma, 0, 0]$ denoting the population parameter, we set
\begin{align}
\gamma_n \geq 2 \left\| \nabla \hat{\cL}(\vv^*) \right\|_{\infty}~.
\label{eq:lambda_low}
\end{align}


Through an analysis based on the form of $\hat{\cL}, \vv^*, X^e$, we have the following result.
\begin{lemma}[Regularization Weight]
Setting $\gamma_n = c (1+\rho) \kappa_{\vx} \kappa_{\eps_{\inv}} \sqrt{\lambda_{\max}} \sqrt{ n E \log p }$, where $\kappa_{\vx}, \kappa_{\eps_{\inv}}$ are respectively the $\psi_2$ norms of covariates $\vx^e_i$ and noise $\eps_{\inv}$,  satisfies the requirement on $\gamma_n$ in \eqref{eq:lambda_low} with high probability.
\end{lemma}
\proof By definition
\begin{align*}
\frac{1}{1+\rho} \nabla \hat{\cL}(\vv^*) & =   \sum_{e \in \cE_{tr}} \left [ \nabla \hat \cR^e(\vv^*) + \rho\nabla \hat \cJ^e(\vv^*)
 \right]
\\
& = \sum_{e \in \cE_{tr}}  \sum_{i=1}^n - (y_i^e - (\vx_i^e)^\top \vv^*) \vx_i^e 
+ \rho(y_i^e - (\vx_i^e)^\top \vv^*) \vx_i^e  (\vx_i^e) ^\top  \vx_i^e 
\\
& = - \sum_{e \in \cE_{tr}}  \sum_{i=1}^n - \eps_{inv,i}^e \vx_i^e  - \rho \eps_{inv,i}^e \vx_i^e (\vx_i^e) ^\top  \vx_i^e\\
& = - \sum_{e \in \cE_{tr}} (X^e)^\top \eps_{inv}^e- \rho (X^e)^\top (X^e) (X^e)^\top   \eps_{\inv}  \\
& = - X^\top \eps_{\inv} - \rho X^\top X X^\top   \eps_{\inv}~,
\end{align*}
where $X \in \R^{nE \times p}$ is the concatenation of covariates from all environments and $\eps_{\inv} \in \R^{nE}$ is the concatenation of all noise $\eps_{inv,i}^e$ over all environments. Then, following Theorems 3 of \citep{banerjee2015estimation}, effectively with the scaling $u = \frac{\eps_{\inv}}{\|\eps_{\inv}\|_2} \in S^{nE-1}$, i.e., an unit vector, we have 
\begin{align*}
\frac{1}{1+\rho} \E \left[ \left\| \nabla \hat{\cL}(\vv^*) \right\|_\infty \right] & \leq c_1 \E[\| \eps_{\inv} \|_2 ] \sup_{u \in S^{nE-1}} \E \left[ \left\| X^\top u \right\|_\infty \right]\\
& \leq c_2 \kappa_{\eps_{\inv}} \sqrt{nE} \times \sqrt{\lambda_{\max}} \kappa_{\vx} \sqrt{\log p}~.
\end{align*}
Further, following Theorem 4 of \citep{banerjee2015estimation}, a bound of the same order holds with high probability. Rearranging terms completes the proof. \qed 




Let $\hat{\vv}_n$ be the solution to the regularized estimation problem and consider the estimation error $\Delta_n := \hat{\vv}_n - \vv^*$. Since the loss function for convex relaxation based on regularization is effectively scaled least squares, the restricted strong convexity (RSC) condition \citep{Wainwright2019-tb} reduces to the restricted eigen-value (RE) condition \citep{JMLR:v11:raskutti10a, pmlr-v119-sivakumar20a}. For our specific setting, we show that the RE condition is indeed satisfied with suitable sample complexity.
\begin{lemma}[RE Condition]
\label{lemma: re condition}
If the number of environments $E$ and samples per environment $n$ satisfy $nE \geq c \lambda_{\max} \kappa_{\vx}^2 d_{\inv} \log p$, then with high probability $\| X \Delta_n \|_2^2 \geq \kappa n E \| \Delta_n \|_2^2$ for some positive $\kappa = \Omega(1)$.
\end{lemma}
\proof 
\abcomment{follows from Theorem 6 in \citet{banerjee2015estimation}} 
% We cite Lemma 3 in \citet{pmlr-v119-sivakumar20a} to prove this results.
We note our design matrix $X^e \in \RR^{n\ds{\cE} \times d}$ consists of rows $\vx^e$ where columns corresponding to the spurious features $\vx^e_s$ are dependent on the other features; specifically, the invariant features $\vx^e_\inv$ through the generative model in \Cref{eqn:problem-setting}.

Let $x^e_{i}$ be the $i$th index of $\vx^e$, and is in this case, a spurious feature. Then, $x^e_i = (\gamma^\top \vx_\inv) \zeta_{s,i}  + \alpha^e_i \epsilon_{s,i}$. Because $\vepsilon_{s,i}$ is i.i.d. sampled sub-Gaussian with $\Ds{\epsilon_{s,i}}_{\psi_2} = \kappa_s$, we cite the result of Lemma 3 in \citet{pmlr-v119-sivakumar20a},:
\begin{equation}
    \lambda_{\min} (\EE_{\vx^e} [\vx^e (\vx^e)^\top ]) \ge c_1 \kappa_s^2.
\end{equation}
Restated, we have the RE condition as desired.
\qed 

