




\subsection{Iterative Hard Thresholding}
\label{thm:iht}
Below is the proof for \Cref{thm:iht}. We copy the theorem below for reference:

% \begin{theorem}[Sparse IRM with IHT]
% \label{thm:iht}
\paragraph{Theorem 3} 
Assume $n$ samples per training environment, for $n > Q\ps{\textnormal{poly}(d_\inv)\log(d)\log\ps{\frac{ |E|}{\delta}}}$.
% with at least $n > Q\ps{\textnormal{poly}(d_\inv)\log\ps{\frac{d \cdot |E|}{\delta}}}$ per training environment, 
Together with assumptions in \Cref{sec:assumptions}, 
we can say with probability at least $1-\delta$:
\begin{equation*}
\begin{gathered}
\tilde \beta = 
\min _{\vv} \hat \cL(\vv)
\text { s.t. } 
\vv \in\bbR^{d}, \Ds{\vv}_0\leq d_\inv, %\Ds{\vv'}_1 
\end{gathered}
\end{equation*}
returns a parameter $\tilde\beta$ with low estimation error $\Ds{\tilde\beta - \beta^*_\inv}_2 \le O(\sqrt{\frac{d_\inv} {n}})$.
% \end{theorem}

\begin{proof}
\label{proof:IHT}
We apply Theorem 3 of \citet{jain_iterativehardthreshold_2014}, which specifically contains an example for Sparse Linear Regression. 

\textbf{RSS and RSC}:  
With parameter $\beta\in \RR^d, \beta' \in \RR^d$, we define
\begin{equation}
    \delta \cR^e(\beta) \coloneqq  \cR^e(\beta') - \cR^e(\beta) - \langle \nabla_\beta \cR^e (\beta), \beta' - \beta \rangle,
\end{equation}
and likewise 
\begin{equation}
    \delta \cJ^e(\beta) \coloneqq  \cJ^e(\beta') - \cJ^e(\beta) - \langle \nabla_\beta \cJ^e (\beta), \beta' - \beta \rangle.
\end{equation}
To apply results from Iterative Hard Thresholding, we show that this problems satisfies the Restricted Strong Convexity (RSC) and Restricted Strong Smoothness (RSS) conditions. RSC requires
\begin{align}
\delta \cR^e(\beta) + \rho \delta \cJ^e(\beta) \ge \frac{\alpha_{\irm}}{2} \|\Delta\|^2_2.
\end{align}
Likewise, for RSS condition,
\begin{align}
\delta \cR^e(\beta) + \rho \delta \cJ^e(\beta) \le \frac{L_{\irm}}{2} \|\Delta\|^2_2.
\end{align}

Let $ \Delta= \beta' - \beta$.
\begin{align}
    \delta \cJ^e(\beta) 
    &= \cJ^e(\beta + \Delta) - \cJ^e(\beta)  + \langle \nabla_{\beta} \cJ^e (\beta), \Delta\rangle
    \\
    &= \Ds{ \frac{1}{n} (-X^e) ^\top (Y-X^e\beta - X^e\Delta)}_2^2 - \Ds{ \frac{1}{n} (-X^e) ^\top (Y-X^e\beta)}_2^2 \nonumber \\
    &\quad+ \langle \frac{2}{n^2} (-X^{e\top}X^eX^e) ^\top (Y-X^e\beta), \Delta\rangle \\
    &= \Ds{ \frac{1}{n} (X^e) ^\top X^e\Delta}_2^2 \\
     &= \frac{1}{n^2}  \Ds{(X^e) ^\top X^e\Delta}_2^2
\end{align} 
% This is similar to the Restricted Eigenvalue (RE) conditions used to show recovery of 
% \jdcomment{So Cauchy-Schwartz is a bad argument here
% \begin{equation}
% \sigma_{\min } ^2 (\frac{X^e}{\sqrt n}) \left \| \frac{X^e\Delta}{\sqrt n}\right \|_2^2 
% \le \Ds{ \frac{1}{n} (X^e) ^\top X^e\Delta}_2^2 
% % \delta \cJ^e(\beta)
%     \le \sigma_{\max }^2 (\frac{X^e}{\sqrt n}) \left \| \frac{X^e\Delta}{\sqrt n}\right \|_2^2.
%     % \le  \left \| \frac{X^e}{\sqrt n}\right \|_2^2 
%     % \left \| \frac{X^e\Delta}{\sqrt n} \right \|^2_2 
% \end{equation}}

If we set $\alpha_{\irm} = \alpha_s$ as defined in \citep{jain_iterativehardthreshold_2014}, which defines RSC for  the least square component $\delta \cR^e(\beta)$, we recover the RSC property: 
\[\delta \cR^e(\beta) + \rho \delta \cJ^e(\beta) =\frac{1}{n} \Ds{  X^e\Delta}_2^2 +\frac{1}{n^2} \Ds{  (X^e) ^\top X^e\Delta}_2^2\ge \frac{1}{n} \Ds{ X^e\Delta}_2^2 \ge \frac{\alpha_{\irm}}{2} \|\Delta\|^2_2,  \]
since $\frac{1}{n}{X^e}^\top {X^e}$ is positive semi-definite.

We then want to upper-bound $\delta \cJ^e(\beta)$, and we will also use $L_{s} $ as defined in \cite{jain_iterativehardthreshold_2014}
Let $X = X^e$, the data matrix for a single environment.
If we write the eigendecomposition $X^\top X = V\Lambda V^\top$, with diagonal elements of 
 $\Lambda$ as $\lambda_i$ for $i\in [d]$, we can also write $\Delta=V\valpha$ for some coefficients $\alpha$.
For least squares, we have bounds for $\|X\Delta \|_2^2= \|\Lambda ^{1/2}\valpha \|_2^2 = \sum_{i=1}^d \lambda_i \alpha_i^2 \le \frac{L_s}{2} \|\Delta\|^2_2$. 

Define $L_0 = \frac{L_s}{2} \|\Delta\|^2_2$.
First, $\|\Delta\|^2_2 = \|\alpha\|^2_2 = 1$. 
Furthermore, we note that $\Delta = \beta - \beta'$ for iterates of the IHT algorithm, and let $\|\beta\|_0 = s$ and $\|\beta'\|_0 = s'$, where $s + s' < d$. Then, $\|\Delta\|_0\le s + s'$.
Because $\Delta$ is low-rank, we can assume there is a set $T$ of eigenvectors where $|T| \le s' + s$, which defines $\|X\Delta\|_2^2$.
\begin{equation}
\sum_{i=1}^d \lambda_i \alpha_i^2 = \sum_{i \in T} \lambda_i \alpha_i^2 \le L_0,
\quad \text{and}\quad
    \sum_{i=1}^d \lambda_i^2  \alpha_i^2 = \sum_{i \in T} \lambda_i^2 \alpha_i^2.
\end{equation}
The bounds apply for restricted eigenvectors $\Delta$ where $\sum_{i \in T} \lambda_i^2  \alpha_i^2\le L_0$ for any $\|\alpha\|_2^2 = 1$. Then we can say that $\sum_{i\in T} \lambda_i^2 \ve_i^2 = \lambda_i \le L_0$ for all $i \in T$, which means $\frac{\lambda_i}{L_0} \le 1$.
% \begin{align}
%     \Delta^\top (X^\top X)^2 \Delta = \sum_i \lambda_i^2 \alpha_i^2
%     \le \lambda_{\max } \sum_i \lambda_i \alpha_i^2
% \end{align}
Then, it must be that 
\begin{align}
\sum_{i\in T} \frac{\lambda_i^2}{L_0^2} \alpha_i^2 \le \sum_{i\in T} \frac{\lambda_i}{L_0} \alpha_i^2 \le 1 ,
\end{align}
% The first inequality is by $\frac{\lambda_i}{L_0} < 1$. 
and
\begin{align}
    \sum_{i\in T} \lambda_i^2\alpha_i^2 \le {L_0^2}.
\end{align}
Then we can set $L_{\irm} = L_s^2$.
Following the  example of Theorem 3 in \citep{jain_iterativehardthreshold_2014}, for sparse linear regression, we apply the same sample complexity $n > 5c_1 d_\inv \log d (\lambda_{\min}^e)$ where $\lambda^e_{\min} = \min_{i\in d} \lambda(X^e) $ will get us the conditioning constant:
\begin{equation}
    K \coloneqq \frac{(\lambda^e_{\max})^2} {\lambda^e_{\min}}.
\end{equation}
We substitute this back into the error bound of IHT. 
Then with probability at least $1-c_1 p^{-c_2}$ for constants $c_1, c_2 >0$, we end up with the bound
% We are required to show that the minimax IRM loss $\hat \cL$ satisfies both the RSC and RSS conditions, with parameters $a_{t}$ and $L_{t}$ as defined in Definition 1 and Definition 2 \citep{jain_iterativehardthreshold_2014}, at sparsity level $t = t' + d_\inv$ for some tolerance $t'$.
% for $M_S \coloneqq \sum_{e \in \cE}  \min_{\vv^e_S\in \Sp(S)} \hat \cR^e\left(\vv^e_S \right)$, a constant $M_S > 0$.
% Because $\hat \cR (\vv) = \frac{1}{n}\sum_{i=1}^n (y_i - \vv^\top \vx^e)^2$ is the least squares loss, 
% we have RSC with parameter $L_{t} = \lambda_{\max }((X^e)^\top X^e)$, which may be bound in high probability as per \Cref{eqn sub-gaussian design}.
% Applying these results to Theorem 3, we have for constant $c_1 >0$,
\begin{equation}
    \Ds{\tilde \beta - \beta^*_\inv}_2 \le c_1 \frac{\lambda_{\max}^2}{9 \kappa_s^2} \kappa_s \max_{i\in [d_s]}{\alpha_i^e}\kappa_s 
    \sqrt {
        \frac{d_\inv \log d}{n}
    }
    + 2 \sqrt{\frac{\sigma_\inv^2}{\kappa_s^2}}
    = O \ps{\lambda_{\max}^2 A \sqrt {\frac{d_\inv \log d}{n}}
    +\frac{\sigma_\inv}{\kappa_s} }.
\end{equation}
% We note that $ \frac{\sigma_\inv}{\kappa_s} $ is a constant for this problem; 

\begin{remark}
The minimax loss in \Cref{eqn:irm-minimax-vspecific}, which formulates the IRM penalty as a loss difference $\left[ \hat \cR^e(\vv_S)- \hat \cR^e\left(\vv^e_S \right)\right]$, has notable discontinuities between different parameters with different footprints $S$:
\begin{align}
% \begin{split}
\hat \cL(\vv_S) 
&\coloneqq 
\sum_{e \in \cE}  
\hat \cR^e(\vv_S)  
+ \rho\sum_{e\in \cE} 
\max _{\vv^e_S\in \Sp(S)} 
\left[ \hat \cR^e(\vv_S)- \hat \cR^e\left(\vv^e_S \right)\right]
\nonumber
\\
&= (1+\rho)\sum_{e \in \cE}  
\hat \cR^e(\vv_S)  
- \rho
\sum_{e \in \cE}  
\min_{\vv^e_S\in \Sp(S)} 
\hat \cR^e\left(\vv^e_S \right)
\nonumber
\\
&= (1+\rho)\sum_{e \in \cE}  
\hat \cR^e(\vv_S)  
- \rho
M_S.
% \end{split} 
\end{align}
This presents challenges in applying existing results in linear regression with restricted parameter error, such as by LASSO \citep{negahban_2009_higdim_mestimators, banerjee2015estimation, Wainwright2019-tb}, or especially IHT \citep{jain_iterativehardthreshold_2014}. 
Instead, we directly analyze the IRMv1 penalty.
\end{remark}
\end{proof}