%!TEX root = ../paper.tex
The noise variables are independent sub-Gaussian random variables (or vectors), with 0 mean and (lower-)bounded variance and bounded sub-Gaussian norm. 
% Let $ \epsilon_\inv$  have sub-Gaussian norm $\Ds{\epsilon_\inv}_{\psi_2}$. 
% Without loss of generality, 
Finally, we have sub-Gaussian norms $\kappa_\inv = \Ds{\epsilon_\inv}_{\psi_2}$, $\kappa_{s,j} = \Ds{\epsilon_{s,j}}_{\psi_2}$ for spurious features $j \in [d_s]$ and $\kappa_{r,j} = \Ds{\epsilon_{r,j}}_{\psi_2}$ for random features $j \in [d_r]$. For simplicity, we will often work with the largest constants $\kappa_s = \max_{j\in [d_s]} \kappa_{s,j}$ and $\kappa_r = \max_{j\in [d_r]} \kappa_{r,j}$.
% and $\Ds{\epsilon_{s,j}}_{\psi_2} = 1$ for every spurious feature $\vx_{s,j}^e$.

 

% For clarity, we will write $\vx^e$ as a concatenation of these features, but the analysis will hold even with the features shuffled. 
We have design matrix denoted ${X^e}\in \RR^{n\times d}$ 
where 
$X^e = [\vx^e_1, \vx^e_2, \cdots, \vx^e_n]^\top$, 
and $\vy^e \in \RR^n$ where $\vy = [y^e_1, y^e_2, \cdots, y^e_n]^\top$. 
% $X^e$ is therefore a design matrix with sub-Gaussian rows.
Each feature in $\vx^e$ is a sub-Gaussian random variable, and we assume $\Ds{\gamma}_2 = \Ds{\vx_\inv}_2 = 1$. 
% This leads to the overall bound ${\vx^e }_2 = O(\sqrt{s})$.



For the following analysis, we will use the notation
\begin{equation}
    \EE^e[f(x^e)] = \int_{x^e\in e} f(x^e) d \Pr(f(x^e)), \quad
    \hat \EE^e[f(x^e)] = \frac{1}{n} \sum_{i=1}^{n_e} f(x_i^e).
\end{equation}
Likewise, we assume that the environmental expectations are defined as follows:
\begin{equation}
    \EE[f(x^e)] = \frac{1}{\ds{\cE}} \sum_{e \in \cE}\EE^e[f(x^e)], \quad
    \hat \EE[f(x^e)] =  \sum_{e \in \cE_{tr}} \frac{n_e}{N}\hat \EE^e[f(x^e)]
\end{equation}
for the total number of training points available $N = \sum_{e\in \cE_{tr}} n_e$.
Note that the empirical expectation over the environment mixture only has access to $\cE_{tr} \subset \cE $. It is common to assume $n_{i} = n_{j}$ for environments $i\ne j \in \cE_{tr}$, in which case $\hat \EE[f(x^e)] =  \frac{1}{\ds{\cE}}\sum_{e \in \cE_{tr}} \hat \EE^e[f(x^e)]$.

This notation extends to modifiers for environment, footprints that are not the invariant footprint, and empirical risk:  
\begin{align*}
    \beta^e \coloneqq \argmin_{\substack{\vv \in \Sp(S_\inv)\\ \Ds{\vv}_2 \le 1}} \cR^e(\vv),\\
    \beta_S^* \coloneqq \argmin_{\substack{\vv \in \Sp(S)\\ \Ds{\vv}_2 \le 1}}  \sum_{e\in \cE}\cR^e(\vv),\\
    \hat \beta_S \coloneqq \argmin_{\substack{\vv \in \Sp(S)\\ \Ds{\vv}_2 \le 1}}  \sum_{e\in \cE} \hat \cR^e(\vv).
\end{align*}

For the purposes of our analysis, we assume the population optima $\beta^e$,  $\beta^*$ and $\beta^e_S$ are normalized.

Then,
$\hat \Sigma^e 
= \hat \EE^e [\vx^e(\vx^e)^\top] 
= \frac{1}{n}\sum_{i=1}^n \vx^e(\vx^e)^\top
= \frac{1}{n}  ({X^e})^\top({X^e})$.
Also, let $\lambda^e_{\max} \coloneqq \lambda_{1}(\Sigma^e)$ for eigenvalues sorted in descending order, and $\lambda_{\max} \coloneqq \max_{e\in \cE}(\lambda_{1}(\Sigma^e))$.

We also include a proof for the Sub-Gaussian design of feature vectors $\vx^e$, both for \citet{zhouSparseInvariantRisk2022}'s original generative model (\Cref{lemma:xe subg lemma orig}) and for our rescaled model (\Cref{lemma:norm-x}). Notably, the original setup induces a dependency of $\sqrt{d_s}$. 
\

% \begin{proposition}
%     overparameterization \jdcomment{make sure lines up}
% \end{proposition}

\begin{proposition}[Loss difference substitutes gradient norm penalty] Assuming the environmental risk is RSC, that is,
\label{prop:lossdiff}
\begin{equation}
 \cR^e(\vv') \ge \cR^e(\vv) + \langle \vv' - \vv, \nabla_{\vv}\cR^e(\vv) \rangle + \frac{\alpha}{2} \Ds{\vv' - \vv }^2_2,
\end{equation}
the gradient norm function $\cL_\textnormal{IRMv1}$ is an proxy of the loss difference function $\cL_\mm$. 
\end{proposition}
\begin{proof}
\label{proof:lossdiff}
We restate the RSC condition for the environmental risk $\cR^e(\vv)$. 
For   classifiers $\vv\in \RR^d$ and $\vv\in \RR^d$, we have
\begin{equation}
\tilde \cR^e(\vv') := \cR^e(\vv) + \langle \vv' - \vv, \nabla_{\vv}\cR^e(\vv) \rangle + \frac{\alpha}{2} \Ds{\vv' - \vv }^2_2,
\end{equation}
for $\alpha = 2\Ds{\vx^e}_2^2$, which has bounded sub-Gaussian norm, as shown in \Cref{lemma:norm-x}.

By the RSC condition, $ \cR^e(\vv') \ge \tilde \cR^e(\vv') $ for all $\vv' \in \RR^d$.
We  find $ \inf_{\vv} \tilde \cR^e(\vv') $ at the critical point of our function,
\[
0 = \nabla_{\vv} \cR^e(\tilde \vv) = \nabla  \cR^e(\vv') + \alpha(\tilde \vv - \vv),
\]
for a minimizer $\tilde \vv$. The environmental risk then evaluates to 
\begin{equation}
\inf_{\vv} \tilde \cR^e(\vv) = \cR^e(\tilde \vv) = \cR^e(\vv) - \frac{1}{2\alpha} \Ds{\nabla \cR^e(\vv)}^2_2,
\end{equation}
for a reference classifier $\vv\in\RR^d$.

We then get the inequality
\begin{equation}
    \cR^e(\vv') \ge \tilde \cR^e(\vv') \ge \tilde \cR^e(\tilde \vv)
     = \cR^e(\vv) - \frac{1}{2\alpha} \Ds{\nabla_{\vv} \cR^e(\vv)}^2_2
\end{equation}

From this, we can say 
\begin{equation}
    \cR^e(\vv) - \cR^e(\vv') \le \frac{1}{2\alpha} \Ds{\nabla_{\vv} \cR^e(\vv)}^2_2.
\end{equation}
Letting $\vv' = \vv^e$, we can see that minimizing the gradient norm penalty, can approximate minimizing the minimax loss $\cL(\vv)$.

\end{proof}


\begin{lemma}[Sub-Gaussian Design for \citet{zhouSparseInvariantRisk2022}]
\label{lemma:xe subg lemma orig}
When $\zeta_s^e = \vone^s$, as in the original generative model introduced by \citet{zhouSparseInvariantRisk2022}, we still have 
    $\kappa_{ones} := \| (\Sigma^e)^{-1/2} (\vx^e)^{(i)} \|_{\psi_2} \leq c_1 \sqrt{d_s}$ for all $i \in n, e\in \cE$. 
\end{lemma}


\begin{proof}
    Let $I_s$ be the indices of the spurious features.
    % Let $I_\inv, I_s, I_r$ be the indices for invariant, spurious, and random features respectively. 
    The invariant features $\vx^e_\inv$, the label noise $\epsilon_\inv$, and the random features $\vx^e_r$ are all independent, and identically distributed across samples. Then, $\vx^e_j$ for spurious features $j \in I_s$, we have 
    \[\vx^e_j 
    = y + \alpha_i^e \epsilon_j 
    = \gamma^\top \vx^e_\inv + \epsilon_\inv + \alpha_i^e \epsilon_j \]
    % Then, establishing the dependencies between features,
    % \[
    % [\vx^e]_j = \begin{cases} 
    %     \text{i.i.d. } SG(\kappa_\inv) &\text{if } j \in I_\inv,\\
    %     y + \alpha_i^e \epsilon_j =  \gamma^\top \vx_\inv + \epsilon_\inv + \alpha_i^e \epsilon_j &\text{if } j \in I_s,\\
    %     \text{i.i.d. } SG(\kappa_r) &\text{if } j \in I_r.     
    % \end{cases}
    % \]
    We keep $\Ds{\epsilon_j}_{\psi_2} = 1$ for the spurious features. Let  $\va = [\va_\inv, \va_s, \va_r]$ satisfy $\Ds{\va}_2^2 = 1$. 
    % \[
    % \kappa_{\vx} 
    % = \sup_{\Ds{\va}_2^2 = 1} \Ds{\langle \vx^e, \va \rangle}_{\psi_2}.
    % % = \sup_{\Ds{\va}_2^2 = 1}  
    % %     \sum_{j \in I_\inv} \alpha_i x_{inv,j}
    % %     + \sum_{j \in I_s} \alpha_i x_{s,j}
    % %     + \sum_{j \in I_r} \alpha_i x_{r,j}
    % \] 
    Then for any $t\in \RR$, 
    \begin{align}
    % \EE_{\vx^e}\exp\ps{t\langle \vx^e, \va \rangle}
    \EE_{\vx^e}\bs{\exp\ps{t\va^\top \vx^e}}
    = \EE_{\substack{\vx_\inv\\ \epsilon_\inv}} \bs{
    \exp \ps{ t\va_\inv^\top \vx^e_\inv }
    \EE_{\substack{\vx_s | \vx_\inv, \\ \epsilon_\inv}} \bs{
    \exp \ps{ t\va_s^\top \vx^e_s }
    }
    }
    \EE_{\vx^e_r} \bs{
    \exp \ps{ t\va_r^\top \vx^e_r }}
    .\nonumber
    % = \EE_{\substack{\vx_\inv\\ \epsilon_\inv}} \bs{
    % \exp\ps{ t\sum_{i\in I_\inv} a_j x^e_j} 
    % \EE_{\substack{\vx_s | \vx_\inv, \\ \epsilon_\inv}} \bs{\exp \ps{ t\sum_{i \in I_s} a_j x^e_j}}
    % }\EE_{\vx_r} \exp \ps{t\sum_{i \in I_r} a_j x^e_j}
    % &=  \EE_{\vx_\inv, \epsilon_\inv}\bs{
    % \exp\ps{t(\sum_{j \in I_\inv}a_j x_{\inv,j})}
    % \EE_{\vx_s | \vx_\inv, \epsilon_\inv} \bs{
    %     \prod_{j \in I_s} \exp \ps{  t(\gamma^\top \vx_\inv + \epsilon_\inv \alpha_j \epsilon_j  }
    %     }
    % }
    \end{align}
    The random features are bounded with   
    $\EE_{\vx_r} [\exp (t\va_r^\top \vx_r^e)] \le \exp (c_3 t^2  \kappa_r^2)$.    
    From here, we can condition on $\vx_\inv, \epsilon_\inv$, getting
    % keeping in mind that $y$ is a linear combination of those independent features.
    \begin{align}
        \EE_{\substack{\vx_s | \vx_\inv, \\ \epsilon_\inv}} \bs{
        \exp \ps{ t\va_s^\top \vx^e_s }
        }
        &= \EE_{\substack{\vx_s | \vx_\inv, \\ \epsilon_\inv}} \bs{
         \exp \ps{  \sum_{j \in I_s}ta_j(y  + \alpha_j \epsilon_j) }
        }
        ,\\
        &\le \exp \ps{
        ty  \vone^\top \va_s
        }\EE_{\substack{\vx_s | \vx_\inv, \\ \epsilon_\inv}} \bs{
        \exp ( t\sum_{j \in I_s} a_j\alpha_j \epsilon_j )
        }
        ,\\
        &\le \exp \ps{
        ty  \vone^\top \va_s
        }
        \exp\ps{ c_2t^2A},
    \end{align}
    for $A = \max_{j \in I_s} \alpha_j^2$ and $c_2 > 0$.
    Then,
    % Then, the remaining expression dependent on $\vx^e_\inv, \epsilon_\inv$ can be evaluated: 
    \begin{align}
        &\EE_{\substack{\vx_\inv\\ \epsilon_\inv}} \bs{
            \exp \ps{ t\va_\inv^\top \vx^e_\inv } 
            \exp \ps{ t (\gamma^\top \vx^e_\inv + \epsilon_\inv)  \vone^\top \va_s}
            \exp\ps{ c_2t^2A}
            },\\
        &= \EE_{\substack{\vx^e_\inv\\ \epsilon_\inv}} \bs{
            \exp \ps{
                t ( \va_\inv + \vone^\top \va_s\gamma)^\top \vx^e_\inv 
                + t ( \epsilon_\inv \vone^\top \va_s)
            }
            \exp\ps{ c_2t^2A}
        }
        ,\\
        &\le \exp \ps{
            t^2 c_4\ps{            
            (1+ \sqrt{d_s})^2 \Ds{\vx^e_\inv}_{\psi_2}^2
            +d_s \Ds{\epsilon_\inv}_{\psi_2}^2
            }
        }\exp\ps{ c_2t^2A}
        .
    \end{align}
    The above inequality uses $ \vone^\top \va_s \le \sqrt{ d_s}$ and 
    $\gamma_j \le 1$.
    We then have the bound
    \begin{equation}
        \EE_{\substack{\vx}} \bs{
        % \exp \ps{ t\va^\top \ps{(\Sigma^e)^{-1/2}\vx^e}}
        \exp \ps{ t\va^\top \vx^e}
        } 
        \le 
        \exp \ps{
            t^2 c_5
            (d_s + 2\sqrt {d_s} + 1 + d_s\Ds{\epsilon_\inv}_{\psi_2}^2 
            + c_2A + c_ 3\kappa_r^2
            )
        }.
    \end{equation}
    Taking the square root of the exponent gets $\Ds{\vx^e}_{\psi_2} = O(\sqrt{d_s}\Ds{\vx^e_\inv}_{\psi_2})$; the  isotropic vector $(\Sigma^e)^{-1/2} (\vx^e)^{(i)}$ then satisfies $\kappa_{ones} \le {c_1}{\lambda_{\max}}  \sqrt{d_s}$.

\begin{remark}
    This implies that if we are interested in finding the norm for a subset of the features, i.e.~$\vm \odot \vx^e$ for $\vm \in \{0,1\}^d$, this bound scales with the size of the subset $\Ds{\vm}_1$. This is pertinent for when we select a smaller (usually $O(d_\inv)$) subset of features with a sparse predictor under $L_0$ constraints.
\end{remark}
\end{proof}

\begin{lemma}[Feature vector L2 bound]
\label{lemma:norm-x}
We have with probability $1-\delta$ the bound 
\begin{equation}    
\Ds{\vx^e}_2 \le  1+ c_s
    % + \alpha^e\sqrt {d_s}  
    + c_a \kappa_s
    + c_r \kappa_r + O \ps{
    (c_a^2\kappa_s^2 + c_r^2 \kappa_r^2)\sqrt{\log \frac{1}{\delta}}
    },
\end{equation}
for positive constants $c_s, c_a, c_r$ as defined in model generation. The norm itself is a sub-Gaussian RV with $\Ds{\vx^e}_{\psi_2} = \kappa_{\vx} = O(\max\{\kappa_s^2, \kappa_r^2\})$ and mean 
$    \EE [\Ds{{\vx^e}}_2] = c_s
    + c_a \kappa_s
    + c_r \kappa_r.$
\end{lemma}

\begin{proof}
We first apply triangle inequality on the three feature blocks.
% Breaking down the vector's components allows us to separate the independent components from the spurious features. 
\begin{equation*}
\Ds{\vx^e}_2 \le \Ds{\vx^e_\inv}_2 + \Ds{\vx^e_s}_2 + \Ds{\vx^e_r}_2
\end{equation*}
We bound the three terms in order. First, we use the assumption that $\Ds{\vx^e_\inv}_2 = 1$. 

%%%%%%%%%%%%
% Then, also assuming $\EE[\vx_r^2] = \nu^2$ with $\nu \ge 1$, we have
% the following high probability bound:
% \[\Pr\ps{ \left | \Ds{\vx^e_r}_2 - \sqrt{d_r }\nu \right | \ge t} 
% \le 2 \exp \ps{-\frac{ct^2}{\nu ^2\kappa_r^4}},\]
% with a constant $c >0$. This gets the upper bound with probability $1-\delta_r$ that $\Ds{\vx^e_r}_2 \le 
% \nu \sqrt {d_r }  + O( \nu \kappa_r^2\log \frac{1}{\delta_r})$. 
%%%%%%%%%%%

Then, to evaluate $\Ds{\vx_s^e}_2$, we again use the triangle inequality to separate the label component from the sub-Gaussian noise, getting $\Ds{\vx_s^e}_2 \le \Ds{y^e \vzeta }_2 + \Ds{\valpha^e \odot \vepsilon_s }_2$. 
% \jdcomment{Where the explicit bounding of $\vx_s$ comes in}
With Cauchy-Schwartz, we have $\Ds{y^e \vzeta }_2 \le \Ds{\gamma}_2 \Ds{ \vx_\inv  }_2 \Ds{\vzeta_s^e }_2 = c_s$. 

To bound the second noise component, we apply a variant of Theorem 3.1.1 from \citep{vershyninHighDimensionalProbability} for zero-mean sub-Gaussian variables with different sub-Gaussian norms on different features. 

We define a random variable $Z  = [Z_1, Z_2, \cdots, Z_{d_s}]$ with $Z_i =  \lvert \alpha^e_i \epsilon_{s,i} \rvert $, and we aim to bound $ \Ds{Z}_2$. 
Firstly,
$\EE[\Ds{Z}_2^2] = c_a^2\kappa_s^2$
and
$ \EE[Z_i^2] = (\alpha_i^e )^2 \kappa_s^2 $.
We know that $Z_i$ is sub-Gaussian with $\Ds{Y_i}_{\psi_2} = \alpha_i^e \kappa_s$, so it must be that $ Y_i = Z_i^2 - (\alpha_i^e )^2 \kappa_s^2$ is sub-exponential and zero-mean.

Then, let 
$K = \max \ps{\Ds{Y_i}_{\psi_1}} \le \max \ps{  c_1 \Ds{Z_i}_{\psi_2}^2} \le 
c_1\max_i \ps { (\alpha_i^e )^2 }\kappa_s^2  = c_1  c_a^2\kappa_s^2$ for an absolute constant $c_1 > 0$. Note that variables named $c_1, c_2,$ etc.\ will also be positive constants going forward. 
% Note that since $(\alpha^e_i )^2 \le 1$ this depends on both $\kappa_s$ and $\alpha_i^e$, an

We apply Bernstein's to get 
\begin{equation}
    \Pr \ps{
    \left \lvert
    \sum_{i=1}^{d_s} Z_i^2 - c_a^2\kappa_s^2
    \right \rvert \ge u 
    }
    \le 
    2 \exp \ps{
    -\frac{c_2d_s}{K_0} \min \ps{ u^2,u }
    },
\end{equation}
where $K_0 = \max \ps{K^2, K} $. Then, using the fact that for non-negative $z,a$ we have $\lvert z - a| \ge \delta $ implies $\lvert z^2 - a^2 \rvert \ge \max (\delta, \delta^2)$. If we let $u = \max (\delta, \delta^2)$, we have $\delta^2 = \min (u^2, u)$, and
\begin{align}
\Pr \ps{
\left \lvert
\frac{1}{\sqrt{d_s}}\ps{ \Ds{Z }_2 - c_a \kappa_s }
\right \rvert
\ge \delta
}
&\le
\Pr \ps{
\left \lvert
\frac{1}{d_s}  \ps{ \Ds{Z_i}^2_2 - c_a^2 \kappa_s^2}
\right \rvert
\ge \max (\delta, \delta^2)
}
\\
&\le 2 \exp \ps {-\frac{c_2d_s}{K_0^2} \cdot  \delta^2}.
\end{align}
This then gets the bound, with $t  = \delta \sqrt {d_s},$
\begin{equation}
\Pr \ps{
\left \lvert
\Ds{Z_i }_2-c_a \kappa_s
\right \rvert
\ge t
}\le 2 \exp \ps{ - \frac{c_2t^2}{K_0^2}}.
\end{equation}
Then, we have with probability $1-\delta_{s}$  that $\Ds{\valpha^e \odot \vepsilon_s }_2 \le c_a \kappa_s
+ \sqrt {\frac{K_0^2}{c_2} \log \frac{1}{\delta_{s}}}$. 
Together, 
\[\Ds{\vx^e_s}_2 \le c_s  + c_a \kappa_s
+ \sqrt {\frac{K_0^2}{c_2} \log \frac{1}{\delta_{s}}}.
\] 
Then, a similar argument gets 
$\Ds{\vx^e_r}_2 \le c_r \kappa_r +  \sqrt {\frac{K^2_1}{c_3} \log \frac{1}{\delta_{r}}} $. 
Here, the constant is $K_1 
\le \max \{K', K'^2\} % \max_i((\zeta_{r,i}^e)^2)
$ for 
$K' = c_3 c_r^2 \kappa_r^2$.
For the final answer, we will assume $K, K' \ge 1$. Then,
letting $\delta = \delta_r + \delta_s$, we get the final bound: with probability $1- \delta$, we have 
\begin{align}
    % R \coloneqq
    \Ds{\vx^e}_2 
    &\le 1 
    + c_s
    % + \alpha^e\sqrt {d_s}  
    + c_a \kappa_s
    + c_r \kappa_r
    + \sqrt {\frac{c_a^4\kappa_s^4}{c_2} \log \frac{1}{\delta}}
    + \sqrt {\frac{c_r^4\kappa_r^4}{c_3} \log \frac{1}{\delta}}
    ,
    \nonumber
    \\
    &= O \ps{c_s
    % + \alpha^e\sqrt {d_s}  
    + c_a \kappa_s
    + c_r \kappa_r + 
    (c_a^2\kappa_s^2 + c_r^2 \kappa_r^2)\sqrt{\log \frac{1}{\delta}}
    }
    % \\
    % &=O \ps{\sqrt{\log \frac{1}{\delta}}}
    .
\end{align}
This demonstrates that $\Ds{\vx^e}_2$ is a sub-Gaussian random variable; its sub-Gaussian norm is  $\kappa_{\vx} = O(\max\{\kappa_s^2, \kappa_r^2\})$. Also from the above analysis, we can see that the population mean of the norm is 
\begin{equation}
    \EE [\Ds{{\vx^e}}_2] = 1 + c_s
    + c_a \kappa_s
    + c_r \kappa_r.
\end{equation}
% $\EE[\vx_r^2] = \mu_r^2$
% $\sigma_r^2 = \sum_{i\in S_r} \sigma_{i}^2 $. 
\end{proof}

% \begin{proposition}[Environmental estimation error with footprint]
% \label{prop:vomega-S}
%     We have
%     \begin{equation}
%         \Ds{\hat \beta^e_S - \beta^e_S}_2 \le \sqrt{d_\inv/n}.
%     \end{equation}
%     \end{proposition}
% \begin{proof}
% Note that the label randomness is entirely contained within $\epsilon^e$. We also only consider $|S| \le d_\inv$. 
% \begin{align}
%     y &= \gamma^ \top \vx^e_\inv + \epsilon_\inv = \beta^{*\top}\vx^e + \epsilon_\inv,\\
%     &= \beta_S^{e\top}\vx^e+ (\beta^* - \beta^e_S)^{\top}\vx^e + \epsilon_\inv
%     % \\
%     % &= \hat {\beta^e_S}^\top \vx^e
% \end{align}
% From this, we are back to linear regression with a $L_0$ norm regularization. Then, by Theorem 2 in \citep{banerjee2015estimation}, we have $\Ds{\hat \beta^e_S - \beta^e_S}_2\le \frac{w(B_{L_0})}{\sqrt n}$ for Gaussian width of the regularizer norm ball $w(B)$. We know that the Gaussian width of a $L_0$ ball is $\sqrt{d}$ for $d$ nonzero features, and we know that 
% $|S| \le d_\inv$. 
% We then have $\Ds{\hat \beta^e_S - \beta^e_S}_2\le\sqrt{d_\inv/n}$.
% \end{proof}



\begin{lemma}[Empirical gap]
\label{lemma:missing err-appx}
% Given environment $e \in \cE$ and selected features $S\in 2^d$, and with probability $1-\delta$,
Let  
$\beta_S^e = \argmin_{\vv \in \Sp(S)} \EE^e[(y-\vv^\top \vx)^2]$ and  
$\hat \beta_S^e = \argmin_{\vv \in \Sp(S)} \hat \EE^e[(y-\vv^\top \vx)^2]$ respectively be 
population and empirical minimizers, with covariance matrix 
$\hat \Sigma^e = \hat \EE^e [  \vx \vx ^\top] $ 
and footprint size $|S| \le d_\inv$.
Then we have with probability $1-\delta$,
\begin{equation}
\label{eqn:empirical}
\hat \EE^e[(y-{\beta_S^e}^\top \vx)^2] - \hat \EE^e[(y - \hat {\beta_S^e} ^\top \vx )^2] = \Ds{\beta_S^e- \hat \beta_S^e}_{\hat \Sigma^e}  +  \err(1/\delta, n), 
\end{equation}
where $\err(\frac{1}{\delta}, n) $ depends on assumptions on the generation model, specified in \Cref{prop:missing-emp-general}.
\end{lemma}
\begin{proof}
\label{prf:missing err-appx}
We note that $\vv_S^\top \vx = \vv^\top \Phi(\vx)$ if $\Phi$ selects the same features as $S$, where $\Phi(\vx)$ masks $\vx$ and $\vv_S$ masks $\vv$. We continue with the set notation $\beta_S$ for this proof, but the proof applies when working with feature mask $\Phi$.

First, we define a noise variable $\omega_S^e$:
\begin{equation}
\label{eqn:define-omegae}
\omega^e_S =y - (\beta^e_S)^\top {\vx^e}  = (\beta^* - \beta^e_S)^\top{\vx^e} + \epsilon_\inv.
\end{equation}

We proceed with the algebraic proof.
\begin{align}
&\hat \EE^e \bs{
    (y-{\vx^e}^\top \hat \beta^e_S)^2 - (y-{\vx^e}^\top \beta^e_S)^2
}
\\
&= 
    \hat \EE^e \bs{
    ({\vx^e}^\top \hat \beta^e_S)^2 
    } 
    - 2\hat \EE^e \bs{
     y{\vx^e}^\top (  \hat \beta^e_S - \beta^e_S) 
    }
    -\hat \EE^e \bs{
     ({\vx^e}^\top \beta^e_S)^2
    }
\label{align:step1}
\\
&=
        \hat \EE^e \bs{
            ({\vx^e}^\top \hat \beta^e_S)^2
        } 
        - 2\hat \EE^e \bs{
         ({\beta^e_S}^\top {\vx^e}  + \omega^e_S){\vx^e}^\top (  \hat \beta^e_S - \beta^e_S) 
        }-\hat \EE^e \bs{
     ({\vx^e}^\top \beta^e_S)^2
    }
\\
&=
        \hat \EE^e \bs{
            ({\vx^e}^\top \hat \beta^e_S)^2
        } 
        - 2\hat \EE^e \bs{
         ({\vx^e}^\top \beta^e_S){\vx^e}^\top (  \hat \beta^e_S - \beta^e_S) 
        }
         -
         \hat \EE^e \bs{
        ({\vx^e}^\top \beta^e_S)^2
        } 
        + 
        \err(1/\delta, n )
\label{align:step2}
\\&=
    \hat \EE^e \bs{
    ({\vx^e}^\top \hat \beta^e_S)^2 
    } 
    -2 \hat \EE^e \bs{
    {\vx^e}^\top \hat \beta^e_S {\vx^e}^\top \beta^e_S 
    }
    + \hat \EE^e \bs{
    ({\vx^e}^\top \beta^e_S)^2
    }
    + \err(1/\delta, n ) 
    \nonumber
\\
&= 
    \hat \EE^e \bs{
    ({\vx^e}^\top \hat \beta^e_S)
    - ({\vx^e}^\top \beta^e_S)^2
    } 
 + \err(1/\delta, n ) \nonumber
\\&= 
    (\hat \beta^e_S - \beta^e_S)
    \hat\EE^e \bs{{\vx^e} {\vx^e}^\top
    % ({\vx^e}^\top \hat \beta^e_S)
    % - ({\vx^e}^\top \beta^e_S))^2
    } (\hat \beta^e_S - \beta^e_S)
+ \err(1/\delta, n )
\nonumber
\\&= \left \|  
    \hat \beta^e_S - \beta^e_S
\right\|_{\hat \Sigma^e}^2 
+ \err(1/\delta, n ).
\end{align}


We note that step \Cref{align:step1} to \Cref{align:step2} is where the $\err(\frac{1}{\delta},n)$ term is introduced.
The proof of Proposition 5 in \citep{hsu2014random} applies to population error, making use of $\EE[y] = \EE[ {\beta^*}^\top \vx]$, which does not hold for either empirical risk, nor for classifiers $\beta^e$ or $\beta_S^e$ that are not the (invariant optimal) ground truth. 
\end{proof}

\begin{corollary}[Empirical gap dominates]
\label{cor:lessthanzero}
Following definitions introduced in \Cref{lemma:missing err-appx}, we have with probability $1-\delta$,
\begin{equation}
\label{eqn:empirical}
 \Ds{\beta_S^e- \hat \beta_S^e}_{\hat \Sigma} 
 \le  |\err(1/\delta, n)|.
\end{equation}
\end{corollary}
\begin{proof}
Recalling that $\hat \beta_S^e = \argmin_{\vv \in \Sp(S)} \hat \EE^e[(y-\vv^\top \vx)^2]$, we know that  
\[
% $
\hat \EE^e \bs{
    (y-{\vx^e}^\top \hat \beta^e_S)^2 - (y-{\vx^e}^\top \beta^e_S)^2
} \le 0.
% $
\]
\end{proof}
%
%
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%%  3 situations evaluating err
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proposition}[Empirical gap general]
% This brings us to the result in \cref{eqn: v diff hat sigma}. We then evaluate $\err(\logdel, n)$.
\label{prop:missing-emp-general}
Given environment $e \in \cE$ and selected features $S\in 2^d$, and with probability $1-\delta$,
\begin{equation}
    \label{eqn: concentration y x vdiff}
    \err(1/\delta, n) 
\coloneqq O \ps{
 d_\inv c_\total + 
 \frac{Kd_\inv }{\sqrt n}\log\frac{{1}}{{\delta}}}
\end{equation}
for $c_\total = \max \{1 + \kappa_\inv^2 + (c_z^2 + \kappa_\inv^2 + c_a^2) + c_r^2 \}$ and $K^2 = \max \ps{\kappa_{\vx}^4, \kappa_\inv^2}$.
\end{proposition}
\begin{proof}
\label{proof:missing-emp-general}
We  have the definition
\begin{equation}
    \err(\frac{1}{\delta}, n) \coloneqq 
    -2 \hat \EE^e[\omega^e_S{\vx^e}^\top(\hat \beta^e_S - \beta^e_S)].
\end{equation}
 We will now upper bound the second term with probability $1-\delta$.
\begin{align}
\label{eqn:3factors}
    \err(\frac{1}{\delta}, n)
    &\le   2 \left \lvert
    \hat \EE^e \bs{
        \omega^e_S(\vx^e)^\top (\hat \beta^e_S - \beta^e_S)
    } \right \rvert 
    \\
     % &\le 
    % \left \lvert
    % \hat \EE^e [
    % (\beta^* - \beta^e_S)^\top  \vx^e (\vx^e)^\top (\hat \beta^e_S - \beta^e_S)
    % ]
    % \right \rvert
    % + 
    % \left \lvert
    % \hat \EE^e [ \epsilon_\inv \vx^e]^\top (\hat \beta^e_S - \beta^e_S) 
    % \right \rvert
    % \\
    % &\le \frac{1}{n} \sum_{i=1}^n 
    % A_i B_i
    % + \sqrt{2 d_\inv}  
    % \left \lVert \frac{1}{n}\sum_{i=1}^n\epsilon_{\inv,i}\vx^e_{i} \right \rVert_2\\ 
    &\le 
    \left \lvert
    (\beta^* - \beta^e_S)^\top \hat \EE^e [ \vx^e (\vx^e)^\top ](\hat \beta^e_S - \beta^e_S)
    \right \rvert
    + 
    \left \lvert
    \hat \EE^e [ \epsilon_\inv \vx^e]^\top (\hat \beta^e_S - \beta^e_S) 
    \right \rvert\label{eqn:errbound}
    \\
    &\le 2d_\inv \left \lVert\frac{1}{n}(X^e)^\top X^e\right \rVert _2 
    + \sqrt{2 d_\inv}  
    \left \lVert \frac{1}{n}\sum_{i=1}^n\epsilon_{\inv,i}\vx^e_{i} \right \rVert_2 
    %     &\le \frac{1}{n} \Ds{(\beta^* - \beta^e_S)^\top X^e  + \epsilon_\inv \vone^{n_e}}_2 
    % \Ds{ {X^e}^\top(\hat \beta^e_S - \beta^e_S)}_2
.
\end{align}
% Above, we apply Cauchy-Schwartz to get $A_i =\Ds{ (\beta^* - \beta^e_S)^\top  \vx^e}_2 $ and $B_i = \Ds{(\hat \beta^e_S - \beta^e_S)^\top \vx^e }_2$. 
% Since we have a high-probability bound on $\Ds{\vx^e}_2$, we can say that 



With constants $c_1, c_2 > 0$ and $t_1, t_2 > 0$ we can use Bernstein's inequality to get, with probability 
$1 - 2\exp \ps{ -c_1 \min \{
\frac{t_1^2}{\kappa_{\vx}^4} 
\frac{t_1}{\kappa_{\vx}^2}\} n } $,
\begin{equation}
\label{eqn:secondmoment-part}
\left \lVert\frac{1}{n}(X^e)^\top X^e\right \rVert _2
\le \frac{1}{n} \sum_{i=1}^n \left \lVert \vx_i^e\right \rVert^2 _2
\le \EE^e[\left \lVert \vx^e\right \rVert^2 _2] +  t_1
.
\end{equation}
Similarly, with the Hoeffding-type inequality, $1 - e \cdot \exp\ps{-\frac{c_2t^2_2n}{K^2}}$,
% \jdcomment { the norm is itself a sub-gaussian RV, calculate its psi 2 norm}
\begin{equation}
\label{eqn:einv-part}
\left \lVert \frac{1}{n}\sum_{i=1}^n\epsilon_{\inv,i}\vx^e_{i} \right \rVert_2
\le \frac{1}{n} \sum_{i=1}^n \left \lVert \epsilon_{\inv,i}\vx^e_{i} \right \rVert_2 
\le \EE^e[\left \lVert \epsilon_\inv \vx_i^e\right \rVert _2] +  t_2
.
\end{equation}
Combining, we get
\begin{equation}
\err(\frac{1}{\delta}, n) \le 2 d_\inv (\EE^e[\left \lVert \vx^e\right \rVert^2 _2] 
 + t_1)+ \sqrt{2d_\inv}  (\EE^e[\left \lVert \epsilon_\inv \vx_i^e\right \rVert _2] +  t_2).
\end{equation}
We let both share the same bound $t_1 = t_2 = t$. Then, with constants $C,c >0$ and $K^2 = \max \ps{\kappa_{\vx}^4, \kappa_\inv^2}$, we apply union bound to a bound with probability
% In combination, for $t = \min(t_1, t_2)$ and 
$1-c \exp \ps{ -C \min \{
\frac{t^2}{K^2} , 
\frac{t}{\kappa_{\vx}^2}\} n } 
% + e \cdot \exp\ps{\frac{c_2t^2_2n}{K^2}}
$. Due to the mixture of tails resulting from Bernstein's inequality of a sum of sub-exponential variables, we will upper bound the maximum of the two. In other words, $t \le \sqrt{\frac{K^2}{Cn}}\log \frac{c}{\delta}$, and 
\begin{equation}
\err(\frac{1}{\delta}, n) 
\le 2 d_\inv \EE^e[\left \lVert \vx^e\right \rVert^2 _2]  
+  \sqrt{2 d_\inv}\EE^e[\left \lVert \epsilon_\inv \vx_i^e\right \rVert _2]
+ 2 d_\inv \sqrt{\frac{K^2}{Cn}}\log \frac{c}{\delta}. 
\end{equation}
We substitute $\EE^e[\left \lVert \vx^e\right \rVert^2 _2] \le 1 + \kappa_\inv^2 + (c_z^2 + \kappa_\inv^2 + c_a^2\kappa_a^2) + c_r^2 \kappa_r^2$ and $\EE^e[\left \lVert \epsilon_\inv \vx_i^e\right \rVert _2] \le \kappa_\inv ( 1 + c_s
    + c_a \kappa_s
    + c_r \kappa_r)$ into the above to get the desired result.

% \begin{equation}
    
% \end{equation}


% \begin{align}
% \label{eqn:3factors}
%     \err(\frac{1}{\delta}, n)
%     &\le   2 \left \lvert
%     \hat \EE^e \bs{
%         \omega^e_S{\vx^e}^\top(\hat \beta^e_S - \beta^e_S)
%     }
%     \right \rvert \\
%     &\le   2 \left \lvert \hat \EE^e \bs{
%         \ps{(\beta^* - \beta^e_S)^\top \vx^e + \epsilon_\inv}
%         {\vx^e}^\top(\hat \beta^e_S - \beta^e_S)
%     } \right \rvert
%     % + 2 \left \lvert \hat \EE^e \bs{
%     %     \epsilon_\inv
%     %     {\vx^e}^\top(\hat \beta^e_S - \beta^e_S)
%     % }\right \rvert 
%     \\
%     &\le \frac{1}{n} \Ds{(\beta^* - \beta^e_S)^\top X^e  + \epsilon_\inv \vone^{n_e}}_2 
%     \Ds{ {X^e}^\top(\hat \beta^e_S - \beta^e_S)}_2
%     % + 2 \left \lvert \hat \EE^e \bs{
%     %     \epsilon_\inv
%     %     {\vx^e}^\top(\hat \beta^e_S - \beta^e_S)
%     % }\right \rvert
% .
% \end{align}


\end{proof}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%%  with ones
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{corollary}[Empirical gap with ones]
\label{cor:missing-emp-ones}
In th original setting in \citet{zhouSparseInvariantRisk2022}, where the scaling variables are $\vzeta^e_s = \vone^{d_s}$ $\vzeta_s^e = \vone^{d_s}$, we have
\begin{equation}
    \err(\frac{1}{\delta},n)
    \le c_{\textnormal{ones}} d_\inv
+ O \ps{ d_\inv \sqrt{\frac{K^2}{n}}\log \frac{1}{\delta}}
. 
\end{equation}
for $c_{\textnormal{ones}} =  d_\inv(1  +\kappa_\inv^2 + c_A^2\kappa_s^2)^2$ and $K^2 = O(d_\inv)$.
\end{corollary}
\begin{proof}
We modify the bound for the error term introduced in \Cref{prop:missing-emp-general} with $\vzeta^e_s = \vone^{d_s}$ and $\vzeta_r = \vone^{d_r}$. 
That is,  $\vx_s = y \vone^{d_s} + \valpha^e \odot \vepsilon_s$ and $\vx_r = \vepsilon_r$. 
\begin{remark}The spurious features are generated with constant contribution from the label per feature. That is,
\[\EE^e[x_{s,i}] = \EE^e[y] = \EE^e[\gamma^\top \vx_\inv] = O(1),
\] 
In this case, $\EE^e[\Ds{\vx^e}_2] \ge c\sqrt{d_s}$ for constant $c > 0$. Because this will introduce an undesirable dependency on $d_s$, we avoid evaluating $\Ds{\vx^e}_2$ explicitly.
\end{remark}
Again, we want to bound the expression from \Cref{eqn:errbound}.
% , with the primary change in computing the expressions in \Cref{eqn:secondmoment-part} and \Cref{eqn:einv-part}. 
% The second moment in \Cref{eqn:secondmoment-part} is instead $O(\sqrt{d_s})$, shown in \Cref{lemma:xe subg lemma orig}.
% \jdcomment{snippets that may come in handy}
% We want to bound
% \begin{equation}
% \err(\frac{1}{\delta}, n)
%     % \le  \left \lvert 2 \hat \EE^e \bs{\ds{
%     %     \omega^e_S{\vx^e}^\top(\hat \beta^e_S - \beta^e_S)
%     % }} \right \rvert
%     \le 2 \hat \EE^e \bs{\ds{
%         \omega^e_S
%     }}
%    \EE^e \bs{ {\vx^e}}^\top
%      (\hat \beta^e_S - \beta^e_S).
% \end{equation}
\begin{align}
\label{eqn:3factors}
    \err(\frac{1}{\delta}, n)
    &\le   2 \left \lvert
    \hat \EE^e \bs{
        \omega^e_S(\vx^e)^\top (\hat \beta^e_S - \beta^e_S)
    } \right \rvert 
    \\
     &\le 
    \left \lvert
    \hat \EE^e [
    (\beta^* - \beta^e_S)^\top  \vx^e (\vx^e)^\top (\hat \beta^e_S - \beta^e_S)
    ]
    \right \rvert
    + 
    \left \lvert
    \hat \EE^e [ \epsilon_\inv \vx^e]^\top (\hat \beta^e_S - \beta^e_S) 
    \right \rvert
    \\
    &\le \frac{1}{n} \sum_{i=1}^n 
    A_i B_i
    + \sqrt{2 d_\inv}  
    \left \lVert \frac{1}{n}\sum_{i=1}^n\epsilon_{\inv,i}\vx^e_{i} \right \rVert_2 
    % &\le 
    % \left \lvert
    % (\beta^* - \beta^e_S)^\top \hat \EE^e [ \vx^e (\vx^e)^\top ](\hat \beta^e_S - \beta^e_S)
    % \right \rvert
    % + 
    % \left \lvert
    % \hat \EE^e [ \epsilon_\inv \vx^e]^\top (\hat \beta^e_S - \beta^e_S) 
    % \right \rvert
    % \\
    % &\le 2d_\inv \left \lVert\frac{1}{n}(X^e)^\top X^e\right \rVert _2 
    % + \sqrt{2 d_\inv}  
    % \left \lVert \frac{1}{n}\sum_{i=1}^n\epsilon_{\inv,i}\vx^e_{i} \right \rVert_2 \label{eqn:errbound}
    %     &\le \frac{1}{n} \Ds{(\beta^* - \beta^e_S)^\top X^e  + \epsilon_\inv \vone^{n_e}}_2 
    % \Ds{ {X^e}^\top(\hat \beta^e_S - \beta^e_S)}_2
.
\end{align}
Above, we apply Cauchy-Schwartz to get $A_i =\Ds{ (\beta^* - \beta^e_S)^\top  \vx^e}_2 $ and $B_i = \Ds{(\hat \beta^e_S - \beta^e_S)^\top \vx^e }_2$. 
% Since we have a high-probability bound on $\Ds{\vx^e}_2$, we can say that 
% Starting again from \Cref{eqn:define-omegae}, we observe
% \begin{equation}    
% \ds{(\beta^* - \beta^e_S)^\top \vx^e }
% \le \left | \sum_{i=1}^n \bbone [i \in S \cup S_\inv] x^e_i \right |
% \le c_1 \sqrt{d_\inv} + c_2\sqrt{\log \frac{1}{\delta'}}
% % O(\sqrt{d_\inv \log \frac{1}{\delta_{\vx}}})
% .
% \end{equation}
$A_i$ is a sub-Gaussian random variable with mean at least $\sqrt{d_\inv}$. Likewise for $B_i$. Furthermore, let $K = \max\{\Ds{A_i}_ {\psi_2}, \Ds{B_i}_ {\psi_2}\} $, where the sub-Gaussian norm of both $A_i$ and $B_i$ is $O(\sqrt{d_\inv})$ (see final remark in \Cref{lemma:xe subg lemma orig}), given that we only sum up to $2d_\inv$ elements of the total features of $\vx^e$.

Then with probability 
$1 - 2\exp \ps{ -c \min \{
\frac{t_1^2}{K^4} 
\frac{t_1}{K^2}\} n }  
$, constants $C,c >0$, and $t> 0$ we can say 
\begin{equation}
\frac{1}{n} \sum_{i=1}^n 
    A_i B_i
    \le 
    c d_\inv(1  +\kappa_\inv^2 + c_A^2\kappa_s^2)  + t_1.
\end{equation}
The bound 
\begin{align*}
\err(\frac{1}{\delta},n)
\le c d_\inv(1  +\kappa_\inv^2 + c_A^2\kappa_s^2)^2
% &\le
% ((\mu_{\vx} + R')\sqrt{2d_\inv} + \kappa_\inv + \sqrt{\log 1/\delta})(\mu_{\vx} + R) \sqrt{\frac{d_\inv}{n}} \\
% &= O \ps{
% \frac{ d_sd_\inv}{\sqrt n}\log\frac{{1}}{{\delta}}
% }
+  \sqrt{2 d_\inv}\EE^e[\left \lVert \epsilon_\inv \vx_i^e\right \rVert _2]
+ 2 d_\inv \sqrt{\frac{K^2}{Cn}}\log \frac{c}{\delta}
. 
\end{align*}

% Putting this together, wehave the same bound 
% Instead of repeatedly applying Cauchy-Schwartz, we note that 
% the bound for $\EE^e[\Ds{\vx^e}_2]$ scales with $\sqrt{\EE^e[x^2_{s,i}]} $
% \[\EE^e[x^2_{s,i}] 
% % \ge \EE^e[(y + \alpha^e_i \epsilon_{s,i})^2] 
% \ge \EE^e[y^2] = \EE^e[(\gamma^\top \vx_\inv)^2] + \Var(\epsilon_\inv) = O(1)
% \] 

% With the original data generation model, we have  $\mu_{\vx} = \EE[\Ds{\vx^e}_2] \ge 1+ \kappa_s\sqrt{d_s}$. 
% This is shown in \Cref{cor:l2-bound-theirs}.

% \begin{remark}
% This takes advantage of the constant expected value $\EE^e[\Ds{\vx^e}_2]$, due to $\vzeta_s^e$ and $\valpha^e$ having bounded norms. 
% Because this is not the case in the original setup of \citep{zhouSparseInvariantRisk2022}, this approach will induce a $\sqrt{d_s}$ dependency. 
% that $(\beta^* - \beta^e_S)^\top \vx^e$ selects at most $2d_\inv$ features of $\vx^e$ in \Cref{cor:missing-emp}.
% % Instead, we see that the ${\log\frac{d}{\delta}}$ arises instead.
% \end{remark}

% \jdcomment{Something second momenty here?}
% Substituting this into the analysis of \Cref{prop:missing-emp-general}, 
% with a final expression with $K = O(\sqrt{d_\inv}$ 



% With probability $1-\delta$, we have the bound 
% \begin{equation}    
% \Ds{\vx^e}_2 \le  O\ps {
% 1
% + \sqrt{d_s} \kappa_s
% + \sqrt{d_r}\kappa_r
% +\sqrt { \log \frac{1}{\delta}}},
% \end{equation}
% for positive constants $c_s, c_a, c_r$.

% % From \Cref{lemma:xe subg lemma orig} we h
% Note that each spurious feature has a constant mean at least.
% \begin{equation}
%     \EE^e[x_{s,i}] 
%     =\EE^e[ \gamma^\top \vx_\inv + \epsilon_\inv  +\alpha^e_i \epsilon_{s,i}] 
%     = \gamma^\top\EE^e[ \vx_\inv] = O(1).
% \end{equation}
% Knowing this, a lower bound of the feature vector as a whole has to be $\ge \sqrt {d_s}$.
% \begin{equation}
%     \Ds{\vx^e}_2 = 
% \end{equation}



\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%%  with uniform feature
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{corollary}[Missing empirical term with uniform feature assumption]
\label{cor:missing-emp-uniform} When the norms of $\vx_s, \vx_r$ are uniformly distributed, i.e.~$\vzeta_s^e = \frac{1}{d_s} \cdot \vone^{d_s} $ and $\vzeta_r = \frac{1}{d_r}\cdot\vone^{d_r}$, we have with probability $1-\delta$,
\begin{equation}
    \err(\frac{1}{\delta},n)
\le  O( (c_s^2 + c_a^2 + c_r^2)^2 \cdot \frac{d_\inv}{\min\{d_s, d_r\}})
+  \sqrt{2 d_\inv}\EE^e[\left \lVert \epsilon_\inv \vx_i^e\right \rVert _2]
+ 2 d_\inv \sqrt{\frac{\kappa_\inv^2}{Cn}}\log \frac{c}{\delta}
. 
\end{equation}
\end{corollary}
\begin{proof}
We can also infer that for a constant $b > 0$, $\max_{j\in S_\inv} x_j^2 \ge \frac{b}{d_\inv}$ because $\Ds{\vx_\inv}_2^2 = 1$.

We again bound \Cref{eqn:errbound} by considering  $A_i =\Ds{ (\beta^* - \beta^e_S)^\top  \vx^e}_2 $ and $B_i = \Ds{(\hat \beta^e_S - \beta^e_S)^\top \vx^e }_2$. 
With the uniformity assumptions, we can then say that for vectors $\vv \in \Sp(S_1), \vu \in \Sp(S_2)$, for $|S_1|, |S_1|  \le d_\inv$,
\begin{equation}
    (\vv - \vu)^\top \vx^e 
    \le  
    2 + \sum_{j\in S_s}(u_j - v_j) x_j^e + \sum_{j \in S_r}(u_j - v_j) x_j^e
    \le 2 +\sqrt{ d_\inv} \ps{\frac{c_s^2 + c_a^2}{\sqrt{d_s}} 
    + \frac{c_r^2}{\sqrt{d_r}} }
\end{equation} 
% Letting $\frac{1}{d'} = \max \{\frac{1}{d_s}, \frac{1}{d_r}\}$, 
We get $A_i B_i \le O( (c_s^2 + c_a^2 + c_r^2)^2 \cdot \frac{d_\inv}{\min\{d_s, d_r\}})$.
Then with probability 
$1 - 2\exp \ps{ -c \min \{
\frac{t_1^2}{K^4} 
\frac{t_1}{K^2}\} n }  
$, constants $C,c >0$, and $t> 0$ we can say 
\begin{equation}
    \err(\frac{1}{\delta},n)
\le O( (c_s^2 + c_a^2 + c_r^2)^2 \cdot \frac{d_\inv}{\min\{d_s, d_r\}})
+  \sqrt{2 d_\inv}\EE^e[\left \lVert \epsilon_\inv \vx_i^e\right \rVert _2]
+ 2 d_\inv \sqrt{\frac{\kappa_\inv^2}{Cn}}\log \frac{c}{\delta}
. 
\end{equation}

\begin{remark}
\label{rem:10overparameterized}
This result is significantly tighter due to the $\frac{d_\inv^2}{\min\{d_s, d_r\}}\le d_\inv$ factor in the first term, generated from the mean of $\omega^e_S(\vx^e)^\top (\hat \beta^e_S - \beta^e_S)$. 
Because in the overparameterized case $d_\inv \ll d_s + d_r$, we expect that the predictors $(\hat \beta^e_S - \beta^e_S)$ and $(\hat \beta^* - \beta^e_S)$, are not likely to pick up the majority of the length of $\vx^e$ with only $2d_\inv$ features. This is the best-case scenario, in which no ``heavy hitters" contributing to $c_s^2$ or $c_r^2$ are captured by the linear predictors.
\end{remark}
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% old thing for the general case proposition
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



% \jdcomment{is the below wrong}
% First, we use triangle inequality with the definition in  \Cref{eqn:define-omegae} to bound $\hat \EE^e[|\omega^e_S|] \le\hat  \EE^e[\ds{(\beta^* - \beta^e_S)^\top \vx^e}] + \hat \EE^e[|\epsilon_\inv|] $. 
% Then, applying a Hoeffding-type inequality to bound $ \hat \EE^e[{\vx^e}]$
% we have with probability $1- \delta'$, and  $R' = \sqrt{\frac{\kappa_{\vx}^2}{c_1n}\log \frac{1}{\delta'}}$ for a constant $c_1 >0$, 
% \begin{equation}    
% \ds{(\beta^* - \beta^e_S)^\top \vx^e }
% \le 
% (\mu_{\vx} + R')\sqrt{2d_\inv}
% % O(\sqrt{d_\inv \log \frac{1}{\delta_{\vx}}})
% .
% \end{equation}
% % and the assumption $\Ds{\beta^e_S}_2\le d_\inv$, 
% We then upper bound $ \hat \EE^e[|\epsilon_\inv|] \le c_2\kappa_\inv + \sqrt{\frac{\kappa_\inv^2}{c_3n}\log \frac{1}{\delta_\inv}}$ with probability $1-\delta_\inv$, again with a Hoeffding-type inequality and absolute constants $c_2, c_3 > 0$.
% % \begin{equation*}
% % \hat \EE[ \ds{ 
% % (\beta^* - \beta^e_S)^\top \vx^e
% % }] 
% % \le \Ds{\beta^* - \beta^e_S}_2 
% % \hat \EE[\Ds{\vx^e}_2]
% % \le \sqrt{2d_\inv}
% % (\mu_{\vx}+ \sqrt {\frac{\kappa_{\vx}^2}{cn} \log \frac{1}{\delta_{\vx}}})
% % \end{equation*}
% for $\kappa_{\vx}$ the sub-Gaussian norm of $\Ds{\vx^e}_2$ as defined in \Cref{lemma:norm-x}.
% % Then, $\ds{\epsilon_\inv}$ is the absolute of a sub-Gaussian variable, thus has sub-Gaussian norm $\kappa_\inv$. This allows us to bound with probability $1 - \delta_1$ that $\hat \EE^e[|\epsilon_\inv|] \le \kappa_\inv + O(\kappa_\inv\sqrt{\log \frac{1}{\delta_1}})$.
% % Then, the last component can be separated into 
% \begin{equation}
% \label{eqn: bounding empirical omegae}
%     % \hat \EE^e[|\omega^e_S|]
%     \err(\frac{1}{\delta},n)
%     \le 2((\mu_{\vx} + R')\sqrt{2d_\inv} + \kappa_\inv + \sqrt{\log 1/\delta})(\mu_{\vx} + R) \sqrt{\frac{d_\inv}{n}}
% \end{equation}
% To analyze the factors of \Cref{eqn:3factors}, we first note from \Cref{lemma:norm-x} that we have with probability $1-\delta_{\vx}$ 
% $\Ds{{\vx^e}}_2 \le \mu_{\vx} + R$ for $\mu_{\vx} = \EE[\Ds{\vx^e}_2] = 1+ c_s    + c_a \kappa_s     + c_r \kappa_r= O(c_s    + c_a    + c_r)$,
% and $R = O\ps{(c_a^2\kappa_s^2 + c_r^2 \kappa_r^2)\sqrt {\log \frac{1}{\delta_{\vx}}}}$. 
% % The norms are delineated in the definition of \Cref{lemma:norm-x} but will be left out going forward for clarity.

% We assume population optima are normalized $\Ds{\beta^*}_2  = \Ds{\beta^e_S}_2 = 1$. Note that $\Ds{\beta^*}_0 = d_\inv$ and $\Ds{\beta^e_S}_2 \le |S| \le d_\inv$.
% Then, $\Ds{\hat \beta^e_S - \beta^e_S}_2 \le \sqrt{d_\inv/n}$
% through an application of Theorem 2 in \citep{banerjee2015estimation} to the $L_0$ restricted error set, as detailed in \Cref{prop:vomega-S}.

% Using $\delta = \delta' + \delta_{\vx} + \delta_\inv$ gets us
% \begin{equation}
% \err(\frac{1}{\delta},n)
% \le
% % O \ps{
% % \sqrt{\log \frac{1}{\delta}}
% % } 
% O \ps{c_{\total}
% \frac{d_\inv}{\sqrt n}\log\frac{{1}}{{\delta}}
% }
% \end{equation}
% where the combined dependence of the constants is $c_{\total} = \kappa_\inv\max \{c_s^2, c_a^4\kappa_s^4, c_r^4\kappa_r^4 \}$. 
% Stripping away the constants, with an overall probability $1-\delta$, we have the desired bound for $\err(1/\delta, n)$.