\begin{proof}
\label{appdx:proof-mtl}
The proof starts by using the same error breakdown as the proof of \Cref{thm:info-theory} in \Cref{proof:info-theoretic-thm}. The application of \Cref{xib lemma} to bound $\xi_a(S)$ and $\xi_b(S)$ remains the same, as does the bound for $\xi_c(S)$. 

The departure comes into effect when comparing $\hat \cL_\mm(S_\inv)$ with $\hat \cL_\mm(S)$, for two categories of $S$: those that include spurious features, and those without. 

We first state some additional assumptions for the modified Multitask case.
\subsection{Multitask assumptions}
\label{sec:assumptions}

\begin{assumption}[Multitask IFM assumption]
We need a $\sigma^2_\textnormal{total}$ to be large enough, in addition to $\sigma_0^2$ from before. 
\begin{equation}
\sigma_\total^2\coloneqq
\sum_{e\in \cE}
     \cR^e( \gamma^e)
     = \sum_{e\in \cE}
     \cR^e( \beta_\inv^e)
     = \ds{\cE} \sigma_\inv^2
.
\end{equation}
% We are essentially interested in $\epsilon_\inv^e$, which is allowed to change per environment, must have a minimum total. 

\end{assumption}



\subsection{Multitask proof: differences in L0 case}
We will first show that $\forall S \subseteq \cS_\text{with-spu}$, $\hat \cL_\mm(S_\inv) \le \hat \cL_\mm(S)$. 
We include relevant lines that differ from the single-task analysis; highlighted elements are some differences in the multi-line algebra for ease of reading.
\begin{align}
\label{eqn: inv,r,s vs inv}
\hat \cL_\fmm (S) - \hat \cL_\fmm (S_{\inv})
&= \sum _e \bs{
\mathhl{\hat \cR^e(\hat \beta^e_S)} + \rho \ps{ \hat \cR^e(\hat \beta_S) - \hat \cR^e (\hat \beta_S^e)} 
}\nonumber\\
&\quad - 
    \sum _e \bs{
    \mathhl{\hat \cR^e(\hat \beta^e_\inv)} + \rho \ps{ \hat \cR^e(\hat \beta_\inv) - \hat \cR^e (\hat \beta_\inv^e)}
}   ,
\\
%
&= \sum _e \bs{
\hat \cR^e (\hat \beta^e_S)}
+ \rho \xi_a(S) - \rho\xi_b(S) + \rho\xi_c(S)
\nonumber \\
&\quad - \xi_b(S^e_\inv) - \mathhl{\sigma_\total^2}
- \rho \xi_b(S_\inv) + \rho \xi_b(S_\inv) - \rho \xi_c(S_\inv).
\end{align}
For the second equality, instead of $\xi_a(S_\inv)$, we examine $\xi_b(S_\inv)$, using $\sigma_\inv^2$ as the variance of $\epsilon_\inv$: 
\[
\xi_b(S_\inv) = \sum_e\bs{\hat \cR^e(\hat \beta^e_\inv) - \cR^e( \beta_\inv^e)} 
= \sum_e \bs{\hat \cR^e(\hat \beta^e_\inv)} - \sigma_\total^2 .
\]
By dropping error terms $\sum_e\hat \cR^e( \cdot)  \ge 0$ and using $\ds{\xi_a(S)} \le \ds{\xi_b(S)}$ for all $S\in 2^d$, we get a lower bound.
\begin{align}
\label{eqn:converge}
\hat \cL_\fmm (S) - \hat \cL_\fmm (S_{\inv})
&\ge - \rho\xi_b(S) + \rho \xi_c(S) 
\nonumber \\ &\quad
- \mathhl{\xi_b(S_\inv) - \sigma_\total^2}
- \rho\xi_a(S_\inv)  
- \rho \xi_c(S_\inv),\\
&\ge -(2\rho + 1)\ds{\xi_b} - \sigma_\total^2
+  \rho \xi_c(S).
\end{align}
From \cref{eqn:converge}, the analysis converges to the single-task case. For the sake of completeness, we repeat the remaining line: the second inequality uses $\xi_c(S_\inv) = 0$.
We then select $\rho$ to eliminate the $\xi_c(S) $ term, getting the new expression
% We set our penalty weight to let the gap be at least 0.
\begin{equation}
    \rho = 
     \frac{\sigma^2_\total}{C^2\Delta^2/\lambda_{\max}^3}
     \ge \frac{\sigma^2_\total}{\xi_c(S)}.
\end{equation}
This evaluates to the same expression as in the single-task case, as $\sigma_\total^2 = \sum_{e} \sigma_\inv^2$.


Then, for the second case, $\forall S \subseteq \cS_\text{no-spu}$, we still have $\hat \cL_\fmm(S_\inv) \le \hat \cL_\fmm(S)$: 
\begin{align}
\hat \cL_{\fmm} (S) - \hat \cL_\fmm (S_{\inv})
&= \sum _e \bs{
\hat \cR^e (\hat \beta^e_S)}
+ \rho \xi_a(S) - \rho\xi_b(S) + \rho\xi_c(S)
\nonumber \\
&\quad - \xi_b(S^e_\inv) - \sigma_\total^2
- \rho \xi_a(S^e_\inv) + \rho \xi_b(S_\inv) - \rho \xi_c(S_\inv).
\end{align}

Now, we have $R^e(\beta_S^{\mathhl{e}}) - R^e(\beta^{\mathhl e}_\inv) = \sum_{i \in S_\inv \setminus S} \gamma_i \cdot x_{\inv, i} ^2$.
\[\mathhl{\xi_b(S)} 
= \sum _e \bs{\hat \cR^e (\hat \beta^e_S) - R^e(\beta^e) - \cR^e(\beta^e_S) +R^e(\beta^e)}
= \sum _e \hat \cR^e (\hat \beta_S) - \sigma_\total^2 - \ds{\cE}\sum_{i \in S_\inv \setminus S} \gamma_i x_{\inv, i} ^2 .\]
We again use the definitions in \Cref{assn: gamma} to lower bound $\gamma_i$ and $x_{\inv,i}$ with high probability. 
\begin{align}
\hat \cL_{\mm} (S) - \hat \cL_\mm (S_{\inv})
&\ge \ps{\mathhl{\xi_b(S)} + \ds{\cE}\cdot \sigma_\inv^2
+ \ds{\cE} \sum_{i \in S_\inv \setminus S}\gamma_i x_{\inv,i}^2} \nonumber \\
&\quad+ \rho \xi_a(S) - \rho\xi_b(S) + \rho\xi_c(S) \nonumber \\
&\quad \mathhl{- \xi_b(S_\inv) -  \sigma_\total^2}
\nonumber \\
&\quad
- \rho \xi_a(S_\inv) + \rho \xi_b(S_\inv) - \rho \xi_c(S_\inv),
\\
&\ge -(2\lambda + 1) \ds{\xi_b} + \ds{\cE} \cdot \bar \gamma \cdot \sigma_0^2.
\label{eqn:no-spu-diff}
\end{align}
Because the final expression in \cref{eqn:no-spu-diff} remains the same as for the single-task case, we arrive at the same final sample complexities.
\begin{equation}
    n > 
     \max\cs{
        \frac{\rho  \sgnorm
                d_\inv^{1.5}
                \lambda_{\max}^e
                \log \ps{\frac{1}{\delta_b}}
        }
        {\bar \gamma \cdot \sigma_0^2} 
        ,
        \frac{\rho^2 d_s d_\inv 
            \ps{
                d_\inv + \Ds{\epsilon_\inv}_{\psi_2} ^2
                \log\ps { \frac{1}{\delta_b}}
            }
        }
        {\bar \gamma^2 \cdot \sigma_0^4} 
    }
\end{equation}
Combining \cref{eqn: first complexity } and \cref{second complexity} results in a bound $n > Q_1 + Q_2 \log (d/\delta)$. 

\begin{equation}
    \label{q1}
    Q_1 = O\ps{
        \rho^2 d_sd_\inv^2
        \ps{  \frac{1}{\Ds{\epsilon_\inv}_{\psi^2}^4} + \frac{1}{\bar \gamma^2 \cdot \sigma_0^4} } 
    }.
\end{equation}
\begin{equation}
    \label{q2}
    Q_2 = O\ps{
        \ps{
        \rho  d_\inv^{1.5} \lambda_{\max}^e
        \ps{  
             % \frac{1}{\sgnorm} 
            1
            + \frac{\sgnorm}{\bar \gamma \cdot  \sigma_0^2} 
        } 
        % \log \ps{\frac{1}{\delta_b}}
        + 
        \rho^2 d_s d_\inv  
        \ps{  
            \frac{1}{\sgnorm} 
            % 1
            + \frac{\sgnorm}{\bar \gamma^2 \cdot  \sigma_0^4} 
        }
        }
         \log \ps{\frac{1}{\delta_b}}
    }.
\end{equation}
% from the derivation in \cite{zhouSparseInvariantRisk2022}.
Because $\ds{\cS_{\text{with-spu}} \cap \cS_{\text{no-spu}}} = d^{d_\inv}$, taking the union bound over all possible footprints incurs an extra $ O(d_\inv \log \frac{d}{\delta})$. The final complexity in terms of the dimensions is $O(d_\inv^{2.5}d_s) \log \frac{d}{\delta}$.
\end{proof}




\subsection{Remarks on IFM}
\jdcomment{Remove in final draft}
When brainstorming for how to make the MTL setting work, I investigated some other paths before deciding on changing the objective (and not the penalty expression). This summarizes those ideas, as well as the initial

The primary problem with the existing L0 analysis is that it depends on analyzing the loss of a featurizer, not of a classifier. In other words, we know how to analyze $\hat \cL_\mm(S)$, not $\hat \cL_\mm (\vv)$. When applied to the multitask setting, it reduces to finding $E$ different columns $V_{:,j}$ independently.

If we stick with the $\hat \cL_\mm(S)$ analysis, we are now just minimizing
\[\sum_{e\in \cE}\min_{V^e \in \Sp(S)} \hat \cR^e(V^e).\]
In this case, there are two paths going forward.
\begin{itemize}
    \item First, just ignore this. Can we prove that $\hat \cL _{\fmm} (S_\inv)  <  \hat \cL _{\fmm} (S); S  \ne S_\inv$? Just set up the $\alpha^e_i$ values such that this holds true. If there are some environments that don't have tiny $\alpha^e$ available to them, they will not be able to use the trick answer of putting 0s on all features except for the one highly correlated spurious feature. In other words, at least one environment must depend on the invariant features.
    \item Second, apply an additional constraint to our optimization. The smallest absolute entry, $|V_{i,j}|$, is not allowed to be too small. This involves optimizing in a weird region; not sure how the optimization will hold up.
    \begin{figure}[h]
    \centering
    \includegraphics[width=4cm]{images/mtl-region.png}
    \end{figure}
\end{itemize}

\jdcomment{This is the one I went forward with}
On the other hand, we could change the problem to be closer to ``classical IRM but with noise," which is 
\begin{equation}
    \hat \cL_\mm(S) \coloneqq 
    \min_{\vv \in \Sp(S)}
    \sum_{e \in \cE_{tr}} 
    \min_{\vv^e \in \Sp(S)}
    \hat \cR^e(\vv^e) 
    + \rho\sum_{e\in \cE_{tr}} 
    \left[\hat \cR^e(\vv)-\hat \cR^e\left(\vv^e \right)\right].
\end{equation}

In other words, keep the same penalty, but change the main objective to be the environmental risk instead. 

% \begin{theorem}
% \label{thm:l21}
% Given $n_{e_1} = n_{e_2} =n $ samples, together with assumptions in \Cref{sec:assumptions}, we can say with probability at least $1-\delta$ that
% IFM is able to find the invariant-feature optimal classifier. 
% \begin{equation}
%     \hat \cL _{\fmm} (V^*_\inv)  <  \hat \cL _{\fmm} (W); W \in \Sp(S), S  \ne S_\inv .
% \end{equation}
% for all $\Ds{S}_{2,1} \le c\Ds{S_\inv}_{2,1} = d_\inv$. 
% \end{theorem}


