% \subsection{Configurations}
% \begin{itemize}
%     \item Tried pruning 5, 10, 15 ... 195 values from the last layer of resnet18
%     \item Corresponds to 0.9 \% ... 38\% density in Probmask (theirs)
%     \item MLP details
% \end{itemize}
\label{appx:experiments}
The hyperparameters used for the experiments in \Cref{sec:experiments} are included below. 
Starred hyperparameters were evaluated via grid search. 
Remaining hyperparameters are kept from previous experimentation \citep{arjovsky2020invariant,zhouSparseInvariantRisk2022}.
Hyperparameters for the SparseIRM + PM method, not included in \Cref{tab:hparams},  are taken from \citet{zhouSparseInvariantRisk2022}.
% O

\begin{table*}[h]
\begin{center}
\caption{Hyperparameter configurations for experiments. \\}
\label{tab:hparams}
\begin{tabular}{llll}
\toprule
Dataset                        & 2-CMNIST & 10-CMNIST & MNISTCIFAR \\
Model                          & MLP390   & MLP390    & ResNet-18  \\
\midrule
GPUs (NVIDIA A100)             & 1        & 1         & 1          \\
Epochs                         & 1500     & 1500      & 50         \\
Optimizer                      & Adam     & Adam      & Adam       \\
Learning Rate                  & 0.0004   & 0.001     & 0.001      \\
IRMv1 Penalty Weight           & 10000    & 10000     & 10000      \\
IRMv1 Anneal Start Epoch       & 200      & 200       & 13         \\
Learning Rate Scheduler        & Cosine   & Cosine    & Cosine     \\
\# Zeroed Weights (last layer)*& 40       & 40        & 60         \\
IHT starting epoch*            & 1200     & 1200      & 46         \\
Updates between IHT projection*& 5        & 5         & 5          \\
\bottomrule
\end{tabular}
\end{center}
\end{table*}

\subsection{Datasets}
Correlation tuples for the construction of IRM datasets are included below.
% \jdcomment{dataset configuartions (noise \%, correlation tuples. Cite these; most were taken from \citep{zhouSparseInvariantRisk2022})}
\begin{table*}[h]
\begin{center}
\caption{Dataset configurations for experiments. \\}
\label{tab:dataset-config}
\begin{tabular}{llll}
\toprule
                  & 2-CMNIST        & 10-CMNIST         & MNISTCIFAR        \\
\midrule
Number of Classes & 2               & 10                & 2                 \\
Correlation Tuple & (0.9, 0.8, 0.1) & (0.999, 0.7, 0.1) & (0.999, 0.7, 0.1) \\
Noise             & 25\%            & 20\%              & 10\%            \\
\bottomrule
\end{tabular}
\end{center}
\end{table*}

Numbers followed by a error bar are 1 standard deviation, i.e., in 62.44 $\pm$ 0.96, 62.44 is the mean, and 0.96 is one standard deviation above and below.

% made by grouping MNIST digits 0-4 to one class and 5-9 to the other, with two training environments with different color configurations.  For both CMNIST variations, the oracle is simply trained on the grayscale (original MNIST) image.


% The label is the label of the CIFAR-10 image, and the spurious feature is the MNIST image . We choose two classes from CIFAR-10 and two digits from MNIST to create this binary dataset.
% For MNISTCIFAR, the oracle is trained on the CIFAR-10 with the space normally taken by MNIST whited out. 

\subsection{Tuning the sparsity}
In practice, we do not have access to $d_\inv$ when training a model on Sparse IRM.
 
\begin{table}[h]
\centering
\begin{tabular}{llll}
\toprule
{Perturbation to d\_inv (\%)} & {Train Accuracy (\%)} & {Test Accuracy (\%)} & {L1 Norm} \\
\midrule
-5 & 59.61 $\pm$ 0.32 & 56.98 $\pm$ 0.27 & 5.17 $\pm$ 1.43 \\
-2 & 62.11 $\pm$ 0.51 & 59.05 $\pm$ 0.43 & 6.07 $\pm$ 1.05 \\
+0 & 63.39 $\pm$ 0.55 & 60.94 $\pm$ 0.46 & 5.79 $\pm$ 4.05 \\
+2 & 59.41 $\pm$ 0.52 & 57.36 $\pm$ 0.42 & 7.97 $\pm$ 4.05 \\
+5 & 60.34 $\pm$ 0.80 & 58.03 $\pm$ 0.63 & 6.98 $\pm$ 3.18 \\
\bottomrule
\end{tabular}
\caption{Performance metrics across different perturbations to $d_{\text{inv}}$.}
\end{table}
