% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{amsfonts} %mathbb
\usepackage{amsmath}
\DeclareMathOperator{\R}{\mathbb R}
% Subfigure and subcaption
\usepackage{caption}
\usepackage{subcaption}
\usepackage{amssymb}
% Definition
\usepackage{amsthm}
\usepackage{bm}
% \theoremstyle{definition}
\theoremstyle{plain}
\newtheorem{metric}{Metric}[section]
\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{definition}[theorem]{Definition}
\usepackage{cleveref}



% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 

% \externaldocument{introduction}
% \externaldocument{backgorund}
% \externaldocument{method}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
% xr for overleaf
%----Helper code for dealing with external references----
% (by cyberSingularity at http://tex.stackexchange.com/a/69832/226)

\usepackage{xr}
\makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
%------------End of helper code--------------
\renewcommand\thefigure{\arabic{figure}}
\setcounter{figure}{5}

% \externaldocument{main}
\myexternaldocument{gorji_368} % overleaf

% unnumbered footnote
\newcommand\nnfootnote[1]{%
  \begin{NoHyper}
  \renewcommand\thefootnote{}\footnote{#1}%
  \addtocounter{footnote}{-1}%
  \end{NoHyper}
}

\title{A Scalable Walsh-Hadamard Regularizer to Overcome the Low-degree Spectral Bias of Neural Networks (Supplementary material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Ali Gorji$^*$}
\author[1]{Andisheh Amrollahi$^*$}
\author[1]{Andreas Krause}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Department\\
    ETH Zurich\\
    Zurich, Switzerland
}

  
\begin{document}
\maketitle
\nnfootnote{*These authors contributed equally to this work}
\appendix
% \input{appendix}
% % NOTE: necessary when ptmx or nomathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
\section{Walsh-Hadamard transform matrix form }
\label{app:sec:walsh_hadamard}
The Fourier analysis equation is given by:
\[\widehat{g}(f) = \frac{1}{\sqrt{2^n}} \sum\limits_{x \in \{0,1\}^n}  g(x)  (-1)^{\langle f, x \rangle}\] 
Since this transform is linear, it can be represented by matrix multiplication.  
Let $\mathbf{X}\in \{0,1\}^{2^n\times n}$ be a matrix that has the enumeration over all possible $n$-dimensional binary sequences ($\{0,1\}^n$) in some arbitrary but fixed order as its rows.  Assume $\mathbf{g}(\mathbf{X})\in \mathbb{R}^{2^n}$ to be the vector of $g$ evaluated on the rows of $\mathbf{X}$.  We can compute the Fourier spectrum as:
\[
\widehat{\mathbf{g}} = \frac{1}{\sqrt{2^n}}\mathbf{H}_n \mathbf{g}(\mathbf{X})
\] 
where $\mathbf{H_n}\in \{\pm1\}^{2^n\times 2^n}$ is an orthogonal matrix given as follows. Each row of $\mathbf{H_n}$ corresponds to some fixed frequency $f \in \{0,1\}^n$ and the elements of that row are given by $(-1)^{\langle f, x \rangle}, \forall x \in \{0,1\}^n$, where the ordering of the $x$ is the same as the fixed order used in the rows of $\mathbf{X}$. The ordering of the rows in $\mathbf{H}_n$, i.e. the ordering of the frequencies considered, is arbitrary and determines the order of the Fourier coefficients in the Fourier spectrum $\widehat{\mathbf{g}}$.

It is common to define the Hadamard matrix $\mathbf{H}_n \in \{\pm1\}^{2^n\times 2^n}$ through the following recursion:
\begin{equation*}
    \mathbf{H}_n = \mathbf{H}_2 \otimes\mathbf{H}_{n-1},
\end{equation*}
where $\mathbf{H}_2 := \begin{bmatrix}
  1 & 1\\ 
  1 & -1
\end{bmatrix}$, and $\otimes$ is the Kronecker product. We use this in our implementation. This definition corresponds to the ordering similar to $n$-bit binary numbers (e.g., $[0,0,0], [0,0,1], [0,1,0], ..., [1,1,1]$ for $n=3$) for both frequencies and time (input domain). 

Computing the Fourier spectrum of a network using a matrix multiplication lets us utilize a GPU and efficiently compute the transform, and its gradient and conveniently apply the back-propagation algorithm. 
% \section{HashWH Details}
\section{Algorithm Details}
\label{app:sec:HashWH_details}

Let $g:\{0,1\}^n \rightarrow \mathbb{R}$ be a pseudo-boolean function with Fourier transform $\widehat{g}$. In the context of our work, this pseudo-boolean function is the neural network function. One can sort the Fourier coefficient of $g$ according to magnitude, from biggest to smallest, and consider the top $k$ biggest coefficients as the most important coefficients. This is because they capture the most energy in the Fourier domain and by Parseval's identity also in the time (original input) domain. It is important to us that these $k$ coefficients $\widehat{g}(f_1), \dots, \widehat{g}(f_k)$ are not hashed into the same bucket. Say for example two large coefficients $\widehat{g}(f_i), \widehat{g}(f_j), i \neq j$ end up in the same bucket, an event which we call a \emph{collision}. If they have different signs, their sum can form a cancellation and the $L_1$ norm will enforce their sum to be zero. This entails an approximation error in the neural network: Our goal is to sparsify the Fourier spectrum of the neural network and ``zero out'' the non-important (small-magnitude) coefficients, not to impose wrong constraints on the important (large magnitude) coefficients. 

With this in mind, we first prove our hashing result Equation~\ref{eq:hash_sum} from Section~\ref{subsec:wht_background}. Next, we provide guarantees on how increasing the hashing bucket size reduces collisions. Furthermore, we show how independently sampling the hashing matrix over different rounds guarantees that each coefficient does not collide too often. Ideas presented there can also be found in \cite{alon1999linear, amrollahi_efficiently_2019}. We finally review \textsc{EN-S} and showcase the superiority and scalability of our method in terms of computation. 

\subsection{Proof of Equation~\ref{eq:hash_sum}}
\label{app:subsec:hash_sum_proof}
Let 
\[u_\mathbf{\sigma}(\tilde{x}) = \sqrt{\frac{2^n}{2^b}} g(\mathbf{\sigma} \tilde{x}), \forall \tilde{x} \in \{0,1\}^b\]
as in Section~\ref{subsec:wht_background}.

We can compute its Fourier transform $\widehat{u}_\mathbf{\sigma}(\Tilde{f})$  as:
\begin{align}
    \widehat{u}_\mathbf{\sigma}(\Tilde{f}) &= \frac{1}{\sqrt{2^b}} \sum_{\tilde{x} \in \{0, 1\}^b} u_\mathbf{\sigma}(\tilde{x})(-1)^{\langle \Tilde{f}, \tilde{x} \rangle} \nonumber\\
    &= \frac{1}{\sqrt{2^b}} \sum_{\tilde{x} \in \{0, 1\}^b}  \sqrt{\frac{2^n}{2^b}} \  g(\mathbf{\sigma} \tilde{x})(-1)^{\langle \Tilde{f}, \tilde{x} \rangle} \nonumber\\
    &= \frac{\sqrt{2^n}}{2^b} \sum_{\tilde{x} \in \{0, 1\}^b}  g(\mathbf{\sigma} \tilde{x})(-1)^{\langle \Tilde{f}, \tilde{x} \rangle} \label{eq:hash_sum_expansion}
\end{align}

Inserting the Fourier expansion of $g$ into Equation~\eqref{eq:hash_sum_expansion} we have:
\begin{align*}
    \widehat{u}_\mathbf{\sigma}(\Tilde{f}) &= \frac{1}{2^b} \sum_{\tilde{x} \in \{0, 1\}^b}  (-1)^{\langle \Tilde{f}, \tilde{x} \rangle}\sum_{f \in \{0,1\}^n}\widehat{g}(f)(-1)^{\langle f, \mathbf{\sigma} \tilde{x} \rangle} \nonumber\\
    &= \frac{1}{2^b} \sum_{\tilde{x} \in \{0, 1\}^b}  \sum_{f \in \{0,1\}^n} \widehat{g}(f)(-1)^{\langle \mathbf{\sigma}^\top f, \tilde{x} \rangle} (-1)^{\langle \Tilde{f}, \tilde{x} \rangle}  \nonumber\\
    & = \frac{1}{2^b} \sum_{f \in \{0,1\}^n} \widehat{g}(f) \sum_{\tilde{x} \in \{0, 1\}^b}  (-1)^{\langle \mathbf{\sigma}^\top f + \Tilde{f}, \tilde{x} \rangle}
\end{align*}
The second summation is always zero unless $\mathbf{\sigma}^\top f + \Tilde{f}=0$, i.e., $\mathbf{\sigma}^\top f=\Tilde{f}$, in which case the summation is equal to $2^b$. Therefore:
\begin{align*}
    \widehat{u}_\mathbf{\sigma}(f) = \sum_{\tilde f \in \{0,1\}^n:\ \mathbf{\sigma}^T \tilde f=f} \widehat{g}(f)
\end{align*}

\subsection{Collisions for \textsc{HashWH}}
\label{app:subsec:collision_probability}
We first review the notion of \emph{pairwise independent} families of hash functions introduced by \cite{carter1979universal}. We compute the expectation of the number of collisions for this family of hash functions. We then show that uniformly sampling $\sigma \in \{0,1\}^{n\times b}$ in our hashing procedure (in \textsc{HashWH}) gives rise to a pairwise independent hashing scheme. 

\begin{definition}[Pairwise independent hashing]\label{def:pairwise_independent}
Let $\mathcal{H} \subseteq \{h| h \in \{0,1\}^n\rightarrow\{0,1\}^b\}$ be a family of hash functions.  Each hash function maps $n$-dimensional inputs $x\in\{0,1\}^n$ into a $b$-dimensional buckets $u=h(f)\in\{0,1\}^b$ and is picked uniformly at random from $\mathcal{H}$. We call this family \emph{pairwise independent} if for any distinct pair of inputs $f_1 \neq f_2\in\{0,1\}^n$ and an arbitrary pair of buckets $u_1, u_2\in\{0,1\}^b$:
\begin{enumerate}
    \item $P(h(f_1)=u_1)=\frac{1}{2^b}$
    \item $P((h(f_1)=u_1) \land (h(f_2)=u_2))=\frac{1}{2^{2b}}$
\end{enumerate}
\end{definition}
(randomness is over the sampling of the hash function from $\mathcal{H}$)

Assume $S=\{f_1, ..., f_k\} \subseteq \{0,1\}^n$ is a set of $k$ arbitrary elements to be hashed using the hash function $h \in \{0,1\}^n\rightarrow\{0,1\}^b$ which is sampled from a pairwise independent hashing family. Let $c_{ij}$ be an indicator random variable for the collision of $f_i, f_j, i \neq j$, i.e., $c_{ij} = \begin{cases}
      1 & h(f_i)=h(f_j) \\
      0 & h(f_i) \neq h(f_j)
    \end{cases}\,$, for $i \neq j \in [k]$.

\begin{lemma}
\label{lemma:collision_count}
The expectation of the total number of collisions $C=\sum_{i \neq j \in [k]} c_{ij}$ in a pairwise independent hashing scheme is given by: 
$\mathbb{E}[C]=\frac{(k-1)^2}{2^{b}}$.
\end{lemma}
\begin{proof}
\begin{align*}
    \mathbb{E}[C]&=\sum_{i \neq j \in [k]} \mathbb{E}[c_{ij}]\\
    &= \sum_{i \neq j \in [k]} \sum_{u \in \{0,1\}^b} P((h(f_i)=u) \land (h(f_j)=u)) \\
    &= \frac{(k-1)^2}{2^b},
\end{align*}
where we have applied the linearity of expectation. 
\end{proof}
The next Lemma shows that the hashing scheme of \textsc{HashWH} introduced in Section~\ref{subsec:Hashwh} is also a pairwise independent hashing scheme. However, there is one small caveat: the hash function always maps $0 \in \{0,1\}^n$ to $0 \in \{0,1\}^b$ which violates property 1 of the pairwise independence Definition \ref{def:pairwise_independent}. If we remove $0$ from the domain then it becomes a pairwise independent hashing scheme. 
\begin{lemma}
\label{lemma:HashWH_pairwise}
The hash function used in the hashing procedure of our method \textsc{HashWH}, i.e., $h(.)=\mathbf{\sigma}^\top(.)$ where $\mathbf{\sigma} \sim \mathcal{U}_{\{0,1\}^{n\times b}}$ is a hashing matrix whose elements are sampled independently and  uniformly at random (with probability $\frac{1}{2}$)  from $\{0,1\}$, is pairwise independent if we exclude $f=0$ from the domain.
\end{lemma}
\begin{proof}
Note that for any input $f\in\{0,1\}^n, f \neq 0$, its hash $\mathbf{\sigma}^\top f$ is a linear combination of columns of $\mathbf{\sigma}^\top$, where $f$ determines the columns. We denote $i$\textsuperscript{th} column of $\mathbf{\sigma}^\top$ by $\mathbf{\sigma}^\top_{\bullet i}$.
Let $f$ be non-zero in $t\geq 1$ positions (bits) $\{i_1, ..., i_t\}\subseteq[n]$. The value of $h(f)$ is equal to the summation of the columns of $\mathbf{\sigma}^\top$ that corresponds to those $t$ positions: $\mathbf{\sigma}^\top_{\bullet i_1}, \cdots, \mathbf{\sigma}^\top_{\bullet i_t}$. Let $u \in \{0,1\}^b$ be an arbitrary bucket. The probability the sum of the columns equals $u$ is $\frac{1}{2^b}$ as all sums are equally likely i.e. 
\[P(h(x) = u) = \frac{1}{2^b}\]
    
Let $f_1, f_2 \neq 0, f_1 \neq f_2 \in \{0,1\}^n$ be a pair of distinct non-zero inputs. Since $f_1$ and $f_2$ differ in at least one position (bit), $h(f_1)$ and $h(f_2)$ are independent random variables. Therefore, for any arbitrary $u_1, u_2 \in \{0,1\}^b$
\begin{align*}
&P(h(f_1)=u_1 \land h(f_2)=u_2) \\
= &P(h(f_1)=u_1)P(h(f_2)=u_2) = \frac{1}{2^{2b}}
\end{align*}
\end{proof}

Lemmas \ref{lemma:collision_count} and \ref{lemma:HashWH_pairwise} imply that the expected total number of collisions $C$ in hashing frequencies of the top $k$ coefficients of $g$ in our hashing scheme is also equal to: $\mathbb{E}[C] = \frac{(k-1)^2}{2^b}$. Our guarantee shows that the number of collisions goes down linearly in the number of buckets $2^b$. 

Finally, let $f_1$ be an important frequency i.e. one with a large magnitude $|\widehat{g}(f_1)|$. By independently sampling a new hashing matrix $\mathbf{\sigma}$ at each round of back-prop, we avoid always hashing this frequency into the same bucket as some other important frequency. By a union bound on the pairwise independence 
property, the probability that a frequency $f_1$ collides with any other frequency $f_2, \dots, f_k$ is upper bounded by $\frac{k-1}{2^b}$. Therefore, over $T$ rounds of back-prop the number of times this frequency collides follows a binomial distribution with $p \leq \frac{k-1}{2^b}$ ($\frac{k-1}{2^b}<1$ for a large enough $b$). We denote the number of times frequency $f_1$ collides over the $T$ rounds as $C_{f_1}$. The expected number of collisions is $\mu \triangleq Tp$ which goes down linearly in the number of buckets. With a Chernoff bound we can say that roughly speaking, the number of collisions we expect can not be too much larger than a fraction $p$ of the $T$ rounds.

By a Chernoff's bound we have:
\[
P(C_{f_1} \geq (1+\delta )\mu ) \leq e^{-\frac{\delta^{2}\mu}{2+\delta}}
\]
where $\mu=Tp$ as mentioned before 

For examples setting $\delta =1$
\[
P(C_{f_1} \geq 2 \mu ) \leq e^{-\frac{\mu}{3}}
\]
As $T \rightarrow \infty$ this probability goes to zero. This means that the probability that the number of times the frequency collides during the $T$ rounds to not be more than a fraction $(1+\delta)p=2p$ of the time is, for all practical purposes, essentially zero. Building on this intuition, we can see that for any fixed $0<\epsilon<1$, setting $b=\log_2(\frac{k-1}{\epsilon})$ guarantees that collision of a given frequency happens on average a fraction $\epsilon$ of the $T$ rounds and not much more. 
% For each bucket $f_u \in \{0, 1\}^b$ with only one frequency $f \in \text{supp}(g_\theta)$ hashed into, $\widehat{u}_\mathbf{\sigma}(f_u)=\widehat{g_\theta}(f)$. Therefore, we can approximate the non-zero part of $\widehat{\mathbf{g_\theta}}$ using $\widehat{\mathbf{u_\mathbf{\sigma}}}$, where the lower number of collisions leads to a better approximation. In case of no collisions, $\|\widehat{\mathbf{u_\mathbf{\sigma}}}\|_1=\|\widehat{\mathbf{g_\theta}}\|_1$, which has the probability of $P(C=0)\geq 1-\frac{k^2}{2^b}$ (Markov's inequality).
\subsection{\textsc{EN-S} details}
\label{app:subsec:EN-S}
To avoid computing the exact Fourier spectrum of the network at each back-propagation iteration in \textsc{FullWH}, \cite{aghazadeh_epistatic_2021} suggest an iterative regularization technique to enforce sparsity in the Fourier spectrum of the network called \textsc{EN-S}. 

We first briefly describe the Alternating Direction Method of Multipliers \citep{boyd_distributed_2011} (ADMM) which is an algorithm that is used to solve convex optimization problems. This algorithm is used to derive \textsc{EN-S}. Finally, we discuss \textsc{EN-S} itself and highlight the advantages of using our method \textsc{HashWH} over it.

\paragraph{ADMM.} Consider the following separable optimization objective:
\begin{align*}
    \min_{\bm{x}\in\mathbb{R}^n,\bm{z}\in\mathbb{R}^m} f(\bm{x}) + g(\bm{z}) \\ \text{subject to  } \mathbf{A}\bm{x}+\mathbf{B}\bm{z}=\mathbf{c},
\end{align*}
where $\mathbf{A}\in\mathbb{R}^{p\times n}$, $\mathbf{B}\in\mathbb{R}^{p\times m}$, $\mathbf{c}\in\mathbb{R}^{p}$, and $f\in \mathbb{R}^m\rightarrow \mathbb{R}$ and $g\in \mathbb{R}^n\rightarrow \mathbb{R}$ are arbitrary \emph{convex} functions 
%(w.l.o.g., $\mathbb{R}$ can be replaced by any set of interest in the domains). 
The augmented Lagrangian of this objective is formed as:
\begin{align*}
L_\rho(\bm{x}, \bm{z}, \mathbf{\gamma})=f(\bm{x})+g(\bm{z})+\mathbf{\gamma}^\top (\mathbf{A}\bm{x}+\mathbf{B}\bm{z}-\mathbf{c}) \\+ \frac{\rho}{2} \|\mathbf{A}\bm{x}+\mathbf{B}\bm{z}-\mathbf{c}\|_2^2,
\end{align*}
where $\mathbf{\gamma}\in\mathbb{R}^{p}$ are the dual variables. 

Alternating Direction Method of Multipliers \citep{boyd_distributed_2011}, or in short \emph{ADMM}, optimizes the augmented Lagrangian by alternatively minimizing it over the two variables $\bm{x}$ and $\bm{z}$ and applying a dual variable update:
\begin{align}
    \bm{x}^{k+1} &= \operatorname*{argmin}_{\bm{x}\in\mathbb{R}^n} L_\rho(\bm{x}, \bm{z}^k, \mathbf{\gamma}^k) & (\bm{x}\text{-minimization})\nonumber\\
    \bm{z}^{k+1} &= \operatorname*{argmin}_{\bm{z}\in\mathbb{R}^m} L_\rho(\bm{x}^{k+1}, \bm{z}, \mathbf{\gamma}^k) & (\bm{z}\text{-minimization})\nonumber\\
    \mathbf{\gamma}^{k+1} &= \mathbf{\gamma}^{k} + \rho(\mathbf{A}\bm{x}^{k+1}+\mathbf{B}\bm{z}^{k+1}-\mathbf{c})  & (\text{dual var. update}) \nonumber
\end{align}

In a slightly different formulation of ADMM, known as \emph{scaled-dual} ADMM, the dual variable can be scaled which results in a similar optimization scheme:
\begin{align}
    \bm{x}^{k+1} &= \operatorname*{argmin}_{\bm{x}\in\mathbb{R}^n} f(\bm{x}) + \frac{\rho}{2}\|\mathbf{A}\bm{x} + \mathbf{B}\bm{z}^k-\mathbf{c}+\mathbf{\gamma}^k \|^2_2 \nonumber\\
    \bm{z}^{k+1} &= \operatorname*{argmin}_{\bm{z}\in\mathbb{R}^m} g(\bm{z}) + \frac{\rho}{2}\|\mathbf{A}\bm{x}^{k+1} + \mathbf{B}\bm{z}-\mathbf{c}+\mathbf{\gamma}^k \|^2_2 \nonumber\\
    \mathbf{\gamma}^{k+1} &= \mathbf{\gamma}^{k} + \mathbf{A}\bm{x}^{k+1}+\mathbf{B}\bm{z}^{k+1}-\mathbf{c}\label{eq:scaled_dual_ADMM}
\end{align}

Using ADMM, one can decouple the joint optimization of two separable groups of parameters into two alternating separate optimizations for each individual group.


\paragraph{\textsc{EN-S}.}
To apply ADMM, \cite{aghazadeh_epistatic_2021} reformulate the \textsc{FullWH} loss, by introducing a new variable $\bm{z}$ and adding a constraint such that it is equal to the Fourier spectrum:
\begin{align*}
&\mathcal{L}_{EN-S}=\mathcal{L}_{net}+\lambda\|\bm{z}\|_1 \\
&\text{subject to:   } \bm{z}=\widehat{\mathbf{g_\theta}}=\mathbf{H}_n\mathbf{g_\theta}(\mathbf{X})
\end{align*}
,  where $g_{\bm{\theta}}$ is the neural network parameterized by $\bm{\theta}$, $\mathbf{H}_n\in\{0,1\}^{2^n\times2^n}$ is the Hadamard matrix, and $\mathbf{X}\in\{0,1\}^{2^n\times n}$ is the matrix of the enumeration over all points on the Boolean cube $\{0,1\}^n$. 

They use the scaled-dual ADMM~\eqref{eq:scaled_dual_ADMM} followed by a few further adjustments to reach the following alternating scheme for optimization of $\mathcal{L}_{EN-S}$:
\begin{align}
    \bm{\theta}^{k+1} &= \operatorname*{argmin}_{\bm{\theta}} \mathcal{L}_{net} + \frac{\rho}{2}\|\mathbf{g_\theta}(\mathbf{X}_T) - \mathbf{H}_T\bm{z}^k+\mathbf{\gamma}^k \|^2_2 \nonumber\\
    \bm{z}^{k+1} &= \operatorname*{argmin}_{\bm{z}} \lambda \|\bm{z}\|_0 + \frac{\rho}{2}\|\mathbf{g}_{\bm{\theta}^{k+1}}(\mathbf{X}_T) - \mathbf{H}_T\bm{z}+\mathbf{\gamma}^k \|^2_2 \nonumber\\
    \mathbf{\gamma}^{k+1} &= \mathbf{\gamma}^{k} + \mathbf{g}_{\bm{\theta}^{k+1}}(\mathbf{X}_T) - \mathbf{H}_T\bm{z}^{k+1}, \label{eq:scaled_dual_ENS}
\end{align}
where $\mathbf{X}_T\in \{0,1\}^{O(2^mn)\times n}$ is the input enumeration matrix $\mathbf{X}\in\{0,1\}^{2^n\times n}$ sub-sampled at $O(2^mn)$ rows, $\mathbf{H}_T\in \{0,1\}^{O(2^mn)\times n}$ is the Hadamard matrix $\mathbf{H}_n\in\{0,1\}^{2^n\times 2^n}$ subsampled at similar $O(2^mn)$ rows, and $\mathbf{\gamma}\in\mathbb{R}^{O(2^mn)}$ is the dual variable. We will introduce the hash size parameter $m$ momentarily.

Using the optimization scheme \eqref{eq:scaled_dual_ENS}, they decouple the optimization of $\mathcal{L}_{EN-S}$ into two separate alternating optimizations: 1) minimizing $\mathcal{L}_{net}$ by fixing $\bm{z}$ and optimizing network parameters using SGD for an epoch ($\bm{\theta}$-minimization), 2) fixing $\theta$ and computing a sparse Fourier spectrum approximation of the network at the end of each epoch and updating the dual variable ($\bm{z}$-minimization).

To approximate the sparse Fourier spectrum of the network at $\bm{z}$-minimization step, they use the ``SPRIGHT'' algorithm \citep{li_spright_2015}. SPRIGHT requires $O(2^mn)$ samples from the network to approximate its Fourier spectrum and runs with the complexity of $O(2^mn^3)$, where $m$ is the hash size used in the algorithm (the equivalent of $b$ in our setting).
In \textsc{EN-S} optimization scheme \eqref{eq:scaled_dual_ENS}, these $O(2^mn)$ inputs are denoted by the matrix $\mathbf{X}_T\in \{0,1\}^{O(2^mn)\times n}$, and are fixed during the whole optimization process. This requires the computation of the network output on these $O(2^mn)$ inputs at each back-prop iteration in $\bm{\theta}$-minimization, as well as at the end of each epoch to run SPRIGHT in $\bm{z}$-minimization.

\paragraph{\textsc{EN-S} vs. \textsc{HashWH}.}
The hashing done in our method, \textsc{HashWH}, is basically the first step of many (if not all) sparse Walsh-Hadamard transform approximation methods \citep{li2015active,scheibler2015fast,li_spright_2015,amrollahi_efficiently_2019}, including \textsc{SPRIGHT} \citep{li_spright_2015} that is used in \textsc{EN-S}. In the task of sparse Fourier spectrum approximation, further, extra steps are done to infer the \emph{exact} frequencies of the support and their associated Fourier coefficients. These steps are usually computationally expensive. Here, since we are only interested in the $L1$-norm of the Fourier spectrum of the network and are not necessarily interested in retrieving the exact frequencies in its support, we found the idea of approximating it with the $L1$-norm of the Fourier spectrum of our hash function compelling. This is the core idea behind \textsc{HashWH} which lets us stick to the \textsc{FullWH} formulation using a scalable approximation of the $L1$-norm of the network's Fourier spectrum.

From the mere computational cost perspective, \textsc{EN-S} requires a rather expensive sparse Fourier spectrum approximation of the network at the end of each epoch. We realized, one bottleneck of their algorithm was the evaluation of the neural network on the required time samples of their sparse Fourier approximation algorithm. We re-implemented this part on a GPU to make it substantially faster. Still, we empirically observe that more than half of the run time of each \textsc{EN-S} epoch is spent on the Fourier transform approximation. Furthermore, in \textsc{EN-S}, the network output needs to be computed for $\Omega(2^mn)$ samples at each back-prop iteration. 

On the contrary, in \textsc{HashWH}, the network Fourier transform approximation is not needed anymore. We only compute the network output on precisely $2^b$ samples at each round of back-propagation to compute the Fourier spectrum of our sub-sampled neural network. Remember that our $b$ is roughly equivalent to their $m$. Since the very first step in their sparse Fourier approximation step is a hashing step into $2^m$ buckets. 

Let us compare our method with \textsc{EN-S} more concretely. For the sake of simplicity, we ignore the network sparse Fourier approximation step ($\bm{z}$-minimization) that happens at the end of each epoch for \textsc{EN-S} and assume their computational complexity is only dominated by the $\Omega(2^m n)$ evaluations made during back-prop. In order to use the same number of samples as \textsc{EN-S}, we can set our hashing size to $b=m+log(n)+c$, where $c$ is a constant which we found in practice to be at least $c\geq 3$. In the case of our avGFP experiment, this would be for instance $b\geq18$ in \textsc{HashWH} for \textsc{EN-S} with $m=7$. There, we outperformed \textsc{EN-S} using $b\in\{7, 10, 13, 16\}$ in terms of $R^2$-score. Note that even with $b=18$ we are still at least two times faster than \textsc{EN-S} as we do not go the extra mile of approximating the Fourier spectrum of the network at each epoch. 

%Finally, we allow for a new hashing matrix at the beginning of each round of back-prop. The \textsc{SPRIGHT} uses a deterministic hash matrix which is not   Therefore we are more robust to collisions also are not constrained to use a fixed sample set to approximate the Fourier spectrum of the network, which makes \textsc{HashWH} statistically more robust against ``harmful'' collisions in the hashing procedure (see Section~\ref{app:subsec:collision_probability}).

% Furthermore, some of ADMM assumptions, for instance, the separability of the parameter groups considered as well as the convexity of the functions and linearity of the constraint with respect to the parameters, do not seem to be met in the formulation of \textsc{EN-S}, as it originally stands. Therefore, we believe it requires more theoretical investigations before guaranteeing the expected results obtained through ADMM.

\section{Datasets}
\label{app:sec:datasets}
We list all the datasets used in the real dataset Section~\ref{sec:exp:real_data}. 
\paragraph{Entacmaea quadricolor fluorescent protein. (Entacmaea)}
\cite{poelwijk_learning_2019} study the fluorescence brightness of all $2^{13}$ distinct variants of the Entacmaea quadricolor fluorescent protein, mutated at $13$ different sites. 
They examine the goodness of fit ($R^2$-score) when only using a limited set of frequencies of the highest amplitude. They report that only $1\%$ of the frequencies are enough to describe data with a high goodness of fit ($R^2=0.96$), among which multiple high-degree frequencies exist.
% They report the existence of multiple high-degree frequencies in its support (referred to as high-order epistatic interactions), given the observed data.

\paragraph{GPU kernel performance (SGEMM).}
\cite{nugteren_cltune_2015} measures the running time of a matrix product using a parameterizable SGEMM GPU kernel, configured with different parameter combinations. The input has $14$ categorical features that we one-hot encode into $40$-dimensional binary vectors.

\paragraph{Immunoglobulin-binding domain of protein G (GB1).}
\cite{wu_adaptation_2016} study the ``fitness'' of variants of protein GB1, that are mutated at four different sites. Fitness, in this work, is a quantitative measure of the stability and functionality of a protein variant. Given the $20$ possible amino acids at each site, they report the fitness for $20^4=160,000$ possible variants, which we represent with one-hot encoded 80-dimensional binary vectors. In a noise reduction step, they included $149,361$ data points as is and replaced the rest with imputed fitness values. We use the former, the untouched portion, for our study.

\paragraph{Green fluorescent protein from Aequorea victoria (avGFP).}
\cite{sarkisyan_local_2016} estimate the fluorescence brightness of random mutations over the green fluorescent protein sequence of Aequorea victoria (avGFP) at 236 amino acid sites. We transform the data into the boolean space of the absence or presence of a mutation at each amino acid site by averaging the brightness for the mutations with similar binary representations. This converts the original $54,024$ distinct amino acid mutations into $49,089$ 236-dimensional binary data points.

\section{Implementation technical details}
\label{app:sec:technical_details}
\paragraph{Neural network architecture and training}
We used a 5-layer fully connected neural network including both weights and biases and LeakyReLu as activations in all settings. For training, we used MSE loss as the loss of the network in all settings. We always initialized the networks with Xavier uniform distribution.  We fixed 5 random seeds in order to make sure the initialization was the same over different settings. The Adam optimizer with a learning rate of $0.01$ was used for training all models. We always used a single Nvidia GeForce RTX 3090 to train each model to be able to fairly compare the runtime of different methods. 
%We use early stopping for picking the model using which we report the test $R^2$ (details in the following data split segment).
% We run all synthetic experiments done for Fourier spectrum evolution experiments for 500 epochs (we reported up to 300 epochs in the figures for the sake of clearer visualization)
We did not utilize other regularization techniques such as Batch Normalization or Dropout to limit our studies to analyze the mere effect of Fourier spectrum sparsification. We use networks of different widths in different experiments which we detail in the following:
\begin{itemize}
    \item \textit{Fourier spectrum evolution:} The architecture of the network is $10 \times 100 \times 100 \times 10 \times 1$.
    \item \textit{High-dimensional synthetic data:} For each $n \in \{25, 50, 100\}$, the architecture of the network is $n \times 2n \times 2n \times n \times 1$.
    \item \textit{Real data:} Assuming $n$ to be the dimensionality of the input space, we used the network architecture of $n \times 10n \times 10n \times n \times 1$ for all the experiments except avGFP. For avGFP with $n=236$, we had to down-size the network to $n \times n \times n \times n \times 1$ to be able to run \textsc{EN-S} on GPU as it requires a significant amount of samples to compute the Fourier transform at each epoch in this dimension scale.
\end{itemize}

\paragraph{Data splits}
In the Fourier spectrum evolution experiment, where we do not report $R^2$ of the predictions, we split the data into training and validation sets (used for hyperparameter tuning). For the rest of the experiments, we split the data into three splits training, validation, and test sets. We use the validation set for the hyperparameter tuning (mainly the regularizer multiplier $\lambda$ and details to be explained later) and early stopping. We stop each training after 10 consecutive epochs without any improvements over the best validation loss achieved and use the epoch with the lowest loss for testing the model. All the $R^2$s reported are the performance of the model on the (hold-out) test set. 

% We always continued the experiments until the stop due to no improvement happens.

For each experiment, we used different training dataset sizes that are explicitly mentioned in the main body of the paper. Here we list the validation and test dataset sizes:
\begin{itemize}
    \item \textit{Fourier spectrum evolution:} Given that $n=10$ and the Boolean cube is of size $2^n=1024$, we always use the whole data and split it into training and validation sets. For example, for the training set of size $200$, we use the rest of the $824$ data points as the validation set.
    \item \textit{High-dimensional synthetic data:} For each training set, we use validation and test sets of five times the size of the training set. That is, for a training set of size $c \cdot 25n$, both of our validation and test sets are of size $c \cdot 125n$.
    \item \textit{Real data:} After taking out the training points from the dataset, we split the remaining points into two sets of equal sizes one for validation and one for test. 
    %Given $2^{13}=8192$ data points in the Entacmaea dataset, we divide the data not used in the training set into two equal-size sets of validation and test for this dataset. For example for the training set of size $200$, validation and test sets would be of size $3996$. For the rest of the datasets, we use validation and test sets of size $20,000$.
\end{itemize}

\paragraph{Hyper-parameter tuning.}
In all experiments, we hand-picked candidates for important hyper-parameters of each method studied and tested every combination of them, and picked the version with the best performance on the validation set.
This includes testing different $\lambda \in \{0.0001, 0.001, 0.01, 0.1\}$ for HashWH, $\lambda \in \{0.01, 0.1, 1\}$ and $\rho \in \{0.001, 0.01, 0.1\}$ for \textsc{EN-S}, and $\lambda \in \{0.01, 0.1, 1\}$ for \textsc{FullWH}. Furthermore, we also used the following hyper-parameters for the individual experiments:
\begin{itemize}
    \item \textit{Fourier spectrum evolution:} We used $b\in\{5, 7, 8\}$ for HashWH and $m=5$ for the \textsc{EN-S}. We did not tune $b$ for HashWH as we reported all the results in order to show the graceful dependence with increasing the hashing matrix size. 
    \item \textit{High-dimensional synthetic data:} We used $b\in\{7, 10, 13\}$ for HashWH and $m=7$ for the \textsc{EN-S}. We did not tune $b$ for HashWH as we reported each individually.
    \item \textit{Real data:} We used $b\in\{7, 10, 13\}$ for HashWH and $m=7$ for \textsc{EN-S} in the Entacmaea, SGEMM, and GB1 experiments. Furthermore, for avGFP, we also considered $b=16$ for HashWH. Unlike the synthetic experiments, where we reported results for each $b$ individually, we treated $b$ as a hyper-parameter in real data experiments. For Lasso, we tested different L1 norm coefficients of $\lambda \in \{10^{-5}, 10^{-4}, 10^{-3}, 10^{-2}, 10^{-1}, 1\}$. For Random Forest, we tested different numbers of estimators $n_{estimators} \in \{100, 200, 500, 1000 \}$, and different maximum depths of estimators $max_{depth} \in \{5, 10, 15\}$ for Entacmaea experiments and $max_{depth} \in \{10, 20, 30, 40, 50\}$ for the rest of experiments. We tested the exact same hyper-parameter candidates we considered for Random Forest in our XGBoost models.
\end{itemize}
Like common practice, we always picked the hyper-parameter combination resulting in the minimum loss on the validation set, and reported the model's performance on the test (hold-out) dataset.

%\paragraph{Plots.}
%In the plots representing the Fourier spectrum (e.g. Figures \labelcref{fig:data_freq_heatmap,fig:sc_heatmap}), the spectrums are averaged over multiple runs ($125$ runs for Figure \ref{fig:data_freq_heatmap} and $25$ runs for Figure \ref{fig:sc_heatmap}). For the rest of the plots reporting Test $R^2$ or runtimes, the values are averaged over multiple runs for each specific setting and are accompanied by error bars representing the standard error intervals.

\paragraph{Code repositories.}
All the implementations for the methods as well as the experiments are publicly accessible through \href{https://github.com/agorji/WHRegularizer}{https://github.com/agorji/WHRegularizer}. 
% All the implementations for our methods as well as the experiments will be publicly accessible in the final version of the paper after the double-blind review process is over, by making the GitHub repository of the project publicly available.

For \textsc{EN-S} and \textsc{FullWH} regularizers, we used the implementation shared by \cite{aghazadeh_epistatic_2021}\footnote{https://github.com/amirmohan/epistatic-net}. We applied minor changes so to compute samples needed for the Fourier transform approximation in \textsc{EN-S} on GPU, making it run faster and fairer to compare our method with.

We used the python implementation of \verb|scikit-learn|\footnote{https://scikit-learn.org} for our Lasso and Random Forest experiments. We also used the XGBoost\footnote{https://xgboost.readthedocs.io} python library for our XGBoost experiments.

% \begin{itemize}
%     \item hadamard lambdas used for different methods and experiments
%     \item learning rate and optimizer
%     \item scaling of synthetic dataset
% \end{itemize}

\section{Ablation study details}
\label{app:sec:ablation_details}
To study the effect of the low-degree simplicity bias on generalization on the real-data distribution, we conduct an ablation study by fitting a sparse Fourier transform to two of our datasets. To this end, we fit Random Forest models on Entecamaa and SGEMM datasets, such that they achieve test $R^2$ of nearly 1 on an independent test set not used in the training. Then, we compute the exact sparse Fourier transform of each Random Forest model, which essentially results in a sparse Fourier function that has been fitted to the training dataset. In our ablation study, finally, we remove frequencies based on two distinct regimes of lower-amplitudes-first and higher-degrees-first and show that the former harms the generalization more. This is against the assumption of simplicity bias being always helpful.

In the next two subsections, we provide the details on how to compute the exact sparse Fourier transform of a Random Forest model as well as finer details of the study setup.




\subsection{Fourier transform of (ensembles) of decision trees}\
\label{app:subsec:tree_fourier}
% In the previous section, we saw how the Fourier representation provides us with insight in the form of Shapley values, helps to identify \emph{complementary} and \emph{substitutable} features, and how {\em Fourier sparsity} helps for their {\em efficient calculation}. In this section, we go even further and review how decision trees, 
% %convey why it makes sense to compile these representations into decision trees, 
% a class of models celebrated for their interpretability, naturally admit sparse Fourier representations \citep{kushilevitz1993learning}. 

%. This approach is motivated by the fact that that decision trees admit sparse Fourier representations \citep{kushilevitz1993learning}. Therefore, one can expect the compiled decision trees would accurately approximate the Fourier transform and hence the original function. We continue by providing more context on decision trees.
A decision tree, in our context, is a rooted binary tree whose nodes can have either zero or two children. Each leaf node is assigned a real number. Each non-leaf node corresponds to one of $n$ binary features. The tree defines a function $t:\{0, 1\}^n \rightarrow  \mathbb{R}$ in the following way: To compute $t(x)$ we look at the root,  corresponding to, say, feature $i \in [n]$. Next, we check the value of the variable $x_i$. If the value of the variable is equal to $0$ we look at the left child. If it is equal to $1$ we look at the right child. Then we repeat this process until we reach a leaf. The value of the function $t$ evaluated at $x$ is the real number assigned to that leaf. In all that follows, when referring to decision trees, we will denote them by the function $t:  \{0, 1\}^n \rightarrow  \mathbb{R}$. 

Given a decision tree, we can compute its Fourier transform recursively. Let $i \in [n]$ denote the feature corresponding to its root. Then the tree can be represented as follows:
\begin{equation} 
\label{eq:recursive_tree}
t(x) = \frac{1+(-1)^{\langle e_i, x \rangle}}{2} t_{\mathrm{left}}(x) + \frac{1-(-1)^{\langle e_i, x \rangle}}{2} t_{\mathrm{right}}(x)
\end{equation}

Hereby, $t_{\mathrm{left}}:\{0,1\}^{n-1}\rightarrow \R$ and $t_{\mathrm{right}}:\{0,1\}^{n-1} \rightarrow \R$ are the left and right sub-trees respectively. Therefore, one can recursively compute the Fourier transform of a decision tree. 
% or even an ensemble of trees (e.g., Random Forest) very efficiently. 

This also portrays why a decision tree of depth $d$ is a function of degree $d$. Moreover, for each tree $t$, if $|\text{supp}(t_{left})|=k_{\mathrm{left}}$ and $|\text{supp}(t_{right})|=k_{\mathrm{right}}$, then $|\text{supp}(t)| \leq 2(k_{\mathrm{left}}+k_{\mathrm{right}})$. This implies that a decision tree is $k$-sparse with $k=O(4^d)$. However, in many cases, when the decision tree is not complete or cancellations occur, the Fourier transform is even sparser.

Finally, we can also compute the Fourier transform of an ensemble of trees such as one produced by the random forest and XGBoost algorithms. In the case of regression, the ensemble just predicts the average prediction of its constituent trees.  Therefore its Fourier transform is the (normalized) sum of the Fourier transforms of its trees as well. If a random forest model consists of $T$ different trees then its Fourier transform is $k=O(T 4^d)$-sparse and of degree equal to its maximum depth. 
% This often occurs, e.g., in the case of classification  where labels (leaf values) are zero and one. Nevertheless, in all cases $k=O(4^d)$, which is polynomial in the size of the tree, since a tree of depth $d$ contains at most $2^d$ nodes.     

\subsection{Ablation study setup}
For the Entacmae dataset, we used a training set of size $5,000$ and a test set of size $2,000$, for which we trained a Random Forest model with $100$ trees with maximum depths of $7$. For the SGEMM dataset, we used a training set of size $100,000$ and a test set of size $5,000$, for which we trained a Random Forest model with $100$ trees with a maximum depth of $10$.

\section{Extended experiment results}
\label{app:extended_results}
% Uncomment figures last to avoid reducing the compilation speed ...
%In the main body of the article, for each plot, we usually reported one version out of all variations of a similar experiment that are different in traits such as the train size used or the version of the random synthetic function considered. 
Here, we report the extended experiment results containing variations not reported in the main body of the paper.

\subsection{Fourier spectrum evolution}
\label{app:subsec:evolution_detailed}
We randomly generated five synthetic target functions $g^*\in\{0,1\}^{10}$ of degree $d=5$, each having a single frequency of each degree in its support (the randomness is over the choice of support). We create a dataset by randomly sampling the Boolean cube. 
Figure~\ref{fig:full_data_freq_heatmap} shows the evolution of the Fourier spectrum of the learned neural network function for different methods over training on datasets of multiple sizes ($100, 200, 300, 400$) limited to the target support. This is the extended version of Figure~\ref{fig:data_freq_heatmap}, where we only reported results for the train size of $200$. We observe that, quite unsurprisingly, each method shows better performance when trained on a larger training set in terms of converging at earlier epochs and also converging to the true Fourier amplitude it is supposed to. It can also be observed that the Fourier-sparsity-inducing (regularized) methods are \emph{always} better than the standard neural network in picking up the higher-degree frequencies, regardless of the training size.

Figure~\ref{fig:full_sc_heatmap} goes a step further and shows the evolution of the full Fourier spectrum (not just the target frequencies) over the course of training. Here, unlike the previous isolated setting where we were able to aggregate the results from different target functions (because of always having a single frequency of each degree in the support), we have to separate the results for each target function $g^*\in\{0,1\}^{10}$, as each has a unique set of frequencies in its support.
In Figure~\ref{fig:data_freq_heatmap}, we reported the results for one version of the target function $g^*$ and Figure~\ref{fig:full_sc_heatmap} shows the Fourier spectrum evolution for the other four. We observe that in addition to the spotted inability of the standard neural network in learning higher-degree frequencies, it seems to start picking up erroneous low-degree frequencies as well.

To quantitatively validate our findings, in Figure~\ref{fig:full_function_error}, we show the evolution of Spectral Approximation Error (SAE) during training on both target support and the whole Fourier spectrum. This is an extended version of Figure~\ref{fig:function_error}, where we report the results for the train size of $200$. Here we also include results when using training datasets of three other train sizes $\{100, 300, 400\}$. We observe that even though the standard neural network exhibits comparable performance to \textsc{HashWH} on the target support when the training dataset size is $100$ and $400$, it is always underperforming \textsc{HashWH} when broadening our view to the whole Fourier spectrum, regardless of the train size and the hashing size. 

From a more fine-grained perspective, in Figure~\ref{fig:full_degree_split_function_error}, we categorize the frequencies into subsets of the same degree and show the evolution of SAE and energy on each individual degree. This is an extended version of Figure~\ref{fig:degree_split_function_error}, where we reported the results for the training dataset size of $200$. Firstly, we observe that using more data aids the standard neural network to eventually put more energy on higher-degree frequencies. But it is still incapable of appropriately learning higher-degree frequencies. Fourier-sparsity inducing methods, including ours, show significantly higher energy in the higher degrees. Secondly, No matter the train size, we note that the SAE on low-degree frequencies first decreases and then increases and the standard neural network starts to overfit. This validates our previous conclusion that the standard neural network learns erroneous low-degree frequencies. Our regularizer prevents overfitting in lower degrees. Its performance of which can be scaled using the hashing size parameter $b$.

\subsection{High-dimensional synthetic data}
\label{app:subsec:synthetic_detailed}
Figure~\ref{fig:full_synthetic_large} shows the generalization performance of different methods in learning a synthetic degree $d=5$ function $g^*\in\{0,1\}^n\rightarrow\mathbb{R}$, for $n\in\{25,50,100\}$, using train sets of different sizes ($c\cdot25n, c\in[8]$). For each $n$ we sample three different draws of $g^*$. This is the extended version of Figure~\ref{fig:synthetic_large}, where we only reported the results for the first draw of $g^*$ for each input dimension $n$. Our regularization method, \textsc{HashWH}, outperforms the standard network and \textsc{EN-S} in all possible combinations of input dimension and dataset sizes, regardless of the draw of $g^*$. We observe that increasing $b$ in \textsc{HashWH}, i.e. increasing the number of hashing buckets, almost always improves the generalization performance. \textsc{EN-S}, on the other hand, does not show significant superiority over the standard neural network rather than marginally outperforming it in a few cases when $n=25$. This does not match its performance in the previous section and conveys that it is not able to perform well when increasing the input dimension, i.e., having more features in the data.

To both showcase the computational scalability of our method, \textsc{HashWH}, and compare it to \textsc{EN-S}, we show the achievable performance by the number
of training epochs and training time in Figures~\labelcref{fig:full25_runtime_synthetic_large,fig:full50_runtime_synthetic_large,fig:full100_runtime_synthetic_large}, for all train set sizes and input dimensions individually and limited to the first draw of $g^*$ for each input dimension. This is the extended version of Figure~\ref{fig:runtime_synthetic_large} where we only reported it for $n=50$ and the sample size multiplier $c=5$. We consistently see that the trade-off between the generalization performance and the training time can be directly controlled in \textsc{HashWH} using the parameter $b$. Furthermore, \textsc{HashWH} is able to \emph{always} exhibit a significantly better generalization performance in remarkably less time, in all versions of $b$ tested. This emphasizes the advantage of our method in not directly computing the approximate Fourier spectrum of the network, which resulted in this gap with \textsc{EN-S} in the run time, that increases as the input dimension $n$ grows.

\subsection{Real data}
\label{app:subsec:real_detailed}
Figure~\ref{fig:full_performance_real_data} shows the generalization performance and the training time of different methods, including relevant machine learning benchmarks, in learning four real datasets. It is the extended version of Figure~\ref{fig:score_real_data}, where we only reported the generalization performance and not the training time. The training time for neural nets is considered to be the time until overfitting occurs i.e. we do early stopping. \emph{In addition} to superior generalization performance of our method, \textsc{HashWH}, in most settings, again, we see that it is able to achieve it in significantly less time than \textsc{EN-S}. \textsc{Lasso} is the fastest among the methods but usually shows poor generalization performance.

\begin{figure*}[h]
    \centering
    \begin{subfigure}[b]{0.75\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/spectrum/data_freq_heatmap_n10_d5_size1.pdf}
         \caption{$\text{Train size}=100$}
     \end{subfigure}
    \begin{subfigure}[b]{0.75\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/spectrum/data_freq_heatmap_n10_d5_size2.pdf}
         \caption{$\text{Train size}=200$}
     \end{subfigure}
    \begin{subfigure}[b]{0.75\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/spectrum/data_freq_heatmap_n10_d5_size3.pdf}
         \caption{$\text{Train size}=300$}
     \end{subfigure}
    \begin{subfigure}[b]{0.75\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/spectrum/data_freq_heatmap_n10_d5_size4.pdf}
         \caption{$\text{Train size}=400$}
     \end{subfigure}
     \caption{Evolution of the Fourier spectrum during training limited to the target support, using training sets of different sizes. All synthetic functions have single frequencies of each degree in their support that are all given the amplitude of $1$. This is an extended version of Figure~\ref{fig:data_freq_heatmap}, where we only reported the results for the train set size $200$. It can be observed that the Fourier-sparsity-inducing (regularized) methods are \emph{always} better than the standard neural network in picking up the higher-degree frequencies, regardless of the training size. Each method shows better performance when trained on a larger training set in terms of converging at earlier epochs and also converging to the true Fourier amplitude it is supposed to.}
     \label{fig:full_data_freq_heatmap}

\end{figure*}

\begin{figure*}[h]
    \centering
\begin{subfigure}[b]{0.8\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/spectrum/sc_heatmap_n10_d5_size2_seed1.png}
     \end{subfigure}
\begin{subfigure}[b]{0.8\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/spectrum/sc_heatmap_n10_d5_size2_seed2.png}
     \end{subfigure}
\begin{subfigure}[b]{0.8\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/spectrum/sc_heatmap_n10_d5_size2_seed3.png}
     \end{subfigure}
\begin{subfigure}[b]{0.8\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/spectrum/sc_heatmap_n10_d5_size2_seed4.png}
     \end{subfigure}
     \caption{Evolution of the Fourier spectrum in learning a synthetic function $g^*\in\{0,1\}^{10}$ of degree $5$ during training, categorized by frequency degree. All synthetic functions used have single frequencies of each degree in their support that are all given the amplitude of $1$. We reported the results for one draw of $g^*$ in Figure \ref{fig:sc_heatmap} and the four others here, for the training dataset size of $200$. In addition to the incapability of the standard neural network in learning high-degree frequencies, they tend to consistently pick up wrong low-degree frequencies. Both of the problems are remedied through our regularizer.}
     \label{fig:full_sc_heatmap}
\end{figure*}

\begin{figure*}[h]
  \centering
        \begin{subfigure}[b]{0.9\linewidth}
         \centering
            \includegraphics[width=\linewidth]{plots/spectrum/degree_split_function_error_n10_d5_size1.pdf}
         \caption{$\text{Train size}=100$}
     \end{subfigure}
     % \begin{subfigure}[b]{0.9\linewidth}
     %     \centering
     %        \includegraphics[width=\linewidth]{plots/spectrum/degree_split_function_error_n10_d5_size2.pdf}
     %     \caption{$\text{Train size}=200$}
     % \end{subfigure}
     \begin{subfigure}[b]{0.9\linewidth}
         \centering
            \includegraphics[width=\linewidth]{plots/spectrum/degree_split_function_error_n10_d5_size2.pdf}
         \caption{$\text{Train size}=200$}
     \end{subfigure}
     \begin{subfigure}[b]{0.9\linewidth}
         \centering
            \includegraphics[width=\linewidth]{plots/spectrum/degree_split_function_error_n10_d5_size3.pdf}
         \caption{$\text{Train size}=300$}
     \end{subfigure}
  \caption{Evolution of the Spectral Approximation Error (SAE) and energy of the network during training, categorized by frequency degree (continued in the next page).}
\end{figure*}
\begin{figure*}[t!]\ContinuedFloat
    \centering
     \begin{subfigure}[b]{0.9\linewidth}
         \centering
            \includegraphics[width=\linewidth]{plots/spectrum/degree_split_function_error_n10_d5_size2.pdf}
         \caption{$\text{Train size}=400$}
     \end{subfigure}
      \caption{Evolution of the Spectral Approximation Error (SAE) and energy of the network during training, categorized by frequency degree. This is an extended version of Figure~\ref{fig:degree_split_function_error}, where we only reported results for training dataset size $200$. Firstly, in a standard neural network, the energy is mostly put on low-degree frequencies as compared to the high-degree frequencies. The energy slightly shifts towards high-degree frequencies when increasing the training dataset size. Our regularizer facilitates the learning of higher degrees in all cases. 
      Secondly, over the lower-degree and regardless of the train size, the standard neural network's energy continues to increase while the SAE first decreases then reverts and increases. This shows that the standard neural network emphasizes energy on erroneous low-degree frequencies and overfits. Our regularizer prevents overfitting in lower degrees.}
      \label{fig:full_degree_split_function_error}
\end{figure*}

\begin{figure*}[h]
  \centering
      \begin{subfigure}[b]{0.49\linewidth}
         \centering
            \includegraphics[width=\linewidth]{plots/spectrum/function_error_n10_d5_size1.pdf}
         \caption{$\text{Train size}=100$}
     \end{subfigure}
      \begin{subfigure}[b]{0.49\linewidth}
         \centering
            \includegraphics[width=\linewidth]{plots/spectrum/function_error_n10_d5_size2.pdf}
         \caption{$\text{Train size}=200$}
     \end{subfigure}
      \begin{subfigure}[b]{0.49\linewidth}
         \centering
            \includegraphics[width=\linewidth]{plots/spectrum/function_error_n10_d5_size3.pdf}
         \caption{$\text{Train size}=300$}
     \end{subfigure}
      \begin{subfigure}[b]{0.49\linewidth}
         \centering
            \includegraphics[width=\linewidth]{plots/spectrum/function_error_n10_d5_size4.pdf}
         \caption{$\text{Train size}=400$}
     \end{subfigure}
     \caption{Evolution of the spectral approximation error (SAE) during training. The left plot limits the error to the target support, while the right one considers the whole Fourier spectrum. This is an extended version of Figure \ref{fig:function_error}, where we only reported results for train size $200$. The standard neural network is able to achieve a lower (better) (train size $100$) or somewhat similar (train size $400$) SAE on the \emph{target support} compared to our method. However, our method always achieves lower SAE on the \emph{whole Fourier spectrum}, regardless of $b$ used. This shows how our regularisation method is effective in preventing the network from learning the wrong frequencies that are not in the support.}
  \label{fig:full_function_error}
\end{figure*}



% ---------------- High dimensional synthetic ----------------------

\begin{figure*}[h]
    \centering
     \begin{subfigure}[b]{0.33\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/performance_n25_seed3.pdf}
         \caption{$n=25$, first draw of $g^*$}
     \end{subfigure}
     \begin{subfigure}[b]{0.33\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/performance_n25_seed2.pdf}
         \caption{$n=25$, second draw of $g^*$}
     \end{subfigure}
    \begin{subfigure}[b]{0.33\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/performance_n25_seed1.pdf}
         \caption{$n=25$, third draw of $g^*$}
     \end{subfigure}
     \begin{subfigure}[b]{0.33\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/performance_n50_seed3.pdf}
         \caption{$n=50$, first draw of $g^*$}
     \end{subfigure}
     \begin{subfigure}[b]{0.33\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/performance_n50_seed2.pdf}
         \caption{$n=50$, second draw of $g^*$}
     \end{subfigure}
    \begin{subfigure}[b]{0.33\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/performance_n50_seed1.pdf}
         \caption{$n=50$, third draw of $g^*$}
     \end{subfigure}
    \begin{subfigure}[b]{0.33\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/performance_n100_seed3.pdf}
         \caption{$n=100$, first draw of $g^*$}
     \end{subfigure}
     \begin{subfigure}[b]{0.33\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/performance_n100_seed2.pdf}
         \caption{$n=100$, second draw of $g^*$}
     \end{subfigure}
    \begin{subfigure}[b]{0.33\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/performance_n100_seed1.pdf}
         \caption{$n=100$, third draw of $g^*$}
     \end{subfigure}
    \caption{Generalization performance $R^2$ on a hold-out test set, in learning a synthetic degree $5$ function $g^*\in\{0,1\}^n$ for $n\in\{25,50,100\}$, using datasets of size $c \cdot 25n$. We report the results of the first draws of $g^*$ for each input dimension in Figure~\ref{fig:synthetic_large} and the extended version for all three draws of $g^*$ of different dimensions here. Our method, \textsc{HashWH}, \emph{always} outperforms the standard neural network and \textsc{EN-S}. We are capable of significantly increasing the outperformance margin by increasing $b$. \textsc{EN-S}, however, does not show improvement over the standard network in most cases which indicates its diminishing effectiveness as the size of the input dimension grows, i.e., the number of features increases.}
    \label{fig:full_synthetic_large}
\end{figure*}

\begin{figure*}[h]
    \centering
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n25_size1_seed3.pdf}
         \caption{$n=25$, $c=1$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n25_size2_seed3.pdf}
         \caption{$n=25$, $c=2$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n25_size3_seed3.pdf}
         \caption{$n=25$, $c=3$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n25_size4_seed3.pdf}
         \caption{$n=25$, $c=4$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n25_size5_seed3.pdf}
         \caption{$n=25$, $c=5$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n25_size6_seed3.pdf}
         \caption{$n=25$, $c=6$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n25_size7_seed3.pdf}
         \caption{$n=25$, $c=7$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n25_size3_seed3.pdf}
         \caption{$n=25$, $c=8$}
     \end{subfigure}
    \caption{Best achievable generalization performance $R^2$ up to a certain epoch or training time (seconds), in learning a synthetic degree $5$ function $g^*\in\{0,1\}^n$, using datasets of size $c \cdot 25n$. This figure is an extended version of Figure~\ref{fig:runtime_synthetic_large}, where we reported similar plots for $n=50$ and $c=5$. Here we report the results for the first draw of $g^*$ with $n=25$. Our method, \textsc{HashWH}, \emph{always} outperforms \textsc{EN-S} $R^2$ score in significantly less time. \textsc{HashWH} can also be scaled by the choice of $b$ to achieve better generalization performance at th price of higher training times.}
    \label{fig:full25_runtime_synthetic_large}
\end{figure*}

\begin{figure*}[h]
    \centering
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n50_size1_seed3.pdf}
         \caption{$n=50$, $c=1$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n50_size2_seed3.pdf}
         \caption{$n=50$, $c=2$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n50_size3_seed3.pdf}
         \caption{$n=50$, $c=3$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n50_size4_seed3.pdf}
         \caption{$n=50$, $c=4$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n50_size5_seed3.pdf}
         \caption{$n=50$, $c=5$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n50_size6_seed3.pdf}
         \caption{$n=50$, $c=6$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n50_size7_seed3.pdf}
         \caption{$n=50$, $c=7$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n50_size3_seed3.pdf}
         \caption{$n=50$, $c=8$}
     \end{subfigure}
    \caption{Best achievable generalization performance $R^2$ up to a certain epoch or training time (seconds), in learning a synthetic degree $5$ function $g^*\in\{0,1\}^n$, using datasets of size $c \cdot 25n$. This figure is an extended version of Figure~\ref{fig:runtime_synthetic_large}, where we reported similar plots for $n=50$ and $c=5$. Here we report the results for the first draw of $g^*$ with $n=50$. Our method, \textsc{HashWH}, \emph{always} outperforms \textsc{EN-S} $R^2$ score in significantly less time. \textsc{HashWH} can also be scaled by the choice of $b$ to achieve better generalization performance athe price of higher training times.}
    \label{fig:full50_runtime_synthetic_large}
\end{figure*}


\begin{figure*}[h]
    \centering
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n100_size1_seed3.pdf}
         \caption{$n=100$, $c=1$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n100_size2_seed3.pdf}
         \caption{$n=100$, $c=2$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n100_size3_seed3.pdf}
         \caption{$n=100$, $c=3$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n100_size4_seed3.pdf}
         \caption{$n=100$, $c=4$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n100_size5_seed3.pdf}
         \caption{$n=100$, $c=5$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n100_size6_seed3.pdf}
         \caption{$n=100$, $c=6$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n100_size7_seed3.pdf}
         \caption{$n=100$, $c=7$}
     \end{subfigure}
    \begin{subfigure}[b]{0.49\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/synthetic_large/triple_runtime_n100_size3_seed3.pdf}
         \caption{$n=100$, $c=8$}
     \end{subfigure}
    \caption{Best achievable generalization performance $R^2$ up to a certain epoch or training time (seconds), in learning a synthetic degree $5$ function $g^*\in\{0,1\}^n$, using datasets of size $c \cdot 25n$. This figure is an extended version of Figure~\ref{fig:runtime_synthetic_large}, where we reported similar plots for $n=50$ and $c=5$. Here we report the results for the first draw of $g^*$ with $n=100$. Our method, \textsc{HashWH}, \emph{always} outperforms \textsc{EN-S} $R^2$ score in significantly less time. \textsc{HashWH} can also be scaled by the choice of $b$ to achieve better generalization performance at the price of higher training times.}
    \label{fig:full100_runtime_synthetic_large}
\end{figure*}


% --------------------- Real Data ----------------------
\begin{figure*}[h]
    \centering
     \begin{subfigure}[b]{0.65\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/Entacmaea_performance.pdf}
         \caption{Entacmaea (n=13)}
     \end{subfigure}
     \begin{subfigure}[b]{0.65\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/SGEMM_performance.pdf}
         \caption{SGEMM (n=40)}
     \end{subfigure}
     \begin{subfigure}[b]{0.65\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/GB1_reduced_performance.pdf}
         \caption{GB1 (n=80)}
     \end{subfigure}
     \begin{subfigure}[b]{0.65\linewidth}
         \centering
         \includegraphics[width=\linewidth]{plots/avGFP_performance.pdf}
         \caption{avGFP (n=236)}
    \end{subfigure}
    \caption{Generalization performance of standard/regularized neural networks and benchmark ML models on four real datasets. This figure is an extended version of Figure~\ref{fig:score_real_data}. It also includes the training times (logarithmically scaled in the plot). Our method is able to achieve the best test $R^2$s while always training significantly faster than \textsc{EN-S}.}
    \label{fig:full_performance_real_data}
\end{figure*}









\clearpage

\bibliography{gorji_368}
\end{document}
