\appendix
\section{Theoretical Details}

\subsection{Full Functional Derivatives}
\label{sec:app_derivatives}

Our FT is: 
\begin{align}
\mathcal{L}_{\rho, \gamma}(\hat{\mu},\hat{\Lambda}) &\approx \int_{\mathcal{X}}p(x) \rho\bigg[\frac{1}{2}\hat{\Lambda}(x)\hat{r}(x)^2  - \frac{1}{2}\log \hat{\Lambda}(x)\bigg]+p(x)\bar{\rho}\left[\gamma||\nabla\hat{\mu}(x)||_2^2 + \bar{\gamma}||\nabla\hat{\Lambda}(x)||_2^2 \right]dx\nonumber
\end{align}
and its functional derivatives are
\begin{align}
\begin{cases}
\frac{\delta \mathcal{L}}{\delta \hat{\mu}} & = p(x)\rho\hat{\Lambda}(x)\hat{r}(x) - 2\bar{\rho}\gamma\Delta\hat{\mu}(x)\\
\frac{\delta \mathcal{L}}{\delta \hat{\Lambda}} & = \frac{p(x)\rho}{2} \!\left[\hat{r}(x)^2-\frac{1}{\hat{\Lambda}(x)}\right]\!-2\bar{\rho}\bar{\gamma}\Delta\hat{\Lambda}(x), 
\end{cases}
\end{align}
where $\hat r(x) = y(x) - \hat \mu(x)$
After setting equal to zero we arrive at
\begin{align}
    \begin{cases}
    \hat{\Lambda}^*(x)(\hat{\mu}^*(x)-y(x)) = 
    2\frac{\bar{\rho}}{\rho}\gamma\frac{\Delta\hat{\mu}^*(x)}{p(x)}\\
    (y(x)-\hat{\mu}^*(x))^2 = \frac{1}{\hat{\Lambda}^*(x)}+ 4\frac{\bar{\rho}}{\rho}\bar{\gamma}\frac{\Delta\hat{\Lambda}^*(x)}{p(x)}.
    \end{cases}
\end{align}

\subsection{Proofs}
\label{sec:app_proofs}
\setcounter{prop}{0}
\begin{restatable}{prop}{prop:existence}
Assuming there exists twice differentiable functions $\mu: \R^d \to \R, \Lambda: \R^d \to \R_{>0}$, the following properties hold
\begin{enumerate}[label=\roman*]
    \item In the absence of regularization ($\rho = 1$), there are no solutions to the FT.
    \item In the absence of data ($\rho = 0$), there is no unique solution to the FT.
    \item In order for there to exist a solution to the FT there must be regularization on the mean function. 
\end{enumerate}
\end{restatable}

\begin{proof}
Without loss of generality, we consider a uniform $p(x)$ and drop it from the equations.
\begin{enumerate}[label=(\roman*)]
    \item When $\rho=1$ the necessary conditions for an optima are
    \begin{align}
        &\begin{cases}
            \ftp^*(x)(\ftmu^*(x) - y(x)) = 0 \\
            (\ftmu^*(x) - y(x))^2 = \frac{1}{\ftp^*(x)}
        \end{cases}
        \\
        \implies &\begin{cases}
            \ftp^*(x)(\ftmu^*(x) - y(x)) = 0 \\
            \ftp^*(x)(\ftmu^*(x) - y(x))^2 = 1
        \end{cases}
        \\
        \implies &\begin{cases}
            \ftp^*(x)(\ftmu^*(x) - y(x)) = 0 \\
            0 \times (\ftmu^*(x) - y(x)) = 1
        \end{cases}\\
        \implies & 0 = 1
    \end{align} 
    which is a contradiction and there cannot exist $\mu, \Lambda$ that are solutions. 
    \item When $\rho=0$ the integral we seek to maximize is:
    \begin{align}
        \mathcal{L}_{\rho, \gamma}(\hat{\mu},\hat{\Lambda}) &= \int_{\mathcal{X}} \rho\int_{\mathcal{Y}} p(y\sep x)\log \hat p(y\sep x) dy  + \bar{\rho}\left[\gamma||\nabla\hat{\mu}(x)||_2^2  + \bar{\gamma}||\nabla\hat{\Lambda}(x)||_2^2 \right] dx\\
        &= \int_{\mathcal{X}}\left[\gamma||\nabla\hat{\mu}(x)||_2^2  + \bar{\gamma}||\nabla\hat{\Lambda}(x)||_2^2 \right] dx
    \end{align}
    where we $\hat p(y\sep x) = \norm(y\sep \hat\mu(x), \hat\Lambda(x))$.
    Each term in this integral is non-negative, so the minimum value it could be is zero. 
    Any pair of constant functions $\mu, \Lambda$ will minimize this integral, of which there are infinitely many. 
    \item In the $(\alpha, \beta)$-regularization, it is equivalent to say $\alpha>0$ is a necessary condition for there to exist a solution to the FT. Recall that we seek to minimize
\begin{align*}
    \mathcal{L}_{\alpha, \beta}(\hat{\mu},\hat{\Lambda}) &= \int_{\mathcal{X}}p(x)(-\log \hat p(y | x) + \alpha||\nabla\hat{\mu}(x)||_2^2  + \beta ||\nabla\hat{\Lambda}(x)||_2^2) dx
\end{align*}
where we $\hat p(y | x) = \mathcal{N}(y | \hat\mu(x), \hat\Lambda(x))$.
Suppose $\alpha = 0$. Then the functional simplifies to 
\begin{align}
    \min_{\hat \mu, \hat \Lambda} \mathcal{L}_{\alpha, \beta}(\hat \mu, \hat \Lambda) &= \min_{\hat \mu, \hat \Lambda} \int_{\mathcal{X}} p(x)(-\log \hat p(y | x) +\beta ||\nabla\hat{\Lambda}(x)||_2^2) dx\\
    & \leq \min_{\hat \mu, \hat {\underline \Lambda}} \int_{\mathcal{X}} p(x) \frac{1}{2}(\hat{\underline \Lambda}(x)\hat r(x)^2 -\log \hat{\underline{\Lambda(x)}}) + p(x)\beta ||\nabla\hat{\underline\Lambda}(x)||_2^2 dx\\
    &= \min_{\hat \mu, \hat {\underline \Lambda}}(x)\int_{\mathcal{X}} p(x) \frac{1}{2}(\hat{\underline \Lambda}(x)\hat r(x)^2 - \log \hat{\underline{\Lambda}}(x)) \, dx
\end{align}
where $\hat{\underline\Lambda}$ is a constant function and $\hat r(x) = y(x) - \hat \mu(x)$.  
This provides an upper bound on the integral as we are looking at a restricted class of possible precision functions. 
Since the precision function is constant the gradient penalty, $||\nabla\hat{\Lambda}(x)||_2^2$, is zero. 
There is no penalty on $\hat \mu$ so it can perfectly pass through every data point and the contribution of $\hat{\underline \Lambda}(x)(\hat \mu(x) - y(x))^2$ is zero while $- \log \hat{\underline{\Lambda}}(x)$ can become arbitrarily negative. 
Thus there is no solution if $\alpha=0$.
\end{enumerate}
\end{proof}



\section{Experimental Details}
\subsection{Datasets}
\label{sec:datasets}

\begin{figure*}
\centering
\includegraphics[width=\textwidth]{figures/synth_visualizations.pdf}
\caption{Visualization of heteroskedastic and homoskedastic versions of simulated datasets. Specific details for the functional form of these can be found in \cref{tab:sim_data}.}
\label{fig:sim_data_vis}
\end{figure*}

We chose 64 datapoints in each of the simulated datasets. The generating processes for each simulated dataset is included in \cref{tab:sim_data} and can be seen in \cref{fig:sim_data_vis}. The homoskedastic data is simulated in the same way, but with $f(x) = 1$. For testing, we simulate a new dataset of 64 datapoints with the same process. \cref{tab:uci_data} summarizes the UCI datasets. We provide a description of the ClimSim climate data in \cref{sec:climsimdesc}.
\begin{table*} %
\centering

\caption{Simulated datasets. Each dataset is defined by a true $\mu$ function and then a noise function $f$. All data is generated as $\mu(x) + \epsilon(x)$ where $\epsilon(x) \sim \mathcal{N}(0, f(x)^2)$. After the datasets were generated they were scaled to have mean zero and standard deviation one. The homoskedastic versions of each dataset fix $f(x)=1$. The datasets are shown in \cref{fig:sim_data_vis}.}
\begin{tabular}{ llll }
\toprule 
Dataset  & Mean ($\mu$) & Noise Pattern ($f$) & Domain
\\
\midrule
Sine &  $\mu(x) = 2 \sin(4\pi x)$ & $f(x) = \sin(6\pi x) + 1.25$ & $x\in [0, 1]$\\
\midrule 
Cubic   &  $\mu(x) = x^3 
\; $ & $f(x) = \begin{cases}0.1 & \text{for } x < -0.5\\1 & \text{for }x \in [-0.5, 0.0) \\ 3 & \text{for } x \in [0.0, 0.5) \\ 10 & \text{for } x \ge .5 \end{cases}$&$x\in [-1, 1]$
\\
\midrule
Curve  &  $\mu(x) = x - 2x^2 + 0.5x^3$ & $f(x) = x + 1.5$ & $x \in [-1.5, 1.5]$
\\
\bottomrule
\end{tabular}
\label{tab:sim_data}
\end{table*}


\begin{table*} %
\centering

\caption{UCI dataset.}

\begin{tabular}{ lccc }
\toprule 
Dataset & Train Size & Test Size & Input Dimension 
\\
\midrule
Concrete &  687 & 343 & 8 \\
\midrule 
Housing   &  337 & 168 & 13 
\\
\midrule
Power  &  6379 & 3189 & 4 
\\
\midrule
Yacht & 204 & 102 & 6
\\
\bottomrule
\end{tabular}
\label{tab:uci_data}
\end{table*}


\subsection{Training Details}
\label{sec:training}
We take 22 values of $\gamma, \rho$ that range from $10^{-10}$ up to $1-10^{-5}$ on a logit scale for all of the experiments run on neural networks.
The exact values were $(\rho, \gamma \in$ \{0.9999, 0.999, 0.99, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 
0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001,
0.00000001, 0.000000001, 0.0000000001,
0.00000000001\}).
For the FTs we take 20 values from  $10^{-6}$ up to $1-10^{-7}$ also on a logit scale $(\rho, \gamma \in$ \{0.999999, 0.99999, 0.99999, 0.9999, 0.999, 0.99, 0.9,
0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.01, 0.001, 
0.0001, 0.00001, 0.000001\}). This scaling increases the absolute density of points evaluated near the extreme cases of 0 and 1 where the theoretical analysis of the FT focused. 
The ranges differ slightly due to numerical stability during the fitting.
The limiting cases of $\gamma, \rho \in \{0, 1\}$ were omitted for numerical stability and the ranges of values for the FTs vs neural networks vary slightly for the same reason. 
The values of $\rho, \gamma$ that were taken along the $\rho = 1- \gamma$ line were $\rho, \gamma \in \{0.0, 1.0\times 10^{-11}, 1.0\times 10^{-10}, 1.0\times 10^{-9}, 1.0\times 10^{-8}, 1.0\times 10^{-7},
        1.0\times 10^{-6}, 1.0\times 10^{-5}, 1.0\times 10^{-4},1.0\times 10^{-3},$ 0.01, 0.1,
        0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 
        0.17, 0.18, 0.19, 0.20, 0.21, 0.22,
        0.23, 0.24, 0.25, 0.26, 0.27, 0.28,
        0.29, 0.30, 0.31, 0.32, 0.33, 0.34,
        0.35, 0.36, 0.37, 0.38, 0.39, 0.40,
        0.41, 0.42, 0.43, 0.44, 0.45, 0.46,
        0.47, 0.48, 0.49, 0.50, 0.51, 0.52,
        0.53, 0.54, 0.55, 0.56, 0.57, 0.58,
        0.59, 0.60, 0.61, 0.62, 0.63, 0.64,
        0.65, 0.66, 0.67, 0.68, 0.69, 0.70,
        0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
        0.77, 0.78, 0.79, 0.80, 0.81, 0.82,
        0.83, 0.84, 0.85, 0.86, 0.87, 0.88,
        0.89, 0.9, 0.99, 0.999, 0.9999, 1.0\}.
All experiments were run on Nvidia Quadro RTX 8000 GPUs. Approximately 500 total GPU hours were used across all experiments. 

\subsection{Metrics}
\label{sec:metrics}
\begin{itemize}
    \item Geometric complexity: For the one-dimensional datasets the function is evaluated on a dense grid and then the gradients are approximated via finite differences and a trapezoidal approximation to the integral is taken. In the case of the FT, we only assess the function on the solved for, discretized points while with the neural networks we interpolate between points. 
    For the higher-dimensional UCI datasets the gradients are also numerically approximated in the same way but only at the points in the train/test sets. 
    \item MSE: In the fully non-parametric, unconstrained setting, the maximum likelihood estimates at each $x_i$ are $\hat \mu(x_i) = y(x_i)$ and $\hat \Lambda(x_i) = (y(x_i) - \mu(x_i))^{-2} \implies \hat \Lambda^{-1/2}(x_i) = |y(x_i) - \mu(x_i)|$, serving as motivation for checking these differences. 
\end{itemize}


\paragraph{Variability over runs}
The experiments were each run six times with different seeds.  
The standard deviations over the metrics displayed in \cref{fig:summary} are shown in \cref{fig:sd_summary}. 
The Sobolev norms show that there is the most variability in the overfitting regions $O_{\text{I}}$ and parts of $O_\text{II}$. 
This indicates that the functions themselves vary across runs. 
However, when turning to quality of fits, the MSEs show a different pattern of regions of instability, and $O_\text{I}$ has low variability in terms of actual performance. 

\begin{figure*}
\centering
\includegraphics[width=\textwidth]{figures/heatmaps_std_edit.pdf}
\caption{The standard deviation over the six runs of each metric shown in \cref{fig:summary}}
\label{fig:sd_summary}
\end{figure*}




\subsection{Field Theory}
\label{sec:FT}
For the discretized field theory we take $n_{ft}=4096$ evenly spaced points on the interval $[-1, 1]$. 
There are two datapoints placed beyond $[-1, 1]$ because the method we use to estimate the gradients requires the datapoints to have left and right neighbors. These datapoints were not included when computing our metrics. 
Of these 4096 datapoints 64 were randomly selected to be used for training neural networks $\nnmu, \nnLambda$.
The field theory results were consistent across choices of $n_{ft}\in\{256, 512, 1024, 2048, 4096\}$. 
We present results for $n_{ft}=4096$ in the main paper. 
We train for $100000$ epochs and use the Adam optimizer with a basic triangular cycle that scales initial amplitude by half each cycle on the learning rate. The minimum and maximum learning rates were 0.0005 and 0.01. The cycles were 5000 epochs long. We clip the gradients at 1000. 
A subset of the fits can be seen in \cref{fig:ft-fits}. 



\subsection{Simulated Data with Neural Networks}
\label{sec:nn-sim}
For all of the simulated datasets except for \emph{Sine} we train for $600000$ epochs and use the Adam optimizer with a basic triangular cycle that scales initial amplitude by half each cycle on the learning rate. The minimum and maximum learning rates were 0.0001 and 0.01. The cycles were 50000 epochs. 
The first 250000 epochs are only spend on training $\nnmu$ while the remaining 350000 epochs are spent training both $\nnmu, \nnLambda$. 
We clip the gradients at 1000. 
The training for the \emph{Sine} dataset was the same, except trained for $2500000$ epochs. 
A subset of the fits for the \emph{Sine} dataset can be seen in \cref{fig:mlp-fits}. 

\subsection{UCI Data with Neural Networks}
\label{sec:uci-sim}
For the \emph{concrete, housing} and \emph{yacht} datasets we train for $500000$ epochs and use the Adam optimizer with a basic triangular cycle that scales initial amplitude by half each cycle on the learning rate. The minimum and maximum learning rates were 0.0001 and 0.01. The cycles were 50000 epochs. 
The first 250000 epochs are only spend on training $\nnmu$ while the remaining 250000 epochs are spent training both $\nnmu, \nnLambda$. 
Meanwhile on the \emph{power} dataset, we had to use minibatching due to the size of the dataset. We used minibatches of 1000 and trained for 50000 total epochs with the first 25000 dedicated solely to $\nnmu$ and the remainder training both $\nnmu, \nnLambda$. The same cyclic learning rate was used but with cycle length 5000. 
We clip the gradients at 1000.

\subsection{Practical Suggestion}
\label{sec:diagsearch}
We can also view the $\rho = 1-\gamma$ line that we search from the perspective of the $\alpha, \beta$ parameterization of the regularizers. 
Let $\rho, \gamma \in (0,1)$ such that 
$\rho = 1 - \gamma$.
Furthermore, we know that $\alpha=\frac{1-\rho}{\rho}\gamma$ and that $\beta=\frac{1-\rho}{\rho}(1-\gamma)$.
If we are interested in the model settings for $(\rho(t)=t, \gamma(t)=1-t)$ for $t\in(0,1)$, it then follows that we are equivalently interested in
\begin{align*}
(\alpha(t), \beta(t)) & = \left(\frac{1-\rho(t)}{\rho(t)}\gamma(t), \frac{1-\rho(t)}{\rho(t)}(1-\gamma(t))\right) \\
& = \left(\frac{1-t}{t}(1-t), \frac{1-t}{t}t\right) \\
& = \left(\frac{(1-t)^2}{t}, 1-t \right) \\
\implies &\begin{cases}
 t & = 1- \beta(t) \\
 \alpha(t) & = \frac{\beta(t)^2}{t} \\
\end{cases}
\text{ or } \sqrt{t\alpha(t)} = \beta(t).
\end{align*}


\section{Additional Results}

\subsection{All Synthetic Dataset Results}
Both FT and neural networks were fit to the heteroskedastic and homoskedastic synthetic datasets described in \cref{tab:sim_data}. The main results for these displayed as phase diagrams of various metrics can be seen in \cref{fig:synth_ft} and \cref{fig:synth_nn} respectively. We largely see the same trends as were exhibited by the real-world datasets seen in \cref{fig:summary}.

\begin{figure*}
\centering
\includegraphics[width=\textwidth]{figures/heatmap_synth_ft.pdf}
\caption{Same configuration as \cref{fig:summary}, except all results here pertain to minimizing the FT on six different synthetic datasets described in \cref{tab:sim_data}. Dataset names with an $*$ are the homoskedastic counterparts.}
\label{fig:synth_ft}
\end{figure*}

\begin{figure*}
\centering
\includegraphics[width=\textwidth]{figures/heatmap_synth_nn.pdf}
\caption{Same configuration as \cref{fig:summary} and \cref{fig:synth_ft}, except all results here pertain to training a neural network on six different synthetic datasets described in \cref{tab:sim_data}. Dataset names with an $*$ are the homoskedastic counterparts.}
\label{fig:synth_nn}
\end{figure*}

\subsection{Effect of Neural Network Size}
We used the same training methods to fit models with one and two hidden layers and fit them to the \emph{concrete} dataset. 
The results in the phase diagram were consistent with the other experiments, as can be seen in \cref{fig:nn_ablation}.

\begin{figure*}
\centering
\includegraphics[width=0.6\textwidth]{figures/heatmap_nn_ablation.pdf}
\caption{Same configuration as \cref{fig:summary}, however, these results pertain all to fitting a neural network of various sizes on the \emph{concrete} dataset.}
\label{fig:nn_ablation}
\end{figure*}

\section{Comparison to Baselines}\label{sec:baseline_comp}
We compare the performance of our diagonal $\rho+\gamma=1$ search against two baselines, $\beta$-NLL \citep{seitzer_pitfalls_2022} and an ensemble of six MLE-fit heteroskedastic regression models \citep{lakshminarayanan_simple_2017}. We use $\mu$ MSE, $\Lambda^{-\frac{1}{2}}$ MSE, and expected calibration error (ECE) to evaluate the models. In all cases lower values are better. Note that the method of ensembling multiple individual MLE-fit models from \cite{lakshminarayanan_simple_2017} could be implemented on our method or $\beta$-NLL as well.

\subsection{Model Architecture}
All (individual) models have the same architecture: fully connected neural networks with three hidden layers of 128 nodes and leaky ReLU activations for the synthetic and UCI datasets and fully connected neural networks with three hidden layers of 256 nodes for the ClimSim data \citep{yu_climsim_2023}. 
Note that both baselines model the variance while our approach models the precision (inverse-variance).
In all cases we use a softplus on the final layer of the variance/precision networks to ensure the output is positive.

For the $\beta-$NLL implementation we take $\beta=0.5$ as suggested in \cite{seitzer_pitfalls_2022}. 
The ensemble method we use fits 6 individual heteroskedastic neural networks and combines their outputs into a mixture distribution that is approximated with a normal distribution. We do not add in adversarial noise as the authors state it does not make a significant difference. We fit six $\beta-$NLL models and six MLE-ensembles.

\subsection{Diagonal selection criteria}\label{sec:diag-crit}
After conducting our diagonal search we found the model that minimized $\mu$ MSE and the model that minimized $\Lambda^{-\frac{1}{2}}$ MSE on the \emph{training} data. 
In some cases these models coincided. 
We then used the model that was on the midpoint (on a logit scale) of the $\rho+\gamma=1$ line between these two models to compare.
The results are reported in Table \ref{tab:baselines}. In all cases our method is competitive with or exceeds the performance of these two baselines--particularly on real-world data. 
Note that our goal is to show that we are able to find models that model the mean and standard deviation of the data well, that is, lie in our proposed region \emph{S} of the phase diagram.   
We do not claim that this method will always provide optimal model within \emph{S}. 

\subsection{Training Details}
Training for our method was conducted as described in  sections \ref{sec:training} and \ref{sec:uci-sim} of the appendix.

For the baselines, on all of the simulated datasets we train for $600000$ epochs and use the Adam optimizer with a basic triangular cycle that scales initial amplitude by half each cycle on the learning rate. The minimum and maximum learning rates were 0.0001 and 0.01. The cycles were 50000 epochs. 
We clip the gradients at 1000. The same optimization scheme is performed for the UCI datasets but for $500000$ epochs for the \emph{Housing}, \emph{Concrete}, and \emph{Yacht} datasets. The \emph{Power} dataset was trained for $50000$ epochs with batches of 1000.

\subsection{ClimSim Dataset}\label{sec:climsimdesc}
The ClimSim dataset \citep{yu_climsim_2023} is a largescale climate dataset. 
Its input dimension is 124 and output dimension is 128. 
We use all 124 inputs to model a single output,
\emph{Visible direct solar flux, SOLS [$W/m^2$
]}.
We train on 10,091,520 of the approximately 100 million points for training and we use a randomly selected 7,209 points to evaluate our models.

\subsection{Deficiency of ECE}
Shortcomings of ECE (in isolation) are well documented \citep{kuleshov_accurate_2018, levi_evaluating_2022, chung_beyond_2021}. 
The main issue with ECE is it measures \emph{average} calibration, while \emph{individual} calibration is more desirable. 
On our diagonal search we found that often times the models that achieved the best ECE were those that were severely underfit and belonged to region $U_{I}$. 
In Table \ref{tab:baselines} we see that the MLE-ensemble is able to achieve low scores while being uncompetitive with respect to the two MSE metrics.
The MLE-ensembles were unstable on several of the datasets with respect to the variance network which is consistent with Proposition \ref{prop:existence}. 
In particular this can be seen for the synthetic datasets the $\Lambda^{-\frac{1}{2}}$ MSE diverges to infinity.




\begin{table*}[htp]
\centering

\caption{Comparison of a deep heteroskedastic regression model with diagonal regularization search against two baselines \citep{seitzer_pitfalls_2022, lakshminarayanan_simple_2017}. For details on the selection criteria of the heteroskedastic model see \cref{sec:diag-crit}. We report the average and standard deviations of expected calibration error (ECE), $\mu$ MSE and $\Lambda^{-\frac{1}{2}}$ MSE on test data. Lowest mean value for each metric is bolded. 
In several cases the MLE ensemble failed to properly converge (yielding $\inf\pm \text{nan}$ results when the standard deviation function diverges to infinity). Individually, there are many pitfalls to using MLE to train heteroskedastic regression models, and it only takes one member of the ensemble to fail to diverge to yield these results. 
In particular, note that these numerical issues occur most commonly for the quantities relating to the standard deviation. 
This highlights the instability of MLE training in this setting. Note that the method of ensembling multiple individual MLE-fit models could be performed on our method or $\beta$-NLL as well.}

\begin{tabular}{ l *{3}{c} }
\toprule
{Dataset}&
{Heteroskedastic} &
{$\beta$-NLL} &
{MLE Ensemble} \\
{\phantom{..........}Metric} & {} &
{ \cite{seitzer_pitfalls_2022}} &
{\cite{lakshminarayanan_simple_2017}} \\
\midrule
Cubic & & &\\
\phantom{..........} ECE & \textbf{0.2380} ± 0.03 & 0.2385 ± 0.02 & 0.2411 ± 0.02 \\
\phantom{..........} $\mu$ MSE  & 0.2339 ± 0.01 & \textbf{0.1500} ± 0.01 & 1.1809 ± 1.88 \\
\phantom{..........} $\Lambda^{-\frac{1}{2}}$MSE  & 0.2397 ± 0.02 & \textbf{0.1397} ± 0.01 & inf ± nan     \\
Curve & & & \\
\phantom{..........} ECE  & 0.1804 ± 0.02 & \textbf{0.1754} ± 0.02 & 0.2432 ± 0.00 \\
\phantom{..........}  $\mu$ MSE & \textbf{0.4318} ± 0.12 & 0.4877 ± 0.16 & 1.0067 ± 0.19 \\
\phantom{..........} $\Lambda^{-\frac{1}{2}}$MSE  & 0.4655 ± 0.09 & \textbf{0.4187} ± 0.20 & inf ± nan     \\
Sine & & & \\
\phantom{..........} ECE   & 0.2499 ± 0.00 & \textbf{0.2082} ± 0.03 & 0.2313 ± 0.05 \\
\phantom{..........} $\mu$ MSE   & \textbf{0.7968} ± 0.00 & 4.4107 ± 6.90 & 0.9716 ± 0.06 \\
\phantom{..........} $\Lambda^{-\frac{1}{2}}$MSE   & \textbf{0.7968} ± 0.00 & 4.3524 ± 6.89 & inf ± nan     \\
\midrule
Concrete & & & \\
\phantom{..........} ECE  & 0.2471 ± 0.01 & 0.2552 ± 0.00           & \textbf{0.0655} ± 0.01           \\
\phantom{..........} $\mu$ MSE  & \textbf{0.1055} ± 0.02 & 0.5461 ± 0.30         & 2.2454 ± 1.74           \\
\phantom{..........} $\Lambda^{-\frac{1}{2}}$MSE  & \textbf{0.3028} ± 0.51 & 1.0867 ± 0.20 & $1.3 \times 10^5$ ± $1.2 \times 10^5$ \\
Housing & & & \\
\phantom{..........} ECE   & 0.0653 ± 0.00 & \textbf{0.2631} ± 0.01           & 0.1332 ± 0.02           \\
\phantom{..........} $\mu$ MSE   & \textbf{1.2236} ± 0.00 & 0.3175 ± 0.06      & 155.4494 ± 128.27       \\
\phantom{..........} $\Lambda^{-\frac{1}{2}}$MSE   & \textbf{0.7610} ± 0.00 & 0.8820 ± 0.03    & 218.8269 ± 195.38       \\
Power & & & \\
\phantom{..........} ECE    & 0.2233 ± 0.01 & 0.2370 ± 0.00           & \textbf{0.0285} ± 0.01           \\
\phantom{..........} $\mu$ MSE    & 0.0350 ± 0.01 & 0.1013 ± 0.01           & \textbf{0.0177} ± 0.00           \\
\phantom{..........} $\Lambda^{-\frac{1}{2}}$MSE    & 0.0343 ± 0.01 & 0.3081 ± 0.37           & \textbf{0.0091} ± 0.00           \\
Yacht & & & \\
\phantom{..........} ECE    & 0.3038 ± 0.04 & 0.2882 ± 0.02           & \textbf{0.0463} ± 0.02           \\
\phantom{..........} $\mu$ MSE    & \textbf{0.0077} ± 0.01 & 0.0137 ± 0.01        & 6.2670 ± 13.96          \\
\phantom{..........} $\Lambda^{-\frac{1}{2}}$MSE    & \textbf{0.0076} ± 0.01 & 1.3275 ± 0.02        & 8.0599 ± 19.18          \\
\midrule
Solar Flux & & & \\
\phantom{..........} ECE  & \textbf{0.1503} ± 0.00 & 0.3007 ± 0.00 & 0.1924 ± 0.04                    \\
\phantom{..........} $\mu$ MSE  & \textbf{0.2887} ± 0.00 & 0.3771 ± 0.00 & 1.0067 ± 0.19                    \\
\phantom{..........} $\Lambda^{-\frac{1}{2}}$MSE  & \textbf{0.1175} ± 0.00 & 0.3217 ± 0.00 & \hspace{0.85em} $4.6\times10^{9}$ ± $9.9\times 10^{9}$  \\
\bottomrule
\end{tabular}
\label{tab:baselines}
\end{table*}

\section{Fourier Feature Mappings and Geometric Complexity}\label{sec:ffmap-gc}
As a preprocessing step we apply Fourier feature mapping \cite{tancik_fourier_2020} before passing our data into MLPs. 
That is, we map inputs $x\to \gamma(x)$ where $\gamma(\cdot)$ is defined as follows
\begin{equation*}
    \gamma(x) = [\cos (2\pi b_1^T x), \dots, \cos (2\pi b_k^T x)]
\end{equation*}
and the $\{b_i\}_{i=1}^k$ are independently sampled from a $\mathcal{N}(0, 1)$ distribution. 
This method has been shown to allow MLPs to learn high frequency data and to reduce training time. 
The motivation for this additional step is to encourage the mean and precision networks to overfit and hopefully, mimic some of the behaviors of the FT more faithfully. 
We try this in two different settings. 
In the first we add in the Fourier Features layers with an L2 penalty and then again with the Dirichlet energy/geometric complexity as the regularizer similar in spirit to \cite{hoffman_robust_2019}. 
Just as in the earlier experiments we penalize the mean and precision networks separately and weigh the regularizers in the same $(\rho, \gamma)$-scheme. 

In the case where we penalize the geometric complexity the regularizer now matches exactly the regularizer of the field theory setting. 
We find even greater correspondence between results. 
However, there is a heavy computational burden where training takes on the order of 10 times slower in wall clock time. 

We use 2-layer MLPs with width 128 and set the Fourier features mapping to be 64-dimensional, with $\sigma=2$ when sampling the weights for both the $\mu$ and $\Lambda$ networks. 
We remove one layer from the MLPs (compared to the earlier experiments) to accommodate the fact that we add in the Fourier feature mapping. 
We train the models to 128 samples from the generated in the same way as the \emph{Sine} dataset with 5000 epochs warmup for the mean network and 150000 epochs total and have batch size of 32. 
Despite fewer training epochs than the earlier neural network experiments, we still achieve overfitting behavior. 
We use the same set of $(\gamma, \rho)$-pairings as in the neural network experiments described in section \ref{sec:nn-sim}. 
Summary plots of $\mu$- and $\Lambda^{-1/2}$-MSE as well as Dirichlet energies can be found in \cref{fig:ff-summary}. 
A subset of resulting fits can be seen in \cref{fig:ff-de-fits,fig:ff-l2-fits}.

\begin{figure*}
\centering
\includegraphics[width=\textwidth]{figures/ff-summary.pdf}
\caption{Phase diagrams for the field theory (left), and MLPs with Fourier feature mappings with Dirichlet energy regularization (middle) and L2 regularization (right) fit to the \emph{Sine} dataset.}
\label{fig:ff-summary}
\end{figure*}


\begin{figure*}
\centering
\includegraphics[width=\textwidth]{figures/ft_fits.pdf}
\caption{Subsample of fits of the field theory. Moving right to left increases $\rho$ while moving up and down to up increases $\gamma$. Training data is shown in orange, the mean function is shown in red, and $\pm$ 1 SD is shaded. Note: FT was fit to 4096 datapoints, but here we display a thinned subset of the points for visual clarity.}
\label{fig:ft-fits}
\end{figure*}

\begin{figure*}
\centering
\includegraphics[width=\textwidth]{figures/mlp_fits.pdf}
\caption{Subsample of fits of the neural networks on the \emph{Sine} dataset. Training data is shown in orange, the mean function is shown in red, and $\pm$ 1 SD is shaded. Notice the abrupt phase transition from overfitting to underfitting in the mean function (in the lower right corner) and similarly in the precision function (in the upper right corner). Moving right to left increases $\rho$ while moving up and down to up increases $\gamma$. }
\label{fig:mlp-fits}
\end{figure*}

\begin{figure*}
\centering
\includegraphics[width=\textwidth]{figures/ff-de_fits.pdf}
\caption{Subsample of fits of neural networks with Fourier Feature layers and Dirichlet energy/geometric complexity regularization. Training data is shown in orange, the mean function is shown in red, and $\pm$ 1 SD is shaded. Moving right to left increases $\rho$ while moving up and down to up increases $\gamma$.}
\label{fig:ff-de-fits}
\end{figure*}

\begin{figure*}
\centering
\includegraphics[width=\textwidth]{figures/ff-l2_fits.pdf}
\caption{Subsample of fits of neural networks with Fourier Feature layers and L2 complexity regularization. Training data is shown in orange, the mean function is shown in red, and $\pm$ 1 SD is shaded. Moving right to left increases $\rho$ while moving up and down to up increases $\gamma$.}
\label{fig:ff-l2-fits}
\end{figure*}
