%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
%\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%Our imports
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{svg}

\usepackage{hyperref}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\usepackage{xcolor}
\newcommand{\cblue}{\textcolor{blue}}
\newcommand{\cred}{\textcolor{red}}
\newcommand{\cgreen}{\textcolor{green}}
\newcommand{\cmag}{\textcolor{magenta}}

% Definitions
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}


% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}


%Eli
\newcommand{\EF}[1]{{\color{green}{EF: #1}}}


\usepackage{xr}
\makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
%------------End of helper code--------------

\myexternaldocument{./el-laham_467}


\title{Deep Gaussian Mixture Ensembles\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Yousef El-Laham}
\author[1]{Niccol\`o Dalmasso}
\author[1]{Elizabeth Fons}
\author[1]{Svitlana Vyetrenko}
% Add affiliations after the authors
\affil[1]{%
    J.P. Morgan AI Research, New York, USA
}  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle


\appendix

\section{Theoretical Proofs}\label{sec:app-proofs}

This section includes the proofs of the propositions presented in Section 4.3. We have also included the proposition statements for readability purposes.

\begin{proposition} \label{prop:max-lower-bound-app}
Under the assumption that $\pi_i = 1/K-1$ for $i=1,..,K$,
%$(\pi_1,\ldots,\pi_K)=(1/K, \ldots, 1/K)$,
maximizing the Gaussian mixture data likelihood directly achieves better or equal joint likelihood than maximizing each ensemble member's likelihood separately.
\end{proposition}

\begin{proof}
The EM algorithm minimizes the joint data log-likelihood as defined in equation~\eqref{eq: exact_data_likelihood_max}, which can be lower-bounded in the following way by using Jensen's inequality:

\begin{align*}
    \argmax_{\theta} \mathbb{E}_{X,Y} \left[ \log\left(\sum_{k=1}^K \pi_k p_k(y|x, \theta_k) \right) \right] &\geq \argmax_{\theta} \mathbb{E}_{X,Y} \left[ \sum_{k=1}^K \log\left(\pi_k \right) + \log \left( p_k(y|x, \theta_k) \right) \right] = \\
    &= \argmax_{\theta} \sum_{k=1}^K  \mathbb{E}_{X,Y} \left[\log(\pi_k) \right] + \mathbb{E}_{X,Y} \left[ \ell_{\theta_k}(x, y))\right]. \\
\end{align*}

By assumption, the first term constant (of value $-\log(K)$), hence:

\begin{align*}
    &\argmax_{\theta} \mathbb{E}_{X,Y} \left[ \log\left(\sum_{k=1}^K \pi_k p_k(y|x, \theta_k) \right) \right] \geq \argmax_{\theta} \sum_{k=1}^K \mathbb{E}_{X,Y} \left[ \ell_{\theta_k}(x, y))\right],
\end{align*}

with the lower bound corresponding to maximizing the likelihood of each ensemble member separately, as performed in DEs~\citep{lakshminarayanan2017de}.

\end{proof}



\begin{proposition} \label{prop:em-convergence}
Under assumptions 4.4 - 4.7, let the mean and variance in each ensemble model being estimated via a separate 2-layer deep ReLu network from a common feature extraction layer. Then the DGMEs EM algorithm convergences to a non stationary point that maximizes the data likelihood with high-probability.
\end{proposition}


\begin{proof}
Using \citet[Theorem 4.1]{wu1983convergence}, to guarantee convergence of the EM algorithm it is enough to prove that at every round $t$:

\begin{equation} \label{app:diff-q-function}
\forall \theta \notin \mathcal{N}: Q(\theta^{(t+1)}; \theta^{(t)}) - Q(\theta^{(t)}; \theta^{(t)}) > 0,
\end{equation}

where $\mathcal{N}$ is the set of stationary points of the function $Q$. By writing the difference in equation~\eqref{app:diff-q-function} above we have that:

\begin{align*}
Q(\theta^{(t+1)}; \theta^{(t)}) - Q(\theta^{(t)}; \theta^{(t)}) &= \sum_{n=1}^N\sum_{k=1}^K \gamma_{k, n}(\log(\pi_k) + \ell_{\theta^{(t+1)}}(x_n, y_n)) - \sum_{n=1}^N\sum_{k=1}^K \gamma_{k, n}(\log(\pi_k) + \ell_{\theta^{(t)}}(x_n, y_n)) \\
&= \sum_{k=1}^K \left[ \sum_{n=1}^N \gamma_{k,n} \left( \ell_{\theta^{(t+1)}}(x_n, y_n)\right)  - \sum_{n=1}^N \gamma_{k,n} \left(\ell_{\theta^{(t)}}(x_n, y_n) \right) \right].
\end{align*}

By setting $\theta^{(t+1)} = \theta^{*}_k$ and using Assumption 4.4:


\begin{align*}
Q(\theta^{*}_k; \theta^{(t)}) - Q(\theta^{(t)}; \theta^{(t)}) \geq \sum_{k=1}^K \frac{\epsilon_{t,k}}{K} > \epsilon
\end{align*}

The result follows if every ensemble network can learn the maximum likelihood $\theta^*$ at every round. We will show that the above happens in high probability. Without loss of generality, set the round $t$ and the ensemble member $k$ if the mean and variance functions follow assumptions 4.5 and 4.6. Let $\ell^* =\ell_{\theta^{*}_k}$ and $\hat{\ell}_\theta$ be the estimated likelihood. As the likelihood is Gaussian, the estimation problem is equivalent to estimating the true mean function $\mu^*(x)$ and variance function $\sigma^*(x)$. Assume the mean and variance functions are learnt independently by using a pre-trained feature extraction layer, we can break down the estimation problem into:

\begin{align*}
    \| \ell(\mu^*, \sigma^*) - \ell(\widehat{\mu}, \widehat{\sigma})\|_2 &= \| \ell(\mu^*, \sigma^*) \pm \ell(\mu^*, \hat{\sigma}) - \ell(\widehat{\mu}, \widehat{\sigma})\|_2 \\
    &\leq \underbrace{\| \ell(\mu^*, \widehat{\sigma}) - \ell(\widehat{\mu}, \widehat{\sigma})\|_2}_{(A)} + \underbrace{\| \ell(\mu^*, \sigma^*) - \ell(\mu^*, \widehat{\sigma})\|_2}_{(B)}.
\end{align*}

Provided $n > \mathcal{O}(\log(1/\delta)/ \epsilon^2)$ and using Assumption 4.7 to guarantee non-degenerate weights, the proposition follows since:

\begin{itemize}
    \item[(A)] For the mean function estimation, the likelihood reduces to a weighted least square loss, which satisfies the assumptions in \citet[2.1]{farrell2021deep}. Hence, one would need at least $n > \mathcal{O}(\log(1/\delta)/ \epsilon)$ samples to estimate the mean function within $\epsilon/2$ radius and with probability $1 - \delta$;
    \item[(B)] For the variance function estimation, the assumption correspond to the requirement in \citet[Section 5]{arora2019fine}; hence, one would need at least $n > \mathcal{O}(\log(1/\delta)/ \epsilon^2)$ samples to estimate the mean function within an $\epsilon/2$ radius and with probability $1 - \delta$.
\end{itemize}


\end{proof}


\begin{proposition}
If the weights of each ensemble members are initialized to 0 with fixed bias terms, a single EM step for DGMEs is equivalent to perform DEs.
\end{proposition}

\begin{proof}
    If any ensemble members $f_k$ has all weights initialized to 0, then it follows that $p_{k}(y_n|x_n, \theta_k) = a$ for some constant $\delta \in \mathbb{R}$. In addition, $\mu_{\theta_k}(x_n) = \mu$ and $\sigma_{\theta_k}^2 (x_n)$ for any $x_n$. Hence, in the expectation steps all posterior probabilities are equal to:

    \begin{align*}
        \gamma_{k, n} &= \frac{p_{k}(y_n|x_n, \theta_k)P_{\theta}(z_n=k)}{\sum_{j=1}^K p_{j}(y_n|x_n, \theta_j)P_{\theta}(z_n=j)} = \frac{\delta{\cal N}(y_n; \mu, \sigma^2)}{\sum_{j=1}^K \delta{\cal N}(y_n; \mu, \sigma^2)} = \frac{1}{K}.
    \end{align*}

    Hence, the maximization in the M-step is equal to:

    \begin{align*}
        \theta_k^\star &= \argmax_{\theta_k\in\Theta_k} \sum_{n=1}^N \gamma_{k, n}\ell_{\theta_k}(x_n, y_n) = \argmax_{\theta_k\in\Theta_k} \sum_{n=1}^N \ell_{\theta_k}(x_n, y_n),
    \end{align*}

    which corresponds to maximizing the likelihood of each ensemble member separately, as performed in DE \cite{lakshminarayanan2017de}.
    
\end{proof}


\section{Additional Experimental Results and Ablation Studies}
\label{sec:exp-supp}

\subsection{Toy Regression}
\label{app: toy_regression}

In this subsection, we provide additional experimental results and ablation studies on the toy regression dataset that provide valuable insights on the role of each of the DGME hyperparameters.

\subsubsection{Ablation: Number of EM Rounds}

To study the effect that the number of EM rounds has on training of DGMEs, Figure \ref{fig: training_improves} shows DGMEs trained with 1, 2 and 5 rounds on the toy regression task with Gaussian noise (Case 1), where the number of epochs per round is fixed to $E=80$. We can see in this figure that after $J=5$ EM rounds, the algorithm has converged to a conditional distribution that represents the ground truth quite well. 

\begin{figure}[t]
    \centering
    \includegraphics[width=0.9\linewidth]{./figures/training_improves_with_EM_rounds.eps}
    \caption{Results on a toy regression task with Gaussian noise for different numbers of EM rounds, as described in Section \ref{experiments:toy}. As $J$ increases, the predictive mean improves.}
    \label{fig: training_improves}
\end{figure}

Additionally, we can assess the joint impact of the number of epochs $E$ used in the M-Step per EM round and the total number of EM rounds $J$, while keeping the total computational budget constant (e.g., $E\times J =50$ total epochs). We test the following values of $E\in\{1, 2, 5, 10, 25, 50\}$ and report the average NLL over the training set and its corresponding standard error (computed over a total of 10 runs) in Table \ref{tab: em_rounds_ablation}. 

\begin{table}[t]
\centering
\resizebox{\textwidth}{!}{\begin{tabular}{@{}ccccccc@{}}
\toprule
                        & $(E=1, J = 50)$ & $(E=2, J = 25)$ & $(E=5, J = 10)$ & $(E=10, J = 5)$ & $(E=25, J = 2)$ & $(E=50, J = 1)$ \\ \midrule
Normal - Unimodal       & $2.71 \pm 0.06$ & $2.63 \pm 0.06$ & $2.58 \pm 0.03$ & $2.54 \pm 0.01$ & $2.54 \pm 0.01$ & $2.56 \pm 0.03$ \\
Heavy-Tailed - Unimodal & $2.98 \pm 0.03$ & $2.95 \pm 0.02$ & $2.88 \pm 0.02$ & $2.87 \pm 0.01$ & $2.91 \pm 0.01$ & $2.96 \pm 0.02$ \\
Normal - Bimodal        & $3.15 \pm 0.05$ & $3.09 \pm 0.07$ & $3.02 \pm 0.04$ & $3.13 \pm 0.08$ & $3.42 \pm 0.06$ & $3.53 \pm 0.04$ \\ \bottomrule
\end{tabular}}
\caption{Training NLL obtained for the toy regression dataset using DGMEs under different configurations of the number of epochs per EM round $E$ and the total number of EM rounds $J$ for a fixed computational budget $E\times J = 50$. }
\label{tab: em_rounds_ablation}
\end{table}

We can see empirically that for a fixed computational budget, there is tradeoff between the performance and the effective number of EM rounds. The tradeoff is more apparent when considering the more difficult examples (i.e., heavy-tailed unimodal noise and normal bimodal noise). If the number of epochs in the M-step is $E=1$ and we train for $J=50$ rounds, not enough information is being propagated between the E- and M-Step in each round of training, making learning inefficient. In the other extreme, if the number of epochs per M-step is $E=50$ and we train for $J=1$ rounds, even if the optimization problem in the M-step is more accurately resolved, we do not run enough EM rounds to accurately learn the underlying conditional density function. If we balance the number of epochs per rounds and the total number of EM rounds (i.e., $(E=5, J=10)$ or $(E=10, J=5)$), we get much better performance in terms of training NLL.

\subsubsection{Ablation: Dropout and Adversarial Training}
In this ablation, our goal is to understand the effect of epistemic uncertainty estimation techniques in DGMEs. As a rough analysis, Figure \ref{fig: dropout_ablation} shows the effect of training with dropout, adversarial training and their combination. Here, the dropout probability is set to $p_d=0.05$. We can see that without dropout or adversarial training, the uncertainty estimates are well-calibrated for the training data (features taking value between -4 and 4), but are underestimated for the test data (features taking absolute value between 4 and 5). By incorporating dropout and adversarial training, we can see that the uncertainty estimates become larger for the test examples.

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{./figures/effects_of_dropout_and_AT.eps}
    \caption{Results on a toy regression task with Gaussian noise. Left most plot corresponds to standard set up of DGMEs trained with $K=5$ networks. Second plot corresponds to incorporating Dropout in the training. Third plot shows the effect of using adversarial training, and final plot shows the effect of using both dropout and adversarial training.}
    \label{fig: dropout_ablation}
\end{figure}

To get a better understanding of the effect of dropout probability $p_d$ on the quantified uncertainty, we can evaluate the train and test NLL for different values of $p_d$ for each of the toy datasets. Results are shown in Table \ref{tab: dropout_ablation}. From this table, we observe that dropout creates a trade-off between performance on in-sample data and out-of-sample data in terms of NLL. Increasing the dropout probability in this case causes the average NLL to be worse for the training set, but improves it (up to a certain point) on the test set. In practice, we can choose the dropout probability to minimize the NLL on a validation set.

\begin{table}[t]
\resizebox{\textwidth}{!}{\begin{tabular}{@{}ccccccccccc@{}}
\toprule
                        & \multicolumn{2}{c}{$p_d = 0.0$}    & \multicolumn{2}{c}{$p_d = 0.05$}   & \multicolumn{2}{c}{$p_d = 0.1$}    & \multicolumn{2}{c}{$p_d = 0.15$}   & \multicolumn{2}{c}{$p_d = 0.2$}    \\ \midrule
                        & Train NLL       & Test NLL         & Train NLL       & Test NLL         & Train NLL       & Test NLL         & Train NLL       & Test NLL         & Train NLL       & Test NLL         \\ \midrule
Normal - Unimodal       & $2.55 \pm 0.01$ & $7.50 \pm 0.88 $ & $2.59 \pm 0.01$ & $4.86 \pm 0.28 $ & $2.63 \pm 0.01$ & $4.30 \pm 0.12 $ & $2.66 \pm 0.01$ & $4.17 \pm 0.08 $ & $2.70 \pm 0.02$ & $4.10 \pm 0.06 $ \\
Heavy-Tailed - Unimodal & $2.87 \pm 0.01$ & $6.31 \pm 0.49 $ & $2.90 \pm 0.01$ & $4.83 \pm 0.17 $ & $2.93 \pm 0.01$ & $4.44 \pm 0.15 $ & $2.97 \pm 0.02$ & $4.23 \pm 0.08 $ & $2.99 \pm 0.02$ & $4.19 \pm 0.06 $ \\
Normal - Bimodal        & $3.16 \pm 0.09$ & $6.81 \pm 1.08 $ & $3.18 \pm 0.08$ & $5.80 \pm 0.37 $ & $3.29 \pm 0.06$ & $5.44 \pm 0.24 $ & $3.31 \pm 0.07$ & $5.32 \pm 0.20 $ & $3.36 \pm 0.05$ & $5.34 \pm 0.09 $ \\ \bottomrule
\end{tabular}}
\caption{Train and test NLL of DGMEs for each toy regression dataset under different dropout probability values.}
\label{tab: dropout_ablation}
\end{table}

\subsubsection{Ablation: Number of Mixture Components}
The number of components in the assumed Gaussian mixture impacts how well the model can estimate more complex noise distributions (e.g., heavy-tailed or bimodal distributions). Gaussian mixtures (with infinite components) are universal approximators to smooth continuous density functions, so the more components assumed, the more flexible the model is. When choosing the number of mixture components, one should take into consideration the complexity of the data generating process and the amount of data in the training set. If the data generating process is known to be Gaussian, then choosing a large number of components is not beneficial. On the other hand, if the data generating process is thought to be multimodal, then using more components is the better choice. We can see this in the following two ablation studies.

Figure \ref{fig: components_ablation_heavy_tailed} shows the effect of the number of mixtures components $K$ on the kurtosis of the learned predictive distribution. We observe that with more mixture components, the the model learns a fatter-tailed distribution. This makes sense since a Student-t distribution can be viewed as a Gaussian mixture with an infinite number of components with different variances. 

\begin{figure}[h]
    \centering
    \includegraphics[width=\linewidth]{./figures/effects_of_components_heavy_tailed.eps}
    \caption{Effect of the number of mixtures on the learned kurtosis of the predictive distribution under heavy-tailed noise. }
    \label{fig: components_ablation_heavy_tailed}
\end{figure}

Figure \ref{fig: components_ablation_bimodal} shows the effect of the number of mixture components on the learned predictive distribution in the case of the bimodal Gaussian. We can see that when DGMEs assumes only $K=1$ mixture component, DGMEs have a similar predictive distribution as DEs, since the model will attempt to explain the bimodal data with a single Gaussian by overestimating the aleatoric noise. An interesting insight is that when DGMEs assume too many components (i.e., $K>2$), the model is still able to accurately learn that the underlying predictive distribution is still bimodal. 

\begin{figure}[t]
    \centering
    \includegraphics[width=0.9\linewidth]{./figures/effects_of_components_bimodal.eps}
    \caption{Effect of the number of mixture components on the learned predictive distribution under bimodal noise.}
    \label{fig: components_ablation_bimodal}
\end{figure}

\subsubsection{Ablation: Weight Initialization Schemes and Data Standardization}

To test the impact of weight initialization of the neural network on the performance of DGME, we perform the following ablation study: we train a DGME for 5 rounds, where 10 epochs are used to resolve the M-Step in each round. We use the same architecture as in our toy experiments. We evaluate the NLL on the training set under five different initializations: PyTorch default initialization, initialization with uniform distribution with bounds -0.01 to 0.01, initialization with normal distribution with mean 0 and standard deviation $10^{-6}$, Xavier uniform initialization \cite{glorot2010understanding}, and Xavier normal initialization. As a note, the PyTorch default initialization for a linear layer is done via a uniform distribution ${\cal U}(-\frac{1}{\sqrt{a}}, \frac{1}{\sqrt{a}})$, where $a$ denotes the number of input features to the linear layer. Please refer to \cite{glorot2010understanding} for more information on these weight initialization schemes.

We train the model for each toy dataset over 20 total runs and report the average training NLL and its corresponding standard error. We run this ablation twice: once for training with non-standardized data and once for training with standardized data. The results are shown in Table \ref{tab: init_unstandardized} and Table \ref{tab: init_standardized}. We can see from the results that although weight initialization has some impact on the results, if the data is standardized, it becomes less important. We also see that across all datasets, the default PyTorch initialization gives the most favorable results for both non-standardized and standardized data.

\begin{table}[t]
\centering
\begin{tabular}{@{}cccccc@{}}
\toprule
                        & PyTorch Default & Uniform         & Normal          & Xavier Uniform  & Xavier Uniform  \\ \midrule
Normal - Unimodal       & $2.86 \pm 0.07$ & $3.08 \pm 0.04$ & $3.02 \pm 0.04$ & $2.88 \pm 0.05$ & $2.90 \pm 0.05$ \\
Heavy-Tailed - Unimodal & $3.16 \pm 0.05$ & $3.42 \pm 0.06$ & $3.36 \pm 0.05$ & $3.18 \pm 0.05$ & $3.20 \pm 0.05$ \\
Normal - Bimodal        & $3.36 \pm 0.17$ & $3.56 \pm 0.06$ & $3.61 \pm 0.06$ & $3.46 \pm 0.20$ & $3.47 \pm 0.20$ \\ 
\bottomrule
\end{tabular}
\caption{Impact of different of weight initialization schemes on the train NLL when the data is not standardized. }
\label{tab: init_unstandardized}
\end{table}

\begin{table}[t]
\centering
\begin{tabular}{@{}clllll@{}}
\toprule
                        & \multicolumn{1}{c}{PyTorch Default} & \multicolumn{1}{c}{Uniform} & \multicolumn{1}{c}{Normal} & \multicolumn{1}{c}{Xavier Uniform} & \multicolumn{1}{c}{Xavier Uniform} \\ \midrule
Normal - Unimodal       & $2.55 \pm 0.02$                     & $2.55 \pm 0.01$             & $2.56 \pm 0.01$            & $2.54 \pm 0.01$                    & $2.53 \pm 0.01$                    \\
Heavy-Tailed - Unimodal & $2.87 \pm 0.02$                     & $2.88 \pm 0.01$             & $2.88 \pm 0.01$            & $2.86 \pm 0.01$                    & $2.87 \pm 0.01$                    \\
Normal - Bimodal        & $3.13 \pm 0.07$                     & $3.63 \pm 0.01$             & $3.60 \pm 0.04$            & $3.27 \pm 0.09$                    & $3.24 \pm 0.09$                   \\ 
\bottomrule
\end{tabular}
\caption{Impact of different of weight initialization schemes on the train NLL when the data is standardized. }
\label{tab: init_standardized}
\end{table}

\subsubsection{Illustrative Results: Additive Gaussian Noise}

We compare DGMEs with the baselines on the toy regression dataset with Gaussian noise. Figure \ref{fig: sota_comparison_gauss} shows the performance of DGMEs compared to MDNs, MCD and DEs. DGMEs has comparable performance to MCD and DEs and outperforms MDNs.

\begin{figure}[h]
    \centering
    \includegraphics[width=\linewidth]{./figures/sota_comparison_gauss.eps}
    \caption{Performance on a toy regression task with Gaussian noise of DGMEs (right) with state-of-the-art methods MDNs, MCD and DEs.}
    \label{fig: sota_comparison_gauss}
\end{figure}




\subsection{Regression on Real Datasets} \label{app: regression}

For real data experiments on a regression task we use the following datasets: (a) Boston Housing dataset\footnote{\url{https://www.kaggle.com/datasets/schirmerchad/bostonhoustingmlnd}}, (b) Concrete compressive strength dataset\footnote{\url{https://archive.ics.uci.edu/ml/datasets/concrete+compressive+strength}} \citep{yeh1998modeling}, (c) Energy efficiency dataset\footnote{\url{https://archive.ics.uci.edu/ml/datasets/energy+efficiency}} \citep{tsanas2012accurate}, (d) Kinematics of an 8 link robot arm dataset \footnote{\url{https://www.openml.org/search?type=data&sort=runs&id=189&status=active}}, (e) 
Combined cycle power plant dataset\footnote{\url{https://archive.ics.uci.edu/ml/datasets/combined+cycle+power+plant}} \citep{tufekci2014prediction}, (f) Wine dataset\footnote{\url{https://archive.ics.uci.edu/ml/datasets/wine}} and (g) Yacht hydrodynamics dataset\footnote{\url{https://archive.ics.uci.edu/ml/datasets/Yacht+Hydrodynamics}}.

In the main text, to provide a fair comparison with techniques that assume the conditional distribution of the data is Gaussian, we summarize the mixture distribution output in both MDNs and DGMEs into a single Gaussian and then evaluate the NLL. This is analagous to the way DEs compute the NLL. We also provide additional results for the test NLL under the assumption of a  mixture of Gaussians in Table \ref{tab:regression_experiments_NLL_mixG} below.


\begin{table*}[t]
    \centering
    \caption{Test NLL for the regression experiments in the mixture of Gaussians case.}
    \label{tab:regression_experiments_NLL_mixG}
    \resizebox{\textwidth}{!}{
    \begin{tabular}{lccccccc}
    \toprule
    \multicolumn{8}{c}{\textbf{\textsc{Test NLL (Mixture of Gaussians)}}}\\
    \midrule
    Dataset                         &            MDNs &           MCD &            DEs &    DGMEs (J=1) &    DGMEs (J=2) &    DGMEs (J=5) &  DGMEs (J=10) \\
    \midrule
    Boston housing                  &    2.71 $\pm$ 0.45 &  2.46 $\pm$ 0.25  &   2.41 $\pm$ 0.25 &  \bf 2.33 $\pm$ 0.18 & \bf  2.33 $\pm$ 0.23 &   2.51 $\pm$ 0.33 &    2.74 $\pm$ 0.53 \\
    Concrete                        &    3.04 $\pm$ 0.22 &   3.04 $\pm$ 0.09 &   3.06 $\pm$ 0.18 &   3.03 $\pm$ 0.10 &   2.99 $\pm$ 0.14 &   2.97 $\pm$ 0.24 & \bf 2.94 $\pm$ 0.22 \\
    Energy                          &  \bf  0.70 $\pm$ 0.17 &   1.99 $\pm$ 0.09 &   1.38 $\pm$ 0.22 &   1.56 $\pm$ 0.14 &   1.31 $\pm$ 0.12 &   0.96 $\pm$ 0.20 &  0.92 $\pm$ 0.48 \\
    Kin8nm                          &  -1.17 $\pm$ 0.04  &  -0.95 $\pm$ 0.03 &  -1.20 $\pm$ 0.02 &  -1.20 $\pm$ 0.02 &  -1.23 $\pm$ 0.03 & \bf -1.24 $\pm$ 0.02 &  \bf  -1.24 $\pm$ 0.02 \\
    Power plant                     &  \bf  2.74 $\pm$ 0.04 &   2.80 $\pm$ 0.05 &   2.79 $\pm$ 0.04 &   2.81 $\pm$ 0.03 &   2.79 $\pm$ 0.03 &   2.77 $\pm$ 0.02 &     2.75 $\pm$ 0.02 \\
    % Protein                         &            NaN &   2.89 $\pm$ 0.01 &   2.83 $\pm$ 0.02 &           NaN &           NaN &           NaN &          NaN \\
    Wine                            &   0.43 $\pm$ 0.86 &   0.93 $\pm$ 0.06 &   0.94 $\pm$ 0.12 &   0.93 $\pm$ 0.12 &   0.90 $\pm$ 0.09 &   0.81 $\pm$ 0.11 &    \bf 0.18 $\pm$ 0.39 \\
    Yacht                           &   0.51 $\pm$ 0.37 &   1.55 $\pm$ 0.12 &   1.18 $\pm$ 0.21 &   0.94 $\pm$ 0.19 &   0.66 $\pm$ 0.18 &  0.51 $\pm$ 0.23 & \bf    0.42 $\pm$ 0.22 \\
    \bottomrule
    \end{tabular}
    }
\end{table*}


\subsection{Hyperparameter Tuning for Financial Forecasting}
We tuned the hyperparameters of the architecture (number of LSTM layers, number of fully-connected layers, number of LSTM hidden units, number of hidden units in fully-connected layers), optimization procedure (weight decay and learning rate), and the uncertainty quantification associated parameters (dropout probability, and homoscedastic variance value for MCD and MultiSWAG) for each of the approaches using cross validation . We note that all methods use the same feature extractor (LSTM and fully-connected network), which is obtained by hyperparameter tuning each dataset to a single network. To hyperparameter tune, we took the full training period and split it into an ordered sequence of a 90\% training period and a 10\% validation period. We select the hyperparameters based on the combination that maximizes the NLL on the validation period for each dataset.


\section{Possible Extension to Classification Tasks}
\label{app: ood_detection}
Techniques like MDNs and DGMEs are not suited for dealing with classification tasks, since the output of both models is a mixture of Gaussian distributions. For classification tasks, we instead can consider a mixture of categorical distributions, rather than a mixture of Gaussian distributions.  In particular, the conditional distribution $p_{\theta}(y|x)$ is given by
\begin{equation*}
    p_{\theta}(y|x) = \sum_{k=1}^K \pi_k \prod_{i=1}^{d_y} p_{\theta_k}^i(x)^{\mathbb{I}(y=i)},
\end{equation*}
where $p_{\theta_k}^i(x)$ denotes the probability that $y$ belongs to the $i$-th class according to the $k$-th mixture. In this case, we assume MDNs and DGMEs output these probabilities rather than the mean and variance parameterizing a Gaussian distribution.  
\subsection{Entropy Calculation}
To evaluate uncertainty in classification tasks, we consider the average predictive entropy as the metric. To compute the average predictive entropy for a sample $x$, we use the following estimate:
\begin{equation*}
    \widehat{\rm Ent}(x) = -\frac{1}{M} \sum_{m=1}^M \sum_{i\in{\cal C}}\tilde{p}_{(m)}^{i}(x) \log \tilde{p}_{(m)}^{i}(x),
\end{equation*}
where $\tilde{p}_{(m)}^i(x)$ denotes the probability of class $i$ according to the $m$-th sample from the predictive distribution and ${\cal C}$ denotes the set of classes. For both MDNs and DGMEs, these samples are obtained by the following procedure:
\begin{align*}
    k^{(m)} &\sim {\rm Categorical}(\pi_1,\ldots,\pi_K), \\
    \tilde p_{(m)}^i &= p^i_{\theta_{k^{(m)}}}.
\end{align*}
Note that we incorporate dropout in the training procedure of MDNs and DGMEs for this experiment by applying a stochastic forward pass to the sampled network $k^{(m)}$.
\subsection{Example: Uncertainty Evaluation on MNIST}
As an example, we compare DGMEs ability to reason about the underlying uncertainty of new samples with the baseline approaches  with regards to the MNIST handwritten digits dataset. Specifically, for each method, we train a MLP network with 3 hidden layers and 200 hidden units per layer with ReLU activations on the MNIST dataset, including only digits 0-3 and 5-9. After the models are trained, we evaluate the average predictive entropy over three different datasets: the training dataset (known classes), a dataset containing only the digit 4 (unknown classes), and the Fashion-MNIST dataset (unrelated data). We use $M=100$ samples from the predictive distribution to form an estimate of the predictive entropy for each method.We describe in more detail how the average predictive entropy is computed for each method in the Supplementary Material, Section B.3. The results for this experiment are shown in Table \ref{tab: ood_results}, which are averaged over 10 independent runs of each method. The results indicate that DGMEs are able to appropriately reason about the uncertainty in each of the datasets and is competitive with the baseline approaches in each case. DGMEs appropriately obtain that lowest entropy on the training dataset (i.e., the digits it was trained on), obtains a slightly higher entropy on the MNIST dataset containing unknown classes, and the highest entropy on the Fashion-MNIST dataset, which contains examples unrelated to the original classification task.

\begin{table*}[!t]
\centering
\begin{tabular}{lcccc}
\hline
\multicolumn{5}{c}{\textbf{\textsc{Average Predictive Entropy}}}                                                  \\ \hline
Dataset         & MDNs               & MCD               & DEs                & DGMEs              \\ \hline
MNIST (Known)   & $0.019 \pm 0.005$ & $0.012 \pm 0.003$ & $0.012 \pm 0.002$ & $0.015 \pm 0.002$ \\
MNIST (Unknown) & $0.192 \pm 0.032$ & $0.180 \pm 0.020$ & $0.180 \pm 0.020$ & $0.193 \pm 0.016$ \\
Fashion-MNIST   & $0.663 \pm 0.110$ & $0.714 \pm 0.140$ & $0.706 \pm 0.067$ & $0.698 \pm 0.057$ \\
\hline
\end{tabular}
\caption{Average predictive entropy for classification datasets. DGMEs are able to appropriately reason about the underlying uncertainty of OOD samples (MNIST with unknown classes and Fashion-MNIST) and is competitive with respect to state-of-the-art approaches.}
\label{tab: ood_results}
\end{table*}

\section{Comparison of Uncertainty Quantification Approaches}
Here, we provide an overall comparison of the benchmarks used in the experiments of this work as compared to the proposed approach along different qualities: the likelihood assumption, whether or not mixture weights are learned, how aleatoric uncertainty is quantified, and how epistemic uncertainty is quantified. This comparison is provided in Table \ref{tab: compare_methods}

\begin{table}[t]
\centering
\resizebox{\textwidth}{!}{\begin{tabular}{@{}lp{1in}p{1in}p{1.3in}p{1.35in}p{1.35in}@{}}
\toprule
\textbf{Method} &
  \textbf{Likelihood} &
  \textbf{Mixture Weights} &
  \textbf{Aleatoric Uncertainty} &
  \textbf{Epistemic Uncertainty} &
  \textbf{Other Notes} \\ \midrule
MDNs &
  Mixture of Gaussians &
  Learned and input dependent &
  Heteroscedastic &
  None in original implementation, but dropout is applied for fair comparison in this implementation &
  Off-the-shelf can be applied to account for epistemic uncertainty (e.g., dropout, Laplace approximation, SWAG, variational Bayes, etc.) \\
  \midrule
MCD &
  Gaussian &
  Each prediction made via a stochastic forward pass at test time is equally weighted. &
  Homoscedastic &
  Dropout &
   \\
   \midrule
DEs &
  Gaussian &
  Assumed uniform &
  Heteroscedastic &
 Adversarial training and weight initialization. Dropout is also applied in this implementation using hyperparameter optimization. &
   \\
   \midrule
MultiSWAG &
  Gaussian &
  Assumed uniform &
  Homoscedastic &
  Stochastic weight averaging Gaussian (SWAG) &
  One can also account for heteroscedasticity by applying SWAG training to a deep ensemble that outputs a mean and variance \\
  \midrule
DGMEs &
  Mixture of Gaussians &
  Learned and independent of input &
  Heteroscedastic &
  Dropout in this implementation &
  Other methods to account for epistemic can be used off-the-shelf (e.g., Laplace approximation, SWAG, variational Bayes, etc.) \\ \bottomrule
\end{tabular}}
\caption{Summary of benchmarks as compared to DGMEs.}
\label{tab: compare_methods}
\end{table}

\section{Sampling from the Predictive Distribution}
To understand how sampling from the predictive distribution works in DGMEs, we begin with standard formula for determining the predictive distributions in Bayesian models:
\begin{equation*}
    p(y|x, {\cal D}) = \int_{\Theta} p_{\theta}(y|x)p(\theta|{\cal D}) d\theta.
\end{equation*}
In the case of DGMEs, $p_{\theta}(y|x)$ is a mixture of Gaussian distributions and $p(\theta|{\cal D})$ is approximated via dropout.  An important property of the predictive distribution in the case of mixture distributions is that it can be expressed as a mixture of predictive distributions. This property can be derived as follows:
\begin{align*}
    \label{eq: mixture_predictive}
    p(y|x, {\cal D}) &= \int_{\Theta} p_{\theta}(y|x)p(\theta|{\cal D})d\theta \\
    &= \int_{\Theta} \left(\sum_{k=1}^K \pi_k p_k(y|x, \theta_k)\right) p(\theta|{\cal D}) d\theta \\
    &= \int_{\Theta} \sum_{k=1}^K \pi_k p_k(y|x, \theta_k)p(\theta|{\cal D}) d\theta \\
    &= \sum_{k=1}^K \pi_k \int_{\Theta} p_k(y|x, \theta_k)p(\theta|{\cal D}) d\theta \\
    &= \sum_{k=1}^K \pi_k \int_{\Theta_k} p_k(y|x, \theta_k)d\theta_k  \underbrace{\int_{\Theta_{-k}} p(\theta|{\cal D}) d\theta_{-k}}_{{p(\theta_k|{\cal D})}} \\
    &= \sum_{k=1}^K \pi_k \int_{\Theta_k} p_k(y|x, \theta_k) p(\theta_k|{\cal D}) d\theta_k.
\end{align*}
Since $p_k(y|x, {\cal D}) = \int_{\Theta_k} p_k(y|x, \theta_k)p(\theta_k|{\cal D})d\theta_k$, we obtain the following expression for the predictive distribution:
\begin{equation*}
    p(y|x, {\cal D}) = \sum_{k=1}^K \pi_k p_k(y|x, \theta_k).
\end{equation*}
This form implies that we can draw samples from the predictive distribution in DGMEs using the following procedure:
\begin{enumerate}
    \item Sample the mixture component: 
    $$k\sim{\rm Categorical}(\pi_1,\ldots,\pi_K)$$
    \item Sample the posterior parameters of the given mixture component. In this work, dropout was used to approximate each posterior $p(\theta_k|{\cal D})$:
    \begin{align*}
    a_{k, i} &\sim {\rm Bernoulli}(p_d), \quad i=1,\ldots, d_{\theta}, \\
    \theta_k &= a_k \odot \theta_k^\star
    \end{align*}
    \item Draw the sample of $y$ from the appropriate predictive distribution:
    $$ y \sim  p_k(y|x, \theta_k)$$
\end{enumerate}

% \subsubsection*{Disclaimer}

\paragraph{Disclaimer}
This paper was prepared for informational purposes by the Artificial Intelligence Research group of JPMorgan Chase \& Co. and its affiliates (``JP Morgan''), and is not a product of the Research Department of JP Morgan. JP Morgan makes no representation and warranty whatsoever and disclaims all liability, for the completeness, accuracy or reliability of the information contained herein. This document is not intended as investment research or investment advice, or a recommendation, offer or solicitation for the purchase or sale of any security, financial instrument, financial product or service, or to be used in any way for evaluating the merits of participating in any transaction, and shall not constitute a solicitation under any jurisdiction or to any person, if such solicitation under such jurisdiction or to such person would be unlawful.


\bibliography{references}

\end{document}
