%%%%%%%% ICML 2021 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%

\documentclass{uai2022}
\usepackage{bm}
% if you need to pass options to natbib, use, e.g.:
%     \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2018

% ready for submission
% \usepackage{neurips_2018}

% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
  %  \usepackage[preprint]{neurips_2018}

% to compile a camera-ready version, add the [final] option, e.g.:csss

%\usepackage{single_col_aistats}

% to avoid loading the natbib package, add option nonatbib:
%     \usepackage[nonatbib]{neurips_2018}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{url}            % simple URL typesetting
\usepackage{xr}
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{bm}
\usepackage{comment}
\usepackage{wrapfig}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{xcolor}
\usepackage{graphicx}
\usepackage{floatrow}
\usepackage{subcaption}
\usepackage{bbold}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{natbib}
\newtheorem{theorem}{{\bf Theorem}}
\newtheorem{lemma}{{\bf Lemma}}
\newtheorem{proposition}{{\bf Proposition}}
\newtheorem{remark}{{\bf Remark}}
\newtheorem{corollary}{{\bf Corollary}}
\newtheorem{definition}{{\bf Definition}}
\newtheorem{assumption}{Assumption}
\newcommand{\E}{\mathop{\mathbb{E}}}
\newcommand{\tr}{\mathop{\rm tr}}
\newcommand{\clamp}{\operatorname{clamp}}
\usepackage{algorithm} 
\usepackage{array}
\usepackage{wrapfig}
\usepackage{multirow}
\usepackage{tabularx}

%\usepackage{algpseudocode} 
\newfloatcommand{capbtabbox}{table}[][\FBwidth]
\makeatletter
\def\BState{\State\hskip-\ALG@thistlm}

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
	\typeout{(#1)}
	\@addtofilelist{#1}
	\IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[2][]{%
	\externaldocument[#1]{#2}%
	\addFileDependency{#2.tex}%
	\addFileDependency{#2.aux}%
	\addFileDependency{#2.bbl}%
}

%\myexternaldocument{uai2022-submission}

\begin{document}
\bibliographystyle{plainnat}

\onecolumn
\title{Modeling Extremes with $d$-max-decreasing Neural Networks (Appendix)}
\maketitle
\appendix
% It is OKAY to include author information, even for blind
% submissions: the style file will automatically remove it for you
% unless you've provided the [accepted] option to the icml2021
% package.

% List of affiliations: The first argument should be a (short)
% identifier you will use later to specify author affiliations
% Academic affiliations should list Department, University, City, Region, Country
% Industry affiliations should list Company, City, Region, Country

% You can specify symbols, otherwise they are numbered in order.
% Ideally, you should not use this facility. Affiliations will be numbered
% in order of appearance and this is the preferred way.
% \icmlsetsymbol{equal}{*}

% \begin{icmlauthorlist}
% \icmlauthor{Aeiau Zzzz}{equal,to}
% \icmlauthor{Bauiu C.~Yyyy}{equal,to,goo}
% \icmlauthor{Cieua Vvvvv}{goo}
% \icmlauthor{Iaesut Saoeu}{ed}
% \icmlauthor{Fiuea Rrrr}{to}
% \icmlauthor{Tateu H.~Yasehe}{ed,to,goo}
% \icmlauthor{Aaoeu Iasoh}{goo}
% \icmlauthor{Buiui Eueu}{ed}
% \icmlauthor{Aeuia Zzzz}{ed}
% \icmlauthor{Bieea C.~Yyyy}{to,goo}
% \icmlauthor{Teoau Xxxx}{ed}
% \icmlauthor{Eee Pppp}{ed}
% \end{icmlauthorlist}

% \icmlaffiliation{to}{Department of Computation, University of Torontoland, Torontoland, Canada}
% \icmlaffiliation{goo}{Googol ShallowMind, New London, Michigan, USA}
% \icmlaffiliation{ed}{School of Computation, University of Edenborrow, Edenborrow, United Kingdom}

% \icmlcorrespondingauthor{Cieua Vvvvv}{c.vvvvv@googol.com}
% \icmlcorrespondingauthor{Eee Pppp}{ep@eden.co.uk}

% You may provide any keywords that you
% find helpful for describing your paper; these are used to populate
% the "keywords" metadata in the PDF but will not be shown in the document
% \icmlkeywords{Machine Learning, ICML}

% \vskip 0.3in


% this must go after the closing bracket ] following \twocolumn[ ...

% This command actually creates the footnote in the first column
% listing the affiliations and the copyright notice.
% The command takes one argument, which is text to display at the start of the footnote.
% The \icmlEqualContribution command is standard text for equal contribution.
% Remove it (just {}) if you do not need this facility.

%\printAffiliationsAndNotice{}  % leave blank if no need to mention equal contribution
% \printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text.
%\section{Architecture Normalization}
\section{Background on Extreme Value Theory}
\label{sec:bg}
The main idea behind extreme value theory (EVT) is to establish a form of the central limit theorem for the  maxima of appropriately scaled random variables.
EVT characterizes the behavior of the maxima of $n$ independent and identically distributed (i.i.d.) random variables $X_1, \cdots, X_n$ with continuous distribution function $F$.
More precisely, let $M_n = \max_{1 \leq i \leq n} X_i$, then there exists sequences of real numbers $a_n > 0$ and $b_n$ such that the limit $\mathbb{P} \left[ \left(M_n - b_n\right) / a_n \leq x\right] \to H(x)$ as $n \to \infty$ is non-degenerate.
We then say that $F$ is in the maximum domain of attraction of $H$ or equivalently $F \in \text{MDA}(H)$.
This limit is fully identified by the generalized extreme value (GEV) distribution given by:
\begin{equation}
    \label{gev}
\begin{split}
H_{\xi}(x) = \left\{\begin{matrix}
\exp \left( -(1+\xi x)^{-1/\xi}\right), & \text{if } \xi \neq 0 \\ 
\exp{(-e^{-x})}, & \text{if } \xi=0 
\end{matrix}\right.
\end{split}
\end{equation}
where $1+\xi x > 0$ and $\xi$ is the shape parameter indicating the thickness of the tail. 
The following theorem due to Fisher, Tippet and Gnedenko, stated in~\cite{dehaan_book_supp}, states the fundamental result of EVT. 
\begin{theorem}[Fisher-Tippet-Gnedenko Theorem, stated in~\cite{dehaan_book_supp}]
 If $F \in \text{MDA}(H)$, and if the limit $H$ exists then it belongs to the class of GEV distributions, i.e. $H = H_{\xi}$ for some real number $\xi$. 
\end{theorem}
\subsection{Main Definitions and Theorems}
\begin{theorem}[Extreme value copula]
If $C$ is a $d-$variate extreme value copula then there exists a tail dependence function $\ell: [0, \infty)^d \to [0, \infty)$ such that:
\begin{equation}
    C \left(u_1, \cdots, u_d \right) = e^{ -\ell\left(-\log u_1, \cdots, -\log u_d \right)}, 
\end{equation}
where $\left(u_1, \cdots,u_d \right) \in (0, 1]^d$.
Using the homogeneity property of $\ell$, the extreme value copula $C$ can be rewritten as: 
\begin{equation}
\label{eq:pickands_copula_supp}
    \begin{split}
    C \left(u_1, \cdots, u_d \right)  = e^{ \left( \sum_{k=1}^d \log u_k\right) A\left(\frac{\log u_1}{\sum_{k=1}^d \log u_k}, \cdots, \frac{\log u_d}{\sum_{k=1}^d \log u_k} \right)  },
    \end{split}
\end{equation}
where $A$ is known as the Pickands dependence function, which can be thought of as the restriction of $\ell$ to the unit simplex $\Delta_{d-1} = \{ \mathbf{w}=\left( w_1, \cdots, w_d\right) \in [0, \infty)^d: \sum_{k=1}^d w_k=1 \}$. The Pickands function $A$ is known to be d-max-decreasing and satisfies: \vspace{-0.2cm}
\begin{equation}\label{pickandsbounds}
\max_{1 \leq k  \leq d }  w_k \leq A(w_1, \cdots, w_d) \leq 1 ,
\end{equation}
for all $\mathbf{w} = \left(w_1, \cdots, w_d\right) \in \Delta_{d-1}$. 
%These properties characterize Pickands dependence functions for $d=2$ but not necessarily for general $d$ \cite{segers_copulas}.
\end{theorem}
\begin{definition}[Tail dependence function]
A function $\ell: [0, \infty)^d \to [0, \infty)$ is a tail dependence function if for all $\left(x_1, \cdots, x_d \right) \in [0, \infty)^d$, the following conditions are satisfied:
\begin{itemize}
    \item (i) $\ell$ is d-max-decreasing and homogeneous of order $1$, i.e. $\ell(cx_1, \cdots, c x_d) = c   \,\ell(x_1, \cdots, x_d)$, for all $c > 0$. 
    \item (ii) $\max_{ 1 \leq k \leq d} x_k \leq \ell(x_1, \cdots, x_d) \leq \sum_{k=1}^d x_k$.
\end{itemize}
\label{def:stdf}
\end{definition}
%\subsection{Pickands dependence and max-stable processes}
\begin{comment}
Here we introduce the notion of a max-stable process, which is intrinsically related to the Pickands dependence and the spectral density.
%\subsubsection{Definition of max-stable processes}
%We initially review the definition of a max-stable process.
\begin{definition}[Max-Stable Processes \citep{de1984spectral}]
Let $X_1(t), X_2(t), \cdots, X_n(t)$ be independent copies of a continuous stochastic process on a compact set $\mathcal{T}$. 
If there exists normalizing functions $a_n(t) > 0$ and $b_n(t) \in \mathbb{R}$ for all $t \in \mathcal{T}$ such that the limit
$
 \frac{ \max_{1 \leq i \leq n} X_i(t) - b_n(t) } { a_n(t) } \to M(t) \:\: \text{as} \:\: n \to \infty
$
is non degenerate then the limiting process $M(t)$ is a max-stable process.
\end{definition}

A key property of max-stable processes is that any univariate marginal is distributed as a GEV distribution and any subset of marginals is distributed as a MEV distribution. 
This motivates the relationship to MEV distributions parameterized by Pickands dependence functions.
We then consider this analysis with respect to stationary max-stable processes and the ensuing spectral decomposition of stationary max-stable processes.
\end{comment} 
\subsection{Spectral Decomposition of Stationary Max-Stable Processes}
Stationary max-stable processes can be intuitively interpreted as i.i.d. samples from infinite dimensional extreme value distributions (i.e. distributions over functions).
A stationary max-stable process can be decomposed by the spectral representation defined in \cite{de1984spectral} which we recall in Proposition~\ref{prop:spectral}. 
\begin{proposition}[Spectral Representation of Max-Stable Processes \citep{de1984spectral}]
\label{prop:spectral}
Suppose that $M(t)$ has unit Fr\'echet margins and is stationary.
Then, $M(t)$ can be written as:
\begin{equation}
    M(t) = \max_{i \geq 1 } \xi_i Y_i^{+}(t), \quad t \in \mathcal{T}.
\label{eqn:spectral_maxstable}
\end{equation}
$\{Y_i(t) \}_{i \geq 1}$ are i.i.d. copies of a continuous stochastic process $Y$ defined on $\mathcal{T}$ such that $\mathbb{E}[Y^{+}(t)]=1$ with $Y^{+}(t) = \max \{0, Y(t) \}$ and $\xi_i$ is the $i^\text{th}$ realization of an independent Poisson point process on $[0, \infty)$ with intensity $\xi^{-2} d \xi$.
\end{proposition}

\section{d-max-decreasing Functions}
\label{sec:fdmd}
We use the definition given in \cite{hofmann2009characterization_supp}.
A function $A(\mathbf{w}) : \mathbb{R}^d \to \mathbb{R}$ is d-max-decreasing if and only if for any $\mathbf{x} \leq \mathbf{y} \leq 0$ and any subset $E \subsetneq \{1, \ldots, d\}$:
$$
\sum_{\substack{\mathbf{m} = \{0,1\}^d,\\ m_j = 1 \text{ if } j \in E}} \left [(-1)^{d + 1 -\sum_{j \leq d} m_j} \left ( \sum_{j \leq d} -y_j^{m_j} x_j^{1-m_j}\right) A \left (\frac{y_1^{m_1} x_1^{1-m_1}}{\sum_{j \leq d} y_j^{m_j} x_j^{1-m_j}}, \cdots, \frac{y_d^{m_d} x_d^{1-m_d}}{\sum_{j \leq d} y_j^{m_j} x_j^{1-m_j}} \right)\right ] \geq 0, 
$$
and $A(\mathbf{e}_i) = 1$ where $\mathbf{e}_i$ is the canonical basis function. 
Moreover, from \citet[Theorem 3.1.1]{hofmann2009characterization_supp}, the following three characterizations are equivalent:
\begin{enumerate}
    \item The function $$\exp\left(-\sum_{j=1,\ldots,d}x_j A\left(\frac{x_1}{\sum_{j=1,\ldots,d}x_j}, \cdots, \frac{x_d}{\sum_{j=1,\ldots,d}x_j}\right)\right)$$ defines a multivariate extreme value distribution; 
    \item There exists a spectral measure $\Lambda$ such that $$A(\mathbf{w}) = \int_{\Delta_{d-1}} \max_{k=1,\ldots,d} w_k s_i \, \mathrm{d} \Lambda(\mathbf{s}), \quad \mathbf{w} \in \Delta_{d-1};$$
    \item $A(\mathbf{w})$ is $d$-max-decreasing.
\end{enumerate}
Next, we note the nesting property of $d$-max-decreasing functions, given in \citet[Section 2.2]{hofert2018hierarchical_supp}, that hierarchies of spectral measures define valid EVDs, i.e.
$$
A(\mathbf{w}) = \mathbb{E}\left [ s^{(2,1)}_1 \mathbb{E}\left[ \max_{k=1,\ldots,d} s^{(1,1)}_k w_k \right], \ldots,  s^{(2,1)}_d\mathbb{E}\left[ \max_{k=1,\ldots,d} s^{(1,d)}_k w_k \right] \right].
$$
We use part of this property in the next section to define the $d$-max neural network. 

\section{Proofs}
\label{sec:proofs}
\subsection{d-Max-Decreasing Neural Networks}
\label{sec:proof_arch}

We partition the proof into the 1-layer case and the $n$-layer case. 
We assume that all parameters $\theta \in [0,1]$ for the purposes of the proof. 

\paragraph{ Background: $D-$norms.}
$D-$norms are norms defined as 
\begin{equation}
\| \mathbf{x} \|_D := d\; \mathbb{E}_{\bm{\theta}}\left[ \max_{k=1\ldots d}(|x_k| \theta_k)\right], \quad \mathbf{x} \in \mathcal{X} \subset \mathbb{R}^d,
\label{eq:dnorm}
\end{equation}
where $\bm{\theta} \in [0, 1]^{d}$ and $\mathbb{E}[\theta_k] = 1/d$, for $i=1\ldots d$. 
We note that the condition $\mathbb{E}[\theta_k] = 1/d$ is not necessary for the $d-\max$ decreasing property, though it leads to unit exponential margins for convenience during inference, see \citet[Section 2.1]{fougeres2013dense_supp} or \citet[Definition 5.4.1]{hofmann2009characterization_supp} for more on this property.
The key condition is that the expectation in \eqref{eq:dnorm} is taken with respect to the distribution of $\bm{\theta}$ which has support only on nonnegative real numbers. 
Taking $\mathcal{X}$ to be the unit simplex, we see that a $D-$norm defines a Pickands dependence function, and by the spectral representation of the Pickands function, all Pickands functions are $D-$norms.
The main property we will use throughout the proof is that compositions of $D-$norms are also $D-$norms. This property is well established in, for example, \citet{hofert2018hierarchical_supp} and \citet{hofmann2009characterization_supp}.
We finally note that all $D-$norms satisfy the $d-$max decreasing property defined as shown in~\citet[Theorem 3.1.1]{hofmann2009characterization_supp} .


\paragraph{1-Layer Case.}
\begin{proof}
Recall that the 1-layer $d$MNN is given by
\begin{align}
\label{eq:A1}
A^{(1)}_\theta(\mathbf{w}) &= \max\left(L^{(1)}(\mathbf{w}) + (1-L^{(1)}(\mathbf{e})^T\mathbf{w}), \max_{k=1\ldots d} w_k\right), \quad \mathbf{w} \in \Delta_{d-1}  \\
L^{(1)}(\mathbf{w}) &= \frac1{n_1}\sum_{j=1}^{n_1}\left(\max_{k=1\ldots d} w_k\theta_{kj} \right)_j
\label{eq:L1}
\end{align}
The expression \eqref{eq:L1}, corresponding to the first term in \eqref{eq:A1}, is a valid $D-$norm since it is the expectation with respect to a nonnegative spectral measure. The second term $(1-L^{(1)}(\mathbf{e})^T \mathbf{w}) = \sum_{i=1}^d (1-\mathbb{E}[\theta_i]) w_i$ is also a $D-$norm since $(1-\mathbb{E}[\theta_i])$ is positive, for $i=1\ldots d$, and thus it is also an expectation with respect to a nonnegative spectral measure. In fact, it is equivalent to $\|\text{diag} (1-\mathbb{E}[{\bm\theta}]) \mathbf{w}\|_1$. The combination $L^{(1)}(\mathbf{w}) + (1 - L^{(1)}(\mathbf{e})^T \mathbf{w})$ is then the sum of two $D-$norms, equivalent to a composition with the $\| \, \cdot \, \|_1$ norm, which is again a $D-$norm. 
Finally, the outer $\max$ with $\max_{i=1\dots d} w_i$, a $D-$norm corresponding to dependence, is yet another composition of $D-$norms. This results in a function that is $d-\max$ decreasing and concludes the proof for the single layer case. 
\end{proof}

\paragraph{$n$-Layer Case.} 
\begin{proof}
We first show the base case (the 2-layer case), then show that the general $n$-layer case follows.
We focus on the composition of intermediate layers, since the technique for proving the output layer is a $D-$norm follows from the $1$-layer case. 
Recall that the 2-layer $d$MNN is given by
\begin{align*}
    A^{(2)}_\theta(\mathbf{w}) &= \max\left(L^{(2)}(\mathbf{w}) +(1 - L^{(2)}(\mathbf{e})^T \mathbf{w}), \max_{k=1\ldots d} w_k\right), \quad \mathbf{w} \in \Delta_{d-1} \\
    L^{(2)}(\mathbf{w}) &= \frac1{n_2} \sum_{j=1}^{n_2}\left( \ell^{(2)}\left(\ell^{(1)}(\mathbf{w})\right)\right)_j
\end{align*}
Let $\ell^{(1)}(\mathbf{w})$ have width $n_1$. 
Then the output of $\ell^{(1)}$ is given by the following vector
\begin{equation}
\label{eq:first_layer}
\ell^{(1)}(\mathbf{w}) = \begin{pmatrix}
\mathbb{E}_{\bm{\theta}^{(1,1)} \sim \lambda^{(1,1)} } \left [ \max_{k=1\ldots d} \theta_k^{(1, 1)} w_k \right ] \\
\vdots \\
\mathbb{E}_{\bm{\theta}^{(1, n_1)} \sim \lambda^{(1,n_1)}} \left [ \max_{k=1\ldots d} \theta_k^{(1, n_1)} w_k \right ] 
\end{pmatrix}, \quad {\bm \theta} \in [0,1]^{d}.
\end{equation}
Each row in~\eqref{eq:first_layer} is a $D-$norm where the expectation is taken over a delta function centered at $\bm \theta$, i.e. $\lambda^{(1,j)}=\delta(\bm{\theta}^{(1,j)})$ for $j=1,\ldots,n_1$. 
Therefore, the property of $D-$norms is preserved for each row of~\eqref{eq:first_layer}. By analogy to $\ell^{(1)}$ in~\eqref{eq:first_layer}, the property of $D-$norms is preserved for $\ell^{(2)}$ in~\eqref{eq:second_layer}:
\begin{equation}
\label{eq:second_layer}
\ell^{(2)}(\mathbf{w}) = \begin{pmatrix}
\mathbb{E}_{\bm{\theta}^{(2,1)} \sim \lambda^{(2,1)} } \left [ \max_{k=1\ldots n_1} \theta_k^{(2, 1)} w_k \right ] \\
\vdots \\
\mathbb{E}_{\bm{\theta}^{(2, n_2)} \sim \lambda^{(2,n_2)}} \left [ \max_{k=1\ldots n_1} \theta_k^{(2, n_2)} w_k \right ] 
\end{pmatrix}, \quad {\bm \theta} \in [0,1]^{d}.
\end{equation}

We then use the nesting property of $D-$norms given in \citet{hofert2018hierarchical_supp} such that $\ell^{(2)}(\ell^{(1)}(\bm{w}))$ is a $D-$norm, and by the same construction, $\ell^{(n)}(\ell^{(n-1)}(\cdots(\ell^{(1)}(\bm{w}))))$ is a $D-$norm and is thus $d-\max$-decreasing. Following the arguments in the $1$-layer case for the output layer then completes the proof.
\end{proof}
\begin{comment}
Here we prove that the architecture presented is $d-\max$-decreasing.
Recall that the architecture is given by
\begin{equation}
A_\theta(\mathbf{w}) := \max \left( L(\mathbf{w}) - L(\mathbf{e})^T \mathbf{w} + 1, \max_{i = 1 \ldots d} w_i \right)
    \label{eq:arch}
\end{equation}
where
\begin{align*}
\ell^{(i)}(\mathbf{h}^{(i-1)}) &= \max_{j=1\ldots n_{i-1}} \left [\Theta^{(i)} \mathrm{diag}( \mathbf{h}^{(i-1)} )\right ], \:\: \Theta^{(i)} \in  \mathbb{R}^{n_{i} \times n_{i-1}}_+ \\
L(\mathbf{w}) &= \frac1{n_m}\sum_{i=1}^{n_m}\left ( \ell^{(m)} \circ \ell^{(m-1)}\circ \cdots \circ \ell^{(1)}(\mathbf{w}) \right)_i  \\
L(\mathbf{e}) &=(L(\mathbf{e}_1),\ldots,L(\mathbf{e}_d))^T \\
\end{align*}

Let our estimator be of the form \eqref{eq:arch}.
We will use the notion of $D-$norms to form the basis of this proof.

%From this knowledge, we will study Pickands functions in terms of $d-$norms since there are existing properties that we can exploit. 
We now use the property that composition of $d-$norms are valid $d-$norms (see the review on $d-$norms given in \citet{falk2019multivariate}.)
Recall that $\| \cdot \|_\infty$ is a valid $D-$norm and that
$$
\mathbb{E}_{\Theta \sim \Lambda} \left [ \| \Theta \cdot \|_\infty \right]
$$
is also a $D-$norm when $\Theta \geq 0$ and $\mathbb{E}[\Theta_i] = 1$.
Note that the condition on the expectation is only to ensure margins are standardized and is not necessary to maintain the properties of a $D-$norm.
We will enforce this at the end so that the expectation holds. 
Let $\theta^{(j)}_i$ be the vector $i^\text{th}$ row of the weight matrix for the $j^\text{th}$ layer.
We have that $u^{(j+1)}_i = \| \theta^{(j)}_i \odot u^{(j)} \|_\infty$ where $\odot$ refers to elementwise multiplication. 
We can think of this as an expectation with respect to a constant generator since, when the generator is constant, we have that
$$
\mathbb{E} \left [ \| \theta^{(j)}_i \odot u^{(j)} \|_\infty \right] = \| \theta^{(j)}_i \odot u^{(j)} \|_\infty.
$$
Composing the resulting vector with another $d-$norm retains the property as shown in \citet{hofert2018hierarchical_supp} and therefore the network itself is a $D-$norm. 
\end{comment}

\subsection{Universal Approximation}
\label{sec:proof_pointwise}




Our proof that our architecture is an universal approximator of Pickand's copula functions is constructive.
Recall that every Pickands function has the form
\begin{equation}
\label{eq:pick_spec}
 A(\mathbf{w}) = \mathbb{E}_{\mathbf{s} \sim \lambda(\Delta_{d-1})}\left[\max_{k = 1 \ldots d} s_kw_k\right], \quad \mathbf{w} \in \Delta_{d-1},   
\end{equation} where $\lambda$ is a spectral measure with $\text{supp}(\lambda) = \Delta_{d-1}$. We now construct a single layer dMNN, with width $n$, by sampling $n$ independent and identically distributed (i.i.d.) samples $\mathbf{s}^{(1)}, \ldots, \mathbf{s}^{(n)} \sim \lambda(\Delta_{d-1})$, and setting
\begin{equation}
\tilde A_n (\mathbf{w}) = \frac1n \sum_{j=1}^n \max_{k=1\ldots d}(s_k^{(j)} w_k)
\end{equation}
Before showing that $\tilde A_n$ converges uniformly to $A$, we show that it converges point-wise. Although this intermediary result is not needed to show uniform converge, its proof provides intuition while being less technical.

{\it
The copula $\tilde A_n$ converges pointwise to $A$, almost surely.
}

\begin{proof}

Consider the discrete distribution $\mathbb{\Lambda}_n$ given by the $n$ i.i.d. samples $\mathbf{s}^{(1)}, \ldots, \mathbf{s}^{(n)} \sim \lambda(\Delta_{d-1})$:
$$
\mathbb{\Lambda}_n := \frac1n \sum_{i=1}^n \delta(\mathbf{s}^{(i)}),
$$
where $\delta(\mathbf{s})$ represents a Dirac measure at $\mathbf{s}$.
By the law of large numbers, for every $\mathbf{w} \in \Delta_{d-1}$,
\begin{align*}
\mathbb{E}_{\mathbf{s} \sim \mathbb{\Lambda}_n}\left[\max_{k=1\ldots d}s_kw_k\right] & \xrightarrow{a.s.} \mathbb{E}_{\mathbf{s} \sim \lambda}\left[\max_{k=1\ldots d}s_kw_k\right], \quad n \to \infty \\
\implies A_{\bm\theta}(\mathbf{w}) & \xrightarrow{a.s.} A(\mathbf{w}).\qedhere
\end{align*}
%This completes the proof with $A_{\bm \theta}(\mathbf{w}) = \mathbb{E}_{\mathbf{s} \sim \mathbb{\Lambda}_n}\left[\max_{k=1 \ldots d} s_k w_k \right]$.
\end{proof}

%\subsection{Uniform Convergence}
%\label{sec:proof_convergence}

We now state and prove the main result regarding uniform convergence. 

{\it
The empirical process $$\mathbb{G}_n = \sqrt{n} \left( \tilde A_n - A \right )$$ weakly converges to a zero-mean Gaussian process as $n \to \infty$ where $\tilde A_n$ is a single layer dMNN with width $n$.
}
\begin{proof}
Let $\lambda$ be the law given by the spectral measure $\lambda(\Delta_{d-1})$ and the discrete empirical spectral measure be given by $\mathbb{\Lambda}_n :=\frac1n \sum_{j=1}^n \delta(\mathbf{s}^{(j)})$ for $n$ i.i.d. samples $\mathbf{s}^{(1)},\ldots, \mathbf{s}^{(n)}$ from $\lambda$. 
We additionally write $\lambda f := \mathbb{E}_{\mathbf{s} \sim \lambda}[f(\mathbf{s})]$ as the expectation with respect to the measure $\lambda$.
The empirical process $\mathbb{G}_n$ is defined by
\begin{align*}
\mathbb{G}_n &= \sqrt{n} \left ( \tilde A_n - A \right ) \\
&= \sqrt{n} \left( \mathbb{\Lambda}_n - \lambda \right)f \\
&= \sqrt{n} \left ( \frac1n \sum_{j=1}^n \max_{k=1\ldots d}(s_k^{(j)} w_k) - \mathbb{E}_{\mathbf{s} \sim \lambda}\left[\max_{k=1\ldots d}(s_kw_k)\right] \right ),
\end{align*}
where $f \in \mathcal{F}$ and
$$
\mathcal{F} := \{ f_w(s) := \max_{k = 1 \ldots d} ( s_k w_k ) : \mathbf{w} \in \Delta_{d-1} \}.
$$
By the classical central limit theorem, for a given $\mathbf{w}$, $\sqrt{n} \left (\tilde A_n(\mathbf{w})  - A(\mathbf{w})\right)\xrightarrow{d} \mathcal{N}(0, \sigma^2)$, with $\sigma^2 \leq (1 - A(\mathbf{w}))(A(\mathbf{w}) - \frac1{d^2})$, since the random variable $\max_{k= 1\ldots d}(w_k s_k) \in [1/d^2, 1]$ is bounded and has finite variance.
%That is, the estimator readily achieves pointwise convergence. 

Our claim is that $\mathbb{G}_n \leadsto \mathbb{G}$ where $\mathbb{G}$ is a zero-mean Gaussian process for establishing uniform convergence over $\mathbf{w}$.  
We will now show that the function class given by $\mathcal{F}$ is $\lambda-$Donsker. 
To show this, we will show that the bracketing integral given by
\begin{equation}
\label{eq:brack_int}
\mathcal{J}_{[\,]}(1,\mathcal{F}, L_2(\lambda)) = \int_0^1 \sqrt{\log N_{[\,]}(\epsilon, \mathcal{F} \cup 0, L_2(\lambda))} \, \mathrm{d} \epsilon 
\end{equation}
converges where the $L_2(\lambda)$ norm is defined as $\|f\|_{\lambda, 2} = (\int f^2 d \lambda)^{1/2}$.
A sufficient condition for convergence of \eqref{eq:brack_int} is to show that the logarithm of the bracketing number $N_{[\,]}$ grows at a rate slower than $O(\frac{1}{\epsilon^2})$.
The function class $\mathcal{F}$ is indexed by $\mathbf{w} \in \Delta_{d-1}$ and is Lipschitz on $\mathbf{w}$.
From \citet[Lemma 2.14]{sen2018gentle}, the bracketing number of $\mathcal{F}$ is thus bounded above by the covering number of $\Delta_{d-1}$, i.e.
$$
N_{[\,]}(2\epsilon, \mathcal{F}, L_2(\lambda)) \leq N(\epsilon, \Delta_{d-1}, \| \cdot \|_2),
$$
where the covering number of the unit simplex is asymptotically $O\left(\frac{1}{\epsilon^{d-1}}\right)$. The logarithm of the bracketing number then grows at a rate $\log N_{[\,]}\leq O\left ( \left (d - 1 \right ) \log \frac1\epsilon \right) < O(\frac1{\epsilon^2})$. This proves that $\mathcal{F}$ is $\lambda-$Donsker and thus $\mathbb{G}_n \leadsto \mathbb{G}$.
\end{proof}
We can also make a statement on the covariance of the process.
For any two points $w^{(i)}, w^{(j)}$, the covariance of the Gaussian process converges to 
\begin{align*}
    Cov\left(A\left(w^{(i)}\right), A\left(w^{(j)}\right) \right ) &= \mathbb{E} \left [ \max_k s_k w_k^{(i)} \max_k s_k w_k^{(j)} \right] - \mathbb{E}\left[\max_k s_k w_k^{(i)}\right]\mathbb{E}\left[\max_k s_k w_k^{(j)}\right] \\
    &= \mathbb{E} \left [ \max_k s_k w_k^{(i)} \max_k s_k w_k^{(j)} \right] - A(w^{(i)})A(w^{(j)}) \\
    &\leq \sqrt{Var\left(A\left(w^{(i)}\right)\right)Var\left(A\left(w^{(j)}\right)\right)} \\
    &\leq \frac14 \left(1 - \frac1d\right)^2.
\end{align*}
The final inequality comes from Popoviciu's inequality and the fact that $A(w) \in [1/d, 1]$. 
For an introduction to empirical processes, see the review given in~\citet{wellner2005empirical}. 

\section{Survival Probability Estimation}
\label{sec:survival}

One particularly useful task is estimating multi-dimensional survival probabilities rather than cumulative probabilities.
More precisely, let $\left(\gamma_1, \cdots, \gamma_d \right) \in \mathbb{R}^d$ be a $d-$dimensional vector of thresholds, we are interested in calculating the following survival probability:
\begin{align}
\label{survival_general}
   \mathbb{P} \left[M_{n}^{(1)} > \gamma_1, \cdots, M_{n}^{(d)} > \gamma_d \right] 
    = \mathbb{P} \left[\bar{M}_{n}^{(1)} > \bar{\gamma}_1 , \cdots, \bar{M}_{n}^{(d)} > \bar{\gamma}_d \right],
\end{align}
where $\bar{\gamma}_k = \frac{\gamma_k - b_n^{(k)}}{a_n^{(k)}}$, $k \in \{1, \cdots, d\}$.

To calculate this, we simply use a change-of-variable technique which we present in the following proposition. 
This approach is well known, and we only provide the proposition for completeness. 
\begin{proposition}[Survival Probability Computation]
Let $G_k(x):= F_k^{-1}(1-F_k(x))$ for $k~\in~\{1,\dots,d\}$, then the random variables $G_k(\bar{M}_{n}^{(k)})$ and $\bar{M}_{n}^{(k)}$ have the same marginal CDF $F_k$, for $k\in\{1,\dots,d\}$, and
\begin{align}
    \mathbb{P} \left[\bar{M}_{n}^{(1)} > \bar{\gamma}_1 , \cdots, \bar{M}_{n}^{(d)} > \bar{\gamma}_d \right] = \mathbb{P} \left[G_1(\bar{M}_{n}^{(1)}) < G_1(\bar{\gamma}_1), \cdots, G_d(\bar{M}_{n}^{(d)}) < G_d(\bar{\gamma}_d) \right].
    \label{survivalchgvareq}
\end{align}
\end{proposition}
\begin{proof}
With the change-of-variable $G_k(x):= F_k^{-1}(1-F_k(x))$ for $k \in \{1, \cdots, d\}$, it first follows that the random variables $G_k(\bar{M}_{n}^{(k)}) \sim F_k$: 
\begin{equation}
    \mathbb{P}(G_k(\bar{M}_{n}^{(k)})\leq x) = \mathbb{P}(F_k^{-1}(1-F_k(\bar{M}_{n}^{(k)}))\leq x)=\mathbb{P}(1-F_k(\bar{M}_{n}^{(k)})\leq F_k(x))=F_k(x),
\end{equation}
since $F_k(\bar{M}_{n}^{(k)})$ and $1-F_k(\bar{M}_{n}^{(k)})$ follow the unit uniform distribution.

Moreover, the survival probability can be written as:
\begin{align*}
     \mathbb{P} \left[\bar{M}_{n}^{(1)} > \bar{\gamma}_1 , \cdots, \bar{M}_{n}^{(d)} > \bar{\gamma}_d \right]
  & = \mathbb{P}\left[1- F_1(\bar{M}_{n}^{(1)}) < 1-F_1(\bar{\gamma}_1) , \cdots, 1-F_d(\bar{M}_{n}^{(d)}) < 1 - F_d(\bar{\gamma}_d) \right] \\ 
  & = \mathbb{P}\left[G_1(\bar{M}_{n}^{(1)}) < G_1(\bar{\gamma}_1) , \cdots, G_d(\bar{M}_{n}^{(d)}) < G_d(\bar{\gamma}_d) \right] \\
  & = C \left( 1 - F_1(\bar{\gamma}_1), \cdots, 1 - F_d(\bar{\gamma}_d) \right),
\end{align*}
where $C$ is the copula of $\left(G_1(\bar{M}_{n}^{(1)}), \cdots, G_d(\bar{M}_{n}^{(d)}) \right)$.
\end{proof}
%Instead, we calculate the survival probability with the following manipulations:
%\begin{align}
%  \nonumber &  \mathbb{P} \left[\bar{M}_{n}^{(1)} > \bar{\gamma}_1 , \cdots, \bar{M}_{n}^{(d)} > \bar{\gamma}_d \right] \\
%  \nonumber & = \mathbb{P} \left[F_1(\bar{M}_{n}^{(1)}) > F_1(\bar{\gamma}_1), \cdots, F_d(\bar{M}_{n}^{(d)}) > F_d(\bar{\gamma}_d) \right] \\ 
%  \nonumber & = \mathbb{P} \Biggl[ G_1^{-1} \circ F_1(\bar{M}_{n}^{(1)}) < G_1^{-1} \circ F_1(\bar{\gamma}_1), \\
%  & \quad \quad \quad \cdots, G_d^{-1} \circ F_d(\bar{M}_{n}^{(d)}) < G_d^{-1}  \circ F_d(\bar{\gamma}_d) \Biggr],\label{survivalGtransformation_old}
%\end{align}
%where $G_k=1-F_k$ is the marginal complementary CDF for $k \in \{1, \cdots, d \}$. This implies that $G_k^{-1} \circ F_k(\bar{M}_{n}^{(k)}) = F_k^{-1}(1 - F_k(\bar{M}_{n}^{(k)}))$. It follows easily that the random variables $G_k^{-1} \circ F_k(\bar{M}_{n}^{(k)}) \sim F_k$ for all $k \in \{1, \cdots, d\}$.
This proposition implies that the transformed variables $G_k(\bar{M}_{n}^{(k)})$ are samples from extreme value distributions. 
Then, we can fit Pickands dependence function to these transformed variables, and finally evaluate the corresponding extreme value copula on
%$\left(F_1 \circ G_1 (\bar{\gamma}_1), \cdots,  F_d \circ G_d(\bar{\gamma}_d)\right)=
$\left(1-F_1(\bar{\gamma}_1), \cdots, 1-F_d(\bar{\gamma}_d)\right)$. %
% \footnote{Note that $F_k(G_k(x))= 1 -F_k (x)$}
% \begin{remark}
% No need to invert $G_k$ since the output of the intermediate step in \eqref{transform1} applied to $G_k^{-1} \circ F_k(\bar{M}_{n, b}^{(k)})$  is given by $- \log (1-F_k(\bar{M}_{n, b}^{(k)}))$. 
% \end{remark}
Details on how to estimate the survival probability in \eqref{survival_general} are given in Algorithm 2.

%\textcolor{magenta}{C proposition 1 seems to be a fancy way of writing ``fit the copula to the decreasing ranks...'' suggest writing it simply and remove the ``proposition''.}

\section{Additional Experiments and Figures}
\label{sec:more_experiments}
Code for all experiments can be found at \url{https://github.com/alluly/dMNN}.
\subsection{24 Width 3 Depth Architecture}

Here we repeat the experiments with a different architecture. 
All other hyperparameters are the same, the only difference is we increase the depth to 3 and use a width of 24 for each layer.
Most of the results remain similar for the synthetic data but we see a change in the results for the real data, specifically, the Wind and Commodities data show a slight deterioration in performance. 
However, the variances are still high for the real experiments and not much can be said regarding the efficacy of any single method. 

\begin{figure}[tbh!]
    \centering
\begin{subfigure}{.48\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/survival_sl_50_w=24d=3.pdf}
  \caption{$A_\text{SL}$ MSE ($d=2$)}
  \label{fig:sl_survival32}
\end{subfigure}
\begin{subfigure}{.48\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/survival_asl_50_w=24d=3.pdf}
  \caption{$A_\text{ASL}$ MSE ($d=2$)}
  \label{fig:asl_survival32}
\end{subfigure} 
\caption{Using 24 width 3 depth architecture: MSE of survival probabilities for $d=2$ with $100$ samples for $A_\text{SL}$ (\ref{fig:sl_survival32}) and $A_\text{ASL}$ (\ref{fig:asl_survival32}). Thresholds are above the $75$th percentile.}
\end{figure}

\begin{figure*}[ht!]
    \centering
\begin{subfigure}{.48\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/box_sl_w=24d=3_Da.pdf} 
  \caption{$A_\text{SL}$ MSE ($d=256$)}
  \label{fig:sl_mse_est_all_a_a2}
\end{subfigure}
\begin{subfigure}{.48\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/box_asl_w=24d=3_Da.pdf}
  \caption{$A_\text{ASL}$ MSE ($d=256$)}
  \label{fig:asl_mse_est_all_a_a2}
\end{subfigure} 
\begin{subfigure}{.48\textwidth}
  \centering
  %\includegraphics[width=\linewidth]{imgs/mse_sl_alln_thick_zoomed.pdf}  
\includegraphics[width=\linewidth]{imgs/uai/box_sl_w=24d=3_Dd.pdf}  
  \caption{$A_\text{SL}$ MSE ($\alpha=0.5$)}
  \label{fig:sl_mse_est_all_d_a2}
\end{subfigure}
\begin{subfigure}{.48\textwidth}
  \centering
  %\includegraphics[width=\linewidth]{imgs/mse_asl_alln_thick_zoomed.pdf}  
    \includegraphics[width=\linewidth]{imgs/uai/box_asl_w=24d=3_Dd.pdf}  
  \caption{$A_\text{ASL}$ MSE ($\alpha=0.5$)}
  \label{fig:asl_mse_est_all_d_a2}
\end{subfigure}  
\caption{Using 24 width 3 depth architecture: Comparison of $||\hat{A}(w) - A(w)||_2^2$ for different estimators $\hat{A}$ for different dependence $\alpha = \{0.25, 0.50, 0.75, 1.0\}$ with fixed $d=256$ (\ref{fig:sl_mse_est_all_a_a2}, \ref{fig:asl_mse_est_all_a_a2}) and for fixed  $\alpha=0.5$ with different $d = \{256, 512, 728, 1024\}$ (\ref{fig:sl_mse_est_all_d_a2}, \ref{fig:asl_mse_est_all_d_a2}). The truth models considered are $A_\text{SL}$ (\ref{fig:sl_mse_est_all_a_a2}, \ref{fig:sl_mse_est_all_d_a2}) and $A_\text{ASL}$ (\ref{fig:asl_mse_est_all_a_a2}, \ref{fig:asl_mse_est_all_d_a2}). Results are over 50 runs with 100 training samples for each run. } 
\end{figure*}
\begin{table*}[tbh!]
\newcommand{\timesten}{\text{\tiny $\times10$}}
%\newcommand{\timesten}{ \times10}
\centering
%\footnotesize
\begin{tabular}{@{}lclllll@{}} 
\toprule
 & $d$ & Train/Test Length & \textsc{Pickands} & \textsc{CFG} & \textsc{BDV} & \textsc{Proposed}  \\ 
\midrule
 Wind & 10 & day/week &
 ${4.48(18.6)}\timesten^{-4}$ & $\textit{4.15(15.1)}\timesten^{-4}$ &   $\bf 4.10(16.3)\timesten^{-4}$ &   ${ 4.80(20.6)}\timesten^{-4}$ \\
 Ozone & 4 & day/week &
 $3.06(4.66)\timesten^{-2}$ &  $3.86(6.10)\timesten^{-2}$ & $\textit{2.86(4.46)}\timesten^{-2}$ & $\bf 2.82(4.38)\timesten^{-2}$ \\
 Commodities & 10 & week/month &
 $4.34(5.82)\timesten^{-3}$  & $4.33(5.71)\timesten^{-3}$   & $\bf 1.60(1.96)\timesten^{-3}$  &  $\textit{ 2.20(3.41)}\timesten^{-3}$  \\
 S\&P 500 & 418 & week/month &
 $\textit{3.02(21.2)}\timesten^{-3}$ & $\textit{3.02(21.1)}\timesten^{-3}$ & $6.28(35.2)\timesten^{-3}$ & 
 $\bf 2.41(22.2)\timesten^{-3}$ \\
 Crypto & 100 & week/month &
 ${1.06(2.85)}\timesten^{-2}$ & $\textit{1.05(4.86)}\timesten^{-2}$ & ${1.34(3.44)}\timesten^{-2}$ & $\bf 8.42(26.1)\timesten^{-3}$ \\
  COVID (NC) & 100 & week/week & $4.04(7.21) \timesten^{-2}$ & $4.04(7.19) \timesten^{-2}$ & $\it 3.83(6.51) \timesten^{-2}$ & $\bf 5.22(12.8) \timesten^{-3}$ \\
  COVID (NY) & 58 & week/week & $2.74(10.4) \timesten^{-2}$ & $2.74(10.4) \timesten^{-2}$ & $\it 2.25(7.75) \timesten^{-2}$ & $\bf 3.52(7.92) \timesten^{-3}$ \\
  COVID (CA) & 58 & week/week & $1.17(3.98) \timesten^{-2}$ & $\it 1.19(3.87) \timesten^{-2}$ & $ 1.17(3.85) \timesten^{-2}$ & $\bf 3.27(9.28) \timesten^{-3}$ \\
\bottomrule
\end{tabular}
    \caption{MSE of different estimators in estimating maxima over two time scales for 24 width 3 depth architecture. Best and second best performances are marked in \textbf{bold} and \textit{italic} respectively.}
    \label{tab:real_data_a2}
    %\vspace{-0.8cm}
\end{table*}

\subsection{64 Width 4 Depth Architecture}

Here we repeat the experiments with a different architecture. 
All other hyperparameters are the same, the only difference is we increase the depth to 4 and use a width of 64 for each layer.
Most of the results remain similar for the synthetic data but we see a change in the results for the real data, specifically, the Wind and Commodities data show a slight deterioration in performance. 
However, the variances are still high for the real experiments and not much can be said regarding the efficacy of any single method. 

\begin{figure}[ht!]
    \centering
\begin{subfigure}{.48\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/survival_sl_50_w=64d=4.pdf}
  \caption{$A_\text{SL}$ MSE ($d=2$)}
  \label{fig:sl_survival64}
\end{subfigure}
\begin{subfigure}{.48\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/survival_asl_50_w=64d=4.pdf}
  \caption{$A_\text{ASL}$ MSE ($d=2$)}
  \label{fig:asl_survival64}
\end{subfigure} 
\caption{Using 64 width 4 depth architecture: (\ref{fig:sl_survival64}, \ref{fig:asl_survival64}) MSE of survival probabilities for $d=2$ with $100$ samples for $A_\text{SL}$ (\ref{fig:sl_survival64}) and $A_\text{ASL}$ (\ref{fig:asl_survival64}). Thresholds are above the $75$th percentile.}
\end{figure}

\begin{figure}[tbh!]
    \centering
\begin{subfigure}{.48\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/box_sl_w=64d=4_Da.pdf} 
  \caption{$A_\text{SL}$ MSE ($d=256$)}
    \label{fig:sl_mse_est_all_a_a3}
\end{subfigure}
\begin{subfigure}{.48\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/box_asl_w=64d=4_Da.pdf}
  \caption{$A_\text{ASL}$ MSE ($d=256$)}
      \label{fig:asl_mse_est_all_a_a3}
\end{subfigure} 
\begin{subfigure}{.48\textwidth}
  \centering
\includegraphics[width=\linewidth]{imgs/uai/box_sl_w=64d=4_Dd.pdf}  
  \caption{$A_\text{SL}$ MSE ($\alpha=0.5$)}
    \label{fig:sl_mse_est_all_d_a3}
\end{subfigure}
\begin{subfigure}{.48\textwidth}
  \centering
    \includegraphics[width=\linewidth]{imgs/uai/box_asl_w=64d=4_Dd.pdf}  
  \caption{$A_\text{ASL}$ MSE ($\alpha=0.5$)}
      \label{fig:asl_mse_est_all_d_a3}
\end{subfigure}  
\caption{Using 64 width 4 depth architecture: Comparison of $||\hat{A}(w) - A(w)||_2^2$ for different estimators $\hat{A}$ for different dependence $\alpha = \{0.25, 0.50, 0.75, 1.0\}$ with fixed $d=256$ (\ref{fig:sl_mse_est_all_a_a3}, \ref{fig:asl_mse_est_all_a_a3}) and for fixed  $\alpha=0.5$ with different $d = \{256, 512, 728, 1024\}$ (\ref{fig:sl_mse_est_all_d_a3}, \ref{fig:asl_mse_est_all_d_a3})  for $A_\text{SL}$ (\ref{fig:sl_mse_est_all_a_a3}, \ref{fig:sl_mse_est_all_d_a3}) and $A_\text{ASL}$ (\ref{fig:asl_mse_est_all_a_a3}, \ref{fig:asl_mse_est_all_d_a3}). Results are over 50 runs with 100 training samples for each run. } 
\end{figure}
\begin{table}[tbh!]
\newcommand{\timesten}{\text{\tiny $\times10$}}
%\newcommand{\timesten}{ \times10}
\centering
%\footnotesize
\begin{tabular}{@{}lclllll@{}} 
\toprule
 & $d$ & Train/Test Length & \textsc{Pickands} & \textsc{CFG} & \textsc{BDV} & \textsc{Proposed}  \\ 
\midrule
 Wind & 10 & day/week &
 ${4.48(18.6)}\timesten^{-4}$ & $\textit{4.15(15.1)}\timesten^{-4}$ &   $\bf 4.10(16.3)\timesten^{-4}$ &   ${ 4.76(18.7)}\timesten^{-4}$ \\
 Ozone & 4 & day/week &
 $3.06(4.66)\timesten^{-2}$ &  $3.86(6.10)\timesten^{-2}$ & $\textit{2.86(4.46)}\timesten^{-2}$ & $\bf 2.73(4.25)\timesten^{-2}$ \\
 Commodities & 10 & week/month &
 $4.34(5.82)\timesten^{-3}$  & $4.33(5.71)\timesten^{-3}$  & $\bf 1.60(1.96)\timesten^{-3}$  &  $\textit{ 2.20(3.44)}\timesten^{-3}$  \\
 S\&P 500 & 418 & week/month &
 $\textit{3.02(21.2)}\timesten^{-3}$ & $\textit{3.02(21.1)}\timesten^{-3}$ & $6.28(35.2)\timesten^{-3}$ & 
 $\bf 2.39(22.1)\timesten^{-3}$ \\
 Crypto & 100 & week/month &
 ${1.06(2.85)}\timesten^{-2}$ & $\textit{1.05(4.86)}\timesten^{-2}$ & ${1.34(3.44)}\timesten^{-2}$ & $\bf 8.28(25.6)\timesten^{-3}$ \\
  COVID (NC) & 100 & week/week & $4.04(7.21) \timesten^{-2}$ & $4.04(7.19) \timesten^{-2}$ & $\it 3.83(6.51) \timesten^{-2}$ & $\bf 4.93(12.1) \timesten^{-3}$ \\
  COVID (NY) & 58 & week/week & $2.74(10.4) \timesten^{-2}$ & $2.74(10.4) \timesten^{-2}$ & $\it 2.25(7.75) \timesten^{-2}$ & $\bf 3.69(8.64) \timesten^{-3}$ \\
  COVID (CA) & 58 & week/week & $1.17(3.98) \timesten^{-2}$ & $\it 1.19(3.87) \timesten^{-2}$ & $ 1.17(3.85) \timesten^{-2}$ & $\bf 1.10(4.78) \timesten^{-3}$ \\
\bottomrule
\end{tabular}
    \caption{MSE of different estimators in estimating maxima over two time scales for 64 width 4 depth architecture. Best and second best performances are marked in \textbf{bold} and \textit{italic} respectively. }
    \label{tab:real_data_a3}
    %\vspace{-0.8cm}
\end{table}



\subsection{Estimation Comparison}
We finally add a few figures comparing the learned dependence functions between different architectures.
We additionally provide a table comparing the results for different architectures on the real data experiments in Table~\ref{tab:ablation_real}.
\begin{figure}[tbh!]
    \centering
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/cali_extremal_MaxLinear_w512d1.pdf}  
  \caption{512 width 1 depth 2d Margins}
  \label{fig:net_512_wind}
\end{subfigure} 
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/cali_extremal_MaxLinear_w24d3.pdf}  
  \caption{24 width 3 depth 2d Margins}
  \label{fig:net_24_wind}
\end{subfigure}
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/cali_extremal_MaxLinear_w64d4.pdf}  
  \caption{64 width 4 depth 2d Margins}
  \label{fig:net_64_wind}
\end{subfigure}
\caption{Margin comparison for winds dataset. }
\label{fig:net_wind}
\end{figure}

\begin{figure}[tbh!]
    \centering
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/ozone_extremal_MaxLinear_w512d1.pdf}  
  \caption{512 width 1 depth 2d Margins}
  \label{fig:net_512_ozone}
\end{subfigure} 
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/ozone_extremal_MaxLinear_w24d3.pdf}  
  \caption{24 width 3 depth 2d Margins}
  \label{fig:net_24_ozone}
\end{subfigure}
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/ozone_extremal_MaxLinear_w64d4.pdf}  
  \caption{64 width 4 depth 2d Margins}
  \label{fig:net_64_ozone}
\end{subfigure}
\caption{Margin comparison for ozone dataset. }
\end{figure}
\begin{figure}[tbh!]
    \centering
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/commodities_3d_MaxLinear_w512d1.pdf}  
  \caption{512 width 1 depth 3d Margins}
\end{subfigure} 
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/commodities_3d_MaxLinear_w24d3.pdf}  
  \caption{24 width 3 depth 3d Margins}
\end{subfigure}
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/commodities_3d_MaxLinear_w64d4.pdf}  
  \caption{64 width 4 depth 3d Margins}
\end{subfigure}
\caption{Margin comparison for commodities dataset. }
\end{figure}

\begin{figure}[tbh!]
    \centering
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/comm_extremal_MaxLinear_w512d1.pdf}  
  \caption{512 width 1 depth 2d Margins}
  \label{fig:net_512_comm}
\end{subfigure} 
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/comm_extremal_MaxLinear_w24d3.pdf}  
  \caption{24 width 3 depth 2d Margins}
  \label{fig:net_24_comm}
\end{subfigure}
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/comm_extremal_MaxLinear_w64d4.pdf}  
  \caption{64 width 4 depth 2d Margins}
  \label{fig:net_64_comm}
\end{subfigure}
\caption{Margin comparison for commodities dataset. }
\end{figure}
\begin{figure}[tbh!]
    \centering
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/spy_extremal_MaxLinear_w512d1.pdf}  
  \caption{512 width 1 depth 2d Margins}
  \label{fig:cfg_marg_spy}
\end{subfigure} 
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/spy_extremal_MaxLinear_w24d3.pdf}  
  \caption{24 width 3 depth 2d Margins}
  \label{fig:bdv_marg_spy}
\end{subfigure}
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/spy_extremal_MaxLinear_w64d4.pdf}  
  \caption{64 width 4 depth 2d Margins}
  \label{fig:net_marg_spy}
\end{subfigure}
\caption{Margin comparison for S\&P 500 dataset. }
\end{figure}

\begin{figure}[tbh!]
    \centering
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/crypto_extremal_MaxLinear_w512d1.pdf}  
  \caption{512 width 1 depth 2d Margins}
  \label{fig:net_512_crypto}
\end{subfigure} 
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/crypto_extremal_MaxLinear_w24d3.pdf}  
  \caption{24 width 3 depth 2d Margins}
  \label{fig:net_24_crypto}
\end{subfigure}
\begin{subfigure}{.30\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/crypto_extremal_MaxLinear_w64d4.pdf}  
  \caption{64 width 4 depth 2d Margins}
  \label{fig:net_64_crypto}
\end{subfigure}
\caption{Margin comparison of different architectures for Cryptocurrencies dataset. }
\label{fig:net_crypto}
\end{figure}

\begin{table}[tbh!]
    \newcommand{\timesten}{\text{\tiny $\times10$}}
    \centering
    \begin{tabular}{cccc}
    \toprule
        & $512 \text{width} \times 1 \text{depth}$ & $24 \text{width} \times 3 \text{depth}$ & $64 \text{width} \times 4 \text{depth}$  \\ \midrule
         Wind & ${ 4.37(17.5)}\timesten^{-4}$ & ${ 4.80(20.6)}\timesten^{-4}$ & ${ 4.76(18.7)}\timesten^{-4}$ \\
         Ozone & $ 2.73(4.25)\timesten^{-2}$ & $2.82(4.38)\timesten^{-2}$ & $2.73(4.25)\timesten^{-2}$ \\
         Commodities & $1.56(2.21)\timesten^{-3}$ & ${ 2.20(3.41)}\timesten^{-3}$  & $2.20(3.44)\timesten^{-3}$ \\
         S \& P & $ 2.41(22.2)\timesten^{-3}$ & $2.41(22.2)\timesten^{-3}$ & $2.39(22.1)\timesten^{-3}$ \\
         Crypto & $8.57(26.4)\timesten^{-3}$ & $8.42(26.1)\timesten^{-3}$ & $8.28(25.6)\timesten^{-3}$ \\
         \bottomrule
    \end{tabular}
    \caption{Comparison of 3 different architectures, in terms of MSE, on the real data experiments.}
    \label{tab:ablation_real}
\end{table}



\section{Data Description}
\label{sec:data}
\subsubsection*{Synthetic Data}
For the synthetic data experiments we consider samples of 100 points from each respective distribution. 
We use the full dataset for the batch size during training.
We additionally sample 1000 points from the simplex for each data point during training. 

\subsubsection*{Ozone Data}
We consider ozone levels measured at 4 different stations in Sequoia National Park from data that can be downloaded from the National Park Service website \footnote{\url{https://ard-request.air-resource.com/data.aspx}}. The 4 stations are located at Ash Mountain, Lower Kaweah, Grant Grove and Lookout Point. We train the different models on daily maxima of ozone levels at the 4 different stations for the period from January 1984 to December 1996. To reduce the effect of seasonality, we do not train over the whole period, but we train different models on a single month (training month, e.g. June of each year) and compute accuracy on the consecutive month (validation month e.g. July of the same year). We additionally only look at summer months due to the increase of extreme events during that time. The accuracy is averaged with the specific validation month of each year over the whole period. We train on daily maxima and test on weekly maxima. For the experiments, we consider the following pair of (training/test) months: (June/July), (July/August), and (August/September). 

\begin{figure}
    \centering
     \includegraphics[width=0.5\linewidth]{imgs/Wind_locations.png}
     \caption{Locations (yellow circles) of weather stations sampled for the wind speed experiments. Figure generated via Google Maps.}
     \label{fig:wind_locations}
\end{figure}

\subsubsection*{California Wind Data}
We are interested in modeling the extremal relationship of wind gusts between different locations in California during the summer months. 
We consider 10 locations in California illustrated in \ref{fig:wind_locations}. 
We obtained the data from the Remote Automated Weather Station (RAWS) archive available at the online repository\footnote{\url{https://raws.dri.edu/index.html}}. 
The RAWS data are collected from various time intervals from December 1989 to December 2020. 
We consider only the time points that occur in the intersection of all the data collected and where all values are valid (i.e. not NaNs or missing) for the summer months.
Similarly to the ozone data, and in an effort to reduce seasonality, we consider the daily max wind gust for the different locations for a single month over all the years the data were collected.
To evaluate the proposed method, we train and test on data from consecutive months and repeat for multiple sets of months in our dataset.
Additionally, we train on daily max and test on monthly max using the following data splitting scheme (training/validation months): (June/July), (July/August), and (August/September). 

\subsubsection*{Commodities Data}
We consider the extreme dependency between different commodities such as Coffee, Copper, Corn, Crude Oil, Gold, Heating Oil, Natural Gas, Platinum, Silver and Wheat. We collect data of daily prices of the different commodities from January 2015 to December 2020 as published in \footnote{\url{https://www.investing.com/commodities/}}. For training, we consider weekly max drawdown over a year. We validate the performance by evaluating accuracy of monthly max drawdown over next three years. We consider the following pairs of ([training years],[validation years]): ([2015], [2016, 2017, 2018]), ([2016], [2017, 2018, 2019]), ([2017], [2018, 2019, 2020]).

 

\subsubsection*{S\&P 500 Data}
We obtain historical data from \url{https://www.alphavantage.co}\footnote{Alpha Vantage allows academic use as long as the website is cited.}. 
We choose the components of the S\&P 500 with sufficient history (resulting in 418 stocks). 
For training, we consider weekly max drawdown over a year. 
We validate the performance by evaluating accuracy of monthly max drawdown over next three years. We consider the following pairs of ([training years],[validation years]): ([2015], [2016, 2017, 2018]), ([2016], [2017, 2018, 2019]), ([2017], [2018, 2019, 2020]).
For the full list of stocks, see the \verb|sp_names.txt| file in the supplementary materials. 

\subsubsection*{Cryptocurrencies Data}
We obtain historical data from \url{https://coinmarketcap.com}\footnote{Coin Market Cap allows academic use as long as the website is cited (see FAQ page).} for 100 coins with the longest history. 
For training, we consider weekly max drawdown over a year. 
We validate the performance by evaluating accuracy of monthly max drawdown over next three years. We consider the following pairs of ([training years],[validation years]): ([2015], [2016, 2017, 2018]), ([2016], [2017, 2018, 2019]), ([2017], [2018, 2019, 2020]).
For the full list of coins, see the \verb|crypto_names.txt| file in the supplementary materials. 

\subsubsection*{COVID-19 Data}
We obtained COVID-19 case counts for the United States at the county level from~\url{https://github.com/nytimes/covid-19-data}. 
We chose the states of North Carolina, New York, and California for the analysis. 
We consider training on weeks in 2020 and testing on weeks in 2021. 
We choose the same time scale due to the fact that cases were poorly counted in 2020 whereas in 2021 case counts were more accurately reported. 
Additionally, 2021 saw an increase in cases due to the arrival of new variants such as Delta and Omicron.
For the conditional classification examples, we take the observation at the locations and compute the probability. 
If the probability is reported as $> 0.5$ then we consider it to be classified correctly. 
If it is $< 0.5$ then we consider it an incorrect classification.
For the North Carolina data, we condition on the counties of: Mecklenburg, Wake, Guilford, Forsyth,  and Cumberland and predict  Durham. 
For the New York data, we condition on Westchester, Nassau, New York City, Suffolk, and Erie and predict Monroe (the data aggregated all the NYC counties into a single datapoint).
For the California data, we condition on Los Angeles, San Diego, San Bernardino, Riverside, and Orange and predict Santa Clara.

\section{Pickands, CFG and BDV Estimators}
\label{sec:estimators}
\subsubsection*{Pickands Estimator}
The Pickands estimator \cite{pickands1981multivariate_supp} is built following the transformations (6) and (7) in the paper. The estimator is obtained by exactly maximizing the likelihood (Equation (8) in the paper) resulting in the following non-parametric estimate: 
\begin{align}
    \label{pickands_est}
    \widehat{A}_{\text{Pickands}}(\mathbf{w}) = \left( \frac{1}{B}\sum_{i=1}^B Z_{w, i} \right)^{-1}.
\end{align}

\subsubsection*{CFG Estimator}
The CFG estimator \cite{caperaa1997nonparametric_supp} is constructed following the observation: 
\begin{align*}
    \mathbb{E} \log Z_w = -\log A(\mathbf{w}) - \gamma,
\end{align*}
where $\gamma = -\int_{0}^{\infty} \log x e^{-x} dx$ denotes the Euler's contant. The CFG estimator is thus given by:
\begin{align}
    \label{cfg_est}
    \widehat{A}_{\text{CFG}}(\mathbf{w}) = \exp \left[ -\gamma - \frac{1}{B}\sum_{i=1}^B \log Z_{w, i}\right].
\end{align}
In our main submission we use a similar estimator, with the correction term presented in \cite{gudendorf2011nonparametric}:
\begin{equation}
    \widehat{A}_{\text{CFG,C}}(\mathbf{w})= \exp\left( \log \widehat{A}_{\text{CFG}}(\mathbf{w}) - \sum_{k=1}^d w_k\log\left(\widehat{A}_{\text{CFG}}(\mathbf{e}_k)\right)\right), 
\end{equation}
where $\mathbf{e}_k$ is the $k$-th canonical basis vector.
\subsubsection*{BDV Estimator}
We propose an $d$-dimensional extension to the bivariate estimator described in \cite{bucher2011new_supp}.
We begin by defining the minimum distance estimator between the true CDF, $C(\mathbf{u})$ and the one estimated by the Pickands function $A(\mathbf{w})$.
\begin{align}
&\int_{[0,1]^d} \left[\log C(\mathbf u)  - \sum_{k=1}^d \log u_k A\left(\frac{\log(\mathbf u)}{\sum_k \log u_k}\right)\right]^2 \,  d\mathbf{u} \\
&= \int_{\Delta_{d-1}}\int_{0}^1 (\log C(y^{w_1}, \dots, y^{w_{d}})  - \log (y) A(\mathbf{w}))^2 (-\log(y))^{d-1}\, dy\, d\mathbf{w}.
\end{align}
We have
\begin{align}
\hat C(y^{w_1}, \dots, y^{w_d}) 
&= \frac{1}{B} \sum_{i=1}^B \mathbf{1}( F_{1}(\bar{M}_{n, i}^{(1)})\le y^{w_1}, \cdots, F_{d}(\bar{M}_{n, i}^{(d)})\le y^{w_d})\\
&= \frac{1}{B} \sum_{i=1}^B \mathbf{1}(F_{1}(\bar{M}_{n, i}^{(1)})^{\frac1{w_1}}\le y, \cdots, F_{d}(\bar{M}_{n, i}^{(d)})^{\frac1{w_d}}\le y)\\
&= \frac{1}{B} \sum_{i=1}^B \mathbf{1}(\max_{1 \leq k \leq d} F_{k}(\bar{M}_{n, i}^{(k)})^{\frac1{w_k}}\le y)\\
&= \frac{1}{B} \sum_{i=1}^B \mathbf{1}\left(\Gamma_{w, i} \leq y \right),
\end{align}
where $\Gamma_{w, i} = \exp(-Z_{w,i})$. Now, if we reorder these so that $\Gamma_{w, 1}\le \cdots \le \Gamma_{w, B}$, we have that
\begin{equation}
\hat C(y^{w_1}, \dots, y^{w_d})=\left\{\begin{aligned}
0&\;\:\text{if}\;\: y < \Gamma_{w, 1},\\
\frac{i}{B}&\;\:\text{if}\;\: \Gamma_{w, i}\le y < \Gamma_{w, i+1},\, i\in \{1,\dots,B-1\},\\
1&\;\:\text{if}\;\: \Gamma_{w, B}\le y.
\end{aligned}\right.
\end{equation}
Because $\log \hat C(\cdots)$ is not defined if $y < \Gamma_{w, 1}$, the following modified estimator is considered in~\cite{bucher2011new_supp}.
\begin{equation}
\tilde C(y^{w_1}, \dots, y^{w_d}):= \max\left\{C(y^{w_1}, \dots, y^{w_d}), B^{-\gamma}\right\},
\end{equation}
where $\gamma$ is any positive real greater or equal than $\frac12$. For convenience, we choose $\gamma=1$ so that:
\begin{equation}
\tilde C(y^{w_1}, \dots, y^{w_d})=\left\{\begin{aligned}
\frac{1}{B}&\;\:\text{if}\;\: y < \Gamma_{w, 2},\\
\frac{i}{B}&\;\:\text{if}\;\: \Gamma_{w, i}\le y < \Gamma_{w, i+1},\, i\in \{2,\dots,B-1\},\\
1&\;\:\text{if}\;\: \Gamma_{w, B}\le y.
\end{aligned}\right.
\end{equation}
Finally, as in \cite{bucher2011new_supp}, for any positive weight function $h:(0,1)\to \mathbb{R}_0^+$, let $h^*(y):= h(y) (\log y)^2$,
\begin{equation}
B_h:= \int_{0}^1 h^*(y)\,dy\quad\text{and}\quad g(x) := - B_h^{-1} \int_{0}^x \frac{h^*(y)}{\log y}\,dy.
\end{equation}
Then, letting $\Gamma_{w,0}= 0$, $\Gamma_{w,B+1}= 1$, we define the BDV estimator $\widehat{A}_{\text{BDV,}h}$ as follows
\begin{align}
\widehat{A}_{\text{BDV,}h}(\mathbf{w}) &= B_h^{-1} \int_{0}^1 \frac{\log \tilde C(y^{w_1}, \dots, y^{w_d})}{\log y} h^*(y)\, dy \\
&= B_h^{-1} \sum_{i=0}^B \int_{\Gamma_{w,i}}^{\Gamma_{w,i+1}} \frac{\log \tilde C(y^{w_1}, \dots, y^{w_d})}{\log y} h^*(y)\, dy\\
&= -\log\frac{1}n g(\Gamma_{w, 2}) -\sum_{i=2}^n \log\frac{i}n \left(g(\Gamma_{w, i+1})-g(\Gamma_{w, i})\right)\\
&= -\sum_{i=2}^n \log\frac{i-1}n g(\Gamma_{w, i}) + \sum_{i=2}^n \log\frac{i}n g(\Gamma_{w, i})\\
&= \sum_{i=2}^n \log\left(1+\frac1{i-1}\right) g(\Gamma_{w, i})
\end{align}
In our main submission, we use a slightly modified estimator, which proved to have superior performance in our experiments. Recall that, if $A$ is a Pickand's dependence function, we have $\max(\mathbf{w})\le A(\mathbf{w})\le 1$, which implies the true copula verifies:
\begin{equation}\label{iusethisjustbelow}
\max(\mathbf{w}) \le \frac{\log C(y^{w_1}, \dots, y^{w_d})}{\log y} = A(\mathbf{w}) \le 1.
\end{equation}
Accordingly, we let
\begin{equation}
    \clamp_{a,b}(x):=
    \left\{\begin{array}{rl}
    a&\text{if }x\le a,\\
    x&\text{if }a<x<b,\\
    b&\text{if }x\ge b,
    \end{array}\right.
\end{equation}
and define
\begin{equation}\breve{C}(y^{w_1}, \dots, y^{w_d}) = \exp\left(\clamp_{\log y, \max(\mathbf{w}) \log y}\log\hat C(y^{w_1}, \dots, y^{w_d})\right),
\end{equation}
and
\begin{align}
\widehat{A}_{\text{BDV,MM,}h}(\mathbf{w}) &= B_h^{-1} \int_{0}^1 \frac{\log \breve C(y^{w_1}, \dots, y^{w_d})}{\log y} h^*(y)\, dy \\
&= B_h^{-1} \sum_{i=0}^B \int_{\Gamma_{w,i}}^{\Gamma_{w,i+1}} \frac{\log \breve C(y^{w_1}, \dots, y^{w_d})}{\log y} h^*(y)\, dy
\end{align}
Letting $\Gamma_{w,i}^{(\ell)} = \clamp_{\Gamma_{w,i},\Gamma_{w,i+1}}\left(\left(\frac{i}{n}\right)^{\frac1{\max(\mathbf{w})}}\right)$, $\Gamma_{w,i}^{(u)} = \clamp_{\Gamma_{w,i},\Gamma_{w,i+1}}\left(\frac{i}{n}\right)$ for $i\in \{0,\dots,B\}$ and $\eta(x)= B_h^{-1}\int_0^x h^*(y)\,dy$, we have
\begin{align*}
\widehat{A}_{\text{BDV,MM,}h}(\mathbf{w}) &= B_h^{-1} \sum_{i=0}^B \int_{\Gamma_{w,i}}^{\Gamma_{w,i+1}} \frac{\log \breve C(y^{w_1}, \dots, y^{w_d})}{\log y} h^*(y)\, dy\\
&= B_h^{-1} \sum_{i=0}^B \int_{\Gamma_{w,i}}^{\Gamma_{w,i}^{(\ell)}} \max(\mathbf{w}) h^*(y)\, dy + \int_{\Gamma_{w,i}^{(\ell)}}^{\Gamma_{w,i}^{(u)}} \log \frac{i}{n} \frac{h^*(y)}{\log y}\, dy +  \int_{\Gamma_{w,i}^{(u)}}^{\Gamma_{w,i+1}} h^*(y)\, dy \\
&= \sum_{i=0}^B \max(\mathbf{w})\left(\eta(\Gamma_{w,i}^{(\ell)}) - \eta(\Gamma_{w,i})\right) - \log\frac{i}n \left(g(\Gamma_{w,i}^{(u)})-g(\Gamma_{w,i}^{(\ell)})\right) + \eta(\Gamma_{w,i+1}) \\ & - \eta(\Gamma_{w,i}^{(u)})
\end{align*}
In our main submission, we use $\widehat{A}_{\text{BDV,MM,}h}$  with $h(y)=\frac{1}{\log(y)}$.

\section{Further Details on Experiments}
\label{sec:exp_details}
\subsubsection*{Architecture Details}
For learning the Pickands dependence function, in all experiments in the manuscript we used 512 width and 1 depth $d$MNNs.
Only the input layer was changed according to the input dimension.
In order to force the weights to be positive, we use a weight clipping during training.

For the generative model experiments, we model $p_z$ as a 128 d Gaussian random variable. 
The generator is a basic multi layer perceptron (MLP) with ReLU activations and batch norm. 
For all experiments, we use a width 256 and depth 2 MLP for the generator. 
The output is ensured to be positive through a final ReLU operation.

\subsubsection*{Hyperparameter Tuning}
For learning the Pickands dependence experiments, we used the Adam \cite{adam_supp} optimizer for optimizing all parameters with learning rate $1 \times 10^{-2}$ with a decay according to the \texttt{ReduceLROnPlateau} decay algorithm with a patience of 100 epochs. 
Each model was trained for 2000 epochs for the survival experiments and 4000 for the sampling experiments.
For the sampling experiments, the generator was trained using Adam with learning rate $1 \times 10^{-3}$, $\beta_1 = 0.5$ and $\beta_2 = 0.99$ with exponential decay on the learning rate of $0.99998$.
Models for the generator were trained for 4000 epochs

\subsubsection*{Computational Resources}
All experiments were run on an Nvidia RTX Titan GPU with an Intel Core i9-7900X CPU @ 3.30GHz and 64 GB of RAM. 

\newpage

\section{Larger Figures}
\label{sec:large_figs}

\begin{figure}[ht!]
    \centering
\begin{subfigure}{.24\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/commodities_3d_NaiveEstimator.pdf}  
  \caption{Pickands 3d Margins}
\end{subfigure}
\begin{subfigure}{.24\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/commodities_3d_CFGEstimator.pdf}  
  \caption{CFG 3d Margins}
\end{subfigure}
\begin{subfigure}{.24\textwidth}
  \centering
  \includegraphics[width=\linewidth, trim=2pt 0pt 2pt 0pt]{imgs/margins/commodities_3d_BDVEstimatorMM.pdf}  
  \caption{BDV 3d Margins}
\end{subfigure}
\begin{subfigure}{.24\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/commodities_3d_MaxLinear.pdf}  
  \caption{$d$MNN 3d Margins}
\end{subfigure}
\caption{(Larger figures from main text) Qualitative comparison of 3d margins from learned 10d MEV for the commodities dataset. The $d$MNN is the method that retains margins that are valid Pickands dependence functions as the others are non-convex and outside the required bounds. Contours plotted with solid line.}
\end{figure}

\begin{figure}[ht!]
    \centering
\begin{subfigure}{.48\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/survival_sl_50_w=512d=1.pdf} 
  \caption{$A_\text{SL}$ MSE ($d=2$)}
  \label{fig:sl_survival_512}
\end{subfigure}
\begin{subfigure}{.48\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/survival_asl_50_w=512d=1.pdf}
  \caption{$A_\text{ASL}$ MSE ($d=2$)}
  \label{fig:asl_survival_512}
\end{subfigure} 
\caption{(Larger figures from main text) (\ref{fig:sl_survival_512}, \ref{fig:asl_survival_512}) MSE of survival probabilities for $d=2$ with $100$ samples for $A_\text{SL}$ (\ref{fig:sl_survival_512}) and $A_\text{ASL}$ (\ref{fig:asl_survival_512}). Thresholds are above the $75$th percentile.}
\vspace{-10pt}
\end{figure}

\begin{figure}[ht!]
    \centering
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/ozone_extremal_NaiveEstimator.pdf}  
  \caption{Pickands 2d Margins}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/ozone_extremal_CFGEstimator.pdf}  
  \caption{CFG 2d Margins}
\end{subfigure} 
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth, trim=2pt 0pt 2pt 0pt]{imgs/margins/ozone_extremal_BDVEstimatorMM.pdf}  
  \caption{BDV 2d Margins}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/ozone_extremal_MaxLinear_w512d1.pdf}  
  \caption{$d$MNN 2d Margins}
  %\label{fig:net_marg_spy}
    \label{fig:net_marg_ozone}
\end{subfigure}
\label{fig:marg_ozone}
\caption{(Additional figure.) Qualitative comparison of 6 2d margins from learned 4d MEV for the Ozone dataset. The $d$MNN is the method that retains margins that are valid Pickands dependence functions as the others are non-convex and outside the required bounds.}
\end{figure}

\begin{figure}[ht!]
    \centering
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/comm_extremal_NaiveEstimator.pdf}  
  \caption{Pickands 2d Margins}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/comm_extremal_CFGEstimator.pdf}  
  \caption{CFG 2d Margins}
\end{subfigure} 
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth, trim=2pt 0pt 2pt 0pt]{imgs/margins/comm_extremal_BDVEstimatorMM.pdf}  
  \caption{BDV 2d Margins}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/comm_extremal_MaxLinear_w512d1.pdf}  
  \caption{$d$MNN 2d Margins}
  %\label{fig:net_marg_spy}
\end{subfigure}
\label{fig:marg_comm}
\caption{(Additional figure.) Qualitative comparison of 10 2d margins from learned 10d MEV for the commodities. The $d$MNN is the method that retains margins that are valid Pickands dependence functions as the others are non-convex and outside the required bounds.}
\end{figure}
\begin{figure}[ht!]
    \centering
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/spy_extremal_NaiveEstimator.pdf}  
  \caption{Pickands 2d Margins}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/spy_extremal_CFGEstimator.pdf}  
  \caption{CFG 2d Margins}
\end{subfigure} 
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth, trim=2pt 0pt 2pt 0pt]{imgs/margins/spy_extremal_BDVEstimatorMM.pdf}  
  \caption{BDV 2d Margins}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/spy_extremal_MaxLinear_w512d1.pdf}  
  \caption{$d$MNN 2d Margins}
    \label{fig:net_marg_comm}
  %\label{fig:net_marg_spy}
\end{subfigure}
\label{fig:marg_spy}
\caption{(Additional figure.) Qualitative comparison of 28 2d margins from learned 418d MEV for the S\&P dataset. The $d$MNN is the method that retains margins that are valid Pickands dependence functions as the others are non-convex and outside the required bounds.}
\end{figure}

\begin{figure}[ht!]
    \centering
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/crypto_extremal_NaiveEstimator.pdf}  
  \caption{Pickands 2d Margins}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/crypto_extremal_CFGEstimator.pdf}  
  \caption{CFG 2d Margins}
\end{subfigure} 
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth, trim=2pt 0pt 2pt 0pt]{imgs/margins/crypto_extremal_BDVEstimatorMM.pdf}  
  \caption{BDV 2d Margins}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/crypto_extremal_MaxLinear_w512d1.pdf}  
  \caption{$d$MNN 2d Margins}
  %\label{fig:net_marg_spy}
    \label{fig:net_marg_crypto}
\end{subfigure}
\label{fig:marg_crypto}
\caption{(Additional figure.) Qualitative comparison of 28 2d margins from learned 100d MEV for the Crypto dataset. The $d$MNN is the method that retains margins that are valid Pickands dependence functions as the others are non-convex and outside the required bounds. }
\end{figure}
\begin{figure}[ht!]
    \centering
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/box_sl_w=512d=1_Da.pdf} 
  \caption{$A_\text{SL}$ MSE ($d=256$)}
  \label{fig:sl_mse_est_all_a_supp}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/box_asl_w=512d=1_Da.pdf}
  \caption{$A_\text{ASL}$ MSE ($d=256$)}
  \label{fig:asl_mse_est_all_a_supp}
\end{subfigure} 
\begin{subfigure}{.4\textwidth}
  \centering
  %\includegraphics[width=\linewidth]{imgs/mse_sl_alln_thick_zoomed.pdf}  
\includegraphics[width=\linewidth]{imgs/uai/box_sl_w=512d=1_Dd.pdf}  
  \caption{$A_\text{SL}$ MSE ($\alpha=0.5$)}
  \label{fig:sl_mse_est_all_d_supp}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  %\includegraphics[width=\linewidth]{imgs/mse_asl_alln_thick_zoomed.pdf}  
    \includegraphics[width=\linewidth]{imgs/uai/box_asl_w=512d=1_Dd.pdf}  
  \caption{$A_\text{ASL}$ MSE ($\alpha=0.5$)}
  \label{fig:asl_mse_est_all_d_supp}
\end{subfigure}  

\caption{(Larger figures from main text) Comparison of $||\hat{A}(\mathbf{w}) - A(\mathbf{w})||_2^2$ for different estimators $\hat{A}$ for different dependence $\alpha = \{0.25, 0.50, 0.75, 1.0\}$ and $d=256$ (\ref{fig:sl_mse_est_all_a_supp}, \ref{fig:asl_mse_est_all_a_supp}) and for fixed  $\alpha=0.5$ for $d = \{256, 512, 728, 1024\}$ (\ref{fig:sl_mse_est_all_d_supp}, \ref{fig:asl_mse_est_all_d_supp})  for $A_\text{SL}$ (\ref{fig:sl_mse_est_all_a_supp}, \ref{fig:sl_mse_est_all_d_supp}) and $A_\text{ASL}$ (\ref{fig:asl_mse_est_all_a_supp}, \ref{fig:asl_mse_est_all_d_supp}). Results are over 50 runs with 100 training samples for each run.} 
    \vspace{-10pt}
\end{figure}
\begin{figure}[ht!]
    \centering
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/cali_extremal_NaiveEstimator.pdf}  
  \caption{Pickands 2d Margins}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/cali_extremal_CFGEstimator.pdf}  
  \caption{CFG 2d Margins}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/cali_extremal_BDVEstimatorMM.pdf}  
  \caption{BDV 2d Margins}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/margins/cali_extremal_MaxLinear_w512d1.pdf}  
  \caption{$d$MNN 2d Margins}
\end{subfigure}
\caption{(Larger figures from main text) Qualitative comparison of 10 out of 45 total 2d margins from learned 10d MEV for the California Winds dataset. The $d$MNN is the only method that retains margins that are valid Pickands dependence functions. }
\vspace{-10pt}
\end{figure}

\begin{figure}[ht!]
    \centering
\begin{subfigure}{.4\textwidth}
  \centering
  %\includegraphics[width=\linewidth]{imgs/sl_225_neurips_thick.pdf}   
  \includegraphics[width=\linewidth]{imgs/uai/sl_CFG_sampling_da.pdf}  

  \caption{SL CFG MSE $\Delta \alpha$}
  \label{fig:sl_mse_gen_supp}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  %\includegraphics[width=\linewidth]{imgs/asl_225_neurips_thick.pdf}  
    \includegraphics[width=\linewidth]{imgs/uai/asl_CFG_sampling_da.pdf}  
  \caption{ASL CFG MSE $\Delta \alpha$}
  \label{fig:asl_mse_gen_supp}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/sl_CFG_sampling_dd.pdf}  
  \caption{SL CFG MSE $\Delta d$}
  \label{fig:sl_mse_gen_all_d_supp}
\end{subfigure}
\begin{subfigure}{.4\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/uai/asl_CFG_sampling_dd.pdf}
  \caption{ASL CFG MSE $\Delta d$}
  \label{fig:asl_mse_gen_all_d_supp}
\end{subfigure}
\caption{(Larger figures from main text) MSE of CFG estimate for 1000 samples and 1000 simplex points for $d=225$ (\ref{fig:sl_mse_gen_supp}, \ref{fig:asl_mse_gen_supp}) at various $\alpha \in (0,1)$ and $\alpha=0.5$ (\ref{fig:sl_mse_gen_all_d_supp}, \ref{fig:asl_mse_gen_all_d_supp}) at $d=\{64, 128, 256, 784, 1024\}$ for $A_\text{SL}$ (\ref{fig:sl_mse_gen_supp}) and  $A_\text{ASL}$ (\ref{fig:asl_mse_gen_supp}) for data sampled from generative model (blue), $d$MNN (orange), and exact sampled (green). Both models were trained with 1000 data points.}
\vspace{-5pt}
\end{figure}

\newpage
\section{Algorithms}
\label{sec:algs}
Here we provide algorithms for the estimation and sampling presented in the main content. 
We recall the transformations on $\bar{M}_k^{(n)}$:
\begin{align}
    \label{transform1_supp}
    \widetilde{M}_k^{(n)} & = - \log (F_k(\bar{M}_k^{(n)})), \: \forall k \in \{1, \ldots, d\}, \\ 
    \label{transform2_supp}
    Z_w & = \min_{k=1,\ldots,d} \widetilde{M}_k^{(n)} / w_k.
\end{align}
Then, we have: $\mathbb{P} \left[ Z_w > z\right] = e^{- z A(\mathbf{w})}$.
Additionally, recall the definition of the copula in terms of $A$:
\begin{algorithm}[ht!]
	\caption{Fitting the Pickands-$d$MNN to Data} 
	\begin{algorithmic}[1]
	\STATE \textbf{Input:} $\left \{ \left(X_1^{(i)}, \ldots, X_d^{(i)} \right) \right \}_{i=1}^N$, $N=B \times n$ samples of i.i.d. random vectors where $B$ is the number of blocks of data and $n$ is the size of each block.
	\STATE Take component-wise maxima over each block: $\left \{ \left(M_{1}^{(n,b)}, \ldots, M_{d}^{(n,b)} \right)\right \}_{b=1}^B$ where $M_{k}^{(n,b)}=\max _{i=(b-1)n+1,...,bn}X_k^{(i)}$, $(k, b) \in \{1, \ldots, d\} \times \{1, \ldots, B\}$. 
	\STATE Fit a GEV to each component-wise maxima $\{ M_{k}^{(n,b)} \}_{b=1}^B$, obtain $\{\bar{M}_{k}^{(n,b)} \}_{b=1}^B$, then estimate marginals $F_k$ for each $k \in \{1, \ldots, d \}$.
	\STATE \textbf{Initialize} the parameters ${\bm \theta} \geq 0$ of the $d$MNN \\ 
    \textbf{Repeat}: 
    \STATE Randomly sample a minibatch of training data $\{\bar{M}_{k}^{(n,b)} \}_{b \in \text{batch}}$
    and uniformly sample $\mathbf{w} \in \Delta_{d-1}$.
    \STATE  Transform samples according to Equations \eqref{transform1_supp} and \eqref{transform2_supp} to obtain transformed samples $\{Z_{w, b} \}_{b \in \text{batch}}$. 
    \STATE Compute gradient
     $\nabla_{{\bm\theta}}  \sum_{b \in \text{batch}} \mathcal{L}\left(Z_{w, b}; {\bm \theta} \right)$.\\
    \STATE Update $\bm \theta$ with Adam \citep{adam_supp} \\ 
    \textbf{Until} convergence \\ 
    \textbf{Output:} $A^{\star}_{\bm \theta}(\mathbf{w})$. 
	\end{algorithmic} 
	\label{alg_train_supp}
\end{algorithm}

\begin{algorithm}[ht!]
	\caption{Estimating survival probabilities with the Pickands dependence function} 
	 \label{alg_survival}
	\begin{algorithmic}[1]
	\STATE \textbf{Input:} $\{\bar{M}_{n, b}^{(k)} \}_{b=1}^B$, thresholds: $\left(\gamma_1, \cdots, \gamma_d \right)$. \\ 
	\STATE Train a model $A(\mathbf{w}; \theta)$ with the transformed variables $\{(G_1(\bar{M}_{n, b}^{(1)},\dots,G_d(\bar{M}_{n, b}^{(d)})) \}_{b=1}^B$ using Algorithm~\ref{alg_train_supp} and obtain  $A(\mathbf{w}; \theta_*)$.
	\STATE Evaluate the Pickands copula:
	\begin{align*}
	    C\left(1- F_1(\bar{\gamma}_1), \cdots,  1- F_d(\bar{\gamma}_d) \right),
	\end{align*}
	where $C$ is calculated as in Equation \eqref{eq:pickands_copula_supp} with $A = A(\mathbf{w}; \theta_*)$. 
	\end{algorithmic}  
\end{algorithm}

\begin{algorithm}[ht!]
	\caption{Training a Generator for a Pickands Copula} 
	 \label{alg:train_gen}
	\begin{algorithmic}[1]
	\STATE \textbf{Input:} $A(\mathbf{w})$, $p_z$, tolerance parameter $\epsilon$ \\ 
	\STATE Initialize parameters $\phi$ of generator $G( \cdot ; \phi)$ \\
	\STATE Sample $\{ \mathbf{w}^{(j)}\}_{j=1}^{N_\text{simplex}}$ samples uniformly over $\Delta_{d-1}$.\\
	\WHILE{$ \sum_{j =1}^{N_\text{simplex}} \mathcal{L}(\mathbf{w}^{(j)}; \phi) > \epsilon$}
	\STATE Sample $\{\mathbf{w}^{(j)}\}_{j=1}^{N_\text{simplex}}$ samples uniformly over $\Delta_{d-1}$. \\
	\STATE Sample $\{\mathbf{y}^{(i)}\}_{i=1}^{N_\text{gen}}$ where $\mathbf{y}^{(i)} = G(\mathbf{z}^{(i)}; \phi), \mathbf{z}^{(i)} \sim p_z$ for $1 \leq i \leq N_{\text{gen}}$. \\
	\STATE Define $\eta(\mathbf{w}, \mathbf{y}) = \max \{ \mathbf{w} \odot \mathbf{y} \}$ with $\odot$ denoting the point-wise multiplication. \\
	\STATE Compute gradient w.r.t $\phi$ of $\sum_{j=1}^{N_{\text{simplex}}}\mathcal{L}(\mathbf{w}^{(j)}; \phi)$ where:
	\vspace{-10pt}
	\begin{equation*}
	    \mathcal{L}(\mathbf{w}^{(j)}; \phi) = (A(\mathbf{w}^{(j)}) - \frac{1}{N_\text{gen}} \sum_{i=1}^{N_\text{gen}} \eta(\mathbf{w}^{(j)}, \mathbf{y}^{(i)}) )^2  + \left\| \frac{1}{N_\text{gen}}\sum_{i=1}^{N_\text{gen}}\mathbf{y}^{(i)} - 1 \right\|_2^2
	\end{equation*}
	\STATE Update $\phi$ using Adam \cite{adam_supp}.
	\ENDWHILE
	\STATE \textbf{Output:} $G(. ; \phi_*)$.
	\end{algorithmic}  
\end{algorithm}

\begin{algorithm}[ht!]
	\caption{Heuristic for Sampling From a Given Pickands Copula \citep[Algorithm 1]{hofert2018hierarchical_supp}}
	 \label{alg:sampling}
	\begin{algorithmic}
	\STATE \textbf{Input:} $A(\mathbf{w})$, $N_\text{max} > 1 \in \mathbb{N}$\\ 
	\STATE Optimize a generator $G(\cdot; \phi)$ using Algorithm~\ref{alg:train_gen}. \\
	\FOR{$i \in \{1, \ldots, N_\text{max}\}$}
	\STATE Generate $\mathbf{y}^{(i)}$ where $\mathbf{y}^{(i)} = G(\mathbf{z}^{(i)}; \phi_*), \mathbf{z}^{(i)} \sim p_z$. 
	\STATE Sample $\{\xi^{(i)}\}_1^{N_\text{max}}$ from the Poisson process by sampling $\epsilon_k \sim \text{Exp}(1)$ and $\xi^{(i)} = 1 / \sum_{k=1}^{i} \epsilon_k$.
	\ENDFOR
	\STATE Compute the component-wise maxima as: $M = \max_{1 \leq i \leq N_\text{max}} \{ \xi^{(i)} \odot \mathbf{y}^{(i)} \}$.
	\STATE \textbf{Output:} $M$.
	\end{algorithmic}  
\end{algorithm}

\begin{comment}
\section{Percentage Accuracy}\label{sec:perc_acc}

\begin{table*}
\newcommand{\timesten}{\text{\footnotesize $\times10$}}
\centering
\begin{tabular}{@{}lllll@{}} 
\toprule
 & \textbf{Pickands} & \textbf{CFG} & \textbf{BDV} & \textbf{Proposed}  \\ 
\midrule
 Wind &  $11.6(8.3)\%$  & $9.6(7.4)\%$    &   $ 38.3(32.5) \%$  &   $\bf 40.5(30.9)\%$      \\
 Ozone & $3.3(4.5)\%$ &  $29.2(15.0)\%$   & $ 29.2(38.9)\%$ & $\bf 38.2(24.0)\%$      \\
 Commodities & $30.7(32.4)\%$  & $8.1(8.8)\%$     & $14.5(9.2)\%$    &  $\bf 46.6(34.4)\%$     \\
 S\&P 500 & $ \bf 55.5(46.9)\%$ & $ 1.3(1.9)\%$ & $40.2(40.8)\%$ & $ 2.9 (4.2) \%$ \\
 Crypto & $ 15.0(2.2)\%$ & $2.4 (4.2)\%$ &$ 31.9(34.6)\%$ & $ \bf 59.7(39.2)\%$  \\
\bottomrule
\end{tabular}
    \caption{\% accuracy for data presented in main submission. Values are based on computations in Section~\ref{sec:perc_acc}.}
    \label{tab:pct_error}
\end{table*}

Here we complement the results in Section 4 by adding an additional performance measure that we call percentage accuracy as shown in Table \ref{tab:pct_error}. The percentage accuracy quantifies the percentage of time a given estimator has the closest estimated survival probability to the empirical estimator. This is quantified as: $$\text{Acc}(\text{est}) =  \frac{1}{|Q|} \sum_{\gamma \in Q} \mathbb{1}\{ \text{est} = \arg \min_{\text{est}' \in \text{estimators}} (\mathbb{P}_{\text{est}'}(\gamma) -  \mathbb{P}_{\text{emp}}(\gamma))^2\}$$, where $\mathbb{P}_{\text{est}}(\gamma)$ is the survival probability estimated by an estimator: $\text{est} \in \text{estimators}= \{\text{Pickands, CFG, BDV, Proposed}\}$ for a given threshold $\gamma$ corresponding to a specific quantile. 
$\mathbb{P}_{\text{emp}}(\gamma) = \frac{1}{B} \sum_{b=1}^B \mathbb{1}\{ M_{n, b} \geq \gamma\}$ with $M_{n, b} = \left(M_{n, b}^{(1)}, \cdots, M_{n, b}^{(d)} \right)$ the $d-$dimensional vector of point-wise maxima (or point-wise maximum drawdown over a period of interest) denotes the empirical survival probability for a given quantile. For most datasets (except for S\&P 500), we achieve the highest percentage accuracy as compared to other estimators. 
In the S\&P dataset, since using MSE the proposed method provided the best results, suggests that a few points had large influence on the final metric. 
\end{comment}
\section{Sampling Examples}
\begin{comment}
\subsection{Sampling from the $d$MNN}
Here we present a few qualitative results on sampling using the weights of the $d$MNN.


\begin{figure}
    \centering
\begin{subfigure}{.23\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/d=2_a=0.0.pdf}  
  \caption{$\alpha = 0$}
\end{subfigure} 
\begin{subfigure}{.23\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/d=2_a=1.0.pdf}  
  \caption{$\alpha = 1$}
\end{subfigure}
\begin{subfigure}{.23\textwidth}
  \centering
  \includegraphics[width=\linewidth]{imgs/d=2_a=0.5.pdf}  
  \caption{$\alpha = 0.5$}
\end{subfigure}   
    \caption{Comparison of samples for $A = A_{SL}$. Blue points are from the proposed method and orange points are true samples.}
    \label{fig:samples}
\end{figure}
In Figure~\ref{fig:samples} we show examples of samples from the symmetric logistic distribution for different values of the dependence parameter $\alpha$. 
Samples from our heuristic and learned generator (blue) are compared to the exact samples (orange) provided by the method in \cite{stephenson2003simulating}. 
\end{comment}

% \section{Direct sampling from data}
% We build on the method proposed in Section 3.2 and detailed in Algorithm 3 to propose a learning procedure that directly samples from data. In more concrete terms, given samples $\{ M_i\}_{i=1}^n$ assumed to be distributed according to a MEV distribution $F_M$, we train a generator $G(.)$ such that a generated sample $\tilde{M} = G(\mathbf{z}) \sim F$ for $\mathbf{z} \sim p_z$. 

% We start by training a generator parameterized by $\phi$, $G(.; \phi)$ to learn the data underlying Pickands dependence function. This is done by modeling 
% \begin{align*}
%     & A(w; \phi) = \mathbb{E}_y \max_{k=1, ..., d} w_k y_k, \:\:\: y_k = G(z_k; \phi), \quad \text{s.t.} \:\: \mathbb{E}[y] = 1.  \\
%     & \max_{\phi} \mathbb{E}_{Z_w \sim \text{real} } \log \text{Exp}(z | A(w; \phi)).
% \end{align*}

% In the unusual situation where you want a paper to appear in the
% references without citing it in the main text, use \nocite

% \pagebreak 
%\bibliography{refs}
% \bibliographystyle{icml2021}
%\bibliographystyle{numeric}
% \appendix

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% DELETE THIS PART. DO NOT PLACE CONTENT AFTER THE REFERENCES!
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\bibliography{hasan_123}


\end{document}


% This document was modified from the file originally made available by
% Pat Langley and Andrea Danyluk for ICML-2K. This version was created
% by Iain Murray in 2018, and modified by Alexandre Bouchard in
% 2019 and 2021. Previous contributors include Dan Roy, Lise Getoor and Tobias
% Scheffer, which was slightly modified from the 2010 version by
% Thorsten Joachims & Johannes Fuernkranz, slightly modified from the
% 2009 version by Kiri Wagstaff and Sam Roweis's 2008 version, which is
% slightly modified from Prasad Tadepalli's 2007 version which is a
% lightly changed version of the previous year's version by Andrew
% Moore, which was in turn edited from those of Kristian Kersting and
% Codrina Lauth. Alex Smola contributed to the algorithmic style files.
