\section{Extended Background}
\label{sec:extended_background}

\subsection{Bernstein Polynomials}
\label{sec:bernstein_poly}
Bernstein polynomials of order $M$ are defined as
%
\begin{equation}
h(z) = \frac{1}{M+1}\sum_{i=0}^M \operatorname{Be}_i(z) \vartheta_i,
\end{equation}
where $\operatorname{Be}_i(z)$ is the density of a Beta distribution with parameters $i+1$ and $M-i+1$, and $\vartheta_0,\ldots,\vartheta_M$ are the Bernstein coefficients~\cite{Farouki2012}.
As $M$ increases, Bernstein polynomials become a \emph{universal approximator} for smooth functions in $[0,1]$ \cite{Farouki1988}.
In practice, $M \ge 10$ is often sufficient~\cite{Hothorn2018}.

Bernstein polynomials are defined for values of $z$ within the range $[0,1]$.  Outside this interval, linear extrapolation is performed.
To guarantee invertibility, transformation functions must be bijective, achieved by strict monotonicity. When using Bernstein polynomials, monotonicity is enforced by constraining the Bernstein coefficients $\vartheta_0,\ldots,\vartheta_M$ to be increasing. This is achieved by recursively applying a strictly positive function like $\softplus$ to an unconstrained vector $\tilde\vartheta_0,\ldots,\tilde\vartheta_M$, such that $\vartheta_0=\tilde\vartheta_0$ and $\vartheta_{k}=\vartheta_{k-1} + \softplus(\tilde\vartheta_k)$ for $k=1,\ldots,M$.
Parameter estimation can depend on initialization, as $\vartheta_0$ is directly derived from unconstrained parameters.
Monotonicity of $h$ is enforced by constraining the coefficients to be increasing, e.g.
by $\vartheta_0=\tilde\vartheta_0$ and $\vartheta_{k}=\vartheta_{k-1} + \softplus(\tilde\vartheta_k)$ for $k=1,\ldots,M$~\cite{Sick2021}.
The inverse transformation can be found using a root-finding algorithm~\cite{Chandrupatla1997}.

We require transformations to cover at least the range $[-3,3]$ (e.g., $\pm 3\sigma$ of a standard Gaussian). Since Bernstein polynomials' boundaries are defined by its first and last coefficients ($f(0)=\vartheta_{0}$ and $f(1)=\vartheta_{M}$), we determine these from unrestricted parameters $\tilde{\vartheta}_0$ and $\tilde{\vartheta}_{M+1}$ via $\vartheta_0 = -\softplus(\tilde{\vartheta}_0) - 3.0 \le -3$ and $\vartheta_M =  \softplus(\tilde{\vartheta}_{M+1}) + 3.0 \ge 3$.
To ensure $\sum_{k=1}^M{(\vartheta_k - \vartheta_{k-1})}=\vartheta_M - \vartheta_0 =: \Delta$, the remaining coefficients $\vartheta_k$ for $k=1,\ldots,M$ are calculated as:
\begin{equation}
\vartheta_k = \vartheta_{k-1} + \Delta\cdot\softmax\left(\left[\tilde\vartheta_1, \tilde\vartheta_3,\ldots, \tilde\vartheta_M\right]\right)_{k-1}
\end{equation}
Since $\Delta$ and all $\softmax$ components are non-negative, $\vartheta_k - \vartheta_{k-1} \ge 0$, ensuring monotonicity.

\subsection{Maximum Likelihood Estimation for Parameter Inference}
%Transformations are parameterized functions ranging from affine transformations \citep{Dinh2017,Papamakarios2018,Kingma2016} to complex spline \citep{Durkan2019,Durkan2019a} or polynomial functions \citep{Sick2021,Arpogaus2023,Jaini2019}.
Parameter estimation usually involves minimizing a divergence between the true distribution $p_{y}(y)$ and the transformation model $p_y(\mathbf{y}|\theta)$ regarding the model's parameters $\theta$ \citep{Papamakarios2021}.
Since $p_y(y)$ is unknown, we minimize the negative log-likelihood of the empirical distribution $p_{\mathcal{D}}$ obtained from a finite set $\mathcal{D}$ of $N$ i.i.d. observations $\mathbb{Y} = \{\mathbf{y}_1,\ldots,\mathbf{y}_m\}$:
\begin{equation}\label{eq:nll}
\nll = - \sum_{y \in \mathcal{D}}\log \left(p_y(y;\theta)\right)
\end{equation}
which is equivalent to minimizing the KL divergence between $p_\mathcal{D}$ and the flow-based model $p_y (y; \theta)$ \citep{Papamakarios2021}.

\subsection{Multivariate Conditional Transformation Models}
\label{sec:mctm}

\gls{MCTM} \citep{Klein2022} use element-wise transformations $\tilde{h}_{j}\left(y_{j}\right), j=1, \ldots, J$ and a linear triangular $(J \times J)$ matrix $\Lambda$ to model correlations:
\begin{equation}
h_j(y_1,\ldots,y_j) = \lambda_{j 1} \tilde{h}_{1}(y_1)+\ldots+\lambda_{j,j-1} \tilde{h}_{j-1}(y_{j-1}) + \tilde{h}_j(y_j)
\end{equation}
For conditional distributions, $\Lambda$ and transformation parameters $\theta$ can depend on covariates $\mathbf{x}$:
\begin{equation}
h_j(\mathbf{y}|\mathbf{x}) = \sum_{i=1}^{j-1}\lambda_{ji}(\mathbf{x})\tilde{h}_i(y_i;\theta_i(\mathbf{x})) + \tilde{h}_j(y_j;\theta_j(\mathbf{x}))
\end{equation}

\subsection{Autoregressive Transformation Models}
\label{sec:ar_models}
Autoregressive flows factorize multivariate distributions based on the chain rule of probability:
\begin{equation}
p_y(\mathbf{y}) = \prod_{i=1}^D p_y(y_i|\mathbf{y}_{<i})
\end{equation}
Applying the change of variables formula yields:
\begin{equation}
p_y(\mathbf{y}) = p_z\left(h(y)\right) \left|\det\nabla{h}(y)\right| = \prod_{i=1}^D p_z\left(h_i(y_{i},\mathbf{y}_{<i})\right)\left|\det\nabla{h}_i(y_{i},\mathbf{y}_{<i})\right|
\end{equation}
where $h_i$ is a diffeomorphism applied to the $i$-th element of $\mathbf{y}$, conditioned on preceding elements $\mathbf{y}_{<i}$.
The lower triangular Jacobian determinant is the product of its diagonal elements:
\begin{equation}
\det\nabla{h}(y) = \prod_{i=1}^D \frac{\partial F_i}{\partial y_i}
\end{equation}
This is implemented as $z_i=h_i(y_i, \theta_i)$ with $\theta_i=c(\mathbf{y}_{<i})$, where the conditioner $c_i$, often a neural network, captures dependencies and enforces the autoregressive property \citep{Papamakarios2021}. New samples are obtained via the inverse transformation $y_i=T_i^{-1}(z_i, \theta_i)$.

Two common architectures for the conditioner are coupling layers and masked autoregressive networks. Coupling flows \citep{Dinh2017} split the response $\mathbf{y}\in\mathbb{R}^D$ into subsets $(\mathbf{y}_A, \mathbf{y}_B) \in (\mathbb{R}^{d},\mathbb{R}^{D-d})$, with one subset conditioning the transformation of the other:
\begin{equation}
\mathbf{y} = \begin{cases}
\mathbf{y}_A = \mathbf{y}_A \\
\mathbf{y}_B = h(\mathbf{y}_B; \Theta(\mathbf{y}_A))
\end{cases}
\end{equation}
\gls{MADE} \citep{Germain2015}, as used in \glspl{MAF} \citep{Papamakarios2018} and \glspl{IAF} \citep{Kingma2016}, generalizes this using masked neural networks to enforce autoregressive constraints.

In \gls{MADE}, the output $d$ depends only on inputs $\mathbf{y}_{<d}$. This is achieved by element-wise multiplying weight matrices by binary masks, zeroing connections that violate the autoregressive property. Whether the masked network uses $\mathbf{y}$ or the latent representation $\mathbf{z}$ as input affects only whether inference or sampling is iterative \citep{Papamakarios2021}.

\subsection{Structured Additive Predictors for Enhanced Flexibility}
\label{sec:sap}

Structured Additive Predictors \citep[\gls{SAP};][]{Fahrmeir2013,Fahrmeir2009} allow for (non-)linear effects, interactions, and other structured terms. An exemplary predictor is given by
\begin{equation}
\eta(\mathbf{x}) = \beta_0 + \sum_{u=1}^{U} f_u(x_u) + \sum_{u < v} f_{uv}(x_u, x_v) + \dots 
\end{equation}
Here, $f_u(x_u)$ is a linear or non-linear function, where the latter is usually specified using regression splines to stay in the context of parametric regression. $f_{uv}(x_u,x_v)$ are linear or smooth interaction effects. The order of interaction determines the degree of interpretability. In our hybrid approach, \gls{SAP} can be used to parameterize marginal shifts $\beta_j(\mathbf{x})$ in $H_1$.

% \subsection{Relation to Copula Methods}
% \label{sec:copula}
% 
% Copulas model multivariate distributions by separating marginal distributions from the dependence structure.
% Sklar's Theorem states that a copula function can express any \gls{CDF}. 
% Let $F_{\mathbf{Y} | \mathbf{X}}(\mathbf{y} | \mathbf{x})$ be the joint \gls{CDF} of the response vector $\mathbf{Y} = (Y_1, \dots, Y_J)^\top$ given features $\mathbf{X}$.
% Sklar's theorem implies a copula function $C(u_1, \dots, u_J | \mathbf{x})$ such that:
% \begin{align*}
%   &F_{\mathbf{Y} | \mathbf{X}}(y_1, \dots, y_J | \mathbf{x})\\
%   &= C(F_{Y_1 | \mathbf{X}}(y_1 | \mathbf{x}), \dots, F_{Y_J | \mathbf{X}}(y_J | \mathbf{x}) | \mathbf{x}),
% \end{align*}
% where $u_j = F_{Y_j | \mathbf{X}}(y_j | \mathbf{x})$ are uniform marginal \glspl{CDF}.
% The copula $C$ is a multivariate \gls{CDF} on $[0,1]^J$ with uniform marginals.
% Our hybrid approach directly relates to this.
% The first step ($H_1$) models marginals $F_{Y_j|\mathbf{X}}$ and transforms them to the base distribution $F_Z$.
% Applying the \gls{PIT}, $u_j = F_Z(z_{1j})$, obtains uniform marginals.
% 
% \begin{figure}[ht]%
%   \begin{subfigure}[t]{0.32\linewidth}
%     \centering
%     \includegraphics[width=\linewidth]{gfx/moons.pdf}
%     \caption{\tiny Original Data}
%   \end{subfigure}
%   \hfil%
%   \begin{subfigure}[t]{0.32\linewidth}
%     \centering
%     \includegraphics[width=\linewidth]{moons_w.pdf}
%     \caption{\tiny Normalized Marginals}
%   \end{subfigure}
%   \hfil%
%   \begin{subfigure}[t]{0.32\linewidth}
%     \centering
%     \includegraphics[width=\linewidth]{moons_pit.pdf}
%     \caption{\tiny Uniform Marginals}
%   \end{subfigure}
%   \caption{Illustration of the hybrid approach on the Moons dataset.
%     (a) The original data exhibits a non-linear dependency structure.
%     (b) After applying $H_1$, the marginal distributions are normalized.
%     (c) The autoregressive flow $H_2$ further transforms the data to obtain approximately independent uniform marginals, implicitly modeling the copula function.}
%   \label{fig:copula_illustration}
% \end{figure}
% 
% The copula density $c$ of $\bm{u}=(u_1, \dots, u_J)^\top$ is the ratio of the joint density to the product of the marginal densities:
% \begin{equation*}
%   c(\bm{u} | \mathbf{x}) = \frac{f_{\mathbf{Y} | \mathbf{X}}(F^{-1}_{Y_1 | \mathbf{X}}(u_1 | \mathbf{x}), \dots, F^{-1}_{Y_J | \mathbf{X}}(u_J | \mathbf{x}) | \mathbf{x})}{\prod_{j=1}^J f_{Y_j | \mathbf{X}}(F^{-1}_{Y_j | \mathbf{X}}(u_j | \mathbf{x}) | \mathbf{x})}.
% \end{equation*}

\clearpage

\section{Extended Results}

\subsection{negative log-likelihood on 2D simluation datasets}
\label{sec:sim-nll}

\begin{table*}[ht!]
\centering
\begin{tabular}{lllll}
  \toprule
  dataset name & \multicolumn{2}{c}{circles} & \multicolumn{2}{c}{moons}                                           \\
  conditional  & False                       & True                      & False              & True               \\
  model        &                             &                           &                    &                    \\
  \midrule
  MVN          & -0.204 $\pm$ 0.000          & -0.423 $\pm$ 0.002        & -0.151 $\pm$ 0.002 & -0.704 $\pm$ 0.004 \\
  MCTM         & -0.490 $\pm$ 0.002          & -0.489 $\pm$ 0.002        & -0.536 $\pm$ 0.000 & -1.046 $\pm$ 0.006 \\
  MAF (S)      & -1.123 $\pm$ 0.022          & -1.123 $\pm$ 0.022        & -1.611 $\pm$ 0.042 & -1.611 $\pm$ 0.042 \\
  MAF (B)      & -1.179 $\pm$ 0.014          & -1.179 $\pm$ 0.014        & -1.625 $\pm$ 0.016 & -1.625 $\pm$ 0.016 \\
  CF (S)       & -1.045 $\pm$ 0.132          & -1.861 $\pm$ 0.056        & -1.587 $\pm$ 0.052 & -2.306 $\pm$ 0.050 \\
  CF (B)       & -0.651 $\pm$ 0.202          & -1.657 $\pm$ 0.038        & -1.350 $\pm$ 0.106 & -2.186 $\pm$ 0.116 \\
  \midrule
  HCF (S)      & -1.175 $\pm$ 0.012          & -1.870 $\pm$ 0.018        & -1.628 $\pm$ 0.018 & -2.332 $\pm$ 0.014 \\
  HCF (B)      & -1.071 $\pm$ 0.024          & -1.826 $\pm$ 0.078        & -1.583 $\pm$ 0.042 & -2.332 $\pm$ 0.032 \\
  \bottomrule
\end{tabular}
\caption{
    Test negative log-likelihood on 2D simluation datasets (lower is better).
    Log-likelihoods are averaged over 20 trials, and their spread is reported as two standard deviations.
}
\label{tab:simulated_results}
\end{table*}

\clearpage

\subsection{Scatter plots of Samples from Malnutrition Models}
\label{sec:malnutrition_samples}

\autoref{fig:malnutrition_samples} shows that \gls{MCTM} captures marginals well but fails to model dependencies.
The \gls{HMAF} models, especially with spline transformations (\gls{HMAF}~(S)), show a greater resemblance to the observed data in the pairwise density plots.

\begin{figure}[h!]
\centering
\begin{subfigure}[t]{0.5\textwidth}
\centering
\includegraphics[width=\textwidth]{malnutrition_data.pdf}
\caption{Random samples from the validation dataset.}
\end{subfigure}%
~
\begin{subfigure}[t]{0.5\textwidth}
\centering
\includegraphics[width=\textwidth]{malnutrition_conditional_multivariate_transformation_model_samples.pdf}
\caption{Random samples from the \gls{MCTM} model.}
\end{subfigure}
\\
\begin{subfigure}[t]{0.5\textwidth}
\centering
\includegraphics[width=\textwidth]{malnutrition_conditional_hybrid_masked_autoregressive_flow_quadratic_spline_samples.pdf}
\caption{Random samples from the hybrid model using quadratic splines as the transformation function.}
\end{subfigure}%
~
\begin{subfigure}[t]{0.5\textwidth}
\centering
\includegraphics[width=\textwidth]{malnutrition_conditional_hybrid_masked_autoregressive_flow_bernstein_poly_samples.pdf}
\caption{Random samples from the hybrid model using Bernstein polynomials as the transformation function.}
\end{subfigure}
\caption{Scatter plots of the three target variables—\texttt{stunting}, \texttt{wasting}, and \texttt{underweight}.
         In the upper triangular section, random data points are illustrated, either from the validation dataset or sampled from the three models.
         The diagonal displays the marginal kernel density estimation (KDE) plots, while the lower triangular region contains the two-dimensional KDE plots.
         The data is categorized by the variable \texttt{cage}.}
\label{fig:malnutrition_samples}
\end{figure}

\subsection{Interpretability of Covariate Effects}
\label{sec:interpretation_details}

Interpretable covariate effects are important in many regression tasks. \glspl{CTM} offer a balance between interpretability and flexibility \citep{Hothorn2014}. The base distribution and transformation function parameterization determine the interpretability.

In a univariate \gls{CTM} with covariate $x$:
\begin{equation}
\mathbb{P}(Y \leq y | X = x) = F_Z(h(y | \theta_x)),
\end{equation}
where $h(y | \theta_x) = \alpha(y)^\top \bm{\vartheta} + \beta x$, with $\beta$ is the covariate effect on the transformed response, and  $\alpha(y)^\top \bm{\vartheta}$ controlling the distribution's shape.

\begin{itemize}
\item \textbf{Logistic Distribution:} $\beta$ is the log-odds ratio, quantifying the change in the odds of $Y \le y$ associated with a unit increase in $x$.
\item \textbf{Minimum Extreme Value Distribution:} $\beta$ is the log-hazard ratio, representing the influence of $x$ on the instantaneous risk of an event.
\item \textbf{Gaussian Distribution:} With a linear transformation $h(y|\theta_x)$, $\beta$ is the change in $Y$'s conditional mean per unit change in $x$ (scaled by the standard deviation).
With non-linear $h$, $\beta$'s interpretation is less direct, affecting multiple moments.
\end{itemize}

\subsection{Interpretable Covariate Effect on the Dependency Structure in MCTMs}

As described in \cite{Klein2022}, the covariate effect on the dependency in the \gls{MCTM} model can be interpreted as Spearman's rank correlation from the covariance matrix

\begin{equation}
\Sigma = \Lambda^\top \Lambda^{-\top}
\end{equation}

via the Pearson correlation coefficient,

\begin{equation}
\rho_{ij} = \frac{\Sigma_{ij}}{\sigma_{i} \sigma_{j}} = \frac{\Sigma_{ij}}{\sqrt{\Sigma_{ii} \Sigma_{jj}}}
\end{equation}

where $\Sigma_{ij}$ represents the covariance between variables $y_i$ and $y_j$ and $\Sigma_{ii}$ the variance $\sigma_i^2$. To convert Pearson correlations to Spearman's rank correlation, the following transformation is applied:

\begin{equation}
\rho_s = \frac{6}{\pi} \arcsin\left(\frac{\rho}{2}\right)
\end{equation}

The resulting covariate-dependent rank correlations are shown in \autoref{fig:rank_correlation_comparison}.
\begin{figure}[htb!]
\centering
\includegraphics{malnutrition_conditional_multivariate_transformation_model_rank_corr.pdf}
\caption{Comparison of Spearman's rank correlation $\rho^S$ estimates between \texttt{stunting}, \texttt{wasting}, and \texttt{underweight} with respect to \texttt{cage}.}
\label{fig:rank_correlation_comparison}
\end{figure}

% \section{Calibration of marginal distributions}
% 
% \begin{figure}[htb!]
%   \centering
%   \includegraphics{malnutrition_conditional_multivariate_transformation_model_reliability_diagram.pdf}
%   \caption{Reliability diagram for marginal CDFs using consistency bars, as proposed by \cite{Brocker2007a}.
%     The x-axis represents the model-implied marginal \gls{CDF} values, and the y-axis represents the \gls{ECDF} values computed from generated samples.
%     The observed relative frequencies all fall within the $2.5\% - 97.5\%$ quantiles (indicated by vertical bars).
%     \Gls{ECDF} values for samples have combined into 10 equidistant bins: $[0., 0.1,\ldots, 0.9, 1.]$.
%     A perfectly calibrated model would lie on the diagonal line.}
%   \label{fig:reliability_diagram}
% \end{figure}
% 
% Reliability diagrams are a common choice to assess the relationship between predicted probabilities and observed frequencies. To construct a reliability diagram, we divide the interval $[0, 1]$ into a number of bins. For each bin, we calculate the proportion of observations that fall within the probability range corresponding to the bin's predicted probability range. We then plot these observed proportions against the midpoints of the probability bins. For perfectly calibrated forecasts, the points on the reliability diagram should lie on the diagonal line, indicating that the predicted probabilities accurately reflect the observed frequencies~\cite{Wilks2011, Brocker2007a}.
% 
% \autoref{fig:reliability_diagram} shows the reliability diagrams for the conditional marginal \glspl{CDF} $F(y_j|\text{age})$, including consistency bars, indicating the $2.5\% - 97.5\%$ confidence interval of the observed frequencies. Wider consistency bars indicate greater uncertainty in the observed proportions; deviations from the diagonal suggest biases in the predicted probabilities. Overall, the observations seem to fall just about on the diagonal, indicating a good marginal fit of the model.

\subsection{Benchmark Datasets}
\label{sec:benchmark_data}

We evaluate our method on five common benchmark datasets:
POWER\footnotehyperlink{https://archive.ics.uci.edu/dataset/235},
GAS\footnotehyperlink{https://archive.ics.uci.edu/dataset/322}.
HEPMASS\footnotehyperlink{https://archive.ics.uci.edu/dataset/347},
MINIBOONE\footnotehyperlink{https://archive.ics.uci.edu/dataset/199/},
BSDS300\footnotehyperlink{https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/bsds/}.
We follow the preprocessing steps from \citet{Papamakarios2018}\footnote{\url{https://github.com/francois-rozet/uci-datasets}}.

\autoref{tab:benchmark-nll-box-plot} shows the distribution of \gls{NLL} scores for \gls{HMAF} and \gls{MAF} across 20 different random initializations.
The figure visualizes performance robustness.
Consistent lower \gls{NLL} values indicate better performance, smaller IQRs suggest less sensitivity to initialization, and a lower spread of outliers suggests more stable training.

\begin{figure}[hb!]
\centering
\includegraphics[height=0.6\textheight]{gfx/seed_test_nll.pdf}
\caption{Distribution of negative log-likelihood (NLL) scores for the \gls{HMAF} and \gls{MAF} models, resulting from 20 runs with different random weight initializations.
         The box plots show the median, interquartile range (IQR), and the data range of 1.5 times the IQR, outside the IQR.
         The swarm plots above the boxes show the individual NLL scores for each run.}
\label{tab:benchmark-nll-box-plot}
\end{figure}

\section{Complexity Estimates (Runtime and Number of Parameters)}

\subsection{Runtime of Model Variants}

The tables below present the runtime for training and evaluating our models on the HPC Cluster at the University of Applied Sciences Esslingen, utilizing NVIDIA L40S GPUs with 48 GB VRAM. 
Note that our code is not fully optimized for runtime, and variations in training and inference times are significantly influenced by hyperparameters such as the number of epochs and the use of early stopping.

\begin{table}[h!]
  \centering
  \caption{Runtime in Minutes for training and evaluation of models on benchmark data.
    Variance resulting deviations from 20 runs reported as standard deviation.}
  \begin{tabular}{llll}
    \toprule
    model                    & dataset name & train                  & evaluation          \\
    \midrule
    \multirow[c]{5}{*}{HMAF} & bsds300      & 1191.992 $\pm$ 537.944 & 481.991 $\pm$ 0.650 \\
                             & gas          & 319.809 $\pm$ 132.472  & 14.931 $\pm$ 0.043  \\
                             & hepmass      & 229.736 $\pm$ 155.484  & 15.094 $\pm$ 0.047  \\
                             & miniboone    & 82.882 $\pm$ 58.692    & 3.933 $\pm$ 0.012   \\
                             & power        & 437.108 $\pm$ 63.707   & 11.321 $\pm$ 0.091  \\
    \midrule
    \multirow[c]{5}{*}{MAF}  & bsds300      & 261.977 $\pm$ 70.957   & 16.716 $\pm$ 0.022  \\
                             & gas          & 68.993 $\pm$ 0.073     & 1.858 $\pm$ 0.005   \\
                             & hepmass      & 34.774 $\pm$ 0.003     & 1.540 $\pm$ 0.004   \\
                             & miniboone    & 16.486 $\pm$ 1.404     & 0.279 $\pm$ 0.001   \\
                             & power        & 136.796 $\pm$ 0.004    & 4.979 $\pm$ 0.120   \\
    \bottomrule
  \end{tabular}
\end{table}

\begin{table}[h!]
  \centering
  \caption{Mean runtime in seconds for training and evaluation of models on simulated data.
    Variance resulting deviations from 20 runs reported as standard deviation.}
  \begin{tabular}{lllll}
    \toprule
                                 &                           &         & train                 & evaluation         \\
    dataset name                 & conditional               & model   &                       &                    \\
    \midrule
    \multirow[c]{16}{*}{circles} & \multirow[c]{8}{*}{False} & CF (B)  & 35.043 $\pm$ 15.008   & 16.782 $\pm$ 0.238 \\
                                 &                           & CF (S)  & 317.228 $\pm$ 233.129 & 17.428 $\pm$ 0.181 \\
                                 &                           & HCF (B) & 110.039 $\pm$ 11.230  & 57.759 $\pm$ 0.830 \\
                                 &                           & HCF (S) & 76.960 $\pm$ 37.909   & 29.489 $\pm$ 0.191 \\
                                 &                           & MAF (B) & 91.213 $\pm$ 0.354    & 19.562 $\pm$ 2.876 \\
                                 &                           & MAF (S) & 145.815 $\pm$ 36.820  & 17.369 $\pm$ 1.585 \\
                                 &                           & MCTM    & 42.597 $\pm$ 0.724    & 29.009 $\pm$ 0.375 \\
                                 &                           & MVN     & 44.892 $\pm$ 0.099    & 10.722 $\pm$ 0.067 \\
    \cline{2-5}
                                 & \multirow[c]{8}{*}{True}  & CF (B)  & 41.655 $\pm$ 20.906   & 20.423 $\pm$ 0.223 \\
                                 &                           & CF (S)  & 329.650 $\pm$ 211.377 & 17.864 $\pm$ 0.267 \\
                                 &                           & HCF (B) & 66.384 $\pm$ 40.931   & 73.782 $\pm$ 1.620 \\
                                 &                           & HCF (S) & 171.494 $\pm$ 107.205 & 31.913 $\pm$ 0.128 \\
                                 &                           & MAF (B) & 91.153 $\pm$ 0.337    & 20.306 $\pm$ 2.840 \\
                                 &                           & MAF (S) & 145.765 $\pm$ 36.843  & 17.912 $\pm$ 1.554 \\
                                 &                           & MCTM    & 304.875 $\pm$ 112.109 & 31.375 $\pm$ 2.577 \\
                                 &                           & MVN     & 56.360 $\pm$ 36.401   & 11.708 $\pm$ 0.055 \\
    \midrule
    \multirow[c]{16}{*}{moons}   & \multirow[c]{8}{*}{False} & CF (B)  & 33.991 $\pm$ 12.603   & 16.842 $\pm$ 0.145 \\
                                 &                           & CF (S)  & 443.163 $\pm$ 80.343  & 17.129 $\pm$ 0.312 \\
                                 &                           & HCF (B) & 184.718 $\pm$ 71.683  & 67.237 $\pm$ 2.418 \\
                                 &                           & HCF (S) & 76.214 $\pm$ 39.760   & 29.368 $\pm$ 0.106 \\
                                 &                           & MAF (B) & 91.252 $\pm$ 0.538    & 19.872 $\pm$ 3.607 \\
                                 &                           & MAF (S) & 134.682 $\pm$ 62.895  & 14.719 $\pm$ 3.458 \\
                                 &                           & MCTM    & 42.544 $\pm$ 0.742    & 28.949 $\pm$ 0.141 \\
                                 &                           & MVN     & 44.823 $\pm$ 0.024    & 11.497 $\pm$ 0.046 \\
    \cline{2-5}
                                 & \multirow[c]{8}{*}{True}  & CF (B)  & 44.995 $\pm$ 20.047   & 20.477 $\pm$ 0.313 \\
                                 &                           & CF (S)  & 368.668 $\pm$ 250.469 & 17.644 $\pm$ 0.415 \\
                                 &                           & HCF (B) & 48.243 $\pm$ 17.635   & 40.735 $\pm$ 0.516 \\
                                 &                           & HCF (S) & 169.016 $\pm$ 62.618  & 31.744 $\pm$ 0.146 \\
                                 &                           & MAF (B) & 91.215 $\pm$ 0.529    & 20.623 $\pm$ 3.563 \\
                                 &                           & MAF (S) & 135.603 $\pm$ 68.688  & 15.308 $\pm$ 3.487 \\
                                 &                           & MCTM    & 475.688 $\pm$ 1.217   & 35.501 $\pm$ 7.524 \\
                                 &                           & MVN     & 48.118 $\pm$ 18.322   & 11.030 $\pm$ 0.071 \\
    \bottomrule
  \end{tabular}
\end{table}

\begin{table}[h!]
  \centering
\caption{Mean runtime in seconds for training and evaluation of models on malnutrition data.
Variance resulting deviations from 20 runs reported as standard deviation.}
\begin{tabular}{lll}
\toprule
 & train & evaluation \\
model &  &  \\
\midrule
HMAF (B) & 260.752 $\pm$ 121.895 & 20.823 $\pm$ 0.535 \\
HMAF (S) & 1993.317 $\pm$ 717.933 & 19.649 $\pm$ 0.110 \\
MCTM & 4106.187 $\pm$ 725.136 & 16.877 $\pm$ 0.847 \\
\bottomrule
\end{tabular}
\end{table}

%%% Local Variables:
%%% mode: LaTeX
%%% TeX-master: "uai_main"
%%% End:
