\begin{table}[!h]
    \centering
    \begin{adjustbox}{max width=\textwidth}
    \begin{tabular}{l cccc cccc cccc}
        \toprule
        %%%
         &  \multicolumn{4}{c}{\fontfamily{cmr}\textsc{\textbf{Barley}}  \textsc{aks\_m2} $\to$ \textsc{keraks}} &  \multicolumn{4}{c}{\fontfamily{cmr}\textsc{\textbf{Mildew}}  \textsc{mikro\_1} $\to$ \textsc{meldug\_2} ($d = 16$)} & \multicolumn{4}{c}{\fontfamily{cmr}\textsc{\textbf{Hailfinder}} \textsc{area} $\to$ \textsc{clds}}  \\ 
         %%%
         \cmidrule(lr){2-5} \cmidrule(lr){6-9} \cmidrule(lr){10-13}
         %%%
         $n$ & $\z$ \textsc{Acc} & $\z_1$ \textsc{Pre} & $\z_1$ \textsc{Rec} & \cmark & $\z$ \textsc{Acc} & $\z_1$ \textsc{Pre} & $\z_1$ \textsc{Rec} & \cmark & $\z$ \textsc{Acc} & $\z_1$ \textsc{Pre} & $\z_1$ \textsc{Rec} & \cmark \\
         %%%
         \hline
         %%%
         $10k$ & x & x & x & x & 71.4 (53.2-89.7) & 53.3 (9.0-97.6) & 60.0 (12.0-100) & 2/5 & x & x  & x & x \\
         $25k$ & x & x & x & x & 88.6 (76.4-100) & 80.0 (40.8-100) & 80.0 (40.8-100) & 4/5 & x  & x & x & x \\
         $50k$ & x & x & x & x & 92.9 (92.9-92.9) & 100 (100-100) & 100 (100-100) & 5/5 & x  & x & x & x \\
         $100k$ & x & x & x & x & 92.9 (88.4-97.3) & 100 (100-100) & 100 (100-100) & 5/5 & x  & x & x & x \\
         %$250k$ & x & x & x & x & x & x & x & x & x  & x & x & x \\
         %%%%%%%%%%%%%%%%%%%
         %% MILDEW
         %%%%%%%%%%%%%%%%%%%
         %\toprule
         %& \multicolumn{4}{c}{\fontfamily{cmr}\textsc{\textbf{Mildew}}  \textsc{mikro\_1} $\to$ \textsc{meldug\_2}} \\
         %%%%%%%%%%%%%%%%%%%
         %% ?
         %%%%%%%%%%%%%%%%%%%
         %& \multicolumn{4}{c}{\fontfamily{cmr}\textsc{\textbf{Hailfinder}} \textsc{area} $\to$ \textsc{clds}}  \\
         \bottomrule
    \end{tabular}
    \end{adjustbox}
    \caption{Performance of Algorithm \ref{alg:method} on discrete benchmark data from \texttt{bnlearn} \citep{scutari_learning_2010}. Sample size is denoted by $n$ and dimensionality by $d$ (including exposure and outcome). Exposure $X$ and outcome $Y$ are denoted by $X \to Y$. All values are averaged over five independent data samples from the underlying DAG. Metrics reported are accuracy of all labels ($\z$ \textsc{Acc}), precision for partition $\z_1$ ($\z_1$ \textsc{Pre}), recall for partition $\z_1$ ($\z_1$ \textsc{Rec}), and fraction of replicates that admit a sufficient adjustment set (\cmark). Standard deviations are in parentheses. The 95\% confidence interval over the 100 individual DAG metrics is reported in parentheses. Independence was determined by permutation testing with 20 permutation surrogates for $n = 100k$, 30 surrogates for $n = 50k$, and 50 surrogates for $n < 50k$.}
    \label{tab:benchmarks}
\end{table}