%\documentclass{uai2023} % for initial submission
 \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
%\usepackage{comment}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument[prefix]{nguyen_358}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% our packages
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{stmaryrd}
\usepackage{subfigure}
\usepackage{multirow}
\usepackage{pgfplots}
\usetikzlibrary{patterns}
\pgfplotsset{compat=1.18} 

% Our Theorems
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{example}[theorem]{Example}

%% Our self-defined macros
\renewcommand{\vec}[1]{\boldsymbol{#1}}
\newcommand{\Prob}{\vec{p}}
\newcommand{\Dens}{\vec{p}}
\newcommand{\given}{\, | \,}
\newcommand*{\defeq}{\mathrel{\vcenter{\baselineskip0.5ex \lineskiplimit0pt
			\hbox{\footnotesize.}\hbox{\footnotesize.}}}%
	=}
\newcommand{\defi}{\defeq}

\newcommand{\fromto}{\longrightarrow}

\newcommand{\cX}{\mathcal{X}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cD}{\mathcal{D}}

\newcommand{\bX}{\mathbf{X}}
\newcommand{\bY}{\mathbf{Y}}

\newcommand{\bx}{\boldsymbol{x}}
\newcommand{\by}{\boldsymbol{y}}
\newcommand{\bv}{\boldsymbol{v}}
\newcommand{\bh}{\boldsymbol{h}}
\newcommand{\bs}{\mathbf{s}}
\newcommand{\bi}{\mathbf{i}}

\newcommand{\Probm}{\mathbf{P}}

\newcommand{\indep}{\perp \!\!\! \perp}

\newcommand{\GBNCs}{GBNCs}
\newcommand{\GBNC}{GBNC}

\DeclareMathOperator*{\argmax}{arg\,max}

\DeclareMathOperator*{\maximize}{Maximize}

%% xr package
\usepackage{xr,refcount}
\externaldocument{nguyen_358}



\setcounter{equation}{\getrefnumber{eq:BOP_Subset}}
\addtocounter{equation}{+1}
\setcounter{algorithm}{\getrefnumber{eq:learn_GBNC}}
\addtocounter{algorithm}{+1}
\setcounter{table}{\getrefnumber{tab:more_res_tab_data}}
\addtocounter{table}{+1}
\setcounter{figure}{\getrefnumber{tab:fig:scatter_plots_K_more_than_7}}
\addtocounter{figure}{+1}

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Probabilistic Multi-Dimensional Classification\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<vu-linh.nguyen@hds.utc.fr>?Subject=Probabilistic Multi-Dimensional Classification}{Vu-Linh Nguyen$^{\ast}$}{}}
\author[2]{\href{mailto:<yang.yang@kuleuven.be>?Subject=Probabilistic Multi-Dimensional Classification}{Yang Yang$^{\ast}$}}{}
\author[3]{\href{mailto:<c.decampos@tue.nl>?: Subject=Probabilistic Multi-Dimensional Classification}{Cassio de Campos}}{}
% Add affiliations after the authors
\affil[1]{%
   Heudiasyc Laboratory, University of Technology of Compi\`egne, France
}
\affil[2]{%
    Department of Computer Science, KU Leuven, Belgium
}
\affil[3]{%
    Eindhoven University of Technology, The Netherlands
  }
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\def\thefootnote{*}\footnotetext{These authors contributed equally to this work.}
\def\thefootnote{\arabic{footnote}}

\appendix
\section{Notation and acronyms}
\begin{table}[H]
\begin{center}
\caption{Notation and acronyms}
\begin{tabular}{ll}
\hline
symbol/acronym & meaning \\
\hline
$\cX$, $\bx$ & instance space, instance \\
$\cY$, $\by$ & output space, outcome \\
$X^p$, $Y^k$ & feature, class variable\\
\hline
$K$, $Q$ & number of class variables, number of features\\
$\llbracket \cdot \rrbracket$ & indicator function \\
$[n]$ & set $\{ 1,\ldots, n\}$ of natural numbers \\
\hline
$\Prob(\by \given \bx)$ & probability of outcome $\by$ given $\bx$ \\
$\Prob(y^k \given \bx)$ & marginal probability of relevance for outcome $Y^k = y^k $ given $\bx$\\ 
\hline
$\cD$ & training data \\
\hline
$\ell$, $\ell_S$, $\ell_L$ & MDC loss function, Subset 0/1 loss, Hamming loss\\
\hline
$G$, $\theta$ & Structure (i.e., a DAG) of a BN, Parameter set of a BN \\
$\Delta^Y_G$ & Parent set of $Y$ in $G$ \\
$\Pi^Y_d$ & Set with all possible configurations of the discrete parents of $Y$\\
$\pi$ & Configuration of the parents of a variable, stored as pairs (variable, value)\\
$\mathcal{G}$ & Set of the $R(K+Q)$ possible DAGs \\
$\mathcal{G}^1$ & Set of the $R(K) 2^{KQ} R(Q)$ DAGs which contain no edge of the form $Y \longrightarrow X$\\  
$\mathcal{G}^2$ & Set of the $R(K) 2^{KQ}$ DAGs, whose elements contain no edge between features\\
$\mathcal{G}^3$ & Set of the $R(K)2^{K|\mathbf{X}_d|})$ DAGs such that, $\forall G\in \mathcal{G}^3$ and $\forall Y\in \mathbf{Y}$, we have $\mathbf{X}_c \subset \Delta^Y$\\
\hline
DAG & Directed acyclic graph\\
MDC & Multi-dimensional classification\\
BN & Bayesian network\\
BOP & Bayes-optimal prediction\\
CP & class powerset \\
CCs & classifier chains \\
BR & binary relevance \\
\hline
\end{tabular}
\end{center}
\end{table}

\section{Proofs of Propositions}\label{sec:Proofs_of_Propositions_appendix}

This section presents proofs for the propositions stated in the main paper. When it is necessary, we recall related notions and results in the literature before presenting proofs. 

\subsection{Proposition \ref{pro:Learning_problem_BND}}

We first present Lemmas which are necessary to complete the proof of Proposition \ref{pro:Learning_problem_BND}.

\begin{lemma}\label{lem:I-maps_2_CLL}
Assume there is a $G'\in \mathcal{G}$ such that $G'$ is an I-map for $\Prob \in \mathcal{P}^0$. All the I-maps $G$ of $\Prob$ induce the same CLL score \eqref{eq:CLL}.
\end{lemma}
\begin{proof}
Reminding that conditional joint probability distribution is 
\begin{align}
    \Prob(\by \given \bx) \defi \frac{\Prob(\bx, \by)}{\sum_{\by \in \cY} \Prob(\bx, \by)} \,, \forall (\bx,\by) \in \cX \times \cY \,. 
\end{align}
Assume $G \in \mathcal{G}$ is an I-map of $\Prob$. Because of that, we have
$\Prob(\bx,\by) =  \Prob^G_\theta(\bx, \by)$ for a certain $\theta$, with factorization respecting $G$. This implies
\begin{align}
    \Prob(\by \given \bx) = \frac{\Prob(\bx, \by)}{\sum_{\by' \in \cY} \Prob(\bx, \by')} = \frac{\Prob^G_{\theta}(\bx, \by)}{\sum_{\by' \in \cY} \Prob^G_{\theta}(\bx, \by')} =  \Prob^G_{\theta}(\by \given \bx) \,, \forall (\bx,\by) \in \cX \times \cY \,. 
\end{align}
Then it is clear that
\begin{align*}
    C(\Prob \given \cD) =  \log \prod_{n=1}^N \Prob(\by_n \given \bx_n)  =  \log \prod_{n=1}^N \Prob^G_{\theta}(\by_n \given \bx_n) = C(\Prob^G_{\theta} \given \cD)\,.
\end{align*}
Thus, all the I-maps $G$ of $\Dens$ have the same CLL score on $\cD$. 
\end{proof}

\begin{lemma}\label{lem:I-map_in_G'}
Assume elements of $\mathbf{X}$ are always made available. Assume there is a $G\in \mathcal{G}$ such that $G$ is an I-map for $\Prob \in\mathcal{P}^0$. Then, there is at least one $G'\in \mathcal{G}^1$ which is an I-map for $\Prob$. 
\end{lemma}
\begin{proof}

As long as the chain rule of probability is valid, we can lazily pick up any topological ordering $t'(1), \ldots, t'(K +Q)$ on $\mathbf{Z} = \mathbf{X} \cup \mathbf{Y}$ in which the $Q$ features occupy the first $Q$ places (and the $K$ class variables occupy the next $K$ places) and add arcs from each feature/variable to the ones succeed it until having a fully connected DAG $G'$. It is clear that $G'\in \mathcal{G}^1$ because we never add any arc of the form $Y\longrightarrow X$. Moreover, $G'$ is an I-map\footnote{While this is already enough to complete the proof, fully connected DAGs are not really the goal of learning BNs. Sparser I-maps can be easily constructed by only adding arcs which preserve conditional dependencies when following the ordering. Also, during the execution of the main algorithms that we use, we naturally find small graphs because of the penalizations that are used (similar guarantees as those that exist for learning BNs).} of $G$ (and $\Prob$) because $\mathcal{I}(G') = \emptyset$. 

\end{proof}

In the following, we present a proof of Proposition \ref{pro:Learning_problem_BND}. 
\begin{proof}
There is always an I-map $G \in \mathcal{G}$ of $\Prob \in \mathcal{P}^0$ which maximizes the CLL function \eqref{eq:CLL} on $\cD$ (if chain rule of probability applies, as we can use a full graph). Lemma \ref{lem:I-maps_2_CLL} tells us that all the I-maps $G$ of $\Prob$ maximize the CLL function. 
Lemma \ref{lem:I-map_in_G'} tells us that at least one of the I-maps belongs to $\mathcal{G}^1$. Hence, there is at least one I-map $G'\in \mathcal{G}^1$ which maximizes the CLL function \eqref{eq:CLL} on $\cD$. Or, equivalently, the relation \eqref{eq:Learning_problem_BND} holds.
\end{proof}

\subsection{Proposition \ref{pro:Decomposability}}

\begin{proof}
The fact that, for any $G \in \mathcal{G}^1$, the joint conditional distribution \eqref{eq:jointConditionalProbabilities} can be factorized as 
\begin{align*}
    \Prob^G_{\theta}(\by \given \bx) = \prod_{Y \in \mathbf{Y}} \Prob_{\theta} \left(y \given \pi_y\right) \,, \forall (\bx,\by) \in \cX \times \cY \,.
\end{align*}
can be checked easily.  Since $Y \notin \Delta^X$, for any $Y \in \mathbf{Y}$ and for any $X \in \mathbf{X}$, we have 
\begin{align}
     \Prob^G_\theta(\by \given \bx) &= \frac{\Prob^G_\theta(\bx,\by)}{\sum_{\by' \in \mathcal{Y}}\Prob^G_\theta(\bx,\by')} = \frac{\prod_{X\in \mathbf{X}} \Prob_\theta(x\given \pi_{x}) \prod_{Y\in \mathbf{Y}} \Prob_\theta(y\given \pi_{y})}{\sum_{\by'}\prod_{X\in \mathbf{X}} \Prob_\theta(x\given \pi_{x}) \prod_{Y\in \mathbf{Y}} \Prob_\theta(y'\given \pi_{y'})} \nonumber\\
    &= \frac{\prod_{X\in \mathbf{X}} \Prob_\theta(x\given \pi_{x}) \prod_{Y\in \mathbf{Y}} \Prob_\theta(y\given \pi_{y})}{\prod_{X\in \mathbf{X}} \Prob_\theta(x\given \pi_{x})\sum_{\by'} \prod_{Y\in \mathbf{Y}} \Prob_\theta(y'\given \pi_{y'})} = \frac{\prod_{Y\in \mathbf{Y}} \Prob_\theta(y\given \pi_{y})}{\sum_{\by'}\prod_{Y\in \mathbf{Y}} \Prob_\theta(y'\given \pi_{y'})} \label{eq:remove_feature_impact_11} \\
    &= \prod_{Y\in \mathbf{Y}} \Prob_\theta(y\given \pi_{y}) \,.\label{eq:remove_feature_impact_12}
\end{align}
The transition from \eqref{eq:remove_feature_impact_11} to \eqref{eq:remove_feature_impact_12} is straightforward because, by the definition of BNs, we have 
\begin{align}\label{eq:variable_elimination}
 \sum_{\by'}\prod_{Y\in \mathbf{Y}} \Prob_\theta(y'\given \pi_{y'}) = 1\,.
\end{align}

We now prove that following relation holds:
\begin{align*}
 \max_{(G,\theta) \in \mathcal{P}^1} C(\Prob^G_\theta\given \cD) = \max_{(G,\theta) \in \mathcal{P}^2} C(\Prob^G_\theta\given \cD) \,,
\end{align*}
We first partition $\mathcal{G}^1$ into $R(K) 2^{KQ}$ groups where each group consists of $R(Q)$ DAGs whose edges among $\mathbf{Y}$ and edges from features to class variables are the same. The relation 
\begin{align*}
    \Prob^G_\theta(\by \given \bx) = \prod_{Y \in \mathbf{Y}} \Prob_\theta \left(y \given \pi_y\right) \,, \forall (\bx,\by) \in \cX \times \cY 
\end{align*}  
ensures that all the members of each group have the same CLL score. Moreover, each group contains exactly one member of $\mathcal{G}^2$, i.e., the DAG with no edge among features. Therefore, the maximal CLL score attained over $\mathcal{G}^1$ equals the maximal score attained over $\mathcal{G}^2$.       
\end{proof}



\subsection{Proposition \ref{pro:Learning_problem_P3}}
Proof of Proposition \ref{pro:Learning_problem_P3} is trivial and is written down for completeness.
\begin{proof}
    For any $G \in \mathcal{P}^2$, there is at least one I-map $G' \in \mathcal{P}^3$ (to see that, simply add the extra arcs to $G$ to complete the parent sets of any class variable with all the continuous feature variables, leading to a graph $G' \in \mathcal{P}^3$ -- adding arcs will keep the I-map property). Thus, Proposition \ref{pro:Learning_problem_P3} comes as a consequence of Proposition \ref{pro:Decomposability}. 
\end{proof}

\subsection{Corollary \ref{cor:optimality}}

We would like to re-emphasize that our assumptions of having the optimality of learned parameters in the local models are not too strong. These are much weaker assumptions than those one finds in the literature when investigating the optimality of PGM learning frameworks: that is typically the assumption that the hypothesis space contains the possible distributions from some given family and the best estimate(s) converge to the optimal distribution(s) asymptotically. 

We do not require any asymptotic results, and the requirement of optimally learned parameters (given data) can be met by many standard estimation methods. Yet, this cannot be always guaranteed in practice, in particular if someone decides to use complicated models connecting the input feature variables and class variables, so our assumption is necessary for the proof of global optimality of the framework (which is a strong result and obviously cannot be achieved if base local models are not optimal themselves). 

With this in mind, the following (short) proof would satisfactorily inform readers of the significance of the proposed framework regarding the optimality. 

\begin{proof}

Assume the chain rule of probability holds (which is arguably a mild assumption) and the parameter learning problem is optimally solved. As a combination of Proposition \ref{pro:Learning_problem_BND}, Proposition \ref{pro:Decomposability} and Proposition \ref{pro:Learning_problem_P3}, we have 
\begin{align*}
\max_{\Prob\in\mathcal{P}^0} C(\Prob \given \cD) =\!\!\! \max_{(G, \theta) \in \mathcal{P}} C(\Prob^G_\theta \given \cD) =\!\!\! \max_{(G, \theta) \in \mathcal{P}^1} C(\Prob^G_\theta \given \cD) =\!\!\! \max_{(G, \theta) \in \mathcal{P}^2} C(\Prob^G_\theta \given \cD) =\!\!\! \max_{(G, \theta) \in \mathcal{P}^3} C(\Prob^G_\theta \given \cD)\,.
\end{align*}

Thus, algorithm \ref{alg:learn_GBNC} should return an I-map of the optimal distribution in $\mathcal{P}^0$. In other words, the learning procedure is universal, as $(G^*, \theta^*)$ is optimal with respect to $\mathcal{P}$, and with enough data would match the true conditional $\Prob(\mathbf{Y}|\mathbf{X})$ for $\Prob$ in $\mathcal{P}^0$.
\end{proof}


\subsection{Proposition \ref{pro:monotonicity}}

Enlarging parent sets (with discrete features) in our setting is analogous to further partitioning the input space in local supervised learning parts~\citep{wang2012local}. A representative of such approaches is the top-down construction of decision trees \citep{landwehr2005logistic,rokach2005top}. In such approaches, it is well-known that further partitioning the input space leads to higher predictive performance on the training data sets~\citep{landwehr2005logistic,rokach2005top}, as long as they are optimally learned. We can expect a similar phenomenon in our setting because CLL \eqref{eq:CLL} is indeed a performance measure for our probabilistic classifiers, and the way we encode each local distribution using $|\Pi^Y_d|$ distributions $\Prob_{\theta}\left(Y \given \pi, \mathbf{X}_c \right)$, $\forall \pi \in \Pi^Y_d$, makes our approach an input space partitioning approach, where $\pi \in \Pi^Y_d$ are used to partition the space formed by $\mathbf{X}_c$. 

We now present a proof for Proposition \ref{pro:monotonicity}.

\begin{proof}
Let $\Prob_{Y,\pi}$ be the local models used for $Y$ with parent set $\Delta$, for $\pi\in\Pi^Y_d$, and $\Prob'_{Y,\pi'}$ be the local models for parent set $\Delta'\supset\Delta$ such that $\pi\subset\pi'$. Let $\theta$ be the optimal parameters used by $\Prob_{Y,\pi}$. Because the local models remain as models from continuous features
$\mathbf{X}_c$ to the class variable $Y$, $\theta$ is still a valid solution (albeit non optimal) of the parameter learning of $\Prob'_{Y,\pi'}$, for each $\pi'$ extending $\pi$. If we use such a $\theta$ and sum together the CLL of all $\Prob'_{Y,\pi'}$ with $\pi'\supset\pi$ (that is, the extended parent set configurations that are compatible with $\pi$ over the variables they have in common), then we achieve the very same score~\eqref{eq:CLL}. Repeating this for all extension of all $\pi\in\Pi^Y_d$, the same overall CLL score is reached. This means that the CLL obtained after the added parents in $\Delta'\setminus\Delta$ has to be equal or larger (as it is assumed to be optimally learned) than before adding the parents. This proves Inequality \eqref{eq:monotonicity}. Now, \eqref{eq:monotonicity} guarantees that enlarging parent sets cannot decrease the CLL score \eqref{eq:CLL}. This ensures that at least one solution of the Algorithm \ref{alg:learn_GBNC} is a fully connected BN in the sense that the DAG over the class variables induced by its structure $G$ is fully connected. Such a solution can be found by finding a topological order of a solution $G$ from Algorithm~\ref{alg:learn_GBNC} and then adding arcs to $G$ (respecting that topological order) until the DAG over the class variables induced by the structure $G$ is fully connected (side comment: obviously it is not our goal to have fully connected networks, this is just to proof the theoretical results). 
\end{proof}

\section{Detailed Algorithms}\label{sec:Algorithms_appendix}

\subsection{Algorithm \ref{alg:learn_GBNC}}\label{sec:Naive_Algorithm_appendix}

In this section, we show that Algorithm \ref{alg:learn_GBNC} can be revised to find a GBNC $(G^*,\theta^*) \in \mathcal{P}^3$ of any regularized variant \eqref{eq:regularized_CLL} of the CLL function. We first compute $ C(\Prob_{\theta^*_{Y,\pi}}\given Y,\pi,\cD)$ from $\mathcal{D}$ by solving \eqref{eq:P1} in line 5 of Algorithm \ref{alg:learn_GBNC}, which can be done by extracting the data set
\begin{align}
    \mathcal{D}_\pi \defi \left\{ (\bx_n,\by_n) \in \mathcal{D}\given \pi_{y_n } = \pi \right\} 
\end{align}
and calling any base learner to learn the optimal parameter set of $\Prob_{\theta}\left(Y \given \pi, \mathbf{X}_c\right)$ on $\mathcal{D}_\pi$ with respect to \eqref{eq:P1}.


For any given regularized variant \eqref{eq:regularized_CLL} of the CLL function, we denote by 
\begin{align}\label{eq:local_score_regularized_variants}
    S(Y, \Delta^Y_d)= C(Y, \Delta^Y_d) -  \text{pen}(\Delta^Y_d, |\mathcal{D}|) \,.
\end{align}
Clearly, the problem of learning a best $G \in \mathcal{G}^3$ can be re-expressed as an Integer Programming (IP):
\begin{align}
    \maximize  & \sum_{Y\in \mathbf{Y}}  \sum_{\Delta^Y_d\in \mathcal{F}^Y}  \gamma(\Delta^Y_d)  \cdot S(Y, \Delta^Y_d)   \,, \label{eq:regularized_target_function_GOBNILP}\\
     \text{Subject to }& 
     \sum_{\Delta^Y_d\in \mathcal{F}^Y} \gamma(\Delta^Y_d) =1 \,, \forall Y \in \mathbf{Y}  \, ,\nonumber \\
     &\sum_{Y \in \mathbf{Y}'} \sum_{\substack{\Delta^Y_d \in \mathcal{F}^Y \\ \Delta^Y_d \cap \mathbf{Y}' = \emptyset}} \gamma(\Delta^Y_d) >1  \,, \forall \mathbf{Y}' \subseteq \mathbf{Y}\,, |\mathbf{Y}'| >1 \, , \nonumber\\
     &\gamma(\Delta^Y_d)  \in \{0,1\} \,, \forall Y \in \mathbf{Y}, \forall ,\Delta^Y_d \in \mathcal{F}^Y \,.\nonumber
\end{align}  
Altogether, we end up with the implementation given in Algorithm \ref{alg:learn_GBNC_for_regularized_CLL}, which returns a GBNC $(G^*, \theta^*) \in \mathcal{P}^3$ of \eqref{eq:Learning_problem_BND_relaxed}.
   
	\begin{algorithm} [!ht]
	\caption{Learning a GBNC of \eqref{eq:Learning_problem_BND_relaxed} under the presence of regularization}\label{alg:learn_GBNC_for_regularized_CLL}
	\begin{algorithmic}[1]
   \STATE {\bfseries Input:} Data $\mathcal{D}$, Probabilistic hypothesis spaces encoding $\Prob_{\theta}\left(Y \given \pi, \mathbf{X}_c \right)$, $\forall \pi \in \Pi^Y_d$, $\forall \Delta^Y_d \in \mathcal{F}^Y$,  $\forall Y \in \mathbf{Y}$ \;
   \FOR{$Y \in \mathbf{Y}$}
        \FOR{$\Delta^Y_d \in \mathcal{F}^Y$}
            \FOR{$\pi \in \Pi^Y_d$}
                \STATE Solve \eqref{eq:P1} and store it in a proper data structure
                %Compute $C(G,Y,\pi \given \cD_\pi)$; Store optimal parameter set of $\Prob^{G}_{\theta}\left(Y \given \pi, \mathbf{X}_c\right)$\;
            \ENDFOR
            \STATE Compute $S(Y,\Delta^Y_d)$ by \eqref{eq:local_score_regularized_variants} using stored values\;
        \ENDFOR
   \ENDFOR
 \STATE Find a best collection $\{\Delta^Y_d\given Y \in \mathbf{Y}\}$ which optimizes \eqref{eq:regularized_target_function_GOBNILP} using GOBNILP \;
   \STATE {\bfseries Output:} A GBNC $(G^*, \theta^*) \in \mathcal{P}^3$ of \eqref{eq:Learning_problem_BND_relaxed} \;
   \end{algorithmic}
   \end{algorithm}
   

\subsection{A Refinement of Algorithm \ref{alg:learn_GBNC}}\label{sec:Refine_Algorithm_appendix}

In this section, we show how pruning rules \citep{de2018entropy} can be employed to find GBNCs which optimize regularized variants \eqref{eq:regularized_CLL} without losing any optimality. 

We first generalize the pruning rule \citep{de2018entropy}[Lemma $3$] for any regularized variant of the form \eqref{eq:regularized_CLL}. 
\begin{lemma}\label{eq:lem_regularized_CLL}
Let $Y\in \mathbf{Y}$ be a node in $G \in \mathcal{G}^3$ with $\Delta \subset \Delta' \in \mathcal{F}^Y$, such that 
\begin{align}\label{eq:pruning_rule}
S(Y,\Delta) \geq - \text{pen}(\Delta', |\mathcal{D}|) \,.   
\end{align}
Then $\Delta'$ and all its supersets can be safely discarded from $\mathcal{F}^Y$ without decreasing the maximum score of \eqref{eq:Learning_problem_BND_relaxed}.
\end{lemma}
\begin{proof}
Using the shorthand notation \eqref{eq:local_score_regularized_variants}, a regularized variant of the CLL function can be rewritten as 
\begin{equation*}
S(\Prob^G_{\theta^*} \given \mathcal{D}) =  \sum_{Y \in \mathbf{Y}}  S(Y,\Delta^Y_d) =  \sum_{Y \in \mathbf{Y}} \left( C(Y,\Delta^Y_d)  -  \text{pen}(\Delta^Y_d, |\mathcal{D}|) \right)\,.
\end{equation*}
For any $G, G' \in \mathcal{G}^3$ such that $\Delta$ and $\Delta'$ are respectively the parent set of $Y$ in $G$ and $G'$, and the parent sets of all $Y'\neq Y$ are the same, we have the relation 
\begin{align*}
 S(\Prob^G_{\theta^*} \given \mathcal{D}) -  S(\Prob^{G'}_{\theta^*} \given \mathcal{D}) &=     S(Y,\Delta) -  S(Y,\Delta') = S(Y,\Delta)- C(Y,\Delta) +  \text{pen}(\Delta', |\mathcal{D}|) \\
 &\geq -  \text{pen}(\Delta', |\mathcal{D}|) -  C(Y,\Delta) +  \text{pen}(\Delta', |\mathcal{D}|) = -  C(Y,\Delta) \geq 0\,.
\end{align*}
Thus, we can safely discard  $\Delta'$ from $\mathcal{F}^Y$ without decreasing the maximum score of \eqref{eq:Learning_problem_BND_relaxed}. 

Because for any $\Delta' \subset \Delta'' \in \mathcal{F}^Y$, we have $ -  \text{pen}(\Delta', |\mathcal{D}|)  \geq   -  \text{pen}(\Delta'', |\mathcal{D}|)$ and assumption \eqref{eq:monotonicity} ensures that $C(Y,\Delta) \leq C(Y,\Delta'')$. Thus, we have the relation 
\begin{align*}
S(Y,\Delta) \geq - \text{pen}(\Delta', |\mathcal{D}|)  \geq   -  \text{pen}(\Delta'', |\mathcal{D}|)  \,.   
\end{align*}
which ensures that we can also safely discard $\Delta''$ from $\mathcal{F}^Y$. 
\end{proof}
Intuitively, Lemma \ref{eq:lem_regularized_CLL} provides us a "stopping criterion" for enlarging parent sets by exploiting the fact that regularized variants \eqref{eq:regularized_CLL} of the CLL \eqref{eq:CLL} seek for a trade-off between the predictive performance provided by more complex classifiers and the simplicity of classifiers. More precisely, condition \eqref{eq:pruning_rule} allows one to safely discard (some/many large) possible parent set $\Delta'$ and its supersets $\Delta''$ without the need of learning local probabilistic classifiers \eqref{eq:Local_Classifiers} for these parent sets. It is very beneficial, because if we do not have such a stopping criterion, we will need to evaluate all the possible parent sets and evaluating each $\Delta^Y \in \mathcal{F}^Y$ requires one to the learning a possibly large number of local probabilistic classifiers \eqref{eq:Local_Classifiers} which is exponential in the cardinality of $\Delta^Y$.   

Ideally, we would expect that enlarging the parent sets (or increasing the model complexity) gives us a better score $S(Y,\Delta^Y)$, i.e., assumption \eqref{eq:monotonicity} should hold. However, in practice, it may happen that the learning algorithm fails to converge and returns unreliable (and inaccurate) local probabilistic classifiers \eqref{eq:Local_Classifiers}. In such a case, we would keep adding redundant parents and end up with unreliable local probabilistic classifiers \eqref{eq:Local_Classifiers} in the final GBNC. In other words, we pick up an unnecessary complex GBNC which contains unreliable local probabilistic classifiers \eqref{eq:Local_Classifiers}. To avoid this unexpected behavior, we propose a variant of the pruning rule \eqref{eq:pruning_rule}.  

\begin{definition}\label{def:pruning_rule_1}
Let $Y\in \mathbf{Y}$ be a node in $G \in \mathcal{G}^3$ with $\Delta \subset \Delta' \in \mathcal{F}^Y$, such that 
\begin{align}\label{eq:pruning_rule_1}
\max_{\Delta'' \subset \Delta} S(Y, \Delta'') \geq -  \text{pen}(\Delta', |\mathcal{D}|)  \,.   
\end{align}
Then all the $\Delta \supset \Delta^*$, where 
\begin{align}\label{eq:best_subset}
\Delta^* = \argmax_{\Delta'' \subset \Delta} S(Y, \Delta'') \,,    
\end{align}
will be discarded from $\mathcal{F}^Y$.
\end{definition}
Intuitively, the pruning rule \eqref{eq:pruning_rule_1} --\eqref{eq:best_subset} allows us to prune all the supersets of $\Delta^*$. For example, if $\Delta^* \subsetneq \Delta$, we discard all of its supersets, such as $\Delta$, $\Delta'$ and their supersets. Adopting the pruning rule \eqref{eq:pruning_rule_1} --\eqref{eq:best_subset}, we propose a refinement of Algorithm \ref{alg:learn_GBNC} which is summarized in Algorithm \ref{alg:learn_GBNC_refined}. To simplify Pseudocode, for any $Y \in \mathbf{Y}$, we denote by 
\begin{align}
    \mathcal{F}^Y_k = \left\{\Delta \in \mathcal{F}^Y \given |\Delta| = k \right\}\,, \forall k = |\mathbf{X}_c|, \ldots, Q + K \,.
\end{align}
Algorithm \ref{alg:learn_GBNC_refined} only learns a local classifier which estimates the local distributions $\Prob_{\theta}\left(Y \given \pi , \mathbf{X}_c\right)$, $\pi \in \Pi$, if $\Delta$ is still included in $\mathcal{F}^Y$ and its complexity is not so high according to \eqref{eq:pruning_rule_1}. In practice, we observed that large $\Delta \in \mathcal{F}^Y$ are usually discarded.  

	\begin{algorithm} [!ht]
	\caption{Learning a GBNC of \eqref{eq:Learning_problem_BND_relaxed} under the presence of regularization}\label{alg:learn_GBNC_refined}
	\begin{algorithmic}[1]
   \STATE {\bfseries Input:} Data $\mathcal{D}$, Probabilistic hypothesis spaces encoding $\Prob_{\theta}\left(Y \given \pi, \mathbf{X}_c \right)$, $\forall \pi \in \Pi^Y_d$, $\forall \Delta^Y_d \in \mathcal{F}^Y$,  $\forall Y \in \mathbf{Y}$  \;
   \FOR{$Y \in \mathbf{Y}$}
        \FOR{$k = |\mathbf{Y}_c|, \ldots, Q + K$}
            \FOR{$\Delta^Y_d \in \mathcal{F}^Y_k$}
            \IF{Condition \eqref{eq:pruning_rule_1} holds}
            \STATE Determine $\Delta^*$ using \eqref{eq:best_subset}; Discard all the $\Delta^Y_d \supset \Delta^*$ from $\mathcal{F}^Y$ \;
            \ELSE
            \STATE  Compute $S(Y,\Delta^Y_d)$ defined in \eqref{eq:local_score_regularized_variants}; Store $\Prob_{\theta^*}\left(Y \given \pi, \mathbf{X}_c\right)$, $\forall \pi \in \Pi^Y_d$ in a proper data structure \;
            \ENDIF
            \ENDFOR
        \ENDFOR
   \ENDFOR
   \STATE Find a best collection $\{\Delta^Y_d\given Y \in \mathbf{Y}\}$ which optimizes \eqref{eq:regularized_target_function_GOBNILP} using GOBNILP \;
   \STATE {\bfseries Output:} A GBNC $(G, \theta) \in \mathcal{P}^3$ of \eqref{eq:Learning_problem_BND_relaxed} \;
   \end{algorithmic}
   \end{algorithm}


\subsection{Inference Algorithms}

Practical procedures for finding BOPs of $\ell_S$ \eqref{eq:BOP_Subset} and $\ell_H$ \eqref{eq:BOP_Hamming} are presented in Algorithm \ref{alg:BOP_Subset} and Algorithm \ref{alg:BOP_Hamming}, respectively.

	\begin{algorithm} [!ht]
	\caption{Find a BOP of the Subset $0/1$ loss \eqref{eq:BOP_Subset}}\label{alg:BOP_Subset}
	\begin{algorithmic}[1]
    \STATE {\bfseries Input:} A GBNC $(G^*, \theta^*) \in \mathcal{P}^3$ of \eqref{eq:Learning_problem_BND_relaxed}: $\Prob_{\theta^*}\left(Y \given \pi, \mathbf{X}_c \right)$, $\forall \pi \in \Pi^Y_d$, $\forall \Delta^Y_d \in \mathcal{F}^Y$,  $\forall Y \in \mathbf{Y}$, a test instance $\bx$ \;
    \STATE Extract the sub-DAG $\mathcal{K}$ over $\mathbf{Y}$ from $G$\;
    \FOR{$Y \in \mathbf{Y}$}
    \STATE Extract parent set of $Y$ in $\mathcal{K}$: $\Delta^Y_\mathbf{Y} = \Delta^Y_d \cap \mathbf{Y}$; Form the set $\Pi^Y_\mathbf{Y}$ of the possible configurations of $\Delta^Y_\mathbf{Y}$\;
        \FOR{$\pi^Y_\mathbf{Y} \in \Pi^Y_\mathbf{Y}$}
            \STATE Predict $\Prob^{\mathcal{K}}_{\theta^*}\left(Y \given \pi^Y_\mathbf{Y}\right)$ using $\Prob_{\theta^*}\left(Y \given \pi, \mathbf{X}_c \right)$ which are specified by $\bx_d$\;
        \ENDFOR
   \ENDFOR
   \STATE Find a MPE $\hat{\by} \in \mathcal{Y}$ given $\mathcal{K}$ and $\Prob^{\mathcal{K}}_{\theta^*}\left(Y \given \pi^Y_\mathbf{Y}\right)$, $\forall \pi^Y_\mathbf{Y} \in \Pi^Y_\mathbf{Y}$, $\forall Y \in \mathbf{Y}$\;
   \STATE {\bfseries Output:} A BOP $\hat{\by}$ of the Subset $0/1$ loss \eqref{eq:BOP_Subset} \;
   \end{algorithmic}
   \end{algorithm}
   
   
   	\begin{algorithm} [!ht]
	\caption{Find a BOP of the Hamming loss \eqref{eq:BOP_Hamming}}\label{alg:BOP_Hamming}
	\begin{algorithmic}[1]
    \STATE {\bfseries Input:} A GBNC $(G^*, \theta^*) \in \mathcal{P}^3$ of \eqref{eq:Learning_problem_BND_relaxed}: $\Prob_{\theta^*}\left(Y \given \pi, \mathbf{X}_c \right)$, $\forall \pi \in \Pi^Y_d$, $\forall \Delta^Y_d \in \mathcal{F}^Y$,  $\forall Y \in \mathbf{Y}$, a test instance $\bx$ \;
    \STATE Extract the sub-DAG $\mathcal{K}$ over $\mathbf{Y}$ from $G$\;
    \FOR{$Y \in \mathbf{Y}$}
    \STATE Extract parent set of $Y$ in $\mathcal{K}$: $\Delta^Y_\mathbf{Y} = \Delta^Y_d \cap \mathbf{Y}$; Form the set $\Pi^Y_\mathbf{Y}$ of the possible configurations of $\Delta^Y_\mathbf{Y}$\;
        \FOR{$\pi^Y_\mathbf{Y} \in \Pi^Y_\mathbf{Y}$}
            \STATE Predict $\Prob^{\mathcal{K}}_{\theta^*}\left(Y \given \pi^Y_\mathbf{Y}\right)$ using $\Prob_{\theta^*}\left(Y \given \pi, \mathbf{X}_c \right)$ which are specified by $\bx_d$\;
        \ENDFOR
   \ENDFOR
   \STATE Find $K$ marginals $\hat{y}^1, \ldots, \hat{y}^K$ given $\mathcal{K}$ and $\Prob^{\mathcal{K}}_{\theta^*}\left(Y \given \pi^Y_\mathbf{Y}\right)$, $\forall \pi^Y_\mathbf{Y} \in \Pi^Y_\mathbf{Y}$, $\forall Y \in \mathbf{Y}$\;
   \STATE {\bfseries Output:} A BOP $\hat{\by} \defi (\hat{y}^1, \ldots, \hat{y}^K)$ of the Hamming loss \eqref{eq:BOP_Hamming} \;
   \end{algorithmic}
   \end{algorithm}

\section{The Case of Partial/Missing Data}\label{sec:Missing_data}

The structural Expectation-Maximization (structural EM) approach has been used in different works in BN learning from missing data \citep{adel2017learning,rancoita2016,friedman1998bayesian}. Reminding that, in BN learning with an incomplete training data, the structural EM approach \citep{friedman1998bayesian} can be employed to find a pair of a possible precise/complete data set and a possible BN, which optimizes some given target function. 

The structural EM approach can be implemented as a two-step algorithm, which should be iterated until either the algorithm converges or some stopping criterion is met. 

\begin{itemize}
    \item Expectation step (E): we complete the data by imputing partial/missing data from a fitted BN;
    \item Maximization step (M): we learn a BN by optimizing given target function over the completed data.
\end{itemize}

Yet, we can in principle adapt the M step of the structural EM approach to the setting of probabilistic MDC straightforwardly. 

However, depending on the concrete type of missing data we are dealing with, handling the E step may require more attention. In the case of partially specified class variables and precise features \citep{wang2021learning}, GBNCs given by Algorithm \ref{alg:learn_GBNC} and \ref{alg:learn_GBNC_for_regularized_CLL} are estimates of $\Prob(\cY \vert \cX)$ and can be used to impute partial/missing data during the E step. In the general case where both the features and class variables can be partially specified \citep{hullermeier2014learning,nguyen2021racing}, estimates of $\Prob(\cY \vert \cX)$ itself seems to be inadequate, because estimates of $\Prob(\cX, \cY)$ may be needed for doing imputation if one wishes to use exact/approximate inference. We however leave this problem as a future work because it is beyond the scope of this paper.    

\section{Experiments}\label{sec:Experiments_appendix}

\subsection{Experimental Setting}\label{sec:Experimental_Setting_appendix}

We evaluate our approaches on both tabular and image data sets. \autoref{tab:stats_tab_data} summarizes the detailed statistics of all tabular data sets, which are originally collected by \citep{jia2021decomposition}. 
From left to right, the meaning of each column is the number of class variables (\#CV), the number of samples (\#Samples), and the number of states of each class variable. (\#States/CV) and the number of features (\#Features), respectively.  Among the 20 tabular data sets, there are three data sets (Adult, Default and Thyroid) which contain mixed features. If all class variables contain the same number of states, only this number is reported. For example, the Flickr data set has five class variables, which have 3, 4, 3, 4, and 4 states, respectively. 

\begin{table}[!ht]
  \centering
\caption{Statistics of the tabular benchmark data sets.}
\label{tab:stats_tab_data}
\begin{tabular}{lcccc}
\toprule 
Data Set     & \#CV & \#States/CV             & \#Samples & \#Features\\ \hline
Edm          & 2    & 3                        & 154      & $16 n$    \\
Jura         & 2    & 4,5                      & 359      & $9 n$     \\
Enb          & 2    & 2,4                      & 768      & $6 n$     \\
Voice        & 2    & 4,2                      & 3136     & $19 n$    \\
Song         & 3    & 3                        & 785      & $98 n$    \\
Adult        & 4    & $7,7,5,2$                & 18419    & $5 n, 5 x$  \\
Default      & 4    & $2,7,4,2$                & 28779    & $14 n, 6 x$ \\ 
Flickr       & 5    & $3,4,3,4,4$              & 12198    & $1536 n$  \\
Fera         & 5    & 6                        & 14052    & $136 n$   \\
\hline
WQplants     & 7    & 4                        & 1060     & $16 n$    \\
WQanimals    & 7    & 4                        & 1060     & $16 n$    \\
Thyroid      & 7    & $5,5,3,2,4,4,3$          & 9172     & $7 n, 22 x$ \\
\hline
Rf1          & 8    & $4,4,3,4,4,3,4,3$        & 8987     & $64 n$    \\
Pain         & 10   & $2,5,4,2,2,5,2,5,2,2$    & 9734     & $136 n$   \\
Disfa        & 12   & $5,5,6,3,4,4,5,4,4,4,6,4$& 13095    & $136 n$   \\
WaterQuality & 14   & 4                        & 1060     & $16 n$    \\
Oes97        & 16   & 3                        & 334      & $263 n$   \\
Oes10        & 16   & 3                        & 403      & $298 n$   \\
Scm20d       & 16   & 4                        & 8966     & $61 n$    \\
Scm1d        & 16   & 4                        & 9803     & $280 n$   \\
\bottomrule
\end{tabular}
\end{table}

We compare two instantiations of \GBNCs{} (\GBNC-S which optimizes \eqref{eq:regularized_CLL} and produces BOP \eqref{eq:BOP_Subset} of $\ell_S$, and \GBNC-H which optimizes \eqref{eq:regularized_CLL} and produces BOP \eqref{eq:BOP_Hamming} of $\ell_H$) with BR, CP, and CCs \citep{jia2021decomposition}[Section II--III]. Reminding that $\text{pen}(\Delta^Y_d, |\mathcal{D}|)$ is the penalty term of the Bayesian Information Criterion
(BIC) \citep{schwarz1978estimating} in our experiments. 

It is known that the chain order of CCs can (significantly) affect its performance and choosing the best order is one of the toughest problems in learning CCs \citep{read2021classifier}. Although choosing good orders of CCs is not a focus of our work, randomly choosing orders would make CCs too weak. We thus sample 11 orders (which are the original order of the class variables from the data source and 10 other orders generated randomly) and pick the best chain order in terms of validation performance, i.e., we use 80\% of the training data to learn CCs and pick up the most promising order with the highest performance on the validation set consists of 20\% of training data, and report its test performance. When running the experiments, we observed that this often improves the performance of CCs, compared to randomly choosing one chain order. We follow the suggestion of \citep{jia2021decomposition} and convert discrete features/variables into continuous variables using one-hot encoding whenever they appear as parts of input of local classifiers of BR, CP and CCs. While there are other multi-dimensional classifiers with promising predictive performances, such as \citep{jia2021decomposition} and \citep{jia2023multi}, we find it hard to interpret such classifiers as probabilistic classifiers. Thus, we do not include them in our experimental comparison, which specifically focuses on probabilistic classifiers. 

For each tabular data set, we do a 10-fold cross-validation, and report the mean and standard deviation of the performance of the classifiers. For the image data set, we do a 3-fold cross-validation, and report the mean and standard deviation of the performance of the classifiers. 

We implement all approaches in Python and use the pgmpy framework \citep{ankan2015pgmpy}. 
We use the PyTorch framework \citep{paszke2019pytorch} to implement neural networks. The source code to replicate experiments is provided as supplementary materials and has been made made public at \url{https://github.com/yangyang-pro/probabilistic-mdc}.

\subsection{Results}\label{sec:Results_appendix}

This appendix provides detailed experimental results which are summarized in section \ref{sec:Experiments} of the main text.

Hamming losses and their ranks provided by the classifiers are given in table \ref{tab:lr_hs} and \ref{tab:nb_hs}. Subset 0/1 losses and their ranks provided by the classifiers are given in table \ref{tab:lr_ss} and \ref{tab:nb_ss}. Scatter plots for the losses provided by pairs of classifiers are given in Figure \ref{fig:Hamming loss for pairs with LR}--\ref{fig:Subset 0/1 loss for pairs with NB}. Each black point illustrates losses provided by the classifiers labeled on the horizontal axis and the vertical axis on one data set. The differences provided by pairs of classifiers are illustrated by the horizontal distances between (black) points and the blue line $y =x$. Points lie on the left side of $y =x$ indicate that classifiers labeled on the horizontal axis are better than ones labeled on the vertical axis, and points lie on the right side of $y =x$ indicate that classifiers labeled on the vertical axis are better than ones labeled in the horizontal axis. Points lie far away from  $y =x$ suggest visible differences. 


\begin{table}[!ht]
    \centering
    \caption{Hamming loss (mean $\pm$ std.) of each MDC approach (\textbf{base learner: \textit{logisitic regression}}).}
    \begin{tabular}{p{5em}|p{7.15em}p{7.45em}p{7.65em}p{7.15em}}
    \toprule
    \multirow{2}{*}{Data Set} & \multicolumn{4}{c}{Hamming loss (in \%)} \\ \cmidrule(r){2-5}
    & \multicolumn{1}{c}{BNCH} & BR & CC & CP  \\ \hline
Edm	& \bfseries	26.54	$	\pm	$	9.57	(1.0)	&	27.65	$	\pm	$	7.95	(3.0)	&	27.23	$	\pm	$	6.95	(2.0)	&	28.23	$	\pm	$	8.10	(4.0) \\
Jura	&	37.32	$	\pm	$	4.48	(2.0)	&	\bfseries 35.51	$	\pm	$	5.06	(1.0)	&	39.81	$	\pm	$	8.10	(3.0)	&	67.84	$	\pm	$	7.23	(4.0) \\
Enb	&	\bfseries 22.20	$	\pm	$	4.59	(1.5)	&	24.16	$	\pm	$	4.44	(3.0)	&	\bfseries 22.20	$	\pm	$	5.29	(1.5)	&	31.77	$	\pm	$	1.89	(4.0) \\
Voice	&	8.21	$	\pm	$	1.18	(3.0)	&	\bfseries 8.08	$	\pm	$	\bfseries 1.05	(1.5)	&	8.08	$	\pm	$	0.67	(1.5)	&	41.69	$	\pm	$	1.30	(4.0) \\
Song	&\bfseries 	24.33	$	\pm	$	2.35	(1.0)	&	25.74	$	\pm	$	2.86	(2.0)	&	26.46	$	\pm	$	5.49	(3.0)	&	49.30	$	\pm	$	3.32	(4.0) \\
Adult	&	32.46	$	\pm	$	0.77	(3.0)	& \bfseries	28.26	$	\pm	$	0.47	(1.0)	&	28.46	$	\pm	$	0.42	(2.0)	&	41.40	$	\pm	$	0.61	(4.0) \\
Default	&	\bfseries 33.29	$	\pm	$	0.42	(1.0)	&	33.48	$	\pm	$	0.59	(3.0)	&	33.39	$	\pm	$	0.42	(2.0)	&	43.94	$	\pm	$	0.31	(4.0) \\

Flickr	&	21.74	$	\pm	$	0.57	(3.0)	&	\bfseries 20.22	$	\pm	$	0.69	(1.0)	&	20.46	$	\pm	$	0.47	(2.0)	&	49.21	$	\pm	$	0.72	(4.0) \\
Fera	&	\bfseries 37.76	$	\pm	$	0.64	(1.0)	&	38.82	$	\pm	$	0.96	(2.0)	&	39.23	$	\pm	$	0.70	(3.00)	&	52.97	$	\pm	$	2.41	(4.0) \\

\hline

WQplants	& \bfseries	34.61	$	\pm	$	1.67	(1.0)	&	34.65	$	\pm	$	2.18	(2.0)	&	35.42	$	\pm	$	2.02	(3.0)	&	38.06	$	\pm	$	3.05	(4.0) \\
WQanimals	&	\bfseries 36.85	$	\pm	$	1.35	(1.0)	&	36.98	$	\pm	$	1.97	(2.0)	&	38.23	$	\pm	$	0.87	(3.0)	&	43.07	$	\pm	$	2.68	(4.0) \\

Thyroid	&	\bfseries 3.38	$	\pm	$	0.14	(1.0)	&	3.52	$	\pm	$	0.19	(3.0)	&	3.44	$	\pm	$	0.17	(2.0)	&	3.89	$	\pm	$	0.16	(4.0) \\

\hline
Rf1	&	\bfseries 9.53	$	\pm	$	0.65	(1.0)	&	16.10	$	\pm	$	0.67	(2.0)	&	16.33	$	\pm	$	0.42	(3.0)	&	36.49	$	\pm	$	0.79	(4.0) \\
Pain	&	\bfseries 4.70	$	\pm	$	0.33	(1.0)	&	4.74	$	\pm	$	0.32	(2.0)	&	4.91	$	\pm	$	0.37	(3.0)	&	5.24	$	\pm	$	0.46	(4.0) \\
Disfa	&	\bfseries 10.30	$	\pm	$	0.36	(1.0)	&	10.58	$	\pm	$	0.37	(2.0)	&	10.63	$	\pm	$	0.30	(3.0)	&	13.08	$	\pm	$	0.60	(4.0) \\
WaterQuality	&	\bfseries 35.53	$	\pm	$	1.24	(1.0)	&	36.10	$	\pm	$	1.12	(2.0)	&	36.50	$	\pm	$	0.95	(3.0)	&	40.92	$	\pm	$	1.58	(4.0) \\
Oes97	&	\bfseries 27.61	$	\pm	$	1.41	(1.0)	&	28.24	$	\pm	$	1.68	(2.0)	&	29.35	$	\pm	$	1.78	(3.0)	&	45.59	$	\pm	$	3.36	(4.0) \\
Oes10	&	19.55	$	\pm	$	1.80	(2.0)	& \bfseries	19.21	$	\pm	$	1.98	(1.0)	&	20.80	$	\pm	$	1.73	(3.0)	&	38.40	$	\pm	$	3.30	(4.0) \\
Scm20d	&	\bfseries 31.45	$	\pm	$	0.84	(1.0)	&	36.04	$	\pm	$	0.71	(2.0)	&	38.15	$	\pm	$	0.96	(3.0)	&	57.87	$	\pm	$	0.65	(4.0) \\
Scm1d	&	\bfseries 18.07	$	\pm	$	0.38	(1.0)	&	23.42	$	\pm	$	0.79	(2.0)	&	25.63	$	\pm	$	0.95	(3.0)	&	55.59	$	\pm	$	0.80	(4.0) \\
\hline 
Ave. rank	&				\bfseries			1.43		&							1.98		&							2.60 		&						4.00	\\
    \bottomrule
    \end{tabular}
    \label{tab:lr_hs}
\end{table}


\begin{table}[!ht]
    \centering
    \caption{Hamming loss (mean $\pm$ std.) of each MDC approach (\textbf{base learner: \textit{Naive Bayes}}).}
    \label{tab:nb_hs}
    \begin{tabular}{p{5em}|p{7.15em}p{7.45em}p{7.65em}p{7.15em}}
    \toprule
    \multirow{2}{*}{Data Set} & \multicolumn{4}{c}{Hamming loss (in \%)} \\ \cmidrule(r){2-5}
    & \multicolumn{1}{c}{BNCH} & BR & CC & CP  \\ \hline
Edm	&	32.17	$	\pm	$	6.55	(2.0)	&	33.08	$	\pm	$	3.67	(3.0)	&	34.19	$	\pm	$	7.64	(4.0)	&	\bfseries 27.88	$	\pm	$	7.41	(1.0) \\
Jura	&	\bfseries 43.01	$	\pm	$	6.83	(1.0)	&	45.40	$	\pm	$	4.45	(3.0)	&	43.45	$	\pm	$	4.45	(2.0)	&	69.06	$	\pm	$	4.25	(4.0) \\
Enb	&	\bfseries 22.60	$	\pm	$	2.21	(1.0)	&	29.62	$	\pm	$	2.81	(3.0)	&	29.49	$	\pm	$	3.18	(2.0)	&	31.25	$	\pm	$	2.91	(4.0) \\
Voice	&	\bfseries 9.34	$	\pm	$	0.85	(1.0)	&	11.62	$	\pm	$	1.50	(2.0)	&	12.42	$	\pm	$	1.75	(3.0)	&	45.54	$	\pm	$	1.52	(4.0) \\
Song	& \bfseries	34.24	$	\pm	$	3.17	(1.0)	&	38.22	$	\pm	$	3.14	(3.0)	&	38.01	$	\pm	$	2.36	(2.0)	&	56.91	$	\pm	$	5.67	(4.0) \\
Adult	&	44.38	$	\pm	$	1.03	(2.0)	&	76.69	$	\pm	$	3.14	(4.0)	& \bfseries	32.60	$	\pm	$	0.79	(1.0)	&	60.63	$	\pm	$	1.19	(3.0) \\
Default	&	\bfseries 41.62	$	\pm	$	2.01	(1.0)	&	48.17	$	\pm	$	2.74	(3.0)	&	48.04	$	\pm	$	2.49	(2.0)	&	63.32	$	\pm	$	1.08	(4.0) \\
Flickr	& \bfseries	31.04	$	\pm	$	0.65	(1.0)	&	35.18	$	\pm	$	0.69	(3.0)	&	35.15	$	\pm	$	0.73	(2.0)	&	51.83	$	\pm	$	0.56	(4.0) \\
Fera	& \bfseries	55.50	$	\pm	$	0.76	(1.0)	&	57.54	$	\pm	$	0.38	(2.0)	&	57.82	$	\pm	$	0.38	(3.0)	&	59.11	$	\pm	$	0.55	(4.0) \\

\hline

WQplants	&	52.94	$	\pm	$	4.31	(2.0)	&	60.46	$	\pm	$	3.35	(3.0)	&	62.75	$	\pm	$	3.86	(4.0)	&	\bfseries 49.72	$	\pm	$	2.39	(1.0) \\
WQanimals	&	52.37	$	\pm	$	1.43	(2.0)	&	61.77	$	\pm	$	0.89	(3.0)	&	61.90	$	\pm	$	1.04	(4.0)	&	\bfseries 48.48	$	\pm	$	1.79	(1.0) \\
Thyroid	&	\bfseries 3.50	$	\pm	$	0.35	(1.0)	&	24.01	$	\pm	$	1.68	(4.0)	&	20.03	$	\pm	$	0.68	(3.0)	&	7.82	$	\pm	$	0.34	(2.0) \\

\hline

Rf1	&	\bfseries 17.10	$	\pm	$	0.60	(1.0)	&	23.39	$	\pm	$	0.40	(2.0)	&	23.51	$	\pm	$	0.39	(3.0)	&	37.69	$	\pm	$	0.90	(4.0) \\
Pain	&	22.53	$	\pm	$	0.49	(2.0)	&	34.52	$	\pm	$	1.03	(3.0)	&	39.71	$	\pm	$	2.07	(4.0)	&	\bfseries 18.61	$	\pm	$	0.54	(1.0) \\
Disfa	&	21.19	$	\pm	$	1.03	(2.0)	&	33.44	$	\pm	$	0.57	(3.0)	&	36.89	$	\pm	$	0.68	(4.0)	&	\bfseries 20.09	$	\pm	$	0.22	(1.0) \\
WaterQuality	&	46.91	$	\pm	$	1.88	(2.0)	&	61.26	$	\pm	$	1.20	(3.0)	&	64.45	$	\pm	$	1.25	(4.0)	&	\bfseries 40.65	$	\pm	$	1.57	(1.0)\\
Oes97	& \bfseries	29.47	$	\pm	$	1.95	(1.0)	&	33.26	$	\pm	$	1.60	(2.0)	&	33.29	$	\pm	$	1.62	(3.0)	&	70.07	$	\pm	$	4.16	(4.0)\\
Oes10	& \bfseries	22.71	$	\pm	$	1.55	(1.0)	&	25.65	$	\pm	$	2.15	(2.0)	&	25.87	$	\pm	$	2.08	(3.0)	&	57.32	$	\pm	$	5.16	(4.0) \\
Scm20d	& \bfseries	48.72	$	\pm	$	0.75	(1.0)	&	53.36	$	\pm	$	0.68	(2.0)	&	57.20	$	\pm	$	0.76	(3.0)	&	59.21	$	\pm	$	0.98	(4.0)\\
Scm1d	&	33.94	$	\pm	$	0.67	(2.0)	& \bfseries	33.29	$	\pm	$	0.64	(1.0)	&	34.10	$	\pm	$	0.68	(3.0)	&	57.52	$	\pm	$	1.13	(4.0) \\
\hline
Ave. rank	&			\bfseries				1.40		&							2.70		&							2.95		&							2.95	\\
    \bottomrule
    \end{tabular}
\end{table}

\begin{table}
    \centering
    \caption{Subset 0/1 loss (mean $\pm$ std.) of each MDC approach (\textbf{base learner: \textit{logisitic regression}}).}
    \begin{tabular}{p{5em}|p{7.15em}p{7.45em}p{7.65em}p{7.15em}}
    \toprule
    \multirow{2}{*}{Data Set} & \multicolumn{4}{c}{Subset 0/1 loss (in \%)} \\ \cmidrule(r){2-5}
    & \multicolumn{1}{c}{BNCS} & BR & CC & CP  \\ \hline
Edm	& \small 	\bfseries 40.83	$	\pm	$	11.73	(1.0)	&	 \small 47.54	$	\pm	$	12.49	(2.0)	&	 \small 50.54	$	\pm	$	15.58	(3.0)	&	 \small  55.17	$	\pm	$	14.99	(4.0) \\
Jura	&	60.71	$	\pm	$	5.75	(2.0)	& \bfseries	59.05	$	\pm	$	5.86	(1.0)	&	63.45	$	\pm	$	8.29	(3.0)	&	98.33	$	\pm	$	4.16	(4.0) \\
Enb	&	44.27	$	\pm	$	8.86	(2.0)	&	48.32	$	\pm	$	8.88	(3.0)	&	\bfseries 42.96	$	\pm	$	6.33	(1.0)	&	63.54	$	\pm	$	3.78	(4.0) \\
Voice	&	16.07	$	\pm	$	1.81	(3.0)	&	\bfseries 15.69	$	\pm	$	1.98	(1.0)	&	15.85	$	\pm	$	1.61	(2.0)	&	81.44	$	\pm	$	2.32	(4.0) \\
Song	&	\bfseries 57.57	$	\pm	$	4.27	(1.0)	&	60.38	$	\pm	$	4.93	(3.0)	&	58.98	$	\pm	$	5.67	(2.0)	&	94.26	$	\pm	$	2.09	(4.0) \\
Adult	&	75.67	$	\pm	$	0.99	(3.0)	&	72.91	$	\pm	$	1.33	(2.0)	&	\bfseries 71.76	$	\pm	$	0.84	(1.0)	&	86.40	$	\pm	$	0.95	(4.0) \\
Default	&	\bfseries 81.23	$	\pm	$	0.61	(1.0)	&	82.43	$	\pm	$	0.79	(3.0)	&	82.31	$	\pm	$	1.00	(2.0)	&	94.01	$	\pm	$	0.33	(4.0) \\

Flickr	&	70.93	$	\pm	$	1.03	(3.0)	&	67.74	$	\pm	$	1.33	(2.0)	&	\bfseries 67.40	$	\pm	$	1.14	(1.0)	&	95.97	$	\pm	$	0.54	(4.0) \\
Fera	& \bfseries	80.15	$	\pm	$	0.92	(1.0)	&	80.72	$	\pm	$	1.09	(2.0)	&	80.86	$	\pm	$	1.03	(3.0)	&	83.56	$	\pm	$	2.56	(4.0) \\

\hline

WQplants	& \bfseries	90.75	$	\pm	$	2.86	(1.0)	&	90.85	$	\pm	$	2.27	(2.0)	&	91.79	$	\pm	$	2.83	(3.0)	&	92.36	$	\pm	$	3.88	(4.0) \\
WQanimals	&	95.19	$	\pm	$	2.62	(2.0)	&	95.47	$	\pm	$	2.22	(3.0)	&	\bfseries 95.09	$	\pm	$	3.01	(1.0)	&	97.83	$	\pm	$	2.15	(4.0) \\
Thyroid	&	\bfseries 21.63	$	\pm	$	0.94	(1.0)	&	22.76	$	\pm	$	1.32	(2.0)	&	23.15	$	\pm	$	1.41	(3.0)	&	25.29	$	\pm	$	1.16	(4.0) \\

\hline

Rf1	&	\bfseries 47.51	$	\pm	$	2.11	(1.0)	&	70.88	$	\pm	$	1.69	(2.0)	&	72.36	$	\pm	$	1.92	(3.0)	&	93.58	$	\pm	$	0.81	(4.0) \\
Pain	& \bfseries	24.07	$	\pm	$	1.50	(1.0)	&	24.74	$	\pm	$	1.41	(2.0)	&	24.76	$	\pm	$	1.40	(3.0)	&	24.86	$	\pm	$	1.55	(4.0) \\
Disfa	& \bfseries	60.24	$	\pm	$	1.42	(1.0)	&	60.96	$	\pm	$	1.15	(3.0)	&	60.51	$	\pm	$	1.50	(2.0)	&	63.40	$	\pm	$	1.62	(4.0) \\
WaterQuality	&	99.43	$	\pm	$	0.46	(2.0)	&	\bfseries 99.06	$	\pm	$	0.60	(1.0)	&	99.72	$	\pm	$	0.43	(3.5)	&	99.72	$	\pm	$	0.43	(3.5) \\
Oes97	& \bfseries	95.24	$	\pm	$	4.02	(1.0)	&	95.84	$	\pm	$	3.53	(2.0)	&	97.03	$	\pm	$	3.73	(3.0)	&	100	$	\pm	$	0.00	(4.0) \\
Oes10	&	90.58	$	\pm	$	3.45	(2.0)	&	\bfseries 90.33	$	\pm	$	4.62	(1.0)	&	92.30	$	\pm	$	3.26	(3.0)	&	100	$	\pm	$	0.00	(4.0) \\
Scm20d	& \bfseries	87.79	$	\pm	$	1.25	(1.0)	&	95.46	$	\pm	$	0.96	(4.0)	&	93.89	$	\pm	$	1.29	(3.0)	&	92.58	$	\pm	$	0.98	(2.0) \\
Scm1d	& \bfseries	80.75	$	\pm	$	1.02	(1.0)	&	89.86	$	\pm	$	2.17	(3.0)	&	88.81	$	\pm	$	0.88	(2.0)	&	90.81	$	\pm	$	1.01	(4.0) \\

\hline
Ave. rank		&			\bfseries			1.55		&							2.20		&							2.38		&							3.88	\\
    \bottomrule
    \end{tabular}
    \label{tab:lr_ss}
\end{table}


\begin{table}
    \centering
    \caption{Subset 0/1 loss (mean $\pm$ std.) of each MDC approach (\textbf{base learner: \textit{Naive Bayes}}).}
    \label{tab:nb_ss}
    \begin{tabular}{p{5em}|p{7.15em}p{7.45em}p{7.65em}p{7.15em}}
    \toprule
    \multirow{2}{*}{Data Set} & \multicolumn{4}{c}{Subset 0/1 loss (in \%)} \\ \cmidrule(r){2-5}
    & \multicolumn{1}{c}{BNCS} & BR & CC & CP  \\ \hline
Edm	& \bfseries 48.79	$	\pm	$	8.25	(1.0)	&	 57.12	$	\pm	$	4.34	(4.0)	& 52.13	$	\pm	$	9.22	(2.0)	&	\small 55.08	$	\pm	$	15.36	(3.0) \\
Jura	&	65.44	$	\pm	$	7.07	(2.0)	&	69.33	$	\pm	$	5.45	(3.0)	& \bfseries	64.05	$	\pm	$	6.26	(1.0)	&	98.89	$	\pm	$	1.36	(4.0) \\
Enb	& \bfseries	45.20	$	\pm	$	4.43	(1.0)	&	59.25	$	\pm	$	5.62	(3.0)	&	58.99	$	\pm	$	6.36	(2.0)	&	62.51	$	\pm	$	5.81	(4.0) \\
Voice	&	\bfseries 17.92	$	\pm	$	1.23	(1.0)	&	21.62	$	\pm	$	2.46	(2.0)	&	22.71	$	\pm	$	2.94	(3.0)	&	84.82	$	\pm	$	2.11	(4.0) \\
Song	&	\bfseries 70.72	$	\pm	$	4.57	(1.0)	&	78.60	$	\pm	$	4.13	(3.0)	&	77.71	$	\pm	$	3.23	(2.0)	&	93.63	$	\pm	$	3.26	(4.0) \\
Adult	&	92.75	$	\pm	$	0.68	(3.0)	&	76.69	$	\pm	$	3.14	(2.0)	& \bfseries	74.65	$	\pm	$	2.80	(1.0)	&	98.46	$	\pm	$	0.35	(4.0) \\
Default	& \bfseries	92.11	$	\pm	$	1.89	(1.0)	&	96.10	$	\pm	$	2.23	(2.0)	&	96.47	$	\pm	$	1.91	(3.0)	&	99.94	$	\pm	$	0.03	(4.0) \\
Flickr	& \bfseries	82.94	$	\pm	$	1.33	(1.0)	&	86.03	$	\pm	$	1.11	(3.0)	&	85.89	$	\pm	$	1.08	(2.0)	&	96.56	$	\pm	$	0.55	(4.0) \\
Fera	&	\bfseries 93.34	$	\pm	$	0.37	(1.0)	&	97.94	$	\pm	$	0.35	(3.0)	&	98.04	$	\pm	$	0.31	(4.0)	&	95.69	$	\pm	$	0.53	(2.0) \\

\hline

WQplants	& \bfseries	99.15	$	\pm	$	0.66	(1.5)	&	100	$	\pm	$	0.00	(4.0)	&	99.91	$	\pm	$	0.28	(3.0)	&	\bfseries 99.15	$	\pm	$	0.89	(1.5) \\
WQanimals	& \bfseries	99.15	$	\pm	$	0.66	(1.0)	&	99.62	$	\pm	$	0.86	(3.5)	&	99.43	$	\pm	$	0.96	(2.0)	&	99.62	$	\pm	$	0.63	(3.5) \\
Thyroid	& \bfseries	20.39	$	\pm	$	1.92	(1.0)	&	90.96	$	\pm	$	0.95	(4.0)	&	88.79	$	\pm	$	1.15	(3.0)	&	47.24	$	\pm	$	2.33	(2.0)\\

\hline

Rf1	&	\bfseries 69.12	$	\pm	$	1.09	(1.0)	&	83.82	$	\pm	$	0.97	(2.5)	&	83.82	$	\pm	$	0.65	(2.5)	&	93.05	$	\pm	$	0.60	(4.0) \\
Pain	&	89.14	$	\pm	$	0.77	(2.0)	&	93.08	$	\pm	$	0.80	(3.0)	& \bfseries	87.49	$	\pm	$	1.38	(1.0)	&	91.56	$	\pm	$	1.10	(4.0) \\
Disfa	& \bfseries	88.62	$	\pm	$	2.03	(1.0)	&	99.66	$	\pm	$	0.14	(4.0)	&	99.47	$	\pm	$	0.17	(3.0)	&	94.58	$	\pm	$	1.02	(2.0) \\
WaterQuality	&	100	$	\pm	$	0.00	(3.0)	&	100	$	\pm	$	0.00	(3.0)	&	100	$	\pm	$	0.00	(3.0)	& \bfseries	99.72	$	\pm	$	0.43	(1.0) \\
Oes97	&	96.69	$	\pm	$	3.12	(3.0)	& \bfseries	94.30	$	\pm	$	4.50	(1.0)	&	94.60	$	\pm	$	4.36	(2.0)	&	100	$	\pm	$	0.00	(4.0) \\
Oes10	&	91.79	$	\pm	$	3.91	(2.0)	& \bfseries	91.54	$	\pm	$	4.09	(1.0)	&	92.53	$	\pm	$	4.21	(3.0)	&	99.01	$	\pm	$	1.22	(4.0) \\
Scm20d	&	98.63	$	\pm	$	0.32	(4.0)	&	98.27	$	\pm	$	0.30	(3.0)	&	97.60	$	\pm	$	0.43	(2.0)	&	\bfseries 96.07	$	\pm	$	0.43	(1.0) \\
Scm1d	&	95.54	$	\pm	$	1.25	(3.0)	&	91.66	$	\pm	$	0.70	(2.0)	&	\bfseries 91.03	$	\pm	$	0.91	(1.0)	&	96.59	$	\pm	$	0.58	(4.0) \\
\hline
Ave. rank	&			\bfseries				1.73		&							2.80		&							2.28		&							3.20	\\
    \bottomrule
    \end{tabular}
\end{table}

\begin{figure}[!ht]
\centering
\begin{tabular}{cc}
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCH,
ylabel = BR]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.02,0.02) (0.4,0.4) };
\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.2654	0.2765	w
0.3732	0.3551	b
0.2220	0.2416	c
0.0821	0.0808	d
0.2433	0.2574	e
0.2174	0.2022	f
0.3776	0.3882	g
0.3461	0.3465	h
0.3685	0.3698	i
0.0953	0.1610	j
0.0470	0.0474	k
0.1030	0.1058	l
0.3553	0.3610	m
0.2761	0.2824	n
0.1955	0.1921	o
0.3145	0.3604	p
0.1807	0.2342	q
0.3246	0.2826	r
0.3329	0.3348	s
0.0338	0.0352	t
    };
\end{axis}
\end{tikzpicture} 
&
%%%%%%%%%%%%%%%%%%%%%% 2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\centering
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BR,
ylabel = CC]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.02,0.02) (0.42,0.42) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.2765	0.2723	w
0.3551	0.3981	b
0.2416	0.2220	c
0.0808	0.0808	d
0.2574	0.2646	e
0.2022	0.2046	f
0.3882	0.3923	g
0.3465	0.3542	h
0.3698	0.3823	i
0.1610	0.1633	j
0.0474	0.0491	k
0.1058	0.1063	l
0.3610	0.3650	m
0.2824	0.2935	n
0.1921	0.2080	o
0.3604	0.3815	p
0.2342	0.2563	q
0.2826	0.2846	r
0.3348	0.3339	s
0.0352	0.0344	t
    };
\end{axis}
\end{tikzpicture}
\\
\\
\\
%%%%%%%%%%%%%%%%%%%%%% 3 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCH,
ylabel = CC]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.02,0.02) (0.4,0.4) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.2654	0.2723	w
0.3732	0.3981	b
0.2220	0.2220	c
0.0821	0.0808	d
0.2433	0.2646	e
0.2174	0.2046	f
0.3776	0.3923	g
0.3461	0.3542	h
0.3685	0.3823	i
0.0953	0.1633	j
0.0470	0.0491	k
0.1030	0.1063	l
0.3553	0.3650	m
0.2761	0.2935	n
0.1955	0.2080	o
0.3145	0.3815	p
0.1807	0.2563	q
0.3246	0.2846	r
0.3329	0.3339	s
0.0338	0.0344	t
    };
\end{axis}
\end{tikzpicture} 
&
%%%%%%%%%%%%%%%%%%%%%% 4 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BR,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0,0) (0.7,0.7) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.2765	0.2823	w
0.3551	0.6784	b
0.2416	0.3177	c
0.0808	0.4169	d
0.2574	0.4930	e
0.2022	0.4921	f
0.3882	0.5297	g
0.3465	0.3806	h
0.3698	0.4307	i
0.1610	0.3649	j
0.0474	0.0524	k
0.1058	0.1308	l
0.3610	0.4092	m
0.2824	0.4559	n
0.1921	0.3840	o
0.3604	0.5787	p
0.2342	0.5559	q
0.2826	0.4140	r
0.3348	0.4394	s
0.0352	0.0389	t
    };
\end{axis}
\end{tikzpicture}
\\
\\
\\
%%%%%%%%%%%%%%%%%%%%%% 5 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCH,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0,0) (0.7,0.7) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.2654	0.2823	w
0.3732	0.6784	b
0.2220	0.3177	c
0.0821	0.4169	d
0.2433	0.4930	e
0.2174	0.4921	f
0.3776	0.5297	g
0.3461	0.3806	h
0.3685	0.4307	i
0.0953	0.3649	j
0.0470	0.0524	k
0.1030	0.1308	l
0.3553	0.4092	m
0.2761	0.4559	n
0.1955	0.3840	n
0.3145	0.5787	p
0.1807	0.5559	q
0.3246	0.4140	r
0.3329	0.4394	s
0.0338	0.0389	t
    };
\end{axis}
\end{tikzpicture}
&
%%%%%%%%%%%%%%%%%%%%%% 6 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = CC,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0,0) (0.7,0.7) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.2723	0.2823	w
0.3981	0.6784	b
0.2220	0.3177	c
0.0808	0.4169	d
0.2646	0.4930	e
0.2046	0.4921	f
0.3923	0.5297	g
0.3542	0.3806	h
0.3823	0.4307	i
0.1633	0.3649	j
0.0491	0.0524	k
0.1063	0.1308	l
0.3650	0.4092	m
0.2935	0.4559	n
0.2080	0.3840	o
0.3815	0.5787	p
0.2563	0.5559	q
0.2846	0.4140	r
0.3339	0.4394	s
0.0344	0.0389	t
    };
\end{axis}
\end{tikzpicture}
\\
\end{tabular}
\caption{Hamming loss (\textbf{base learner: \textit{Logistic regression}})}
\label{fig:Hamming loss for pairs with LR}
\end{figure}


\begin{figure}[!ht]
%\captionsetup[subfigure]{labelformat=empty}
\centering
\begin{tabular}{cc}
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCS,
ylabel = BR]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.13,0.13) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.4083	0.4754	w
0.6071	0.5905	b
0.4427	0.4832	c
0.1607	0.1569	d
0.5757	0.6038	e
0.7093	0.6774	f
0.8015	0.8072	g
0.9075	0.9085	h
0.9519	0.9547	i
0.4751	0.7088	j
0.2407	0.2474	k
0.6024	0.6096	l
0.9943	0.9906	m
0.9524	0.9584	n
0.9058	0.9033	o
0.8779	0.9546	p
0.8075	0.8986	q
0.7567	0.7291	r
0.8123	0.8243	s
0.2163	0.2276	t
    };
\end{axis}
\end{tikzpicture} 
&
%%%%%%%%%%%%%%%%%%%%%% 2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BR,
ylabel = CC]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.13,0.13) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.4754	0.5054	w
0.5905	0.6345	b
0.4832	0.4296	c
0.1569	0.1585	d
0.6038	0.5898	e
0.6774	0.6740	f
0.8072	0.8086	g
0.9085	0.9179	h
0.9547	0.9509	i
0.7088	0.7236	j
0.2474	0.2476	k
0.6096	0.6051	l
0.9906	0.9972	m
0.9584	0.9703	n
0.9033	0.9230	o
0.9546	0.9389	p
0.8986	0.8881	q
0.7291	0.7176	r
0.8243	0.8231	s
0.2276	0.2315	t
    };
\end{axis}
\end{tikzpicture}
\\
\\
\\
%%%%%%%%%%%%%%%%%%%%%% 3 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCS,
ylabel = CC]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.13,0.13) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.4083	0.5054	w
0.6071	0.6345	b
0.4427	0.4296	c
0.1607	0.1585	d
0.5757	0.5898	e
0.7093	0.6740	f
0.8015	0.8086	g
0.9075	0.9179	h
0.9519	0.9509	i
0.4751	0.7236	j
0.2407	0.2476	k
0.6024	0.6051	l
0.9943	0.9972	m
0.9524	0.9703	n
0.9058	0.9230	o
0.8779	0.9389	p
0.8075	0.8881	q
0.7567	0.7176	r
0.8123	0.8231	s
0.2163	0.2315	t
    };
\end{axis}
\end{tikzpicture} 
&
%%%%%%%%%%%%%%%%%%%%%% 4 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BR,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.1,0.1) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.4754	0.5517	w
0.5905	0.9833	b
0.4832	0.6354	c
0.1569	0.8144	d
0.6038	0.9426	e
0.6774	0.9597	f
0.8072	0.8356	g
0.9085	0.9236	h
0.9547	0.9783	i
0.7088	0.9358	j
0.2474	0.2486	k
0.6096	0.6340	l
0.9906	0.9972	m
0.9584	1.0000	n
0.9033	1.0000	o
0.9546	0.9258	p
0.8986	0.9081	q
0.7291	0.8640	r
0.8243	0.9401	s
0.2276	0.2529	t
    };
\end{axis}
\end{tikzpicture}
\\
\\
\\
%%%%%%%%%%%%%%%%%%%%%% 5 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCS,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.15,0.15) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.4083	0.5517	w
0.6071	0.9833	b
0.4427	0.6354	c
0.1607	0.8144	d
0.5757	0.9426	e
0.7093	0.9597	f
0.8015	0.8356	g
0.9075	0.9236	h
0.9519	0.9783	i
0.4751	0.9358	j
0.2407	0.2486	k
0.6024	0.6340	l
0.9943	0.9972	m
0.9524	1.0000	n
0.9058	1.0000	o
0.8779	0.9258	p
0.8075	0.9081	q
0.7567	0.8640	r
0.8123	0.9401	s
0.2163	0.2529	t
    };
\end{axis}
\end{tikzpicture}
&
%%%%%%%%%%%%%%%%%%%%%% 6 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = CC,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.1,0.1) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.5054	0.5517	w
0.6345	0.9833	b
0.4296	0.6354	c
0.1585	0.8144	d
0.5898	0.9426	e
0.6740	0.9597	f
0.8086	0.8356	g
0.9179	0.9236	h
0.9509	0.9783	i
0.7236	0.9358	j
0.2476	0.2486	k
0.6051	0.6340	l
0.9972	0.9972	m
0.9703	1.0000	n
0.9230	1.0000	o
0.9389	0.9258	p
0.8881	0.9081	q
0.7176	0.8640	r
0.8231	0.9401	s
0.2315	0.2529	t
    };
\end{axis}
\end{tikzpicture}
\\
\end{tabular}
\caption{Subset 0/1 loss (\textbf{base learner: \textit{Logistic regression}})}
\label{fig:Subset 0/1 loss for pairs with LR}
\end{figure}

\begin{figure}[!ht]
%\captionsetup[subfigure]{labelformat=empty}
\centering
\begin{tabular}{cc}
\begin{tikzpicture}[scale = 1]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCH,
ylabel = BR]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.02,0.02) (.8,.8) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.321666667	0.330833333	w
0.430119048	0.453968254	b
0.22598257	0.296232057	c
0.093432673	0.116238477	d
0.342367197	0.382197339	e
0.310443753	0.351827499	f
0.554967283	0.575405254	g
0.529380054	0.60458221	h
0.523719677	0.617654987	i
0.170954829	0.233893791	j
0.225313305	0.345232035	k
0.21193929	0.334365403	l
0.469137466	0.612601078	m
0.294747103	0.332586898	n
0.22710747	0.256543445	o
0.487201907	0.533571401	p
0.339368148	0.332947729	q
0.443807455	0.766925106	r
0.416223095	0.481687082	s
0.035028159	0.240079968	t
    };
\end{axis}
\end{tikzpicture} 
&
%%%%%%%%%%%%%%%%%%%%%% 2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BR,
ylabel = CC]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.1,0.1) (.8,.8) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.330833333	0.341875	w
0.453968254	0.434484127	b
0.296232057	0.294933356	c
0.116238477	0.124210435	d
0.382197339	0.380071405	e
0.351827499	0.351483102	f
0.575405254	0.578180751	g
0.60458221	0.627493261	h
0.617654987	0.619002695	i
0.233893791	0.2351037	j
0.345232035	0.397136178	k
0.334365403	0.368945935	l
0.612601078	0.644541779	m
0.332586898	0.332948975	n
0.256543445	0.258708079	o
0.533571401	0.571951403	p
0.332947729	0.34099991	q
0.766925106	0.325981519	r
0.481687082	0.480410271	s
0.240079968	0.200284541	t
    };
\end{axis}
\end{tikzpicture}
\\
\\
\\
%%%%%%%%%%%%%%%%%%%%%% 3 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCH,
ylabel = CC]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.02,0.02) (.65,.65) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x       y       label
0.321666667	0.341875	w
0.430119048	0.434484127	b
0.22598257	0.294933356	c
0.093432673	0.124210435	d
0.342367197	0.380071405	e
0.310443753	0.351483102	f
0.554967283	0.578180751	g
0.529380054	0.627493261	h
0.523719677	0.619002695	i
0.170954829	0.2351037	j
0.225313305	0.397136178	k
0.21193929	0.368945935	l
0.469137466	0.644541779	m
0.294747103	0.332948975	n
0.22710747	0.258708079	o
0.487201907	0.571951403	p
0.339368148	0.34099991	q
0.443807455	0.325981519	r
0.416223095	0.480410271	s
0.035028159	0.200284541	t
    };
\end{axis}
\end{tikzpicture} 
&
%%%%%%%%%%%%%%%%%%%%%% 4 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BR,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.1,0.1) (.8,.8) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x           y           label
0.330833333	0.27875	    w
0.453968254	0.690634921	b
0.296232057	0.31254272	c
0.116238477	0.455363139	d
0.382197339	0.569133398	e
0.351827499	0.518297785	f
0.575405254	0.591060944	g
0.60458221	0.497169811	h
0.617654987	0.484770889	i
0.233893791	0.376948496	j
0.345232035	0.18610062	k
0.334365403	0.200948406	l
0.612601078	0.406469003	m
0.332586898	0.700729724	n
0.256543445	0.573193598	o
0.533571401	0.59207667	p
0.332947729	0.575226601	q
0.766925106	0.6062903	r
0.481687082	0.633161969	s
0.240079968	0.078188256	t
    };
\end{axis}
\end{tikzpicture}
\\
\\
\\
%%%%%%%%%%%%%%%%%%%%%% 5 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCH,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.02,0.02) (.65,.65) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x           y           label
0.321666667	0.27875	    w
0.430119048	0.690634921	b
0.22598257	0.31254272	c
0.093432673	0.455363139	d
0.342367197	0.569133398	e
0.310443753	0.518297785	f
0.554967283	0.591060944	g
0.529380054	0.497169811	h
0.523719677	0.484770889	i
0.170954829	0.376948496	j
0.225313305	0.18610062	k
0.21193929	0.200948406	l
0.469137466	0.406469003	m
0.294747103	0.700729724	n
0.22710747	0.573193598	o
0.487201907	0.59207667	p
0.339368148	0.575226601	q
0.443807455	0.6062903	r
0.416223095	0.633161969	s
0.035028159	0.078188256	t
    };
\end{axis}
\end{tikzpicture}
&
%%%%%%%%%%%%%%%%%%%%%% 6 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = CC,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.15,0.15) (.81,.81) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x           y           label
0.341875	0.27875	    w
0.434484127	0.690634921	b
0.294933356	0.31254272	c
0.124210435	0.455363139	d
0.380071405	0.569133398	e
0.351483102	0.518297785	f
0.578180751	0.591060944	g
0.627493261	0.497169811	h
0.619002695	0.484770889	i
0.2351037	0.376948496	j
0.397136178	0.18610062	k
0.368945935	0.200948406	l
0.644541779	0.406469003	m
0.332948975	0.700729724	n
0.258708079	0.573193598	o
0.571951403	0.59207667	p
0.34099991	0.575226601	q
0.325981519	0.6062903	r
0.480410271	0.633161969	s
0.200284541	0.078188256	t
    };
\end{axis}
\end{tikzpicture}
\end{tabular}
\caption{Hamming loss (\textbf{base learner: \textit{Naive Bayes}})}
\label{fig:Hamming loss for pairs with NB}
\end{figure}

\begin{figure}[!ht]
%\captionsetup[subfigure]{labelformat=empty}
\centering
\begin{tabular}{cc}
\begin{tikzpicture}[scale = 1]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCS,
ylabel = BR]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.15,0.15) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x           y           label
0.487916667	0.57125	    w
0.654365079	0.693333333	b
0.45196514	0.592464115	c
0.179210842	0.216211514	d
0.707189224	0.786043492	e
0.829399198	0.860303931	f
0.933390148	0.97936262	g
0.991509434	1	        h
0.991509434	0.996226415	i
0.691218528	0.838213581	j
0.891411435	0.930757875	k
0.886218896	0.996563777	l
1	        1	        m
0.966934046	0.942959002	n
0.917926829	0.915365854	o
0.9862824	0.982711842	p
0.955422934	0.916556825	q
0.927466278	0.766925106	r
0.921122605	0.961047238	s
0.20387714	0.909617774	t
    };
\end{axis}
\end{tikzpicture} 
&
%%%%%%%%%%%%%%%%%%%%%% 2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BR,
ylabel = CC]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.2,0.2) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x           y           label
0.57125	    0.52125	    w
0.693333333	0.64047619	b
0.592464115	0.589866712	c
0.216211514	0.227054801	d
0.786043492	0.777052905	e
0.860303931	0.858910018	f
0.97936262	0.980430084	g
1	        0.999056604	h
0.996226415	0.994339623	i
0.838213581	0.838212218	j
0.930757875	0.874866572	k
0.996563777	0.994731017	l
1	        1	        m
0.942959002	0.945989305	n
0.915365854	0.925304878	o
0.982711842	0.976020017	p
0.916556825	0.910333583	q
0.766925106	0.746456807	r
0.961047238	0.964660847	s
0.909617774	0.887922158	t
    };
\end{axis}
\end{tikzpicture}
\\
\\
\\
%%%%%%%%%%%%%%%%%%%%%% 3 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCS,
ylabel = CC]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.15,0.15) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x           y           label
0.487916667	0.52125	    w
0.654365079	0.64047619	b
0.45196514	0.589866712	c
0.179210842	0.227054801	d
0.707189224	0.777052905	e
0.829399198	0.858910018	f
0.933390148	0.980430084	g
0.991509434	0.999056604	h
0.991509434	0.994339623	i
0.691218528	0.838212218	j
0.891411435	0.874866572	k
0.886218896	0.994731017	l
1	        1	        m
0.966934046	0.945989305	n
0.917926829	0.925304878	o
0.9862824	0.976020017	p
0.955422934	0.910333583	q
0.927466278	0.746456807	r
0.921122605	0.964660847	s
0.20387714	0.887922158	t
    };
\end{axis}
\end{tikzpicture} 
&
%%%%%%%%%%%%%%%%%%%%%% 4 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BR,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.2,0.2) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x           y           label
0.57125   	0.550833333	w
0.693333333	0.988888889	b
0.592464115	0.625085441	c
0.216211514	0.848214322	d
0.786043492	0.936335605	e
0.860303931	0.965567786	f
0.97936262	0.956944766	g
1	        0.991509434	h
0.996226415	0.996226415	i
0.838213581	0.93045527	j
0.930757875	0.915554468	k
0.996563777	0.945781466	l
1	        0.997169811	m
0.942959002	1	        n
0.915365854	0.990060976	o
0.982711842	0.96074141	p
0.916556825	0.965928665	q
0.766925106	0.984581268	r
0.961047238	0.9994093	s
0.909617774	0.472416091	t
    };
\end{axis}
\end{tikzpicture}
\\
\\
\\
%%%%%%%%%%%%%%%%%%%%%% 5 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = BNCS,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.15,0.15) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x           y           label
0.487916667	0.550833333	w
0.654365079	0.988888889	b
0.45196514	0.625085441	c
0.179210842	0.848214322	d
0.707189224	0.936335605	e
0.829399198	0.965567786	f
0.933390148	0.956944766	g
0.991509434	0.991509434	h
0.991509434	0.996226415	i
0.691218528	0.93045527	j
0.891411435	0.915554468	k
0.886218896	0.945781466	l
1	        0.997169811	m
0.966934046	1	        n
0.917926829	0.990060976	o
0.9862824	0.96074141	p
0.955422934	0.965928665	q
0.927466278	0.984581268	r
0.921122605	0.9994093	s
0.20387714	0.472416091	t
    };
\end{axis}
\end{tikzpicture}
&
%%%%%%%%%%%%%%%%%%%%%% 6 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tikzpicture}[scale = 1]
\begin{axis}[%
scatter/classes={scatter src = explicit symbolic,%
    w={mark=*,draw=black},
    b={mark=*,draw=black},
    c={mark=*,draw=black},
    d={mark=*,draw=black},
    e={mark=*,draw=black},
    f={mark=*,draw=black},
    g={mark=*,draw=black},
    h={mark=*,draw=black},
    i={mark=*,draw=black},
    j={mark=*,draw=black},
    k={mark=*,draw=black},
    l={mark=*,draw=black},
    m={mark=*,draw=black},
    n={mark=*,draw=black},
    o={mark=*,draw=black},
    p={mark=*,draw=black},
    q={mark=*,draw=black},
    r={mark=*,draw=black},
    s={mark=*,draw=black},
    t={mark=*,draw=black}}, xlabel = CC,
ylabel = CP]
\addplot[draw=blue,pattern=horizontal lines light blue]
 coordinates { (0.2,0.2) (1,1) };

\addplot[scatter,only marks,%
    scatter src=explicit symbolic]%
table[meta=label] {
x           y           label
0.52125	    0.550833333	w
0.64047619	0.988888889	b
0.589866712	0.625085441	c
0.227054801	0.848214322	d
0.777052905	0.936335605	e
0.858910018	0.965567786	f
0.980430084	0.956944766	g
0.999056604	0.991509434	h
0.994339623	0.996226415	i
0.838212218	0.93045527	j
0.874866572	0.915554468	k
0.994731017	0.945781466	l
1	        0.997169811	m
0.945989305	1        	n
0.925304878	0.990060976	o
0.976020017	0.96074141	p
0.910333583	0.965928665	q
0.746456807	0.984581268	r
0.964660847	0.9994093	s
0.887922158	0.472416091	t
    };
\end{axis}
\end{tikzpicture}
\end{tabular}
\caption{Subset 0/1 loss (\textbf{base learner: \textit{Naive Bayes}})}
\label{fig:Subset 0/1 loss for pairs with NB}
\end{figure}

\bibliography{uai2023-template}

\end{document}
