\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools, amsthm} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz}
\usepackage{xspace}
\usepackage{bm} % nice language for creating drawings and diagrams
\usepackage[ruled,vlined,linesnumbered]{algorithm2e}
\usepackage{caption}
\usepackage{subcaption}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

% \newtheorem{hypothesis}{Hypothesis}
%% Provided macros

% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\input{macros}

\title{Sample Boosting Algorithm (SamBA) - An Interpretable Greedy Ensemble Classifier Based On Local Expertise For Fat Data\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<baptiste.bauvin@lis-lab.fr>?Subject=Your UAI 2023 paper}{Baptiste~Bauvin}{}}
% \author[1]{Harry~Q.~Bovik}
\author[2]{Cécile~Capponi}
\author[3]{Florence~Clerc}
\author[1]{Pascal~Germain}
\author[2]{Sokol~Koço}
\author[4]{Jacques~Corbeil}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
	Computer Science and Software Engineering Dept.\\
	Laval University\\
	Qu\'ebec, QC, Canada
}
\affil[2]{%
	Laboratoire d'Informatique et Systèmes\\
	Aix-Marseille University \\
	Marseille, France
}
\affil[3]{%
	Department of Computer Science\\
	McGill University\\
	Montreal, QC, Canada
}
% \affil[4]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
\affil[4]{%
	Molecular Medicine Dept.\\
	Laval University\\
	Qu\'ebec, QC, Canada
}



  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle



\appendix

\section{Re-Writing Adaboost with Local Knowledge}

In this section, we show how Adaboost fits our generalized algorithm (see Algorithm \ref{supp:alg:skeleton}). We then rewrite the Adaboost algorithm to explicitly fit that generalized skeleton in Algorithm \ref{supp:alg:adasmb}.

%In this section, we propose the pseudo-code of Adaboost, re-written to respect the skeleton provided in the main text, that we remind here in Algorithm \ref{supp:alg:skeleton}. 

\begin{algorithm}[t]
	\SetAlgoLined
 \footnotesize
	\textbf{Iterations} : $\T$ ; 
	\textbf{Train data} : $\train = \{(x_i, y_i)\}_{i=1}^m$ ; \textbf{Voter space} : $\voters$ ; $\eF_0 = \emptyset$ ; \textbf{Prior distribution} : $\prior$\\
	$\wei_{1} \leftarrow \prior$ \hfill {\scriptsize$\#$  Prior distribution }\\
	% $\voter_1 \leftarrow \argmaxlim{\voter \in \voters}\left[\mgs(\voter,\wei_{1}(i))\right]$ \hfill {\scriptsize$\#$ Learn the best margin voter}\\
	% $\dis_{1,i} \leftarrow \disf\left(\voter_1, (x_i, y_i)\right))$ \hfill {\scriptsize$\#$ Compute its relevance on each sample}\\
	% $\eF_{1} \leftarrow \left\{(\voter_1, \dis_{1})\right\}$  \hfill {\scriptsize$\#$ Store the voter and relevance}\\
	\For{$t = 1 .. T$}{
		$\voter_t \leftarrow \argmaxlim{\voter \in \voters}\left[\mgs(\voter,\wei_{t}\right]$ \hfill {\scriptsize$\#$ Learn the best voter on them}\\
		$\dis_{t}[i] \leftarrow \disf\left(\voter_t, (x_i, y_i)\right)$ \hfill {\scriptsize$\#$ Compute its relevance on each sample}\\
		$\eF_{t} \leftarrow \eF_{t-1} \bigcup \left\{(\voter_t, \dis_{t})\right\}$  \hfill {\scriptsize$\#$ Store the voter and relevance}\\
        $\wei_{t+1}(i) \leftarrow \frac{\weiti \weif(\voter_{t}, \xyi)}{\sumlims{i=1}{m}\weif(\voter_{t}, \xyi)}$   \hfill {\scriptsize$\#$ Find the difficult samples}\\
	}
	\KwResult{$\F^{\eF_T}(\cdot) = \sumlims{t=1}{T}  \voter_t(\hx)  \hdis_{\train}^{\eF_T}(\voter_t,\cdot)$}
	\caption{A general skeleton for boosting with local expertise.}
	\label{supp:alg:skeleton}
\end{algorithm}

%To do so, we provide the following definition for the abstract functions. 

In order to derive an ensemble classifier from the generalized framework presented in the main paper, we have to give explicit values to the ``abstract'' functions:
\begin{itemize}
	\item The relevance function is defined as $\disf(\voter_t, \xyi) = \one{\voter_{t}(x_i) = y_i}$
	\item The difficulty function is defined as $\weif(\voter_{t+1}, \xyi) = \exp\left(-\usd \ln\left( \frac{1- \sumlimsm \wei_{t}(i) \dis_{t}[i]}{\sumlimsm \wei_{t}(i) \dis_{t}[i]}\right) \voter_{t}(x_i) y_i\right)$
\end{itemize}

%Then, in Algorithm \ref{supp:alg:adasmb}, we present Adaboost, written as an instance of the previously introduced skeleton.


\begin{algorithm}[t]
	\SetAlgoLined
	\textbf{Number of iterations} : $\T$ ; 
	\textbf{Train data} : $\train = \{(x_i, y_i) | i=1..\m \}$ ; \textbf{Voter space} : $\voters$\\
	$\wei_{1}(i) \leftarrow \frac{1}{m}$ \hfill {\scriptsize$\#$ Uniform distribution}\\
	\For{$t = 1 .. T$}{
		$\voter_t \leftarrow \argmaxlim{\voter \in \voters}\left[\mgs(\voter,\weiti)\right]$ \hfill {\scriptsize$\#$ Select the best voter on them}\\
		$\disti \leftarrow\one{\voter_t(x_i) = y_i}$ \hfill {\scriptsize$\#$ Compute its relevance on each sample}\\
		$\eF_{t} \leftarrow \eF_{t-1} \bigcup \left\{(\voter_t, \dis_{t,:})\right\}$  \hfill {\scriptsize$\#$ Store the voter and relevance}\\
        $\wei_{t+1}(i) \leftarrow \weiti \frac{\exp\left(-\usd \ln\left( \frac{1- \sumlimsm \weiti \disti}{\sumlimsm \weiti \disti}\right) \voter_{t}(x_i) y_i\right)}{\Z_{t}}$   \hfill {\scriptsize$\#$ Find the difficult samples}\\
	}
	\KwResult{$\F^{\eF_T}(.)$}
	\caption{Adaboost re-written as an instance of the skeleton in Algorithm \ref{supp:alg:skeleton}}
	\label{supp:alg:adasmb}
\end{algorithm}

For Adaboost, the decision function $\F^{\eF_T}(.)$ is computed as follows:
$$\F^{\eF_T}(x) = \sumlims{t=1}{T} \usd \ln\left( \frac{1- \sumlimsm \weiti \disti}{\sumlimsm \weiti \disti}\right) \voter_{t}(x)$$

This means that if we define $\hdis^\text{Ada}_{t}$ to be a constant coefficient over the whole space $$\hdis^\text{Ada}_{t}(x) = \hdis^\text{Ada}_{t} =  \usd \ln\left( \frac{1- \sumlimsm \weiti \disti}{\sumlimsm \weiti \disti}\right)$$, we get $$\F^{\eF_T}(x) = \sumlims{t=1}{T} \hdis^\text{Ada}_{t} \voter_{t}(x)$$

As explained in the main paper and as explicitly shown here, this decision function uses for each voter a scalar that is computed on the whole dataset. The weight of each voter does not encapsulate local knowledge. 

\section{Pseudo Code for \algo}

In this section, we present the pseudo-code of \algo in Algorithm \ref{alg:samba}, based on the functions provided in the main paper. 

\begin{algorithm}[t]
\footnotesize
	\SetAlgoLined
	\textbf{Iterations} : $\T$ ; 
	\textbf{Train data} : $\train = \{(x_i, y_i)\}_{i=1}^m$ ; \textbf{Voter space} : $\voters$ (decision stumps). ; \textbf{Hyper-parameters}: $\ha, \hb$\\
	$\wei_{1}(i) \leftarrow \usm$  \\
	\For{$t = 1.. T$}{
		$\voter_t \leftarrow \argmaxlim{\voter \in \voters}\left[\mgs(\voter,\weiti)\right]$ \\
		$\disti \leftarrow \exp\left(\voter_t(x_i) y_i\right)$\\
		$\eF_{t} \leftarrow \eF_{t-1} \bigcup \left\{(\voter_t, \dis_{t})\right\}$  \hfill \\
        $\wei_{t+1}(i) \leftarrow \weiti * \frac{\exp\left(-\disti \voter_{t}(x_i) y_i\right)}{\Z_{t}}$ \\
	}
	$\dis \leftarrow \frac{\dis}{\sumlims{i,t}{} \disti}$ \\
	\KwResult{$\sumlims{t=1}{T}  \voter_t(.)  \left(\sumlims{i=1}{m} \frac{\disti m}{\ha^\hb + \dist(x_i, .)^\hb} \right)$}
	\caption{\algo, the empirically valid training algorithm based on the skeleton of Algorithm \ref{supp:alg:skeleton}.}
	\label{alg:samba}
\end{algorithm}

\section{Analysis of the Influence of the Hyperparameters on the Weight Estimation Function}

In this section, we aim at providing intuitive understanding on the influence of $\ha$ and $\hb$ on the weight estimation function of \algo. In Equation \ref{supp:eq:estim} we recall the way \algo approximates the weight for classifier $\voter_t$ when evaluating on a test sample $(x, y)$. 

\begin{equation}
\hdis_{\voter_t}^{\train}(x) := \begin{cases}\disti \text{ if }x = x_i \text{ and } \ha = 0,\\
\sumlims{i=1}{m} \frac{\disti m}{\ha^\hb + \dist(x_i, \hx)^\hb}\text{ else.}
\end{cases}
\label{supp:eq:estim}
\end{equation}

As one may understand when looking closely at this expression, the value $\hdis_{\voter_t}^{\train}(x)$ is highly influenced by the values of $\ha$ and $\hb$. 
In the following, we illustrate this on a toy example where $\X$ is the real line, and we explore the impact of those hyperparameters on the function $\hdis_{\voter_t}^{\train}$. The considered toy example is a one dimensional random dataset of $200$ samples, for which we provide the values of $\disti$. In this experiment, we consider that all the samples where $x \leq 0.5$ have a relevance of $0.44$ and the others of $0.006$. We will call this the \emph{pure} dataset. To understand the usefulness of $\ha$, we also generate a random dataset where three samples of each group are provided with the wrong relevance and we will call it the \emph{noisy} dataset. Since this is a toy example, the concept of noise can be debated, however, this is useful to mimic what happens in real life for outlier or mislabelled samples.

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figures/weight_viz_1d_noisy_b1}
    \caption{Weight estimation functions on a one dimensional dataset. We plotted the relevance weight (ordinates) of an hypothetical classifier $\voter_t$  as a function of the samples values on the single feature (abscissa). The black dots represent the empirical weights learned by \algo. The straight green line  represents the estimated weight if no local knowledge is used. The two red and blue curves represent \algo's weight estimation with two values of $\ha$. }
    \label{fig:non_noisy}
\end{figure}

Figure \ref{fig:non_noisy} highlights (on our example) the behavior of the weight estimation function of \algo, compared to the standard method, of discarding local knowledge. In the first row, we analyze the differences on a \textit{pure} dataset, where the relevance on the samples is well distributed, with all samples $x \leq 0.5$ voting for a high relevance for our imaginary classifier, and the others, voting for a low relevance. 

\subsection{Case where $\ha = 0$}

In this subsection, we only analyze the behavior of the blue curves, for $\ha=0$.
Even if this case is purely synthetic, it is interesting to note the impact that $\hb$ and $\ha$ have on the weight estimation. Indeed, note how the value of the weight estimation function decreases between all the high values of $\disti$ in the case where $\hb = 1.5$. This is because $\disf_t^{a, b}(x)$ takes into account even the vote of samples that are far:
 all these parabolas on the $\ha = 0$ curve (in blue) are due to the fact that all the samples $x_i \geq 0.5$ vote for a low relevance.

In the case where $\hb=4$, the distance is much more important, therefore the curve is much smoother, as the opinion of distant samples becomes negligible. 

In the case of a more noisy dataset, such as the lower graphs, we see that the fact that $\ha = 0$ is highly problematic, as it forces the curve $\omega_t^{a, b}(x)$ to reach each value of $\disti$. This case is typical of an over-fitting algorithm.

\subsection{Case where $\ha = 0.02$}

We now decipher the information provided by the red curves, for which $\ha = 0.02$.
Setting $\ha$ to anything else than $0$ on \textit{pure} datasets does not improve the classification, it might even lower the confidence of the classifier, as it smoothes the curves when it is not needed.

However, in the case where there is some labelling noise, it is mandatory to avoid overfitting. Mathematically, $\ha$ can be interpreted as a distance lower threshold for which \algo considers that training samples too close have too much influence on $\disf_t^{a,b}(x)$. It is clear in the graph for $b=1.5$ that even with $\ha = 0.02$, \algo takes into account the fact that some samples disagree on the classifier relevance, but it does not greatly impact the  estimated relevance. 

By analyzing these graphs, we gave some insight on the impact of the hyper-parameters of \algo. The tool used to generate those figures is available with the code of \algo. We recommend exploring with it to better understand the inner workings of the algorithm and its hyperparameters.



\section{Theoretical Analysis : Proofs.}
In this section, we provide the proofs for the convergence and generalization guarantees provided in the main paper. 

\subsection{Convergence guarantee of Algorithm \ref{alg:samba}}

We start by recalling the convergence theorem, provided in the main paper.
\begin{theorem}[Error exponential decrease]
 We assume $\ha=0$. We denote $\epsilon_t$ the error of voter $\voter_t$ on the training set weighted by $\wei_{t}$. We write $\edge_t = \frac{1}{2} - \error_t$. We also consider $\wei_1$ an arbitrary distribution over the training set. Then, the weighted training error of the combined classifier outputted by Algorithm \ref{alg:samba}, with respect to $\wei_1$ is bounded by:
	$$\Probalim{i \sim \wei_1}{\left(\predxi\right) \neq y_i} \leq \prodlims{t=1}{T} 1-\edge_t \leq exp(-\sumlims{t=1}{T}\edge_t)$$
    \label{th:converg}
\end{theorem}
\begin{proof}
    To prove such a result, we heavily rely on the methodology used in the error decrease proof of Adaboost, presented in \cite{adaboost}.
    
    First, recall that in \algo, for $a=0$, we have $\predxi = \sumlims{t=1}{T}\disti \voter_t(x_i)$. 
    
    

    We write $Z_t =\sumlimsm \exp\left(-y_i \voter_t(x_i) \disti \right) \weiti$, similarly to the boosting framework. Thus, if we rewrite the value of $\wei_{T+1}(i)$ at iteration $T+1$, for the sample $x_i$, we obtain that
    \begin{align*}
        \wei_{T+1}(i) 
        & = \wei_{T}(i) \frac{\exp\left(-\disti \voter_{T}(x_i) y_i\right)}{\Z_{T}}\\
        & = \wei_{1}(i) \frac{\exp\left(- \sum_{t = 1}^T \disti \voter_{t}(x_i) y_i\right)}{\prodlims{t=1}{T}\Z_t}\\
        & = \wei_{1}(i) \frac{\exp(-y_i \predxi)}{\prodlims{t=1}{T}\Z_t}
    \end{align*}
    which can be rewritten as
    \begin{equation}
    \label{eq:weights}
        \wei_{1}(i) \exp(-y_i \predxi) = \wei_{T+1}(i) \prodlims{t=1}{T}\Z_t.
    \end{equation}
 
	Moreover, $\one{\sg\left(\predxi\right) \neq y_i} \leq \exp\left(-y_i \predxi\right)$, therefore the (weighted) training error is given by:
	\begin{equation}
		\begin{split}
			\Probalim{i \sim \wei_{1}}{\predxi  y_i < 0 }&= \sumlimsm \wei_{1}(i) \one{\sg\left(\predxi \right) \neq y_i}\\
			& \leq \sumlimsm \wei_{1}(i) \exp\left(-y_i \predxi\right)\\
            & = \sumlimsm \wei_{T+1}(i) \left(\prodlims{t=1}{T}Z_t \right)  ~~~ \text{using Equation \ref{eq:weights}}\\
			& = \left( \prodlims{t=1}{T} \Z_t \right)\sumlimsm\wei_{T+1}(i) ~~~ \text{by factoring the term in parenthesis} \\
			& = \prodlims{t=1}{T} \Z_t ~~~\text{since $\wei_{T+1}$ sums to 1.}
		\end{split}
	\end{equation}
	
	Analyzing $\Z_t$, we have 
	\begin{equation}
		\begin{split}
			\Z_t &= \sumlimsm \exp\left(-y_i \voter_t(x_i) \disti \right) \weiti\\
			& = \sumlims{i:y_i \neq \voter_t(x_i)}{} \exp\left(\disti \right) \weiti + \sumlims{i:y_i = \voter_t(x_i)}{} \exp\left(-\disti \right) \weiti
		\end{split}
	\end{equation}
	Thanks to our definition of $\dis$, we have $\disti = \exp\left(\voter_t(x_i) y_i\right) \in \left[\frac{1}{e}, e\right]$\footnote{Note that in most cases, the voters $\voter_t$ has value in $\{-1, 1\}$. In that case, the following computation also holds, but the inequality sign is instead an equality.}.
	So, if we note $\epsilon_t$ the error of classifier $\voter_t$ on the training set, weighted by $\wei_{t,\cdot}$, we obtain the following.
	\begin{equation}
		\begin{split}
			\Z_t &= \sumlims{i:y_i \neq \voter_t(x_i)}{} \exp\left(\disti \right) \weiti + \sumlims{i:y_i = \voter_t(x_i)}{} \exp\left(-\disti \right) \weiti \\
			& \leq \sumlims{i:y_i \neq \voter_t(x_i)}{} \exp\left(\frac{1}{e} \right) \weiti + \sumlims{i:y_i = \voter_t(x_i)}{} \exp\left(-e \right) \weiti\\
			& = \exp\left(\frac{1}{e}\right) \error_t + \exp\left(-e\right) (1 - \error_t)\\
			&= \exp\left(\frac{1}{e}\right) \left(\frac{1}{2} - \edge_t\right) + \exp\left(-e\right) \left(\frac{1}{2} + \edge_t\right)\\
			&= \left[\exp\left(\frac{1}{e}\right) + \exp\left(-e\right)\right] \frac{1}{2} + \left[\exp\left(\frac{1}{e}\right) + \exp\left(-e\right)\right] \edge_t
		\end{split}
	\end{equation}
	We then use that
	\begin{itemize}
		\item $\exp\left(\frac{1}{e}\right) + \exp\left(-e\right) \simeq 1.51 <2$
		\item $\exp\left(\frac{1}{e}\right) - \exp\left(-e\right) \simeq -1.37 < -1$
	\end{itemize}
	
	and we obtain that $\Z_t \leq 1-\edge_t \leq \exp(-\gamma_t)$.
	
	Therefore, we obtain $\Probalim{i \sim \wei_1}{\sg\left(\predxi\right) \neq y_i} \leq \prodlims{t=1}{T} 1-\edge_t \leq \exp(-\sumlims{t=1}{T}\edge_t)$
\end{proof}

\subsection{Generalization Guarantees}





The decision function of SamBA has been defined as follows:
\begin{equation}
\predx = \sumlims{t=1}{T}  \voter_t(\hx)  \left(\sumlims{i=1}{m} \frac{\disti m}{\ha^\hb + \dist(x_i, \hx)^\hb} \right)
\label{eq:pred}
\end{equation}

This decision function relies on the training set and a distance function to compute the weights of each classifier on a unknown test sample. 
We therefore decided to use the sample compress framework in order to provide generalization guarantees on such a decision function.


We introduce a new index $s$ that varies from $1$ to $T \times m$. We use this index $s$ to fuse both sums in Equation \ref{eq:pred} into a single sum by considering $\lceil\frac{s}{T}\rceil = t$ and $$i = \begin{cases}
s \mod m \text{ if } s \mod m \neq 0, \\
m \text{ else.}
\end{cases}$$

We can then rewrite Equation \ref{eq:pred} as follows.
%, considering that changing $\normd$ by $m$ does not modify the output of the classifier.

\begin{equation}
\predx = \sumlims{s=1}{T \times m}  \voter_s(\hx)  \frac{\dis_{s} m}{\ha^\hb + \dist(x_s, \hx)^\hb}
\end{equation}

Therefore, if we rearrange the terms to fit the framework of sample compress (SC) classifiers, we obtain the following.

\begin{equation}
\predx = \sumlims{s=1}{T \times m}  \dis_{s} \frac{ \voter_s(\hx) m}{\ha^\hb + \dist(x_s, \hx)^\hb}
\end{equation}
While the simple rearrangement of the terms $\dis_{s}$ and $\voter_s(\hx)$ may seem innocuous, it reveals a further change in perspective. Indeed, now $\dis_{s}$ is the distribution over the SC-classifiers, and $\frac{ \voter_s(\cdot) m}{\ha^\hb + \dist(x_s, \cdot)^\hb}$ are the SC-classifiers.
These classifiers only rely on a single training sample for the similarity measure. In addition, they rely on at most two samples to set the threshold for the stumps $\voter$.  Therefore, the compression size of these classifiers is at most 3. 



In the sample compress point of view, we consider that the classifiers are drawn from the set $$\voters_{\train, \lambda}^\mathcal{R} = \left\{\mathcal{R}(S_{\bm{i}}, \bm{\sigma}) : \bm{i} \in \mathcal{I}_\lambda, \bm{\sigma} \in \Sigma_{\lambda} \right\},$$ with $\mathcal{R}$ a reconstruction function, outputting sample-compressed classifiers from a compression sequence $S_{\bm{i}}$ and a message $\bm{\sigma}$ both of size $\lambda$. Therefore, the Gibbs and Bayes risks are defined as follows. 

\begin{definition}[Gibbs Risk]
    For any probability distribution $Q$ on a set of SC-voters, the Gibbs risk $R_\basedi(G_Q)$ of the Gibbs classifier $G_Q$ on distribution $\basedi$ is defined as follows: 
    \footnotesize
    $$R_\basedi(G_{Q, \train}) = \usd \left(1- \uesp{(x,y)\sim\basedi}\left[\uesp{(\bm{i}, \bm{\sigma}) \sim Q}\left(y\mathcal{R}(\train_{\bm{i}}, \bm{\sigma})(x)\right)\right]\right).$$
    \normalsize
    Its empirical version, on the dataset $\train$ is noted:
    \footnotesize
    $$R_\train(G_{Q, \train}) = \usd \left(1- \usm \sumlimsm\left[\uesp{(\bm{i}, \bm{\sigma}) \sim Q}\left(y_i\mathcal{R}(\train_{\bm{i}}, \bm{\sigma})(x_i)\right)\right]\right).$$
    \normalsize
\end{definition}

\begin{definition}[Theoretical Bayes Risk]
    For any probability distribution $Q$ on a set of SC-voters, the Bayes risk $R_\basedi(B_Q)$ of the majority vote classifier $B_Q$, relatively to $\basedi$ is defined as follows. 
    \footnotesize
    $$R_\basedi(B_{Q, \train}) = \uesp{(x,y)\sim\basedi}\left[I\left(\uesp{(\bm{i}, \bm{\sigma}) \sim Q}\left[y \sg\left(\mathcal{R}(\train_{\bm{i}}, \bm{\sigma})(x)\right)<0\right]\right)\right],$$
    \normalsize
    with $I(p) = 1$ if predicate $p$ is true, and $0$ else.
\end{definition}

Based on this point of view, we can apply the theorem provided in \cite{germain15a} to bound the generalization error of SamBA. 

% \includegraphics[width=\linewidth]{figures/th_39}
\begin{theorem}[Sample compress theorem from \cite{germain15a}]
    Let $\mathcal{R}$ be a reconstruction function that outputs SC-classifiers of size at most $\lambda$ (where $\lambda < m$). For any distribution $\basedi$ on $\X \times \{-1,1\}$, for any posterior distribution $P$ on $\mathcal{I}_\lambda \times \Sigma_\lambda$, and any $\delta \in (0,1]$, we have:
    \begin{equation*}
	\uprob{\train \sim \basedi^m}\left(
	    \text{For all posteriors }Q, ~~
     R_\basedi(G_{Q,\train}) \leq R_\train(G_{Q,\train}) + \sqrt{\frac{1}{2(m - \lambda)}\left[KL(Q||P) + 4 \lambda + \ln\left(\frac{\xi(m-\lambda)}{\delta}\right)\right]}
	\right)\geq 1- \delta
 \end{equation*}
 with 
	\begin{equation}
	\sqrt{m} \leq \xi(m) := \sumlims{k=0}{m} {m \choose k} \left(\frac{k}{m}\right)^k \left(1- \frac{k}{m}\right)^{m-k} \leq 2 \sqrt{m}
	\end{equation}
 \label{th:germ}
\end{theorem}




% To apply this theorem, we need to provide the value for $\lambda$. 

\paragraph{Value for $\lambda$}
As our sc-classifiers are based on decision stumps, they need at least 2 samples to find a threshold, and potentially one more to compute $\dist(x_i, \hx)$. Therefore, their compression size is $ l \leq 3$. Thus, we set the $Q$-average to be $\lambda \leq 3$.


We then derive our bound directly from Theorem \ref{th:germ}, by simply applying the usual bound on the Gibbs risk: $R_\basedi(B_{Q,\train}) \leq 2 R_\basedi(G_{Q,\train})$ .

\begin{theorem}[SamBA's sample compress bound]
	For any distribution $\basedi$, any set of sc-classifiers of form $\frac{ \voter_s(\cdot) m}{\ha^\hb + \dist(x_s, \cdot)^\hb}$, any prior $\mathcal{P}$, and any $\delta \in (0, 1]$, we have, for $Q$ the distribution found by the sc-version of \algo,
	\footnotesize
    \begin{equation*}
    \uprob{\train \sim \basedi^m}\left(R_\basedi(B_{Q,\train}) \leq 2 R_\basedi(G_{Q,\train}) \leq 2\left(R_\train(G_{Q,\train}) + \sqrt{\frac{1}{2(m - 3)}\left[KL(Q||P) + 12 + \ln\left(\frac{2\sqrt{m-3}}{\delta}\right)\right]}\right) \right)\geq 1- \delta.    
    \end{equation*}
	\normalsize
 \label{th:proof_gen}
\end{theorem}

\subsection{A Version with Smaller KL-Divergence}

In the previous bound, we have no control on the value of the KL-divergence. Indeed, for each iteration of \algo, the distribution $Q$ is modified for $m$ sc-classifiers. This might be an issue for the KL-divergence, as it measures the difference between $\mathcal{P}$ and $Q$. Let us recall the definition of KL-divergence.

\begin{definition}[KL-Divergence]
The KL-divergence of two finite distributions $\mathcal{P} = \left\{p_s\right\}_{s=1}^{T \times m}$ and $Q = \left\{q_s\right\}_{s=1}^{T \times m}$ is defined as follows.
\begin{equation}
    KL(Q||\mathcal{P}) = \sumlims{s=1}{T\times m}q_s \ln\left(\frac{q_s}{p_s}\right)
\end{equation}
\end{definition}

From that definition, it follows that modifying $m$ values of $q_s$ at each iteration might be an issue as the KL-divergence may increase significantly in the end. To overcome this issue, we provide a new bound relying on a slight variation of \algo. That variation of \algo considers $K$ random samples during its learning process instead of all the $m$ available. Therefore, our sc-classifiers become the sum of the $K$ random samples taken into account by this new version of \algo. 
\begin{equation}
    \sumlims{k=1}{K}\frac{ \voter_t(\cdot) \wei_{t}[k] K}{\ha^\hb + \dist(x_k, \cdot)^\hb}    
\end{equation}

Their compression sequence is now of size $K+3$ which leads to the following modified bound.


\begin{theorem}[Modified sample compress bound]
	For any distribution $\basedi$, any set of sc-classifiers of the form $\sumlims{k=1}{K}\frac{ \voter_t(\cdot)\wei_{t}[k] K}{\ha^\hb + \dist(x_k, \cdot)^\hb}$, any prior $\mathcal{P}$, and any $\delta \in (0, 1]$, we have, for $Q$ the distribution found by the sc-version of \algo,
	\scriptsize
	\begin{align*}
	\uprob{\train \sim \basedi^m}\left(R_\basedi(B_{Q,\train}) \leq 2 R_\train(G_{Q,\train}) \leq 2\left(R_\basedi(G_{Q,\train}) + \sqrt{\frac{1}{2(m - K-3)}\left[KL(Q||P) + 4(K+3) + \ln\left(\frac{2\sqrt{m-K-3}}{\delta}\right)\right]}\right) \right)\geq 1- \delta
	\end{align*}
	\normalsize
\end{theorem}

The bound in this theorem is a slight modification on Theorem \ref{th:proof_gen}, but it allows to control the KL-divergence, as it is now defined on distributions that weight the sc-voters $ \sumlims{k=1}{K}\frac{ \voter_t(\cdot) \wei_{t}[k]K}{\ha^\hb + \dist(x_k, \cdot)^\hb}$, which are $m$ times less numerous than the previous bound of Theorem \ref{th:proof_gen}.

\section{Experimental Protocols}

In this section, we provide all the experimental details needed to reproduce the experiments.

All the baselines implementations have been imported from scikit-learn, except for XG-Boost that has been imported from the python library xgboost.

In the following section, we use the arguments name from the libraries to name the hyper-parameters, to avoid any confusion.

\subsection{Time Consumption Experiment}

The time consumption experiment has been realized on synthetic datasets with every approach. As we work with ensemble methods, we had to set the number of base classifiers (or iteration) for each method. To do so, we used two strategies, one limiting all ensemble methods to 10 base classifiers, to compute their durations on equivalent number of base classifiers. And one setting the number of base classifiers outputted by the real-life experiments of Section 5.2.2, to take into account the fact that the methods have different levels of sparsity.

\subsubsection{Equal number of base classifiers}

In this experiment,presented in Supplementary Material E.1, we use 10 iterations or base classifiers for each ensemble method.
Each approach has been parameterized as follows:
\begin{itemize}
    \item Random Forest:
    \begin{itemize}
        \item n\_estimators: 10
        \item All the other parameters: default
    \end{itemize}
    \item KNN:
    \begin{itemize}
        \item All the parameters: default
    \end{itemize}
    \item SVM-RBF:
    \begin{itemize}
        \item All the parameters: default
    \end{itemize}
    \item Decision Tree:
    \begin{itemize}
        \item max\_depth: 10
    \end{itemize}
    \item Adaboost:
    \begin{itemize}
        \item n\_estimators: 10
        \item base\_estimator: decision tree of depth 1
        \item all the other parameters: default
    \end{itemize}
    \item \algo:
    \begin{itemize}
        \item n\_estimators: 10
        \item base\_estimator: decision tree of depth 1
        \item all the other parameters: default.
    \end{itemize}
\end{itemize}
    
\subsubsection{Different number of base classifiers}

In this experiment, presented in Section 5.1, we use a variable number of iterations or base classifiers for each ensemble method.
Each approach has been parameterized as follows:
\begin{itemize}
    \item Random Forest:
    \begin{itemize}
        \item n\_estimators: 127
        \item All the other parameters: default
    \end{itemize}
    \item KNN:
    \begin{itemize}
        \item All the parameters: default
    \end{itemize}
    \item SVM-RBF:
    \begin{itemize}
        \item All the parameters: default
    \end{itemize}
    \item Decision Tree:
    \begin{itemize}
        \item int(log2(17))
    \end{itemize}
    \item Adaboost:
    \begin{itemize}
        \item n\_estimators: 146
        \item base\_estimator: decision tree of depth 1
        \item all the other parameters: default
    \end{itemize}
    \item \algo:
    \begin{itemize}
        \item n\_estimators: 7
        \item base\_estimator: decision tree of depth 1
        \item all the other parameters: default.
    \end{itemize}
\end{itemize}
    

\subsection{Experiments on Generated Datasets}

This experiment was used to showcase the different models learned by the studied methods, highlighting the fact that SamBA is an hybrid method between ensemble methods and similarity-based ones.

\subsubsection{Dataset generation}

To generate the datasets, we used the make\_moons function of scikit-learn and a custom function to generate the spirals, available on GitHub. We used random seed for reproducibility and used 1000 samples, uniformly distributed in two classes. The datasets were then splitted in a train an test set with 80% train and 20% test.

\subsubsection{Algorithm parametrization}

On those datasets, we mainly used preset hyper-parameters, as the goal of this study is simply to show the advantages and drawbacks of each approach on the synthetic dataset, and not to compare their classification relevance. Therefore, we parameterized the algorithms as follows:

\begin{itemize}
    \item Random Forest:
    \begin{itemize}
        \item All parameters: default
    \end{itemize}
    \item KNN:
    \begin{itemize}
        \item All the parameters: default
    \end{itemize}
    \item SVM-RBF:
    \begin{itemize}
        \item probability: True (to ba able to generate the contour plots)
        \item C: 0.1
        \item gamma: 1.1
        \item All the other parameters: default
    \end{itemize}
    \item Decision Tree:
    \begin{itemize}
        \item All the parameters: default
    \end{itemize}
    \item Adaboost:
    \begin{itemize}
        \item All the parameters: default
    \end{itemize}
    \item \algo:
    \begin{itemize}
        \item n\_estimators: 4
        \item b: 20
        \item a: 1e-15
        \item All the other parameters: default.
    \end{itemize}
\end{itemize}

\subsection{Performance Study on Real-life Datasets}

For this performance study, we respected the most robust experimental standards in Machine Learning, to ensure that our results were trustworthy.

\subsubsection{General protocol}

For each of the presented dataset, we :
\begin{itemize}
    \item Splitted the dataset in 80\% train and 20\% test samples 10 times
    \item For each train/test split:
    \begin{itemize}
        \item For each train/test split:
        \begin{itemize}
            \item For each algorithm:
            \begin{itemize}
                \item Optimized the hyper-parameters on the training set with random search (50 iterations), and 5-folds cross-validation, 
                \item Learned on the full training set
                \item Predict on both the training and testing sets,
                \item Computed balanced accuracy on the test set.
            \end{itemize}
        \end{itemize}
    \end{itemize}
\end{itemize}

This protocol ensures that the results are not based on a lucky train-test split and that the hyper-parameters were fairly optimized for each algorithm. As we use random search for our hyper-parameters, we provided distributions for each parameter to draw from.

\subsubsection{Hyper-parameter distributions}


\begin{itemize}
    \item Random Forest:
    \begin{itemize}
        \item n\_estimators: [1,300],
        \item max\_depth: [1,10],
        \item criterion: {gini, entropy}
        \item All the other parameters: default
    \end{itemize}
    \item KNN:
    \begin{itemize}
        \item n\_neighbors: [1,10],
        \item weights: [uniform, distance]
        \item algorithm: {"auto", "ball\_tree", "kd\_tree", "brute"}
        \item p: {1,2}
        \item All the other parameters: default
    \end{itemize}
    \item SVM-RBF:
    \begin{itemize}
        \item C: $[10^{-3},10^{3}]$
        \item All the other parameters: default
    \end{itemize}
    \item Decision Tree:
    \begin{itemize}
        \item max\_depth: [1,300]
        \item criterion: {gini, entropy}
        \item splitter: {best, random}
        \item class\_weight: {balanced, None}
        \item All the other parameters: default
    \end{itemize}
    \item Adaboost:
    \begin{itemize}
        \item n\_estimators: [1,100]
        \item base\_estimator: Decision Tree of depth [1,3]
        \item class\_weight: {balanced, None}
        \item All the other parameters: default
    \end{itemize}
    \item \algo:
    \begin{itemize}
        \item n\_estimators: [1,70]
        \item  b: [0.1,6]
        \item a: [1, 1e-10]
        \item class\_weight: {balanced, None}
        \item All the other parameters: default.
    \end{itemize}
    \item Gradient Boosting:
    \begin{itemize}
        \item n\_estimators: [1,300],
        \item base\_estimator: Decision Tree of depth [1,10]
        \item loss: {log\_loss, deviance, exponential}
        \item learning\_rate: [0,1]
        \item All the other parameters: default.
    \end{itemize}
    \item XG-Boost:
    \begin{itemize}
        \item n\_estimators: [1,300],
        \item base\_estimator: Decision Tree of depth [1,10]
        \item objective: {binary:logistic, binary:hinge}
        \item learning\_rate: [0,1]
        \item All the other parameters: default.
    \end{itemize}
\end{itemize}

All the files used to build the graphs are available on GitHub , alongside a sklearn-compatible version of \algo that is installable.


\section{Additional Experimental Information}

\subsection{Computational Time Comparison}
In this experiment, we repeat the one presented in Section 5.1, with a slight modification. Indeed, we set the number of base estimators for each ensemble method to $10$. We therefore assess their efficiency in the hypothetical case that they are equivalently sparse.

In Figure \ref{supp:fig:consump}, we present the results of this experiment. We see that they do not differ for the similarity-based methods. However, it is clear in this setup that \algo has been designed for fat datasets. Indeed, it is much longer to predict than standard ensemble methods when the number of features is small. This result is to be nuanced by the fact that we plot with log scales,and therefore, the apparent difference is less than $0.74$ seconds for a $2000$ samples by $100$ features dataset. 

\begin{figure}[t]
	\centering
	\begin{subfigure}[b]{0.56\linewidth}
		\centering
		\includegraphics[width=\linewidth]{figures/train_duration_pred_nest}
		\caption{Log train duration for each algorithm}
		\label{supp:fig:consump:train}
	\end{subfigure}
	\begin{subfigure}[b]{0.40\linewidth}
		\centering
		\includegraphics[width=\linewidth]{figures/test_duration_pred_nest}
		\caption{Log test duration for each algorithm}
		\label{supp:fig:consump:test}
	\end{subfigure}
	\caption{Learning and predicting log duration comparison on two datasets : one with 500 samples, the other with 2000. Each sample being described by an increasing number of features, ranging from 10 to 50k. The ensemble methods are limited to 10 base estimators, and the KNN to 5 neighbors.}
	\label{supp:fig:consump}
\end{figure}

\subsection{Metagenome Dataset Description}


The meatgenome dataset\footnote{Note that this section does not provide an in-depth description of the biological study that lead to the dataset, but provides insights for the reader to get a better understanding of the dataset.} presented in the main paper has been first introduced in \cite{metagenomes}. It consists of 640 patients, divided in 221 males, 270 females and 149 unknown gender. It labels the patients as \textit{obese} or \textit{not-obese}, based on their BMI. As in normal population, these is an imbalance in the dataset, with approximately $88\%$ of the dataset being obese and the remaining $12\%$ labelled as not obese. The samples of this dataset have been selected among multiple countries, originating from four different studies, with no drug use nor additional medical condition. 

From these patients, gut metagenome was extracted thanks to Whole Genome Sequencing (WGS). Then, the multiple data types were derived as follows.

\begin{itemize}
    \item \textit{go}, \textit{ec}, \textit{and cog} are extracted form the abundances of protein annotation features, respectively, their Enzyme Commission (EC) numbers, their Gene Ontology (GO) characterization, their Clusters of Orthologous Group (COG) association.
    \item \textit{kegg.module} and \textit{kegg.pathway} were extracted from the KEGG \citep{kegg} database, 
    \item \textit{taxa.family}, \textit{taxa.phylum} and \textit{taxa.genus} are abundances of the standard phylogenetic groups in the gut metagenomes.
\end{itemize}

This diversity in the data acquisition leads to a large range of dimensions, allowing to assess the relevance of our approaches on datasets with a wide range of dimensionnalities. As this dataset in itself is a contribution too complex to describe here, for more information on its construction, we highly encourage the reader to read \cite{metagenomes}.

\subsection{Full Size Contour Plots}

THe full size contour plots are available in Figure \ref{supp:fig:dsets}

\begin{figure}[t]
 \newlength{\plotsize}
 \setlength{\plotsize}{0.33\linewidth}
	\centering
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Moons_NeighborHoodClassifier}
		\caption{\scriptsize\algo - Moons}
		\label{supp:fig:dsets:mosa}
	\end{subfigure} 
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Moons_SVC}
		\caption{\scriptsize SVM-RBF - Moons}
		\label{supp:fig:dsets:mosv}
	\end{subfigure}
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Moons_AdaboostClassifier}
		\caption{\scriptsize Adaboost - Moons}
		\label{supp:fig:dsets:moad}
	\end{subfigure}\\
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Moons_KNeighborsClassifier}
		\caption{\scriptsize KNN - Moons}
		\label{supp:fig:dsets:mokn}
	\end{subfigure}
    \begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Moons_RandomForestClassifier}
		\caption{\scriptsize RF - Moons}
		\label{supp:fig:dsets:morf}
	\end{subfigure}
 \begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Moons_DecisionTreeClassifier}
		\caption{\scriptsize DT - Moons}
		\label{supp:fig:dsets:modt}
	\end{subfigure}\\
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Spirals_NeighborHoodClassifier}
		\caption{\scriptsize \algo - Spirals}
		\label{supp:fig:dsets:spsa}
	\end{subfigure}
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Spirals_SVC}
		\caption{\scriptsize SVM-RBF - Spirals}
		\label{supp:fig:dsets:spsv}
	\end{subfigure}
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Spirals_AdaboostClassifier}
		\caption{\scriptsize Adaboost - Spirals}
		\label{supp:fig:dsets:spad}
	\end{subfigure}\\
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Spirals_KNeighborsClassifier}
		\caption{\scriptsize KNN - Spirals}
		\label{supp:fig:dsets:spkn}
	\end{subfigure}
    \begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Spirals_RandomForestClassifier}
		\caption{\scriptsize RF - Spirals}
		\label{supp:fig:dsets:sprf}
	\end{subfigure}
 \begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Spirals_DecisionTreeClassifier}
		\caption{\scriptsize DT - Spirals}
		\label{supp:fig:dsets:spdt}
	\end{subfigure}\\
	\caption{Decision functions contour plots for the six considered algorithms, on the two \textit{pure} generated datasets. The small dots are train samples, the big ones test samples. The color represents the predicted class ans its intensity, the certainty of the decision function on the 2D space.}
	\label{supp:fig:dsets}
\end{figure}

\begin{figure}[t]
\setlength{\plotsize}{0.33\linewidth}
	\centering
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Moons_LassoClf}
		\caption{\scriptsize Lasso - Moons}
		\label{supp:fig:dsetsadd:mosa}
	\end{subfigure} 
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Moons_LinearTreeClassifier}
		\caption{\scriptsize Linear Tree - Moons}
		\label{supp:fig:dsetsadd:mosv}
	\end{subfigure}\\
    \begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Moons_GradientBoostingClassifier}
		\caption{\scriptsize Gradient Boosting - Moons}
		\label{supp:fig:dsetsadd:mosv}
	\end{subfigure}
    \begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Moons_XGBClassifier}
		\caption{\scriptsize XGBoost - Moons}
		\label{supp:fig:dsetsadd:morf}
	\end{subfigure}\\
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Spirals_LassoClf}
		\caption{\scriptsize Lasso - Spirals}
		\label{supp:fig:dsetsadd:mokn}
	\end{subfigure}
    \begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Spirals_LinearTreeClassifier}
		\caption{\scriptsize Linear Tree - Spirals}
		\label{supp:fig:dsetsadd:morf}
	\end{subfigure}\\
    \begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Spirals_GradientBoostingClassifier}
		\caption{\scriptsize Gradient Boosting - Spirals}
		\label{supp:fig:dsetsadd:morf}
	\end{subfigure}
    \begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/Spirals_XGBClassifier}
		\caption{\scriptsize XGBoost - Spirals}
		\label{supp:fig:dsetsadd:morf}
	\end{subfigure}
	\caption{Decision functions contour plots for the two additional algorithms, on the two \textit{pure} generated datasets. The small dots are train samples, the big ones test samples. The color represents the predicted class ans its intensity, the certainty of the decision function on the 2D space.}
	\label{supp:fig:dsetsadd}
\end{figure}

\subsection{Statistical Results}
In Table \ref{tab:my_label}, we present the balanced accuracies fo Section 5.2.2, with their standard deviations. 
\begin{table}[t]
    \resizebox{\linewidth}{!}{
\begin{tabular}{|l|c|c|c|c|c|c|c|c|c|} \hline
                  Dataset &            \algo &         Adaboost &          XGBoost & Grad. Boost. &          SVM-RBF &              KNN &    Rand. Forest &    Dec. Tree &            Lasso \\ \hline
          cog \hfill (24) & .83 $\pm$ 0.06  & .77 $\pm$ 0.05  & .78 $\pm$ 0.06  & .76 $\pm$ 0.04  & .75 $\pm$ 0.05  & .77 $\pm$ 0.07  & .83 $\pm$ 0.04  & .72 $\pm$ 0.11  & .58 $\pm$ 0.05  \\ \hline
         ec \hfill (2736) & .84 $\pm$ 0.04  & .70 $\pm$ 0.05  & .70 $\pm$ 0.06  & .65 $\pm$ 0.06  & .74 $\pm$ 0.06  & .72 $\pm$ 0.05  & .84 $\pm$ 0.04  & .70 $\pm$ 0.1  & .65 $\pm$ 0.06  \\ \hline
        go \hfill (11946) & .85 $\pm$ 0.04  & .73 $\pm$ 0.07  & .76 $\pm$ 0.1  & .71 $\pm$ 0.08  & .62 $\pm$ 0.09  & .75 $\pm$ 0.06  & .86 $\pm$ 0.03  & .73 $\pm$ 0.09  & .67 $\pm$ 0.06  \\ \hline
 kegg.module \hfill (682) & .85 $\pm$ 0.05  & .70 $\pm$ 0.04  & .68 $\pm$ 0.06  & .69 $\pm$ 0.03  & .71 $\pm$ 0.03  & .70 $\pm$ 0.06  & .83 $\pm$ 0.04  & .72 $\pm$ 0.06  & .62 $\pm$ 0.06  \\ \hline
kegg.pathway \hfill (414) & .82 $\pm$ 0.03  & .67 $\pm$ 0.05  & .69 $\pm$ 0.08  & .67 $\pm$ 0.07  & .73 $\pm$ 0.06  & .73 $\pm$ 0.07  & .84 $\pm$ 0.03  & .69 $\pm$ 0.06  & .61 $\pm$ 0.08  \\ \hline
 taxa.family \hfill (101) & .82 $\pm$ 0.04  & .68 $\pm$ 0.05  & .66 $\pm$ 0.06  & .68 $\pm$ 0.08  & .65 $\pm$ 0.08  & .65 $\pm$ 0.04  & .82 $\pm$ 0.05  & .65 $\pm$ 0.06  & .61 $\pm$ 0.08  \\ \hline
  taxa.phylum \hfill (37) & .84 $\pm$ 0.04  & .70 $\pm$ 0.07  & .67 $\pm$ 0.1  & .66 $\pm$ 0.07  & .57 $\pm$ 0.1  & .63 $\pm$ 0.04  & .84 $\pm$ 0.03  & .74 $\pm$ 0.07  & .55 $\pm$ 0.06  \\ \hline
   taxa.genus \hfill (72) & .80 $\pm$ 0.06  & .63 $\pm$ 0.08  & .70 $\pm$ 0.06  & .70 $\pm$ 0.05  & .68 $\pm$ 0.05  & .68 $\pm$ 0.05  & .80 $\pm$ 0.06  & .73 $\pm$ 0.06  & .64 $\pm$ 0.06  \\ \hline
\end{tabular}
}
    \caption{Test Balanced accuracies with standard deviations for the algorithms presented in Section 5.2.2}
    \label{tab:my_label}
\end{table}


\subsection{Feature Efficiency Results}
In Figure \ref{fig:eff}, we present the full study of the feature efficiency of \algo, compared to the other ensemble methods studied in this paper.
\begin{figure*}[t]
 \def\plotsize{0.45\linewidth}
 % \setlength{\plotsize}{}
	\centering
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/perf_and_feats_cog.pdf}
		\caption{\scriptsize cog}
		\label{fig:eff:mosa}
	\end{subfigure} 
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/perf_and_feats_ec.pdf}
		\caption{\scriptsize ec}
		\label{fig:eff:mosv}
	\end{subfigure}\\
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/perf_and_feats_go.pdf}
		\caption{\scriptsize go}
		\label{fig:eff:moad}
	\end{subfigure}
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/perf_and_feats_kegg_module.pdf}
		\caption{\scriptsize kegg.module}
		\label{fig:eff:mokn}
	\end{subfigure}\\
    \begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/perf_and_feats_kegg_pathway.pdf}
		\caption{\scriptsize kegg.pathway}
		\label{fig:eff:morf}
	\end{subfigure}
 \begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/perf_and_feats_taxa.family.pdf}
		\caption{\scriptsize taxa.family}
		\label{fig:eff:modt}
	\end{subfigure}\\
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/perf_and_feats_taxa.genus.pdf}
		\caption{\scriptsize taxa.genus}
		\label{fig:eff:spsa}
	\end{subfigure}
	\begin{subfigure}[b]{\plotsize}
		\centering
		\includegraphics[width=\linewidth]{figures/perf_and_feats_taxa.phylum.pdf}
		\caption{\scriptsize taxa.phylum}
		\label{fig:eff:spsv}
	\end{subfigure}\\
	\caption{Feature efficiency results for all the studied ensemble methods, on all the available data types of the \textit{metagenomes} dataset.}
	\label{fig:eff}
\end{figure*}

In Table \ref{tab:feat}, we present the support sizes for each approach alongside its standard deviation. 
\begin{table*}
	\resizebox{\linewidth}{!}{
 \begin{tabular}{|l|c|c|c|c|c|c|c|c|c|} \hline
                  Dataset &            \algo &            Adaboost &            XGBoost &     Grad. Boost. & SVM-RBF & KNN &       Rand. Forest &     Dec. Tree &               Lasso \\ \hline
          cog \hfill (24) &  10.7 $\pm$ 2.0  &     9.4 $\pm$ 7.27  &    9.3 $\pm$ 8.49  &       17.4 $\pm$ 8.0  &     all & all &    21.3 $\pm$ 4.45  &  17.4 $\pm$ 6.64  &   12.75 $\pm$ 2.33  \\ \hline
         ec \hfill (2736) & 22.2 $\pm$ 4.79  &   58.6 $\pm$ 49.62  & 145.1 $\pm$ 75.24  &   963.8 $\pm$ 651.62  &     all & all &  137.3 $\pm$ 94.01  & 19.0 $\pm$ 13.44  & 265.6 $\pm$ 124.65  \\ \hline
        go \hfill (11946) & 21.5 $\pm$ 10.9  & 168.1 $\pm$ 290.98  &  62.7 $\pm$ 59.54  & 1394.4 $\pm$ 1707.67  &     all & all & 191.3 $\pm$ 127.98  &  11.3 $\pm$ 8.12  & 574.1 $\pm$ 644.79  \\ \hline
 kegg.module \hfill (682) &  20.1 $\pm$ 3.3  &   44.9 $\pm$ 25.83  &   94.4 $\pm$ 88.9  &   333.0 $\pm$ 200.31  &     all & all &  113.4 $\pm$ 67.39  & 16.2 $\pm$ 13.54  & 185.1 $\pm$ 140.51  \\ \hline
kegg.pathway \hfill (414) & 22.9 $\pm$ 3.24  &   87.6 $\pm$ 63.71  &  82.1 $\pm$ 49.52  &   208.0 $\pm$ 123.65  &     all & all &  186.3 $\pm$ 41.01  & 28.3 $\pm$ 17.27  &   73.2 $\pm$ 61.59  \\ \hline
 taxa.family \hfill (101) &  11.9 $\pm$ 1.7  &   48.3 $\pm$ 25.21  &  48.1 $\pm$ 20.21  &     79.4 $\pm$ 17.56  &     all & all &    85.9 $\pm$ 12.0  & 27.5 $\pm$ 10.47  &   57.4 $\pm$ 12.82  \\ \hline
  taxa.phylum \hfill (37) &  7.7 $\pm$ 3.72  &   22.4 $\pm$ 13.55  &   27.9 $\pm$ 6.07  &      35.7 $\pm$ 2.15  &     all & all &    32.4 $\pm$ 9.72  &  21.3 $\pm$ 9.43  &   18.9 $\pm$ 11.79  \\ \hline
   taxa.genus \hfill (72) & 14.8 $\pm$ 2.93  &   49.5 $\pm$ 25.03  &  42.4 $\pm$ 18.19  &     63.1 $\pm$ 10.87  &     all & all &    69.1 $\pm$ 6.52  & 19.1 $\pm$ 15.14  &   47.6 $\pm$ 12.15  \\ \hline
\end{tabular}
}
		\caption{Mean and standard deviation of the support size for each approach over 10 train/test splits.}
		\label{tab:feat}
	\end{table*}

\section{Intepretability Context}

Interpretability and explainability are notions that are currently at the center of a large number of debates. Therefore, any discussion about those concepts is very interesting. In this work, we mainly rely on the work of \cite{rudininterpret}, and \cite{interpretableml} and hence we consider that interpretability can be seen as a multi-dimensional space, containing approaches with varying

\begin{itemize}
    \item \textbf{Sparsity}: The number of features on which the model relies is a mandatory criterion for the interpretation of its decision.
    \item \textbf{Decision simplicity}: The complexity of the decision function. Even if a decision function relies on, for example, four decision stumps, combining those with logical functions, or weighted majority votes spans a wide spectrum of different function complexities.
    \item \textbf{Learning transparency}: The learning process is similarly important in the interpretation of the decision function. Algorithms such as decision tree are easy to understand in essence, but understanding the Gini score requires a sound mathematical background.
\end{itemize}
    

Those characteristics are examples of features on which interpretability relies, and justify why we consider Random Forest (RF) and Boosting partially interpretable. Indeed, in our case, RF and Boosting have the great advantage of natively proposing quantifications of the importance of each feature that have been included in their algorithm. If one considers explainability as the range of post-hoc methods used to understand classifiers, Random Forests and Boosting do not require such methods. 

Random forests are problematic in the sense that hey output very dense decision functions. However, they have the great advantage of relying on a uniform majority vote, which is much simpler than the linear combination on which boosting relies. 

In contrast, Boosting approaches are usually sparser than Random Forests. Therefore, in this paper, we consider that Boosting and Random Forests are both partially interpretable, as they provide better-than-post-hoc methods to understand their decision, but they still output either dense or mathematically complex decision functions. 

% \paragraph{Issue with the bound}
% The number of modified weights at the end of a greedy run is at most $T \times m$, therefore, the KL-Divergence might skyrocket.
% To avoid this issue we propose two workarounds : 
% \begin{itemize}
% 	\item Find a way to bound the KL, considering the fact that $\frac{ \voter_s(\cdot) m}{\ha^\hb + \dist(x_s, \cdot)^\hb}$ is very small if $ \dist(x_s, \cdot)$ is big, but it does not impact the KL as it does not change $\dis_{s}$.
% 	\item Consider that we use only the $k$ nearest classifiers with $k<<m$ during the learning process. This would mean that $\dis_{s}$ is only used for $k$, and limits the number of sc-classifiers to $T \times k$ 
% \end{itemize}

% \subsection{Considering sc-classifiers that are $K$-nn}

% The $T$ sc-classifiers are $\sumlims{k=1}{K}\dis_{s_k}\frac{ \voter_{s_k}(\cdot) m}{\ha^\hb + \dist(x_{s_k}, \cdot)^\hb}$, then we just combine them with a uniform distribution.  This barely modifies the KL-div

% \paragraph{The KL-div}
% $$KL(Q||P) = \uesp{f \sim Q}\ln\left(\frac{Q(f)}{P(f)}\right) $$

% In never ending paper :

% \includegraphics[width=0.8\linewidth]{figures/sc_sets}

% In our case, if we consider three samples in the compression sequence : $\mathcal{I_\lambda} = \left\{\langle i_1, i_2, i_3 \rangle \text{ s.t. } i_j \in \{1 .. m\}\}\right\}$ and $\Sigma_\lambda = \left\{-1, +1\right\}^\lambda$. 

% First issue, the message might be bigger than expected still constant so no big issue). 

% Second issue, if $\mathcal{P}$ is a distribution on $\mathcal{I_\lambda} \times \Sigma_\lambda$, then the uniform distribution is not just on the sc-classifiers chosen by \algo. Therefore, even if our distribution si uniform, it is on a subset of the classifiers of $\mathcal{I_\lambda} \times \Sigma_\lambda$ so $KL(\mathcal{Q}||\mathcal{P}) \neq 0$ 


% $$\hat{\hy} = \sumlims{t=1}{T}\sumlims{k=1}{K}\dis_{t, i_k}\frac{ \voter_{t}(x) m}{\ha^\hb + \dist(x_{i_k}, x)^\hb}$$


% \section{Kernel POV}

% During th training phase, \algo does not rely at all on the notion of distance. It is only introduced during the testing phase, to approximate the weight of each classifier on the test sample. 

% $$\hy = \predx = \sumlims{t=1}{T}  \voter_t(\hx)  \left(\sumlims{i=1}{m} \frac{\disti m}{\ha^\hb + \dist(x_i, \hx)^\hb} \right)$$

% So $\hdist(x) = \sumlims{i=1}{m} \frac{\disti}{\dist(x_i, \hx)^b}$ is the only usage of the distance in \algo. 

% This can be seen as applying a kernel that was learnt during the training phase to the test samples. 

% \textit{Note that we never write that $\kernel$ is a kernel, as we did not have time to prove it yet, but it really looks like one.}

% So if we define $$\kernel_{\supp(\pred)} (x_i, x) = \frac{\normd}{\ha^\hb + \dist(x_i, x)^\hb},$$ with $d$ the distance computed on the support $\supp(\pred)$ of the prediction function. Let us denote $\vker = [\kernel_{\supp(\F^{\eF_T}_b)}(x_:, x)]_{i=1}^{m}$ 

% We can rewrite $\hdis_{t}^{b}$ as $\hdis_{t}^{b} = \dis_{t, :} \cdot \vker$.

% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 

% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 

% \newpage

% \section{Generalization proof}




% \section{Additional simulation results}
% Table~\ref{tab:supp-data} lists additional simulation results; see also \citet{einstein} for a comparison. 

% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

\bibliography{bauvin_496}











































\end{document}
