\section{Preliminaries} \label{sec:preliminaries}

\begin{lemma}[Chernoff Bounds] Let $X = \sum_{i=1}^{n} X_i$, where $X_i$ = 1 with probability $p_i$ and $X_i = 0$ with probability $1 - p_i$, and all $X_i$ are independent. Let $\mu = \mathbb{E}(X) = \sum_{i=1}^{n} p_i$. Then (i) Lower Tail: $\Pr[X \leq (1 - \eta)\mu] \leq e^{-\eta^2\mu/2} \ \forall \ 0 < \eta < 1$, and (ii) Upper Tail: $\Pr[X \leq (1 + \eta)\mu] \leq e^{-\eta^2\mu/(2 + \eta)} \ \forall \ 0 \leq \eta$.
\label{lemma:chernoff_bounds}
\end{lemma}

\begin{lemma}[Littlewood-Offord-Erd\H{o}s Lemma~\cite{littlewood_offord_Erds1945OnAL}] Let $X_1, X_2, \dots , X_n$ be \emph{i.i.d} $\{0, 1\}$-Bernoulli random variables with $\Pr[1] = 1/2$, and let $a_1, a_2, . . . , a_n \in \mathbb{R}$ s.t. $|a_i| \geq 1, \ \forall i \in[n]$. Then, there exists an absolute constant $C > 0$ such that
\begin{equation*}
	\underset{X_1, \dots, X_n}{\Pr} \left[ \left| \sum_{i \in [n]} a_i X_i + \theta \right| \leq 1 \right] \leq \frac{C}{\sqrt{n}}
\end{equation*}
for any constant $\theta$.
\label{lemma:littlewood_offord}
\end{lemma}

\begin{theorem}[ Theorem 3.7 from \cite{anthony_bartlett_1999neural}] For a $\{0,1\}$-valued class $\mc{H}$ of functions with VC-dimension $\tn{VC-dim}(\mc{H}) = v$, let $\Pi_{\mc{H}}(n)$ denote the maximum number of possible $\{0,1\}$-labelings to any set of $n$ points from the domain of $\mc{H}$. If $n \leq v$, $\Pi_H(n) \leq 2^n$ and for $n > v$, $(\frac{en}{v})^v$. Refer to Section 3.3 of \cite{anthony_bartlett_1999neural} for more details on VC Dimension.
\label{theorem:vcdim_growth_function}
\end{theorem}

\subsection{Boosting meta algorithm for aggregate label setting}
\label{sec:preliminaries_boosting}
Given a collection of bags and aggregate labels, a prototypical boosting algorithm (given in Figure \ref{algo:boosting}) in the aggregate label setting, involves repeating certain steps over some number of rounds: in each round the training data is reweighed, for which a weak classifier is computed. The final output is some function over the ensemble of computed weak classifiers.
\begin{figure}[!htb]
\begin{mdframed}
\small
\textbf{Input:} $\mc{B} = (B_i, \bar{y_i})_{i=1}^m$: Collection of bags and aggregate labels, $D_1(i) = 1/m$: initial weight distribution associated with each bag, $T$: Number of steps of boosting.
\\ \textbf{1.} for  $t \in [T]$: \\
 \hspace*{2em} \textbf{1.1} Train a weak classifier $h_t: \bm{\mc{X}} \xrightarrow[]{} \{0, 1\}$ for the bag distribution $D_t$. \\
\hspace*{2em} \textbf{1.2} Using $\{h_r\}_{r=1}^t$, compute a new distribution $D_{t+1}$ over $\mc{B}$. \\ %
\\ \textbf{2.} For some $g$, output $h^* = g(h_1, \dots, h_T)$ as a (presumably) strong classifier for $\mc{B}$.  
\end{mdframed}
\caption{Boosting for aggregate label setting}\label{algo:boosting}
\end{figure}

Note that bootstrapping aggregation (bagging) ensemble method~\citep{surveyemsemble} can also be framed as a boosting algorithm. This is because the weak learners in bagging are trained in parallel using independent samples from the training data. This fits the iterative framework of boosting, where each iteration can be made independent of the rest, using an independent random sample of the training data which is a special reweighting of the dataset. Stacking ensemble methods~\citep{surveyemsemble}, which are more general than bagging as they allow heterogeneous parallel weak learners, also align with the boosting meta-algorithm. Therefore, the impossibility results applies to bagging and stacking as well. 




