\documentclass[accepted]{uai2025} %
                        

\usepackage[american]{babel}

\usepackage{natbib} %
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} %
\usepackage{siunitx} %
\usepackage{booktabs} %
\usepackage{tikz} %


\usepackage{amsthm ,amsmath, amssymb, natbib, graphicx, url, algorithm2e}
\newtheorem{rdef}{Definition}
\newtheorem{rlemma}{Lemma}
\newtheorem{rthm}{Theorem}
\newenvironment{repdefinition}[1]
  {\renewcommand\therdef{\ref*{#1}}\rdef}
  {\endrdef}
\newenvironment{replemma}[1]
  {\renewcommand\therlemma{\ref*{#1}}\rlemma}
  {\endrlemma}
\newenvironment{reptheorem}[1]
  {\renewcommand\therthm{\ref*{#1}}\rthm}
  {\endrthm}


\usepackage{amssymb}
\usepackage{amsfonts,amsmath, amsthm}
\usepackage{mathtools,braket}
\usepackage{multirow, multicol}
\usepackage{thmtools, thm-restate}
\usepackage{algpseudocode}
\usepackage{subcaption}
\usepackage{soul}
\usepackage{varwidth}
\usepackage{url}

\usepackage{soul} %
\usepackage{mdframed}
\usepackage{footmisc}









\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{claim}{Claim}

\newtheorem*{theorem*}{Theorem}
\newtheorem*{proposition*}{Proposition}

\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{example}{Example}

\theoremstyle{remark}
\newtheorem{remark}{Remark}
\newtheorem*{remark*}{Remark}

\usepackage{soul, cleveref}

\DeclareMathOperator*\Argmax{Argmax}
\DeclareMathOperator*\Argmin{Argmin}

\newcommand{\weight}{\mathcal{W}}
\newcommand{\uniform}{\mathcal{U}}
\newcommand{\graph}{\mathcal{G}}
\newcommand{\vertex}{\mathcal{V}}
\newcommand{\edge}{\mathcal{E}}
\newcommand{\domain}{\mathcal{X}}
\newcommand{\range}{\mathcal{Y}}
\newcommand{\dist}{\mathcal{P}}
\newcommand{\avgdist}{\mathcal{Q}}
\newcommand{\expectation}{\mathbb{E}}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\prob}{\mathbb{P}}
\newcommand{\kthmoment}{\sigma^k_k}
\newcommand{\normal}{\mathcal{N}}
\newcommand{\clipnorm}{\rho}
\newcommand{\bias}{\beta}
\newcommand{\bernoulli}{\text{Ber}}
\newcommand{\Lap}{\text{Lap}}
\newcommand{\Var}{\text{Var}}
\newcommand{\Diag}{\text{Diag}}

\newcommand{\hth}{\hat{\theta}}
\newcommand{\tth}{\theta^{*}}
\newcommand{\bX}{X}
\newcommand{\bS}{S}
\newcommand{\eps}{\gamma}
\newcommand{\ty}{\Tilde{y}}
\newcommand{\tby}{\Tilde{y}}
\newcommand{\beps}{\epsilon}
\newcommand{\Iden}{\mathcal{I}}
\newcommand{\kmeansopt}{OPT}
\newcommand{\tmu}{\Tilde{\mu}}




\newcommand{\group}{{\mathcal{Z}}}
\newcommand{\labelset}{{\mathcal{Y}}}
\newcommand{\advantaged}{{A}}
\newcommand{\disadvantaged}{{D}}
\newcommand{\distribution}{{\mathcal{P}}}
\newcommand{\score}{{\mathcal{S}}}
\newcommand{\cell}{{\mathcal{C}}}
\newcommand{\classifier}{f}
\newcommand{\ex}{\mathbb E}
\newcommand{\E}{\mathbb E}
\newcommand{\repspace}{\mathcal{\Tilde{X}}}
\newcommand{\repfunc}{R}
\newcommand{\candidaterates}{\mathcal{I}}
\newcommand{\fDP}{f^{\text{r-DP}}}
\newcommand{\fFP}{f_r}
\newcommand{\fFN}{f^{\text{r-FN}}}
\newcommand{\fOpt}{f^{*}}
\newcommand{\fDPopt}{f^{\text{r'-DP}}}
\newcommand{\fFPopt}{f^{\text{r'-FP}}}
\newcommand{\fFNopt}{f^{\text{r'-FN}}}
\newcommand{\RDP}{R_{DP}}
\newcommand{\RFP}{R_{PE}}
\newcommand{\RFN}{R_{FN}}
\newcommand{\loss}{\mathcal{L}}
\newcommand{\positivityrate}{\pi}
\newcommand{\threshold}{\mathcal{T}}
\newcommand{\hypoclass}{\Tilde{\mathcal{H}}}
\newcommand{\hypoclassrep}{\Tilde{\mathcal{H}_R}}
\newcommand{\hypoclassdet}{\mathcal{H}}
\newcommand{\thresholdmass}{\Tilde{\mathcal{T}}}
\newcommand{\classifierDP}{f_r}
\newcommand{\accepted}{\mathcal{A}}
\newcommand{\rejected}{\mathcal{R}}
\newcommand{\costFR}{\text{CFR}}
\newcommand{\repfuncset}{R_{\text{fair}}}


\newcommand{\cqed}{\hfill$\lrcorner$}
\newcommand{\mdp}{{\textsc{GapDist}}}
\newcommand{\sncp}{{\textsc{GapSNC}}}
\newcommand{\odd}{{\textsc{GapOddSet}}}
\newcommand{\kvec}{\textsc{MLD}}
\newcommand{\gapvec}{\textsc{GapVectorSum}}
\newcommand{\csp}{\textsc{Gap2CSP}}
\newcommand{\kclique}{k\textsc{-clique}}
\newcommand{\kdense}{k\textsc{-Densest}}
\newcommand{\snvp}{\textsc{GapSNV}}
\newcommand{\svp}{GapSVP}
\newcommand{\den}{\textsc{Den}}
\newcommand{\bA}{{\mathbf{A}}}
\newcommand{\bB}{{\mathbf{B}}}
\newcommand{\bx}{{\mathbf{x}}}
\newcommand{\by}{{\mathbf{y}}}
\newcommand{\bL}{{\mathbf{L}}}
\newcommand{\bT}{{\mathbf{T}}}
\newcommand{\bs}{{\mathbf{s}}}
\newcommand{\bz}{{\mathbf{z}}}
\newcommand{\bp}{{\mathbf{p}}}
\newcommand{\bzero}{{\mathbf{0}}}
\newcommand{\uni}{\text{unique}}
\newcommand{\Id}{{\rm Id}}

\newcommand{\card}[1]{\ensuremath{|#1|}}
\DeclareMathOperator*{\val}{val}
\DeclareMathOperator*{\var}{var}
\DeclareMathOperator*{\cl}{cl}
\DeclareMathOperator*{\opt}{OPT}
\DeclareMathOperator*{\disa}{disagr}
\DeclareMathOperator*{\rdisa}{rdisagr}
\DeclareMathOperator*{\lp}{LP}
\DeclareMathOperator*{\lpc}{\mathcal{L}}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\polyloglog}{polyloglog}
\DeclareMathOperator*{\supp}{supp}
\DeclareMathOperator{\sos}{sos}
\DeclareMathOperator{\cli}{CLIQUE}
\DeclareMathOperator*{\maj}{Majority}
\DeclareMathOperator*{\unsat}{UNSAT}
\DeclareMathOperator*{\EX}{\mathbb{E}}
\newcommand{\etal}{\text{et al.}}
\newcommand{\etals}{\text{et al. }}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\tcW}{\widehat{\cW}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cX}{{\rm{X}}} %
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\tcS}{\widetilde{\cS}}
\newcommand{\tcT}{\widetilde{\cT}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\tGamma}{\tilde{\Gamma}}
\newcommand{\tG}{\tilde{G}}
\newcommand{\tV}{\tilde{V}}
\newcommand{\tE}{\tilde{E}}
\newcommand{\tw}{\tilde{w}}
\newcommand{\tC}{\tilde{C}}
\newcommand{\tO}{\tilde{O}}
\newcommand{\tcF}{\tilde{\cF}}
\newcommand{\tSigma}{\tilde{\Sigma}}
\newcommand{\ttau}{\tilde{\tau}}
\newcommand{\tP}{\tilde{P}}
\newcommand{\tS}{\tilde{S}}
\newcommand{\tU}{\tilde{U}}
\newcommand{\tu}{\tilde{u}}
\newcommand{\tv}{\tilde{v}}
\newcommand{\tcP}{\tilde{\cP}}
\newcommand{\tPhi}{\widetilde{\Phi}}
\newcommand{\x}{\times}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\Fp}{\mathbb{F}_p}
\newcommand{\F}{\mathbb{F}_2}
\newcommand{\bbA}{\mathbb{A}}
\newcommand{\scrA}{\mathscr{A}}
\newcommand{\scrP}{\mathscr{P}}
\newcommand{\Rz}{\mathbb{R}_{\geq 0}}
\newcommand{\ind}{\mathds{1}}
\newcommand{\vone}{\mathbf{1}}
\newcommand{\cost}{\cC}
\newcommand{\geqs}{\geqslant}
\newcommand{\leqs}{\leqslant}
\newcommand{\ow}{\overline{w}}
\newcommand{\f}{\frac}
\newcommand{\reg}{\text{reg}}
\newcommand{\greedy}{\text{greedy}}
\renewcommand{\bar}{\overline}
\renewcommand{\epsilon}{\varepsilon}
\newcommand{\nullset}{\emptyset}
\newcommand{\DYES}{\mathcal{D}_{YES}}
\newcommand{\DNO}{\mathcal{D}_{NO}}
\newcommand{\YES}{{\sc YES}~}
\newcommand{\NO}{{\sc NO}~}
\newcommand{\loglog}{\log\log}
\newcommand{\psim}{\psi_{\text{maj}}}
\newcommand{\agr}{\text{agree}}
\newcommand{\napprox}{\not\approx}

\newcommand{\sa}[1]{{\color{blue} #1}}
\newcommand{\sushant}[1]{}
\newcommand{\appex}[1]{}
\newcommand{\rishi}[1]{{\color{red} #1}}
\newcommand{\yukti}[1]{{\color{cyan}Y: #1}}

\usepackage{amsmath,amsfonts,bm}

\newcommand{\swap}[3][-]{#3#1#2} %

\title{Aggregating Data for Optimal Learning}

\author[1]{\href{mailto:<agarwal.sus@northeastern.edu>?Subject=Your UAI 2025 paper}{Sushant~Agarwal\thanks{Work done during an internship at Google DeepMind}}}
\author[2]{Yukti~Makhija}
\author[2]{Rishi~Saket}
\author[2]{Aravindan~Raghuveer}
\affil[1]{%
    Northeastern University\\
    \href{mailto:agarwal.sus@northeastern.edu}{agarwal.sus@northeastern.edu}\\
}
\affil[2]{%
    Google DeepMind\\
    \href{mailto:rishisaket@google.com}{\{yuktimakhija, rishisaket, araghuveer\}@google.com}
}
  
  \begin{document}
\maketitle

\begin{abstract}
  Multiple Instance Regression (MIR) and Learning from Label Proportions (LLP) are useful learning frameworks, where the training data is partitioned into disjoint sets or \emph{bags}, and only an aggregate label, i.e., \emph{bag-label} for each bag is available to the learner. In the case of MIR, the bag-label is the label of an undisclosed instance from the bag, while in LLP, the bag-label is the mean of the bag's labels. In this paper, we study for various loss functions in MIR and LLP, what is the optimal way to partition the dataset into bags such that the utility for downstream tasks like linear regression is maximized. We theoretically provide utility guarantees, and show that in each case, the optimal bagging strategy (approximately) reduces to finding an optimal clustering of the feature vectors and/or the labels with respect to natural objectives such as $k$-means. We also show that our bagging mechanisms can be made \emph{label-differentially private}, incurring an additional utility error. We then generalize our results to the setting of Generalized Linear Models (GLMs). Finally, we experimentally validate our theoretical results.
\end{abstract}


\section{Introduction}








In traditional supervised learning, the training dataset is a set of $n$ tuples of the form $(\mathbf{x}, y)$, where $\mathbf{x}$ is an instance or feature-vector with label $y$ (denote the sets of tuples by $X, Y$ respectively). The objective is to train a model on the training data $(X,Y)$, that predicts the labels of unseen test instances. 
In this paper, we study the paradigm of \emph{learning from aggregate labels}, in which $X$ is partitioned into $m$ disjoint sets or \emph{bags}
of instances $B =\{B_1, \ldots, B_m\}$, and for each bag $B_l$ only one \emph{bag-label} ($\overline{y}_l$) is available to the learner. $\overline{y}_l$ is derived from the instance-labels present in the bag
via some aggregation function depending on the scenario. The goal, similar to standard supervised learning, is to train a model that predicts the labels of individual instances. This paradigm of learning from aggregate labels directly generalizes traditional supervised learning, the latter being the special case of unit-sized bags. The two formalizations of our focus are (i) Multiple Instance Regression (MIR), where $\overline{y}_l$ is one of the instance-labels of $B_l$\footnote{We consider the popular case where $\overline{y}_l$ is uniformly random.}, and the instance whose label is chosen as the bag-label is not revealed, and (ii) Learning from Label Proportions (LLP), in which $\overline{y}_l$ is the average of $B_l$'s instance-labels.

The MIR and LLP frameworks are becoming increasingly prevalent, and we briefly discuss two use cases (see Section \ref{sec:relatedwork} for a more detailed discussion). There are many practical scenarios (eg., medical tests) in which labels are much more private than the features, and we wish to protect the privacy of individual labels from the learner (and any downstream observer of the learners output). In the MIR and LLP setups, if the bags are of large size, revealing only the aggregate bag-label to the learner provides a layer of privacy protection for individual labels. Due to increasing concerns over data privacy, recent regulations on sharing user-level signals across platforms have resulted in aggregation of data, resulting in LLP and MIR formulations for predictive model training on revenue critical advertising datasets (e.g. Apple SKAN and Chrome Privacy Sandbox, see \cite{o2022challenges}). 

In addition to privacy, in many applications, obtaining labeled data is very costly, but unlabeled data is relatively easy to acquire. This is especially relevant as training data is getting increasingly complex, and skilled human annotators are required for data-labeling, leading to semi-supervised learning settings \citep{van2020survey}. Given a large amount of unlabeled data, and a limited labeling budget, one could partition the data into bags, and query an annotator for the label of one of the instances in each bag. This setting naturally lends itself to the MIR formulation that we study. 

In some scenarios, the bags of instances may already be fixed, whereas in other scenarios like semi-supervised learning, there might be flexibility in curating the bags. We study the question of finding the \emph{optimal bagging strategy}, for the purpose of maximising utility of downstream tasks trained on these bags and corresponding bag-labels. We distinguish between baggings based on whether or not labels are available for constructing the bags. We call them (i) label-agnostic bagging, which occur in settings like semi-supervised learning, and (ii) label-dependent bagging, which occur naturally in privacy motivated scenarios. 



We consider a regression setting, where instances $\mathbf{x}$ lie in $\reals^d$, with labels $y \in \reals$. We adopt a standard way to model linear regression, where label $y_i = \bx_i^T \tth + \eps_i\,, \eps_i\sim\normal(0,\sigma^2)$, for a fixed underlying model $\tth$. Given the bags and corresponding bag-labels, the learner's task is to find an estimator $\hth$ with minimal estimation error, by minimizing some loss function. A common loss function is \emph{instance-level loss}, that basically assigns the aggregate label of the bag to each point in the bag. An estimator $\hth$ minimizes instance-level loss, if
\begin{equation}
    \label{eq:instanceloss}
        \hth
        :=
        \argmin_{\theta} \frac{1}{n} \sum_{l=1}^m \sum_{i\in B_l} \ell(\overline{y}_l,f_{\theta}(x_i))\,,
\end{equation}
where $\ell$ is the squared loss. Another popular loss function is \emph{bag-level loss}, which measures the mismatch between the bag-label and mean of the bag's instance level predictions. An estimator $\hth$ minimizes bag-level loss, if
\begin{equation}
    \label{eq:bagloss}
        \hth
        :=
        \argmin_{\theta} \frac{1}{m} \sum_{l=1}^m  \ell\left(\overline{y}_l, \frac{\sum_{i\in B_l} f_{\theta}(x_i)}{|B_l|}\right)\,.
\end{equation} We also consider \emph{aggregate-level loss}, which
penalises the difference between the bag-label and prediction of the mean of the bag instances. An estimator $\hth$ minimizes aggregate-level loss, if
\begin{equation}
    \label{eq:aggloss}
        \hth
        :=
        \argmin_{\theta} \frac{1}{m} \sum_{l=1}^m  \ell\left(\overline{y}_l,f_{\theta}\left(\frac{\sum_{i\in B_l} x_i}{|B_l|}\right)\right).\,
\end{equation}Given the learning setup (either MIR or LLP, and a loss function), the optimal bagging strategy involves finding the bagging configuration that maximizes the utility of $\hth$ trained using the loss function, with utility defined in terms of closeness to $\tth$. Note that each bag has size at least $k$ which is a fixed value.
\begin{remark*}
Given a dataset with a certain number of samples, the minimum bag size constraint implicitly upper bounds the number of bags or clusters. In addition, smaller bags lead to better utility, as they provide more information about the labels, the number of bags is equal to the upper bound. However, the minimum bag size constraint is essential to define a meaningful problem, otherwise the optimal bagging would be the trivial strategy of putting each point in a separate bag. However, larger bags are more suitable in cases where MIR and LLP are deployed, such as privacy motivated and semi-supervised learning scenarios, since larger bags provide more privacy, and require less labels, respectively.
\end{remark*}




\subsection{Our Results}

We briefly summarize our contributions below.

\paragraph{1) Label-dependent Bagging:} Intuitively, a bagging provides good utility if the bags are \emph{homogeneous}, i.e., the instances and/or instance-labels within a bag are similar. We formalize this intuition below, and study the following learning setups.

 \textit{a) MIR, Instance-level loss:} By deriving a sharp upper bound on the estimation error in Theorem \ref{thm:MIR-event}, we show that finding the optimal bagging reduces to the following $k$-means clustering over the labels,
\begin{align}\label{scaled_kmeans}
    \min_{\mathcal{B}}\sum_{l=1}^m\sum_{\ty_i\in B_l} (\ty_i - \mu_l)^2,\text{ with }|B_l| \geq  k, \forall l \in [m] 
\end{align}
where $\mu_l$ is the mean of the labels in $B_l$, $\ty_i$ denotes the expected value of the label of $x_i$, i.e.,  $\ty_i:= \bx_i^T \tth$, and $\mathcal{B}$ denotes the set of all baggings of the~$n$ samples. This is just a (size-constrained) $k$-means clustering of $\ty$\footnote{$\ty$ is unavailable, but one can instead use $y$ as a proxy, leading to an additional utility error of $n\left(1-\frac{1}{k}\right)\sigma^2$, see Lemma \ref{noisyclustering}.}, and intuitively creates bags that are homogeneous w.r.t. labels. The $1d$ clustering problem above can be solved exactly in polynomial time, and turns out to result in a bagging that just sorts the labels in order, and partitions contiguous segments into bags (see Lemma \ref{l1}).

\textit{b) LLP, Bag-level loss:} By deriving an upper bound on the error in Theorem \ref{thm:LLP-bag-UB1}, we show that finding the optimal bagging reduces to the following optimization problem.
\begin{align}\label{condition}
    &\min_{\mathcal{B}} \frac{\lambda_{max}(f(X))}{\lambda_{min}(f(X))},\text{ subject to} |B_l| =  k, \forall l \in [m], 
\end{align}
where $\lambda_{max}/\lambda_{min}$ denote the maximum/minimum eigenvalues of a matrix, and $f(X) = g(X)g(X)^T$, for
$    g(X) = \left[\left(\frac{\sum_{i\in B_1} x_i}{|B_1|}\right), \dots, \left(\frac{\sum_{i\in B_m} x_i}{|B_m|}\right)\right].$ Essentially, $f(X)$ is the (sample) covariance matrix of each bag's instance-mean. The optimal bagging strategy involves minimizing the condition number ($\lambda_{max}/\lambda_{min}$ ratio) of $f(X)$, and intuitively creates bags that are homogeneous w.r.t. instances. The above discusses equal sized bags, and in Theorem \ref{Bag-LLP-full} we show a corresponding result without the equality constraint.

\textit{c) MIR, Aggregate-level loss:} As seen from the error bound in Theorem \ref{thm:MIR-agg-UB1}, the optimal bagging strategy here involves simultaneously minimizing the condition number of $f(X)$, and minimizing the $k$-means clustering objective of $\ty$, intuitively creating bags that are homogeneous w.r.t. both instances and their labels.


\paragraph{2) Label-agnostic Bagging:} As seen above, a good bagging has bags that are homogeneous w.r.t. instances and/or labels. A label-agnostic bagging can create baggings that are homogeneous w.r.t. instances, but is not able to directly optimize for homogeneity w.r.t. labels. We consider the following 2 label-agnostic bagging strategies.


\textit{a) Instance $k$-means} We justify that the optimal $k$-means clustering of the instances $X$ is an effective label-agnostic bagging strategy for each learning setup we consider. In Instance-MIR, the optimal strategy is a $k$-means clustering of the labels $Y$. We use the fact that $\tilde{Y} =X\tth$ to justify that $k$-means of the instances $X$ is a good heuristic for $k$-means of the labels $Y$ (see Section \ref{instance_k-means}). In the case of Bag-LLP, the optimal bagging strategy does not involve knowledge of the labels, and minimizes the condition number of the sample covariance matrix of the instance-means of each bag. An eigenvalue of a covariance matrix measures the variance along the corresponding eigenvector. In order to minimize the condition number, we intuitively maximize variance in every direction. We show that maximizing the variance of bag-centroids along a direction is equivalent to finding an optimal $k$-means on $X$ projected on that direction. Hence, we want to reduce the $k$-means objective along every direction, and we justify that $k$-means of $X$ is a good heuristic for the same. For Aggregate-MIR, we must simultaneously minimize the condition number of $f(X)$, and the $k$-means objective over the labels $\ty$, and $k$-means of  $X$ is a good heuristic for both objectives.

   
\textit{b) Random bagging} We analyse random bagging in Section \ref{random-bagging}. Random bagging serves as a good baseline to compare our proposed bagging strategies with, and has been experimentally evaluated in many previous works \citep{LWQTS19, YLKJC13}. In addition, unlike data-dependent bagging strategies, random bagging leaks no information about the data., it can be useful in privacy-motivated deployments, such as in online advertising, where incoming user interactions can be partitioned into random bags [Section 2.1 of \citet{o2022challenges}]. We upper bound the error of random bagging in both Bag-LLP and Aggregate-MIR. Since bounding the condition number term in Equation \eqref{condition} as a whole is challenging, we provide an upper bound for $\lambda_{\text{max}}$, and a lower bound for $\lambda_{\text{min}}$. As shown in Lemma \ref{lem:lambda_max_bound} via an application of Cauchy-Schwarz, aggregating feature vectors does not increase $\lambda_{\text{max}}$. For lower bounding $\lambda_{\text{min}}$ we consider a partitioning strategy where the instances are randomly divided into $2k$-sized \textit{super-bags}. Independently from each super-bag, one $k$-sized bag is is sampled, resulting in a collection of $m/2$ bags which are distributed identically to a random collection of $m/2$ disjoint bags and therefore a lower bound on $\lambda_{\text{min}}$ for these bags is sufficient. Observing that bags are independent in this collection (after fixing the super-bags), we compute $\mu_{\text{min}}$, which is the expected value of $\lambda_{\text{min}}$ for these bags, and use Matrix Chernoff to find a high probability lower bound for $\lambda_{\text{min}}$, as stated in Lemma \ref{lem:lambda_min_bound}.


\paragraph{3) Privacy} Apart from the inherent privacy that MIR and LLP offer, we can perturb the labels to obtain formal privacy guarantees in the sense of \emph{label differential privacy}, a popular notion of privacy that measures and prevents the leakage of label information \citep{chaudhuri2011sample}. This incurs an additional utility error, that we formally quantify in Section \ref{privacy}. A larger minimum bag-size $k$ intuitively provides more privacy, and as expected, the error increases with a decrease in $k$. 

\paragraph{4) GLM's} Subsequently, in Appendix \ref{GLM}, we generalize the previous results for linear regression to the setting of Generalized Linear Model's (GLMs), which includes popular paradigms such as logistic regression. We study both instance-level and aggregate-level losses for MIR under the GLM framework. For Instance-MIR, we derive an upper bound that leads to label $k$-means clustering as the optimal bagging strategy. This holds across all distributions within the exponential family. For Aggregate-MIR, our objective suggests minimizing the difference between the maximum and minimum instance-labels within a bag, implying that features with similar labels should be grouped together, yielding a clustering-based objective. This holds for exponential distributions which have a monotonic first derivative.


\paragraph{5) Experiments} To corroborate our theoretical results, we study the proposed bagging mechanisms through extensive experimentation in Section \ref{experiments}, and demonstrate their effectiveness on each learning setup we consider. We analyse trends obtained by varying various parameters such as the minimum bag size, and privacy budget.

\subsection{Related Work}\label{sec:relatedwork}

LLP started with the work of \cite{FK05} and has been studied in the context of privacy concerns~\citep{R10}, lack of supervision due to cost~\citep{CHR}, or coarse instrumentation~\citep{DNRS}. While previous works \citep{QSCL09,YLKJC13,KDFS15,LWQTS19,SZ20,SRR} have developed specialized techniques for model training on LLP training data, \cite{YCKJC14} defined it in the PAC framework, while \cite{Saket21,Saket22} have shown worst case algorithmic and hardness bounds, and recently \cite{brahmbhatt2023pac} gave PAC learning algorithms for Gaussian feature vectors and random bags. 

MIR, introduced in \cite{RP01}, has mostly been studied in applied settings related to remote sensing and image analysis. Popular baseline techniques apply Aggregate-MIR, or Instance-MIR \citep{WRHOV08,RC05}, whereas several expectation-maximization (EM) based methods have also been proposed \citep{RP01,WRHOV08,WLV7,WLR08,TF18}. Recent work of \cite{KSABGR} proved bag-to-instance generalization error bounds as well as hardness results for MIR, in the first theoretical exploration of this problem. 

Both the above problems, LLP and MIR, have gained renewed interest due to recent restrictions on user data on advertising platforms leading to aggregate conversion labels in reporting systems \citep{o2022challenges}. With the goal of preserving the utility of models trained on the aggregate labels, model training techniques for either randomly sampled~\citep{busafekete2023easy} or curated bags~\citep{chen2023learning, javanmard2024priorboostadaptivealgorithmlearning} have been proposed. 







\paragraph{Comparison with \cite{javanmard2024priorboostadaptivealgorithmlearning}:} The case of instance-level loss for LLP has been studied in \cite{javanmard2024priorboostadaptivealgorithmlearning}, where they show that the optimal bagging strategy reduces to finding the best $k$-means clustering of the labels, very similar to our Instance-MIR objective. This is not very surprising, as LLP and MIR are closely related. Indeed, the expected label of each bag in the MIR setup is exactly the label of the bag in the LLP case. Our focus is on MIR  which has not been studied before, and in addition we analyse the popular bag-level loss \citep{ArdehalyC17} for LLP. They provide an adaptive label-agnostic bagging heuristic, which assumes access to an oracle that provides bag-labels in an online setting. Our work provides label-agnostic bagging algorithm in each case, without assuming access to an online oracle. We provide formal privacy guarantees for each of our methods. They also discuss privacy guarantees for their heuristic algorithm; however, their approach does not provide formal privacy guarantees for label-dependent bagging, which we circumvent by using a private clustering algorithm.




\section{Label-dependent Bagging}\label{results}

$X$ has rank $d$, and all expectations henceforth are conditioned on a fixed $X$, unless otherwise stated. The results below provides an upper bound on the error of the estimator $\hth$, in terms of a bagging $B$.  Most proofs are deferred to Appendix \ref{proofs}.

\begin{theorem}[Error Upper Bound, Instance-MIR]\label{thm:MIR-event}
For $\hth$ as in \eqref{eq:instanceloss}, for a given bagging $B$,
\begin{align}\label{eq:MIR-event-UB1}
    \E\left[ ||\hth-\tth||_2^2 \right] \le 
    \ C_1
      \left ( C_2 - \sum_{\ell=1}^m \frac{\left(\sum_{i\in B_\ell} \ty_i\right)^2}{|B_\ell|} \right ),
\end{align}
where constants $C_1, C_2$ are independent of $B$.
\end{theorem}
In Lemma \ref{l1} in Appendix \ref{appendix:proofs}, we show that finding the optimal $k$-means clustering of the (expected) labels $\ty$ exactly minimizes $\sum_{\ell=1}^m \frac{\left(\sum_{i\in B_\ell} \ty_i\right)^2}{|B_\ell|}$. Hence, minimizing the bound in \eqref{eq:MIR-event-UB1} over the set of all baggings
amounts to the $k$-means optimization problem in \eqref{scaled_kmeans}.

\begin{theorem}[Error Upper Bound, Bag-LLP]\label{thm:LLP-bag-UB1}
For $\hth$ as in \eqref{eq:bagloss}, for a given bagging $B$ $~\text{such that }  |B_l| = k, \forall l \in [m]$,
\begin{align}\label{eq:LLP-bag-UB}
    \E\left[ \|\hth-\tth\|_2^2  \right] \leq 
    \sigma^2 \frac{m}{k} \left(\frac{\lambda_{max}(f(X))}{\lambda_{min}(f(X))}\right)^2.
\end{align}
\end{theorem}

Minimizing the bound in \eqref{eq:LLP-bag-UB} over the set of all baggings
amounts to the optimization problem in \eqref{condition}. Theorem \ref{thm:LLP-bag-UB1} is for equal sized bags, and we also show a corresponding result without the equality constraint in Theorem \ref{Bag-LLP-full}.


\begin{theorem}[Error Upper Bound, Aggregate-MIR]\label{thm:MIR-agg-UB1}
For $\hth$ in \eqref{eq:aggloss}, given a bagging $B$ $~\text{such that }  |B_l| = k, \forall l \in [m]$, 
\begin{align}\label{eq:MIR-agg-UB1}
    &\E\left[ \|\hth-\tth\|_2^2  \right] \leq \nonumber \\
    &C_1\left(\frac{\lambda_{max}(f(X))}{\lambda_{min}(f(X))}\right)^2 \left(C_2  +  \sum_{l=1}^m \sum_{\ty_i\in B_l} (\ty_i - \mu_l)^2 \right)
\end{align}
where constants $C_1, C_2$ are independent of $B$.

\end{theorem}

Minimizing the first term in \eqref{eq:MIR-agg-UB1} corresponds to minimizing the condition number of $f(X)$, and minimizing the second term corresponds to finding the optimal $k$-means clustering of $\ty$. Theorem \ref{thm:MIR-agg-UB1} is for equal sized bags, and we also show a corresponding result without the equality constraint in Theorem \ref{Aggregate-MIR-full}.

\section{Label-agnostic Bagging}

\subsection{Instance \texorpdfstring{$k$}{k}-means}\label{instance_k-means}
We justify that $k$-means of the instances $X$ is an effective label-agnostic bagging heuristic for each setting we consider.

\paragraph{Instance-MIR} Note that in our setting of linear regression, $\tilde{Y} =X\tth$. In other words, $\tilde{Y}$ is just the projection of $X$ along the axis normal to the hyperplane determined by $\tth$. Hence, finding an optimal $k$-means clustering of $\tilde{Y}$ is equivalent to minimizing the $k$-means objective of projections along this axis. However, the labels are not given, and this axis is unknown, since $\tth$ is unknown. Hence, in order to do a label-agnostic bagging, one must minimize some objective that simultaneously reduces the $k$-means objective along every direction. In Lemma \ref{mir-k-means}, we show that for a given clustering, the $k$-means objective of a dataset is the sum of $k$-means objective of the dataset projected along each coordinate. 
Given an arbitrary clustering $C$ over $X$ drawn from an isotropic distribution $D$, in expectation the $k$-means clustering objective over $X$ will split equally into $d$ components along each axis (due to symmetry), i.e.,
\begin{equation*}
\expectation[ \text{$k$-means}(C(X_{i}))] = \frac{1}{d}\expectation \left[ \text{$k$-means}(C(X))\right], \forall i,
\end{equation*}
where the expectation is over $X$ drawn from $D$. Hence, for isotropic distribution $D$, we would expect that the $k$-means clustering objective along each direction to be roughly equal. Hence, we would also expect that setting $C$ to be the optimal $k$-means clustering over $X$ would simultaneously keep the $k$-means clustering objective low along each direction.

However, the above reasoning holds only for an isotropic distribution. For a non-isotropic distribution, directions with large variance will dominate the $k$-means objective, and therefore directions with small variance might then have a relatively large $k$-means objective. For an isotropic distribution, we avoid the above problem of directions with large variance dominating. However, note that even for a non-isotropic distribution, $\Sigma^{-\frac{1}{2}}X$ is isotropic, where $\Sigma$ is the covariance matrix of the distribution. Essentially, we stretch each direction so that each direction has the same variance. We can now find an optimal $k$-means clustering over $\Sigma^{-\frac{1}{2}}X$. We will then avoid the problem of directions in $X$ with large variance dominating, while also keeping the $k$-means objective along each direction low. 

\paragraph{Bag-LLP} We want to maximize the condition number of $f(X)$. $\lambda_{\max}/\lambda_{\min}$ of a covariance matrix measures the variance along the direction of most/least variance. In Lemma \ref{llp-k-means}, we show that maximizing the variance of bag's instance-centroids along a direction is equivalent to finding an optimal $k$-means on $X$ projected on that direction. Since we want to maximize the condition number of $f(X)$, we want the variance to be roughly balanced across all directions. Hence, we must simultaneously reduce the $k$-means objective along every direction, and in the previous section, we justified $k$-means of the instances X is an effective heuristic for this.



\paragraph{Aggregate-MIR} Note that in order to minimize the error bound, we must simultaneously minimize the condition number of $f(X),$ and the $k$-means objective over the labels $\tilde{Y}$. Earlier, we justified that $k$-means of the instances $X$ is a good heuristic for both objectives.

\subsection{Random bagging}\label{random-bagging}
We first state the Matrix Chernoff bound, that we use heavily in this section.
\begin{lemma}[Matrix Chernoff (Corollary 5.2 \citep{tropp2012user})]
\label{lem:matrix_chernoff}
Consider a finite sequence $\{ X_k \}$ of independent, random, self-adjoint matrices that satisfy
$
X_k \succeq 0$
and
$\lambda_{\max}(X_k) \leq R$ almost surely. Compute the minimum and maximum eigenvalues of the sum of expectations,
$
\mu_{\min} := \lambda_{\min}\left( \sum\nolimits_{k} \E X_k \right).
$
Then, for $\delta \in [0, 1]$
\begin{align*}
&\prob \left[ \lambda_{\min}\left(\sum\nolimits_k X_k \right) \leq (1 - \delta) \mu_{\min} \right]
	\leq \\ &d \cdot \left[ \frac{e^{-\delta}}{(1 - \delta)^{1-\delta}} \right]^{\mu_{\min}/R}.
\end{align*}

\end{lemma}

\paragraph{Bag-LLP}


We prove the following bound.

\begin{theorem}[Random Bagging Upper Bound, Bag-LLP]\label{thm:LLP-bag-matrix-chernoff}
For $\hth$ as in \eqref{eq:bagloss} and random bagging given by random partitioning into $k$-sized bags, %
\begin{align}
     \E\left[ \|\hth-\tth\|_2^2  \right] \leq 
    \frac{16\sigma^2nk^2}{(1-\delta)^2} \left( \frac{\lambda_{max}(X^TX)}{\lambda_{min}(X^TX)}\right)^2. \nonumber
\end{align}
$w.p.$ greater than $1 - d \cdot \left[ \frac{e^{-\delta}}{(1 - \delta)^{1-\delta}} \right]^{\frac{\mu_{\min}}{k\beta}}$.
\end{theorem}

\begin{proof}
The proof follows from Theorem \ref{thm:LLP-bag-UB1} and Lemmas \ref{lem:lambda_max_bound} and \ref{lem:lambda_min_bound}.
\end{proof}
\begin{lemma}[$\lambda_{max}$ Upper Bound]
\label{lem:lambda_max_bound}
\begin{equation*}
    \lambda_{max}\left(f(X)\right) \leq \lambda_{max}(X^TX).
\end{equation*}
\end{lemma}
\begin{lemma}[$\lambda_{min}$ Lower Bound]
\label{lem:lambda_min_bound}
\begin{align*}
    &\prob \left[ \lambda_{min}\left(f(X)\right) > (1 - \delta) \frac{\lambda_{min}(X^TX)}{4k^2} \right] 
	\geq \\
	&1 - d \cdot \left[ \frac{e^{-\delta}}{(1 - \delta)^{1-\delta}} \right]^{\frac{\mu_{\min}}{k\beta}}. 
\end{align*}

\end{lemma}

\begin{proof}
Let $X_l$ represent the feature matrices of $B_l$ for $l \in [m]$ We consider the randomized Algorithm \ref{algo:random_bag_eigenvalue} which outputs a collection of $m/2$ disjoint bags which are distributed identically to a random subset of $m/2$ disjoint bags, and thus a lower bound for this collection suffices. We have, $\lambda_{min}\left(f(X)\right) =  \frac{1}{k^2} \lambda_{min}\left(\sum_{l=1}^{m}X_l^TX_l\right)$.
\begin{figure}[!htb]
\begin{mdframed}
\small
\textbf{Algorithm 1: Random Bagging, Bag-LLP}\\
\textbf{Input:} : Instances $\mathcal{X}$, fixed bag size $k$. \\
\textbf{Steps:}
\begin{enumerate}
    \item Randomly partition $\mathcal{X}$ into $r$ $2k$-sized \emph{super}-bags, where $r = n/2k.$
    \begin{align*}
    \mathcal{X} = \cup_{l = 1}^{r} \mathcal{X}_l \text{  and  } \mathcal{X}_l \cap \mathcal{X}_{l'} = \phi \text{  for all } l \neq l'%
    \end{align*}
    \item For $l = 1, \dots, r$, a $k$-sized bag $B'_l$ is sampled  $u.a.r$ from $\mathcal{X}_l$.
    \item Output $\mathcal{B'}$ where $\mathcal{B'} = \{ B'_l \}_{l \in [r]}$
\end{enumerate}
\end{mdframed}
\caption{Algorithm 1: Random Bagging, Bag-LLP}\label{algo:random_bag_eigenvalue}
\end{figure}
The feature matrix for bag $B'_l$ sampled using Algorithm \ref{algo:random_bag_eigenvalue} can be represented by $X'_l$ for all $l \in [r]$. 
\begin{equation}
    \frac{1}{k^2} \lambda_{min}\left(\sum_{l=1}^{m}X_l^TX_l\right)  \geq \frac{1}{k^2} \lambda_{min}\left(\sum_{l=1}^{r}{X'_l}^TX'_l\right)
    \label{eq:bound_covariance_x'}
\end{equation}
Let $\mu_{min} = \lambda_{min}\left(\sum_{l=1}^{r}\E\left[{X'_l}^TX'_l\right]\right)/k^2$. We expand ${X'_l}^TX'_l$ and find $\mu_{min}$:
\begin{align*}
    \mu_{min} &= \frac{1}{k^2} \lambda_{min} \left(\sum_{l=1}^{r}\E\left[\sum_{x_i, x_j \in B'_l} x_ix_j^T\right]\right)\\
    &= \frac{1}{k^2} \lambda_{min} \left(\sum_{l=1}^{r}\E\left[\sum_{x_i \in B'_l}x_ix_i^T\right] + \E\left[\sum_{i \neq j} x_ix_j^T\right]\right)
\end{align*}
\newcommand*{\Comb}[2]{{}^{#1}C_{#2}}%
In Algorithm \ref{algo:random_bag_eigenvalue}, $x_i \in \mathcal{X}_l$ get sampled in $B'_l$ with probability $1/2$. Similarly, the probability of sampling the ordered pair $(x_i, x_j)$ is $2\Comb{2k-2}{k-2}/\Comb{2k}{k} = (k-1)/(2k-1)$. Let $\Hat{x} = \sum_{x_i \in \mathcal{X}_l} x_i.$
\begin{align*}
    &\mu_{min} =\\ 
    &\frac{\lambda_{min}}{k^2}  \left(\sum_{l=1}^{r}\sum_{x_i \in \mathcal{X}_l}\frac{1}{2}x_ix_i^T + \sum_{(x_i, x_j) \in \mathcal{X}_l}\frac{k-1}{2k-1} x_ix_j^T\right)= \\
    &\frac{\lambda_{min}}{k^2} \left(\sum_{l=1}^{r}\frac{1}{2} \left(1 - \frac{k-1}{2k-1}\right)\sum_{x_i \in \mathcal{X}_l}x_ix_i^T + \frac{k-1}{2(2k-1)} \Hat{x}\Hat{x}^T \right)\\
    &= \frac{\lambda_{min}}{k^2} \left(\sum_{l=1}^{r} \left(\frac{k}{2(2k-1)}\right)\sum_{x_i \in \mathcal{X}_l}x_ix_i^T + \frac{k-1}{2(2k-1)} \Hat{x}\Hat{x}^T \right)\\
    &= \frac{\lambda_{min}}{2k^2(2k-1)}  \left(k X^TX + (k-1)\sum_{l=1}^{r} \Hat{x}\Hat{x}^T \right)
\end{align*}
Since the second term is a summation of $p.s.d$ matrices, we get $\mu_{min} > \lambda_{min}(X^TX)/4k^2$. We assume $\|x\|_2^2 \leq \beta$ for all $x \in \mathcal{X}$.
\begin{lemma} \label{lem:random_bag_lambda_max}
$\lambda_{max}({X'_l}^TX'_l) \leq k\beta.$
\end{lemma}
Using Lemma \ref{lem:matrix_chernoff} and Lemma \ref{lem:random_bag_lambda_max}, we get
\begin{align*}
&\prob \left[ \frac{1}{k^2}\lambda_{min}\left(\sum_{l=1}^{m} {X'_l}^TX'_l \right) \leq (1 - \delta) \mu_{min} \right] 
	\leq \\ &d \cdot \left[ \frac{e^{-\delta}}{(1 - \delta)^{1-\delta}} \right]^{\frac{\mu_{min}}{k\beta}} 
\end{align*}
Using Equation \ref{eq:bound_covariance_x'} we get
\begin{align*}
&\prob \left[ \lambda_{min}\left(f(X)\right) > (1 - \delta) \frac{\lambda_{min}(X^TX)}{4k^2} \right] 
	\geq \\ &1 - d \cdot \left[ \frac{e^{-\delta}}{(1 - \delta)^{1-\delta}} \right]^{\frac{\mu_{\min}}{k\beta}} 
\end{align*}
\end{proof}


\paragraph{Aggregate-MIR}

We consider a random bagging algorithm similar to the one for Bag-LLP (Algorithm \ref{algo:random_bag_eigenvalue}) for Aggregate-MIR. The upper bound for Aggregate-MIR (Theorem \ref{thm:MIR-agg-UB1}) is product of the label $k$-means objective and the condition number of the bag's instance-centroids. Algorithm \ref{algo:random_bag_agg_mir} takes both these objectives into account. We first sort the instances in increasing order of $\tilde{y}$ and then parition them into contiguous \emph{super-bags} of sizes $2k$. From each super bag, one $k$-sized bag is independently sampled, resulting in a collection of $m/2$ bags. In Theorem \ref{thm:LLP-bag-matrix-chernoff}, we derive an error bound (Lemma \ref{lem:lambda_min_bound}) for any arbitrary partitioning of instances into super-bags, and the same bound holds for Algorithm \ref{algo:random_bag_agg_mir}. Next, we show that arbitrarily dividing the \emph{super}-bag into two equal sized bags leads to a decrease in the $k$-means objective in Proposition \ref{thm:kmeans-mir-random}.

\begin{figure}[!htb]
\begin{mdframed}
\small
\textbf{Algorithm 2: Random Bagging, Aggregate-MIR}\\
\textbf{Input:} : Instances $\mathcal{X}$, fixed bag size $k$, true labels $\Tilde{y}$. \\
\textbf{Steps:}
\begin{enumerate}
    \item Sort points $\mathcal{X}$ in increasing order of $\Tilde{y}$.
    \item Partition sorted points into $r$ contiguous \emph{super}-bags of sizes $2k$, where $r = n/2k.$
    \begin{align*}
    \mathcal{X} = \cup_{l = 1}^{r} \mathcal{X}_l \text{  and  } \mathcal{X}_l \cap \mathcal{X}_{l'} = \phi \text{  for all } l \neq l'%
    \end{align*}
    \item For $l = 1, \dots, r$, a $k$-sized bag $B'_l$ is sampled  $u.a.r$ from $\mathcal{X}_l$.
    \item Output $\mathcal{B'}$ where $\mathcal{B'} = \{ B'_l \}_{l \in [r]}$
\end{enumerate}
\end{mdframed}
\caption{Algorithm 2: Random Bagging, Aggregate-MIR}\label{algo:random_bag_agg_mir}
\end{figure}

Let $B'_l$ denote a super-bag of size $2k$ for $l \in [r]$ as defined in Algorithm \ref{algo:random_bag_agg_mir}. We arbitrarily sample $k$ instances to create a bag $B_l^{(1)}$ and the remaining instances form another bag $B_l^{(2)}$. We know $B_l^{(1)} \cap B_l^{(2)} = \phi$, and $|B_l^{(1)}| = |B_l^{(2)}| = k$.

\begin{proposition}[Optimizing $k$-means in Equation \ref{eq:MIR-agg-UB1}]
\label{thm:kmeans-mir-random} 
For super-bags $B_l'$ as defined in Algorithm \ref{algo:random_bag_agg_mir} with arbitrary non-overlapping partitions $B_l^{(1)}$ and $B_l^{(2)}$,
\begin{align*}
&\sum_{l=1}^{r}\text{kmc}\left(\{\Tilde{y}_i\}_{i \in B_l'}\right) \geq \\
&\sum_{l=1}^{r} \text{kmc}\left(\{\Tilde{y}_i\}_{i \in B_l^{(1)}}\right) + \text{kmc}\left(\{\Tilde{y}_i\}_{i \in B_l^{(2)}}\right)
\end{align*}
where kmc($C$) is the $k$-means clustering loss for cluster $C$. kmc($C$) $= \sum_{y_i \in C} (y_i - \mu)^2$, where $\mu$ denotes the mean of cluster $C$.
\end{proposition} 
We defer the proof to Appendix \ref{appendix:random}. The error for Aggregate MIR, as described in Equation \eqref{eq:MIR-agg-UB1} is the the product of the condition number of the bag centroids and a label $k$-means objective. Since analysis of  Theorem \ref{thm:LLP-bag-matrix-chernoff} in Section \ref{random-bagging} holds for any arbitrary partitioning of instances into \emph{super}-bags, we obtain corresponding bound on the condition number. Proposition \ref{thm:kmeans-mir-random} shows that the loss of the $k$-bagging will be at most that of the optimal $2k$ clustering.  

\section{Differential Privacy}\label{privacy} In each of the previous scenarios, the aggregator can modify the bagging procedure to obtain formal label-differential privacy guarantees \citep{chaudhuri2011sample}, defined below.

\begin{definition}[Label DP]
A randomized algorithm $A$ taking a dataset as an input is $(\epsilon, \delta)$-label-DP if
for two datasets $D$ and $D'$ which differ only on the label of one instance, for any subset $S$ of outputs
of $A$, 
\begin{equation*}
    \prob[A(D) \in S] \leq e^{\epsilon} \prob[A(D') \in S] + \delta.
\end{equation*}
\end{definition}

To guarantee label-DP, it is necessary to assume a sensitivity bound on labels, which we achieve by bounding the norm of the labels by a constant $R$. The results below quantifies the additional loss in utility that is incurred due to private bagging in the cases of Instance-MIR, and Bag-LLP. We discuss the corresponding result for Aggregate-MIR in Appendix \ref{appendix:privacy}, along with the proofs.

    \begin{theorem}[Private Error Upper Bound, Instance-MIR]\label{thm:instance-mir-loss-priv}
There exists a bagging $B~\text{with }  |B_l| = k, \forall l \in [m]$, satisfying $(\epsilon,\delta)$ label-DP, such that for $\hth$ in \eqref{eq:instanceloss}, we have
\begin{align}
    &\E\left[ ||\hth-\tth||_2^2 \right]
    \leq \\
    &C_1\left(C_2 + \kmeansopt + n\left(1-\frac{1}{k}\right)\alpha^2  + \frac{d\alpha^2}{k^2} \right), \nonumber
\end{align}
where $\alpha^2 = \frac{16R^2\log\left(\frac{1.25}{\delta/2}\right) }{\epsilon^2}$, $OPT$ is the objective value of the optimal $k$-means clustering over $\ty$, and constants $C_1, C_2$ are independent of $B$.
\end{theorem}
In the label-agnostic setting, one would just need to add noise to the bag-labels. MIR outputs one label at random, hence the sensitivity of the output is $2R$. Due to privacy amplification via subsampling \cite{balle2018privacyamplificationsubsamplingtight}, we add $\normal\left(0,\frac{\alpha^2}{k^2} \right)$ noise to the label value to ensure $(\frac{\epsilon}{2},\frac{\delta}{2})$ label-DP, where $\alpha^2 = \frac{16R^2\log\left(\frac{1.25}{\delta/2}\right) }{\epsilon^2}$, leading to an additional error of $\frac{d\alpha^2}{k^2}$. In addition, since the objective here is a label-dependent clustering, we must use a differentially private $k$-means algorithm, leading to additional loss in utility. We show that the simple approach of adding $\normal\left(0,\alpha^2 \right)$ noise to each label, and then find an optimal clustering over the noise labels, leads to an additional error of $n\left(1-\frac{1}{k}\right) \alpha^2$. In Appendix \ref{appendix:privacy}, we discuss how it is possible to achieve better utility, since the above method satisfies the more stringent notion of local-DP\sushant{cite}, while we only need to satisfy the standard notion of central-DP.

\begin{theorem}[Private Error Upper Bound, Bag-LLP]\label{thm:bag-llp-loss-priv}
There exists a bagging $B~\text{with }  |B_l| = k, \forall l \in [m]$, satisfying $(\epsilon,\delta)$ label-DP, such that for $\hth$ in \eqref{eq:bagloss}, we have
\begin{align}
    &\E\left[ \|\hth-\tth\|_2^2 \right]
    = OPT \left( \frac{d}{k}\alpha^2 + \sigma^2 \frac{m}{k} \right),\nonumber
\end{align}
where $\alpha^2 = \frac{4R^2\log\left(\frac{1.25}{\delta}\right) }{\epsilon^2}$, and $OPT$ is the optimal value of $\left(\frac{\lambda_{max}(f(X))}{\lambda_{min}(f(X))}\right)^2$.
\end{theorem}
In this case, the optimal bagging strategy in independent of the labels. Hence, one just needs to add noise to the bag-labels, and not add noise for a private clustering of the labels. LLP outputs the mean of $k$ labels, hence the sensitivity of the output is $\frac{2R}{k}$. We add $\normal\left(0,\frac{\alpha^2}{k^2} \right)$ noise to the label value to ensure $(\epsilon,\delta)$ label-DP, leading to an additional error of $\frac{\alpha^2m}{k^2}$ over the corresponding non-private bagging mechanism.



\begin{table}[]
\small{
    \centering
    \begin{tabular}{rrrrrr}
\toprule
 $k$ & Bagging Method & $\|\hth-\tth\|_2^2$ \\
\midrule
\textit{LLP} &\textit{Bag Loss} \\
\midrule
\multirow[c]{3}{*}{10} & Instance $k$-means & $0.0082 \pm 0.002$ \\
& Label $k$-means & $0.0458 \pm 0.012$ \\
& Random & $0.0099 \pm 0.002$ \\
\cline{2-3}
\multirow[c]{3}{*}{50} & Instance $k$-means & $0.0392 \pm 0.008$ \\
& Label $k$-means & $0.0629 \pm 0.008$ \\
& Random & $0.0423 \pm 0.009$ \\
\midrule

\textit{MIR} & \textit{Instance Loss}\\
\midrule
\multirow[c]{3}{*}{10} & Instance $k$-means & $0.0088 \pm 0.002$ \\
 & Label $k$-means & $0.0072 \pm 0.002$ \\
 & Random & $0.0085 \pm 0.002$ \\
\cline{2-3}
\multirow[c]{3}{*}{50} & Instance $k$-means & $0.0388 \pm 0.006$ \\
 & Label $k$-means & $0.0404 \pm 0.007$ \\
 & Random & $0.0419 \pm 0.006$ \\

\midrule
\textit{MIR} & \textit{Aggregate Loss}\\
\midrule
\multirow[c]{3}{*}{10} & Instance $k$-means & $0.0102 \pm 0.002$ \\
 & Label $k$-means & $0.0453 \pm 0.008$ \\
 & Random & $0.0221 \pm 0.004$ \\
\cline{2-3}
\multirow[c]{3}{*}{50} & Instance $k$-means & $0.0437 \pm 0.008$ \\
 & Label $k$-means & $0.0601 \pm 0.008$ \\
 & Random & $0.0619 \pm 0.012$ \\
\bottomrule
\end{tabular}
    \caption{Non-Private Bagging}
    \label{tab:non_private_llp_bag_inst_mir}
}
\end{table}

\section{Experiments}\label{experiments}

We conduct experiments on on both real-world, and synthetically generated data. 

\paragraph{Synthetic Data} We generate data of the form $(X \in \mathbb{R}^{n\times d}, Y \in \mathbb{R}^n)$, by first sampling a random ground truth model $\theta^*$ from the standard $d$-dimensional Gaussian distribution, sampling each of the rows of $X$ i.i.d. from the standard $d$-dimensional Gaussian distribution, and then setting $Y = X\theta^* + \eps$ where each coordinate of $\eps$ is i.i.d. drawn from $N(0, \sigma^2)$ where $\sigma$ is 0.5. We set $n$ to be $50,000$ and $d$ as $32$. We also vary $k$, and use $k = 10, 50$. 

We implement 3 bagging mechanisms on each of Instance-MIR, Aggregate-MIR, and Bag-LLP, namely (1) Instance $k$-means, (2) Label $k$-means, and (3) Random bagging. 
In Table \ref{tab:non_private_llp_bag_inst_mir}, we present the mean and standard deviation of the error, calculated over $15$ runs for each experiment. As expected, for Bag-LLP, instance $k$-means performs better than random bagging, which in turn performs better than label $k$-means. For Aggregate-MIR, instance $k$-means consistently performs the best, which is expected, while random bagging overall performs slightly better than label $k$-means. However, for Instance-MIR, all the 3 mechanisms show similar performance. We compute statistical significance of our results using the paired $T$-value test in Appendix \ref{app:paired-t-value}.



\begin{table}[]
\small{
    \centering
    \begin{tabular}{rrrrrr}
\toprule
 $k$ & Bagging Method & $\epsilon$ & $\|\hth-\tth\|_2^2$ \\
\midrule
\multirow[c]{9}{*}{10} & \multirow[c]{3}{*}{Instance $k$-means} & 0.5 & $0.0621 \pm 0.009$ \\
& & 1.0 & $0.0537 \pm 0.009$ \\
& & 2.0 & $0.0390 \pm 0.008$ \\
\cline{2-4}
& \multirow[c]{3}{*}{Label $k$-means} & 0.5 & $0.0505 \pm 0.005$ \\
& & 1.0 & $0.0362 \pm 0.006$ \\
& & 2.0 & $0.0189 \pm 0.004$ \\
\midrule
\multirow[c]{9}{*}{50} & \multirow[c]{3}{*}{Instance $k$-means} & 0.5 & $0.0656 \pm 0.012$ \\
& & 1.0 & $0.0595 \pm 0.012$ \\
& & 2.0 & $0.0521 \pm 0.009$ \\
\cline{2-4}
& \multirow[c]{3}{*}{Label $k$-means} & 0.5 & $0.0559 \pm 0.008$ \\
& & 1.0 & $0.0480 \pm 0.005$ \\
& & 2.0 & $0.0431 \pm 0.006$ \\
\bottomrule
\end{tabular}
    \caption{Private Bagging, Instance-MIR}
    \label{tab:private_llp_bag_inst_mir}
}
\end{table}

We also consider the private version of Instance-MIR in Table \ref{tab:private_llp_bag_inst_mir}. We set $\delta = 10^{-5}$, and vary $\epsilon$. For each mechanism, we see that accuracy drops with a decrease in $\epsilon$. However, the drop is sharper for label $k$-means, which is expected, since unlike feature $k$-means, it is label-dependent, incurring an extra utility error. We also note that that drop in accuracy is sharper for a smaller bag size; this is again expected since the error due to privacy scales with $\frac{1}{k}$.

We also consider non-isotropic distributions. We generate $X$ i.i.d. from $\mathcal{N}(0, \Sigma)$, where $\Sigma$ is determined by sampling $d$ independent values $\{\lambda_1, \cdots, \lambda_d\}$ from a uniform distribution $U(0.1, 10)$ to be the eigenvalues of the $\Sigma$, which is diagonal matrix. We also consider the case where the columns of $\Sigma$ are non-independent. We sample each entry of a Cholesky matrix $M$ of size $d\times d$ from $\mathcal{N}(0,1)$. We then compute the covariance matrix $M^TM$ and apply a linear transformation to feature vectors $x$ sampled from $\mathcal{N}(0, I)$ using $M$. The resulting set of vectors is non-isotropic with correlated features. Here, we also implement Scaled Instance $k$-means, that scales the dataset $X$ as $\Sigma^{-\frac{1}{2}}X$ to be isotropic, and then finds an optimal $k$-means clustering on the scaled dataset.  We demonstrate results in the Bag-LLP setup in Table \ref{tab:non_iso_llp_bag}. 


The complete tables (Table \ref{tab:app_bag_llp}, Table \ref{tab:app_agg_mir}, Table \ref{tab:app_instance_mir}) are deferred to the Appendix \ref{appendix:experiments}, where we also vary $\sigma$. 

\paragraph{Real-world Data} In Table \ref{tab:white_wine}, we conduct experiments using the Wine Quality Regression dataset from UCI, focusing on the White wine subset, which contains 4898 samples. We evaluate performance using MSE on the test set after 10-fold cross-validation. The results for Instance-MIR align with our theoretical expectations, showing that label $k$-means has the lowest error. For Bag-LLP, the results for  are consistent with our bounds as (Scaled)Instance $k$-means is performing the best. We see label $k$-means consistently performs better than Instance $k$-means for Aggregate-MIR. This is possibly because the distribution for real data does not follow the linear behavior that our results assume. We provide additional experiments on Wine Quality Datasets (Red and White Subsets) in Tables \ref{tab:red_wine} and \ref{tab:white_wine_app}.

The experimental code for the paper is available at \url{https://github.com/google-deepmind/agg_data_uai25}.

\begin{table}[]
\small{
    \centering
    \begin{tabular}{rrrrrr}
\toprule
 $k$ & Bagging Method & $\|\hth-\tth\|_2^2$ \\
\midrule
\textit{} &\textit{Independent} \\
\midrule
\multirow[c]{4}{*}{10} 
& Scaled Instance $k$-means & $0.008552 \pm 0.00191$ \\
& Instance $k$-means & $0.009739 \pm 0.00201$ \\
& Random& $0.010518 \pm 0.00339$ \\
& Label $k$-means& $0.042496 \pm 0.00626$ \\

\cline{2-3}
\multirow[c]{4}{*}{50} 
 & Scaled Instance $k$-means & $0.038586 \pm 0.00784$ \\
 & Instance $k$-means & $0.036923 \pm 0.00536$ \\
 & Random& $0.039461 \pm 0.00760$ \\
& Label $k$-means& $0.059834 \pm 0.00598$ \\
\midrule

\textit{} &\textit{Non-independent}\\
\midrule
\multirow[c]{3}{*}{10} 
& Scaled Instance $k$-means & $0.024811 \pm 0.00498$ \\
 & Instance $k$-means & $0.032367 \pm 0.00835$ \\
 & Random& $0.024585 \pm 0.00755$ \\
 & Label $k$-means& $0.052438 \pm 0.00936$ \\
 
\cline{2-3}
\multirow[c]{3}{*}{50} 
& Scaled Instance $k$-means & $0.049910 \pm 0.00773$ \\
& Instance $k$-means & $0.051425 \pm 0.00895$ \\
 & Random& $0.048222 \pm 0.01074$ \\
 & Label $k$-means& $0.061918 \pm 0.00820$ \\

\bottomrule
\end{tabular}
    \caption{Non-Isotropic Distribution, Bag-LLP}
    \label{tab:non_iso_llp_bag}
}
\end{table}


\begin{table}[]
\small{
    \centering
\begin{tabular}{lrrr}
\toprule
Setting & $k$ & Bagging Method &  MSE\\
\midrule
\multirow[c]{8}{*}{AggMIR}  & \multirow[c]{4}{*}{10} & Instance $k$-means & $0.605 \pm 0.086$ \\
 &  & Label $k$-means & $0.190 \pm 0.023$ \\
 &  & Random & $0.778 \pm 0.131$ \\
 &  & Scaled Instance $k$-means & $0.840 \pm 0.143$ \\
\cline{2-4}
& \multirow[c]{4}{*}{40} & Instance $k$-means & $0.731 \pm 0.176$ \\
 &  & Label $k$-means & $0.198 \pm 0.072$ \\
 &  & Random & $1.112 \pm 0.514$ \\
 &  & Scaled Instance $k$-means & $0.941 \pm 0.152$ \\
\cline{1-4} \cline{2-4}
\multirow[c]{8}{*}{BagLLP}  & \multirow[c]{4}{*}{10} & Instance $k$-means & $0.098 \pm 0.008$ \\
 &  & Label $k$-means & $0.194 \pm 0.021$ \\
 &  & Random & $0.049 \pm 0.008$ \\
 &  & Scaled Instance $k$-means & $0.061 \pm 0.006$ \\
\cline{2-4}
 & \multirow[c]{4}{*}{40} & Instance $k$-means & $0.104 \pm 0.017$ \\
 &  & Label $k$-means & $0.162 \pm 0.057$ \\
 &  & Random & $0.126 \pm 0.042$ \\
 &  & Scaled Instance $k$-means & $0.083 \pm 0.021$ \\
\cline{1-4} \cline{2-4}
\multirow[c]{8}{*}{InstanceMIR}  & \multirow[c]{4}{*}{10} & Instance $k$-means & $0.718 \pm 0.108$ \\
 &  & Label $k$-means & $0.577 \pm 0.038$ \\
 &  & Random & $0.804 \pm 0.082$ \\
 &  & Scaled Instance $k$-means & $0.930 \pm 0.060$ \\
\cline{2-4}
 & \multirow[c]{4}{*}{40} & Instance $k$-means & $0.983 \pm 0.263$ \\
 &  & Label $k$-means & $0.602 \pm 0.033$ \\
 &  & Random & $0.807 \pm 0.317$ \\
 &  & Scaled Instance $k$-means & $0.961 \pm 0.190$ \\
\bottomrule
\end{tabular}
\caption{White Wine Quality}
    \label{tab:white_wine}
}
\end{table}


\section{Conclusion}\label{conclusion}
In this paper, we study for various loss functions in the MIR and LLP setups, the optimal way to partition the dataset into bags such that the utility for downstream tasks like linear regression is maximized. We derive upper bounds on error, and show that in each case, the optimal bagging strategy (approximately) reduces to finding an optimal $k$-means clustering of the feature vectors or the labels. We also show that our bagging mechanisms can be made to satisfy label-DP, incurring an additional utility error. We finally generalize our results to the setting of GLMs, and experimentally validate our theoretical results. 

There are several potential directions for future work. While we only considered linear models, it would be interesting to analyse optimal bagging strategies in non-linear models, such as neural networks. We believe that similar results should also hold for more complex models such as neural networks (the error bounds might be different, but we believe similar clustering objectives would be effective). However, the analysis is challenging and would require different techniques, and we leave this important direction for future work. In addition, one could also consider other popular loss functions for MIR and LLP used in literature. Furthermore, while our work only looked at upper bounds, having corresponding lower bounds would also be valuable.

\newpage 

\bibliography{uai2025-template}

\newpage

\onecolumn

\title{Aggregating Data for Optimal Learning\\(Supplementary Material)}



\appendix





\section{Label-dependent Bagging (Continued)}\label{proofs}

\subsection{Instance-MIR} \label{MIR-event}

We denote the uniform distribution by $\Gamma$\sushant{$U$?}.
Let $\overline{y} = [\overline{y}_1, \dots, \overline{y}_m]$, where $\overline{y}_l = y_{\Gamma(B_l)}$. We define a random attribution matrix for MIR, $A$ $\in \{0, 1\}^{n \times n}$, as follows.
    \begin{align*}
        {A}_{(i,j)} =
        \begin{cases}
            1 & \text{if } i \in B_l \text{ and } \overline{y}_l = y_j\\
            0 & \text{otherwise}.
        \end{cases}
    \end{align*}


Note that $\E[A] = S = S^T$ is given by
\begin{align*}
    {S}_{(i,j)}  =
    \begin{cases}
        \frac{1}{|B_l|} & \text{if } i, j \in B_l \\
        0 & \text{otherwise}.
    \end{cases}
\end{align*}
The minimizer of \eqref{eq:instanceloss} is then given by
\begin{align*}\label{eq:MIR_event_theta}
    \hth =\argmin_{\theta} \frac{1}{n}\|Ay - X\theta\|_2^2 = (\bX^T\bX)^{-1}\bX^TA\by.
\end{align*}

We now give a proof sketch for Theorem \ref{thm:MIR-event}, providing an upper bound for the error of $\hth$ (some details are omitted to Appendix \ref{proofs}. All the expectations henceforth are over the randomness in $A$ unless otherwise stated.


\begin{proof}(of Theorem \ref{thm:MIR-event}) We begin with the following proposition, and use it to prove the main theorem
\begin{proposition}\label{prop:MIR-event-error}
\begin{align*}
    \E\left[ ||\hth-\tth||_2^2 \right]
    &=
  \E\left[   || (\bX^T\bX)^{-1}\bX^T(A - I)\bX\tth ||_2^2 \right]
    + \sigma^2 \E\left[ ||(\bX^T\bX)^{-1}\bX^TA||_{F}^2 \right].
    \end{align*} \label{thm:event-mir-loss-linear-regression}
\end{proposition}
\begin{proof} (of Proposition \ref{prop:MIR-event-error})
    By rearranging the terms,
\begin{align}
\hth - \tth &= (\bX^T\bX)^{-1}\bX^TA\by - \tth \nonumber\\ &= (\bX^T\bX)^{-1}\bX^TA\bX\tth -\tth + (\bX^T\bX)^{-1}\bX^TA \eps \nonumber\\\nonumber
&= (\bX^T\bX)^{-1}\bX^T(A - I)\bX\tth+ (\bX^T\bX)^{-1}\bX^TA \eps\,.
\end{align}
$\eps$ is independent of $A$ with
$\E[\eps] = 0$, $\E[\eps\eps^T]=\sigma^2I$ and $\E[A] = S$. Using this we get, 
\begin{align*}
    \E\left[ ||\hth-\tth||^2  \right] &=
  \E\left[ || (\bX^T\bX)^{-1}\bX^T(A - I)\bX\tth ||_2^2  \right]  
    + \E\left[ {\rm tr} ((\bX^T\bX)^{-1}\bX^T A\eps\eps^TA^T \bX (\bX^T\bX)^{-1})  \right]\,\nonumber \\
      &=
  \E\left[   || (\bX^T\bX)^{-1}\bX^T(A - I)\bX\tth ||_2^2  \right]  
    + \sigma^2 \E\left[ {\rm tr} ((\bX^T\bX)^{-1}\bX^T AA^T \bX (\bX^T\bX)^{-1})  \right]\,\nonumber \\
    &= \E\left[   || (\bX^T\bX)^{-1}\bX^T(A - I)\bX\tth ||_2^2  \right] 
    + \sigma^2 \E\left[ ||(\bX^T\bX)^{-1}\bX^TA||_{F}^2  \right]\,\nonumber
\end{align*}
\end{proof}
We now upper bound the error in Proposition \ref{prop:MIR-event-error}. We simplify the first term.
\begin{align*}
    \E\left[||(\bX^T\bX)^{-1}\bX^T(A-I)\bX\tth||_2^2  \right]
    &\le
    \E\left[||(\bX^T\bX)^{-1}\bX^T||_{op} ||(A-I)\bX\tth||_2^2  \right]\\
    &= ||(\bX^T\bX)^{-1}\bX^T||_{op}^2\E\left[||(A-I)\bX\tth||_2^2\right]
\end{align*}%
$||M||_{op}$ above denotes the operator norm of martix $M$. We simplify the RHS above with the following proposition.
\begin{proposition}\label{event-MIR-k-means}
\begin{align*}
    \E \left[  ||(A - I)\bX\tth||_2^2 
\right] = \left(2||\Tilde{y}||_2^2 - 2\sum_{l=1}^m \frac{\left(\sum_{i\in B_l} \ty_i\right)^2}{|B_l|}
    \right)
    \end{align*}
\end{proposition}
\begin{proof}
\begin{align*}
&\E \left[  ||(A - I)\bX\tth||_2^2 
\right] \\&= \E \left[ ((A - I)\bX\tth)^T  (A - I)\bX\tth 
\right] \nonumber\\
&= \E \left[ {\tth}^TX^TA^T  A \bX\tth 
\right] - \E \left[ {\tth}^TX^T (A + A^T)\bX\tth 
\right] + ||X\tth||_2^2 \\
&= \E \left[ ||A \Tilde{y}||_2^2 
\right] - {\tth}^TX^T (S + S^T)X\tth + ||X\tth||_2^2 \\
&= \E \left[ ||A \bX\tth||_2^2 
\right] - 2 {\tth}^TX^TSX\tth + ||\Tilde{y}||_2^2
\end{align*}
Putting the following two lemmas together, we conclude Proposition \ref{event-MIR-k-means}.
\begin{lemma}
\label{lem:exp_A_tilde_y}
    $\E \left[ ||A X\tth||_2^2 \right] = ||\Tilde{y}||_2^2$.
\end{lemma}
\begin{proof}
    (of Lemma \ref{lem:exp_A_tilde_y})
  Let $B(i)$ be the bag containing $x_i$.  Note that $ A\bX\tth = \left[\ty_{\Gamma(B(1))}, \ldots, \ty_{\Gamma(B(n))}\right]^T$
\begin{align*}
    {\tth}^TX^TA^T  A \bX\tth &= \sum_{i=1}^{i=n} \ty^2_{\Gamma(B(i))}
\end{align*}
Then we have 
\begin{align*}
\E \left[\sum_{i=1}^{i=n} \ty^2_{\Gamma(B(i))}\right]
&=
\sum_{i=1}^{i=n} \left(\sum_{j \in B(i)} \frac{\left(\ty_j\right)^2}{|(B(i))|}  \right)\\
&=
\sum_{l=1}^{l=m} |B_l| \left(\sum_{j \in B(i)}\frac{\left(\ty_j\right)^2}{|B_l|}  \right)\\
&=
\sum_{i=1}^{n}{\left(\ty_i\right)^2} \\
\end{align*}
\end{proof}
\begin{lemma}
\label{lem:clustering_term_mir}
   $ {\tth}^TX^TSX\tth =  \sum_{l=1}^m \frac{\left(\sum_{i\in B_l} \ty_i\right)^2}{|B_l|}$.
\end{lemma}
\begin{proof} (of Lemma \ref{lem:clustering_term_mir}).
Note that $S = M^TM$, where $M \in \mathbb{R}^{m \times n}$ is defined as:
\begin{align*}
        {M}_{(i,j)} =
        \begin{cases}
            1/\sqrt{|B_i|} & \text{if } x_j \in B_i \\
            0 & \text{otherwise}.
        \end{cases}
\end{align*}
Thus, ${\tth}^TX^TSX\tth = {\tth}^TX^TM^TMX\tth = ||M\Tilde{y}||_2^2$.
    \begin{align*}
        ||M\Tilde{y}||_2^2
        &= \sum_{l=1}^{m} \left(\sum_{x_i \in B_l}\frac{1}{\sqrt{|B_l|}} \Tilde{y_i}\right)^2\\
        &= \sum_{l=1}^{m} \frac{1}{|B_l|}\left(\sum_{x_i \in B_l} \Tilde{y_i}\right)^2
    \end{align*}
\end{proof}
\end{proof}
The following proposition analyses the second term in Proposition \ref{prop:MIR-event-error}, and together with Proposition \ref{event-MIR-k-means} concludes the proof of Theorem \ref{thm:MIR-event}.\sushant{can we make these clickable?}
\begin{proposition}
\label{lem:exp_var_term_mir_event}
\begin{align*}
     \E\left[ ||(\bX^T\bX)^{-1}\bX^TA||_{F}^2 \right] \leq d ||(\bX^T\bX)^{-1}\bX^T||_{op}^2
\end{align*}
\end{proposition}
\begin{proof}(of Proposition \ref{lem:exp_var_term_mir_event}). 
We use the following inequality:
\[
    ||AB||_{F}^2 \le \min\left({||A||_{op}^2||B||_{F}^2, ||B||_{op}^2||A||_{F}^2}\right)\,.
\]
\begin{align*}
    \E\left[ ||(\bX^T\bX)^{-1}\bX^TA||_{F}^2  \right] \le
    \min\left(\E\left[ ||(\bX^T\bX)^{-1}\bX^T||_{op}^2||A||_{F}^2  \right], \E\left[ ||(\bX^T\bX)^{-1}\bX^T||_{F}^2||A||_{op}^2  \right]\right)
\end{align*}
We assumed $\text{rank}(\bX) = d$, hence
$||(\bX^T\bX)^{-1}\bX^T||_{F}\le \sqrt{d} ||(\bX^T\bX)^{-1}\bX^T||_{op}\,.$
\begin{align*}
 \E\left[ ||(\bX^T\bX)^{-1}\bX^TA||_{F}^2  \right] &\le \min\left(\E\left[ ||(\bX^T\bX)^{-1}\bX^T||_{op}^2||A||_{F}^2  \right], \E\left[ d||(\bX^T\bX)^{-1}\bX^T||_{op}^2||A||_{op}^2 \right]\right) \\
    &=  ||(\bX^T\bX)^{-1}\bX^T||_{op}^2 \min\left(\E\left[ ||A||_{F}^2  \right], d\E\left[||A||_{op}^2  \right]\right)
\end{align*}
We have $\E\left[ ||A||_{F}^2  \right] = n$ and $\E\left[||A||_{op}^2  \right] = 1$. Also, we are in the setting where $n > d$ to have a well defined regressor. Therefore, we obtain
\[
     \E\left[ ||(\bX^T\bX)^{-1}\bX^TA||_{F}^2  \right]\le d ||(\bX^T\bX)^{-1}\bX^T||_{op}^2 
\]
\end{proof}
\end{proof}














    












\subsection{Bag-LLP}\label{LLP-bag}
 We define a bagging matrix $S$ $\in \{0, 1\}^{m \times n}$ that encodes the assignment of instances to bags.
 \begin{align}\label{eq:S_LLP}
    S_{(l,i)} =
    \begin{cases}
        \frac{1}{|B_l|} & \text{if } i \in B_l,\\
        0 & \text{otherwise}.
    \end{cases}
\end{align}

The minimizer of the bag-level loss in matrix form is
\begin{align*}
    \hth =\argmin_{\theta} \frac{1}{m}\|S \by - S\bX\theta\|_2^2.
\end{align*}



\begin{theorem*}[full version of Theorem \ref{thm:LLP-bag-UB1}]\label{Bag-LLP-full}

For $\hth$ as in \eqref{eq:bagloss}, for a given bagging $B$ with bagging matrix $S$, we have
\begin{align*}
  \E\left[ \|\hth-\tth\|_2^2  \right] \leq   \sigma^2 \left(\frac{\lambda_{max}((S\bX)^T S\bX)}{\lambda_{min}((S\bX)^T S\bX)}\right)^2 \left(\sum_{l = 1}^m \frac{1}{|B_l|}\right)
\end{align*}
For equal sized bags of size $k$, this simplifies to
\begin{align*}\label{eq:LLP-bag-UB1}
    \E\left[ \|\hth-\tth\|_2^2  \right] \leq 
    \sigma^2 \frac{m}{k} \left( 
    	\frac
    	{\lambda_{max} ((S\bX)^T S\bX)^{-1}}
    	{\lambda_{min} ((S\bX)^T S\bX)^{-1}} 
    \right)^2. 
\end{align*}
\end{theorem*}

\begin{proof}
We start by proving the following lemma

\begin{lemma}
\begin{align*}
    \E\left[ \|\hth-\tth\|_2^2  \right]
    =&
    \sigma^2 \|((S\bX)^T S\bX)^{-1} (S\bX)^T (S S^T)^{1/2}\|_F^2\,.
\end{align*}
\end{lemma}

\begin{proof}
    The minimizer of the bag-level loss in matrix form is
\begin{align*}
    \hth &=\argmin_{\theta} \frac{1}{m}\|S \by - S\bX\theta\|_2^2\\
    &= (\bX^T S^T S\bX)^{-1} \bX^T S^T S y.
\end{align*}

    By rearranging the terms, we have
\begin{align*}
\hth - \tth &= ((S\bX)^T S\bX)^{-1} \bX^T S^T S\by - \tth\\
&= ((S\bX)^T S\bX)^{-1} \bX^T S^T S\bX\tth -\tth\\ &+ ((S\bX)^T S\bX)^{-1} \bX^T S^T S \eps\\
&= ((S\bX)^T S\bX)^{-1} \bX^T S^T S \eps 
\end{align*}

Since $\eps$ is independent of $\bX$l, with
$\E[\eps] = 0$, and $\E[\eps\eps^T]=\sigma^2\Iden$, we have
\begin{align*}
\E\left[\|\hth - \tth\|_2^2  \right] = \sigma^2 tr(((S\bX)^T S\bX)^{-1} (S\bX)^T S S^T  (S\bX) ((S\bX)^T S\bX)^{-1})
\end{align*}

By definition, $S S^T = \text{Diag}(\{\frac{1}{|B_1|}, \frac{1}{|B_2|}, \dots, \frac{1}{|B_m|}\})$ and the expression simplifies to give:
\begin{equation*}
    \E\left[\|\hth - \tth\|_2^2 \right] = \sigma^2 \|((S\bX)^T S\bX)^{-1} (S\bX)^T (S S^T)^{1/2}\|_F^2 
\end{equation*}
\end{proof}

Now we upper bound the RHS.

\begin{align*}
\E\left[\|\hth - \tth\|_2^2 \right] &= \sigma^2 \|((S\bX)^T S\bX)^{-1}  (S\bX)^T (S S^T)^{1/2}\|_F^2 \\
&\leq \sigma^2 \|((S\bX)^T S\bX)^{-1} (S\bX)^T\|_{op}^2 \| (SS^T)^{1/2}\|_F^2 \\
&= \sigma^2 \|((S\bX)^T S\bX)^{-1} (S\bX)^T\|_{op}^2 \left(\sum_{l = 1}^m \frac{1}{|B_l|}\right) \\
&\leq \sigma^2 \|((S\bX)^T S\bX)^{-1}\|_{op}^2 \|(S\bX)^T\|_{op}^2 \left(\sum_{l = 1}^m \frac{1}{|B_l|}\right) \\
&\leq \sigma^2 \left(\frac{\lambda_{max}((S\bX)^T S\bX)}{\lambda_{min}((S\bX)^T S\bX)}\right)^2 \left(\sum_{l = 1}^m \frac{1}{|B_l|}\right)
\end{align*}
\end{proof}


\subsection{Aggregate-MIR}\label{MIR-agg}

We define a random attribution matrix $A$ $\in \{0, 1\}^{m \times n}$ as follows, to indicate the bag-label of each bag.
\begin{align*}
    A_{(l,i)} =
    \begin{cases}
        1 & \text{if } y_i = \Gamma{(B_l)},\\
        0 & \text{otherwise}.
    \end{cases}
\end{align*}

We denote $\E[A] = S$. This turns out to be the same S as \eqref{eq:S_LLP}, and represents the instances in each bag. The minimizer of the aggregate-level loss is
\begin{align*}
    \hth =\argmin_{\theta} \frac{1}{m}\|A \by - S\bX\theta\|_2^2.
\end{align*}

\begin{theorem*}[full version of Theorem \ref{thm:MIR-agg-UB1}]\label{Aggregate-MIR-full}
For $\hth$ in \eqref{eq:aggloss}, given a bagging $B$ with bagging matrix $S$,
\begin{align*}
    \E\left[ \|\hth-\tth\|_2^2 \right]
    \leq \|((S\bX)^T S\bX)^{-1} (S\bX)^T\|_{op}^2 \left(\sum_{l=1}^m \left(\frac{\sum_{i\in B_l} \ty^2_i}{|B_l|}\right) - \sum_{l=1}^m \left(\frac{\sum_{i\in B_l} \ty_i}{|B_l|}\right)^2  + \sigma^2 n \right)
\end{align*}
For equal sized bags, this simplifies to
\begin{align}
    &\E\left[ \|\hth-\tth\|_2^2 \right]
    \leq \frac{1}{k} \|((S\bX)^T S\bX)^{-1} (S\bX)^T\|_{op}^2 \nonumber \left( \sum_{l=1}^m \sum_{\ty_i\in B_l} (\ty_i - \mu_l)^2  + \sigma^2nk \right),
\end{align}
\end{theorem*}

\begin{proof}
\begin{align*}
    \hth &=\argmin_{\theta} \frac{1}{m}\|A \by - S\bX\theta\|_2^2 \\
     &= (\bX^T S^T S\bX)^{-1} \bX^T S^T A \by.
\end{align*}


    By rearranging the terms, we have
\begin{align*}
\hth - \tth &= ((S\bX)^T S\bX)^{-1} \bX^T S^T A\by - \tth\\
&= ((S\bX)^T S\bX)^{-1} \bX^T S^T A\bX\tth -\tth + ((S\bX)^T S\bX)^{-1} \bX^T S^T A \eps
\end{align*}
$\eps$ is independent of $\bX$ with
$\E[\eps] = 0$ and $\E[\eps\eps^T]=\sigma^2\Iden$. Also, $\E[A] = S$, and $\eps,A$ are independent. Hence, 
\begin{align*}
\E\left[\|\hth - \tth\|_2^2 \right] &= \E\left[\|((S\bX)^T S\bX)^{-1} (S\bX)^T AX\tth - ((S\bX)^T S\bX)^{-1} (S\bX)^T S\bX\tth + ((S\bX)^T S\bX)^{-1} \bX^T S^T A \eps \|_2^2\right] \\
&\leq \|((S\bX)^T S\bX)^{-1} (S\bX)^T\|_{op}^2 \E[\|(A\bX\tth - S\bX\tth) + A\eps\|_2^2] \\
&\leq \|((S\bX)^T S\bX)^{-1} (S\bX)^T\|_{op}^2 \left( \E[\|A\bX\tth - S\bX\tth\|_2^2] + \E [\|A\eps\|_2^2] \right) \\
&\leq \|((S\bX)^T S\bX)^{-1} (S\bX)^T\|_{op}^2 \left( \E[\|A\Tilde{y} - S\Tilde{y}\|_2^2] + \E [\|A\eps\|_2^2] \right)
\end{align*}
We now analyse $ \E[\|A\Tilde{y} - S\Tilde{y}\|_2^2]$ in the lemma below.
\begin{lemma}
\begin{align*}
    \E[\|A\Tilde{y} - S\Tilde{y}\|_2^2] = \sum_{l=1}^m \left(\frac{\sum_{i\in B_l} \ty^2_i}{|B_l|}\right) - \sum_{l=1}^m \left(\frac{\sum_{i\in B_l} \ty_i}{|B_l|}\right)^2
\end{align*}
\end{lemma}
\begin{proof}

\begin{align*}
    \E[\|A\Tilde{y} - S\Tilde{y}\|_2^2] &= \E [ (A\Tilde{y} - S\Tilde{y})^T(A\Tilde{y} - S\Tilde{y})] \\
    &= \E [ ||A\Tilde{y}||^2 + ||S\Tilde{y}||^2 - 2\Tilde{y}^TS^TA\Tilde{y} ] \\
    &= \E [ ||A\Tilde{y}||^2 ] + \E [ ||S\Tilde{y}||^2] - 2 \E [\Tilde{y}^TS^TA\Tilde{y} ] \\
    &= \E [ ||A\Tilde{y}||^2 ] + \E [ ||S\Tilde{y}||^2] - 2 \E [\Tilde{y}^TS^TS{y} ] \\
    &= \E [ ||A\Tilde{y}||^2 ] + \E [ ||S\Tilde{y}||^2] - 2 \E [||S\Tilde{y}||^2 ] \\
    &= \E [ ||A\Tilde{y}||^2 ] - \E [ ||S\Tilde{y}||^2]  \\
    &= \E [ ||A\Tilde{y}||^2 ] -  ||S\Tilde{y}||^2 
\end{align*}
We now analyse $\E [ ||A\Tilde{y}||^2 ]$
\begin{align*}
    A\ty &= \left[\ty_{\Gamma(B_1)}, \ldots, \ty_{\Gamma(B_m)}\right]^T\\
    \implies \ty^TA^T  A \ty &= \sum_{l=1}^{l=m} \ty^2_{\Gamma(B_l)}
\end{align*}
Then we have 
\begin{align*}
    \E \left[ \ty^TA^T  A \ty \right] &= \E \left[\sum_{l=1}^{l=m} \ty^2_{\Gamma(B_l)}\right]\\
&=
\sum_{l=1}^m \left(\frac{\sum_{i\in B_l} \ty^2_i}{|B_l|}\right)
\end{align*}
For equal size bags it simplifies to $\frac{||\ty||^2}{k}$. We now analyse Term 2 $||S\Tilde{y}||^2$
\begin{align*}
    S\ty &= \left[ \frac{\sum_{i\in B_1} \ty_i}{|B_1|}, \ldots, \frac{\sum_{i\in B_m} \ty_i}{|B_m|}\right]^T\\
    \implies \ty^TS^T  S \ty &= \sum_{l=1}^m \left(\frac{\sum_{i\in B_l} \ty_i}{|B_l|}\right)^2
\end{align*}
For equal size bags this simplifies to $\sum_{l=1}^m \left(\frac{\sum_{i\in B_l} \ty_i}{k}\right)^2$.
\end{proof}

It is easy to see that $\E [\|A\eps\|_2^2] = n\sigma^2$. Combining this with the above lemma, we are done.

\end{proof}



 












\section{MISSING Results and PROOFS}\label{appendix:proofs}


In this section, we present some missing proofs from the paper, along with some additional results that were briefly mentioned in the main paper. 

\subsection{Additional results from Section \ref{results}}

Lemma \ref{l1} shows that finding the optimal $k$-means clustering of the (expected) labels $\ty$ exactly maximizes $\sum_{\ell=1}^m \frac{\left(\sum_{i\in B_\ell} \ty_i\right)^2}{|B_\ell|}$. Lemma \ref{noisyclustering} shows that clustering over $y = \ty + \gamma$ as a proxy for clustering over $\ty$ leads to an additional utility error of $\left(1-\frac{1}{k}\right)\sigma^2n$. Lemma \ref{l3} shows that the $1d$ clustering problem above turns out to result in a bagging that just sorts the labels in order, and partitions contiguous segments into bags.





\begin{lemma}[$k$-means Equivalence]\label{l1}
Maximizing $\sum_{\ell=1}^m \frac{\left(\sum_{i\in B_\ell} \ty_i\right)^2}{|B_\ell|}$ corresponds to finding the optimal $k$-means clustering over $\ty$.
\end{lemma}
\begin{proof}
The $k$-means objective for a bagging $B$ over $\ty$ is 
\begin{align*}\label{eq:k_means}
     \sum_{l=1}^m \sum_{i\in B_l} (\ty_i - \mu_l)^2\,,
\end{align*}
where $\mu_{l} = \frac{1}{|B_l|} \sum_{i\in B_l} \ty_i$ is the mean of the entries of $\tby$ in bag $l$. We expand on the objective below. 
\begin{align*}
    \sum_{l=1}^m \sum_{i\in B_l} \left(\ty_i - \mu_l\right)^2 &= \sum_{l=1}^m \sum_{i\in B_l} (\ty_i^2 + \mu_l^2 - 2\ty_i\mu_l)\\
    &= \sum_{l=1}^m \left(\sum_{i\in B_l} \ty_i^2 + \sum_{i\in B_l} \mu_l^2 - 2\sum_{i\in B_l}\ty_i\mu_l\right)\\
    &= \sum_{l=1}^m \left(\sum_{i\in B_l} \ty_i^2 + |B_l| \mu_l^2 - 2|B_l| \mu_l^2\right)\\
    &= \sum_{i=1}^n \ty_i^2 - \sum_{l=1}^m \left( |B_l| \mu_l^2 \right)\\
    &= ||\ty||_2^2 - \sum_{\ell=1}^m \frac{\left(\sum_{i\in B_\ell} \ty_i\right)^2}{|B_\ell|}
\end{align*}
$||\ty||_2^2$ is constant, hence minimizing $\sum_{l=1}^m \sum_{i\in B_l} \left(\ty_i - \mu_l\right)^2$ is equivalent to maximizing $\sum_{\ell=1}^m \frac{\left(\sum_{i\in B_\ell} \ty_i\right)^2}{|B_\ell|}$.
\end{proof}
\begin{lemma}[Noisy Clustering]\label{noisyclustering}
Given $y_i = \ty_i + \gamma_i$, where $\eps_i\sim\normal(0,\sigma^2)$. Then, given a clustering $B$ over $y$,
\begin{align*}
    \expectation[k\text{-means}(B(y))] = \expectation[k\text{-means}(B(\ty))] + (n-m) \sigma^2
\end{align*}
where where $\text{k-means}(S(X))$ is the $k$-means clustering objective of $S$ on $X$. For equal sized bags of size $k$,
\begin{align*}
    \expectation[k\text{-means}(B(y))] = \expectation[k\text{-means}(B(\ty))] + n\left(1-\frac{1}{k}\right)\sigma^2 \text{\sushant{}}.
\end{align*}
\end{lemma}
\begin{proof}
    \begin{align*}
    \expectation[k\text{-means}(B(y))] -  \expectation[k\text{-means}(B(\ty))] &=
    \expectation \left[ \sum_{l=1}^m \sum_{i\in B_l} (y_i - \mu_l)^2 \right] - \expectation\left[\sum_{l=1}^m \sum_{i\in B_l} (\ty_i - \mu_l)^2 \right] \\ &=
       \expectation \left[ \sum_{l=1}^m \sum_{i\in B_l} (y_i - \mu_l)^2 - \sum_{l=1}^m \sum_{i\in B_l} (\ty_i - \mu_l)^2 \right] \\ &= \expectation\left[\sum_{l=1}^m \sum_{i\in B_l} \left((y_i - \mu_l)^2 -  (\ty_i - \tmu_l)^2\right)\right] \\
        &= \expectation \left[ \sum_{l=1}^m \sum_{i\in B_l} \left( (y_i - \ty_i + \tmu_l - \mu_l)  (y_i - \mu_l + \ty_i - \tmu_l ) \right)\right] \\
        &= \expectation \left[\sum_{l=1}^m \sum_{i\in B_l} \left( \left(\gamma_i -  \frac{\sum_{i\in B_l} \gamma_i}{|B_l|}\right)  \left(2y_i - 2\mu_l + \gamma_i -  \frac{\sum_{i\in B_l} \gamma_i}{|B_l|}\right) \right)\right]\\
       &= \sum_{l=1}^m \sum_{i\in B_l} \left( \expectation\left[ \gamma_i^2\right] +  \frac{\sum_{i\in B_l} \expectation\left[ \gamma_i^2\right]}{|B_l|^2} - 2\frac{ \expectation\left[ \gamma_i^2\right]}{|B_l|} \right)\\
       &= \sum_{l=1}^m \sum_{i\in B_l} \expectation\left[ \gamma_i^2\right] \left( 1 - \frac{1}{|B_l|} \right) \\
       &= \sigma^2 \sum_{l=1}^m \left(|B_l| - 1 \right) \\
       &= \sigma^2 \left(n - m \right)
    \end{align*}
\end{proof}

\begin{lemma}\label{l3}
Sort $\ty_i$ in non-increasing order as $\ty_{(1)}, \ldots, \ty_{(n)}$. There exists an optimal $k$-means clustering $B^*$ such that $\ty_{(i)}, \ty_{(j)} \in B^*_l \implies \ty_{(k)} \in B^*_l, \forall k \in \{i, i+1, \ldots, j\}$.
\end{lemma}
\begin{proof}
    Follows from Lemma 2.3 in \cite{javanmard2024priorboostadaptivealgorithmlearning}.
\end{proof}

\subsection{Additional results from Section \ref{instance_k-means}}\label{appendix:k_means}


\begin{lemma}[$k$-means Decomposition]\label{mir-k-means}
 Consider an orthogonal basis $z_1, \ldots z_d$. Fix a clustering $S$. We can show the following
\begin{equation*}
    \text{k-means}(S(X)) = \sum_{j=1}^d \text{k-means}(S(X_{z_j})),
\end{equation*}
where $\text{k-means}(S(X))$ is the $k$-means clustering objective of $S$ on $X$, and $X_z$ is the projection of $X$ along $z$.
\end{lemma}
\begin{proof} Let $X = \{X_1, \ldots, X_n\}$.
  \begin{align*}
  \text{k-means}(S(X)) &=
      \sum_{l=1}^m \sum_{X_i\in S_l} ||X_i - \mu_l||_2^2 \\ &= \sum_{l=1}^m \sum_{X_i\in S_l} ||X_i||_2^2 + ||\mu_l||_2^2 - 2X_i^T \mu_l\\
      &= \sum_{l=1}^m \sum_{X_i\in S_l} \sum_{j=1}^d \left( {X_{z_j}}_i^2 + \mu_{l_{z_j}}^2 - 2{X_{z_j}}^T \mu_{l_{z_j}} \right) \\
      &= \sum_{j=1}^d \sum_{l=1}^m \sum_{X_i\in S_l}  \left({X_{z_j}}^T - \mu_{l_{z_j}} \right)^2 \\
      &= \sum_{j=1}^d \text{k-means}(S(X_{z_j}))
  \end{align*}
\end{proof}


\begin{lemma}[$k$-means-Variance Equivalence]\label{llp-k-means}
Consider a direction $z$, and a centred dataset $X$. Given a bagging $S$ over $X$ with $m$ bags of equal size $k$,
\begin{align*}
    \text{Var}_z(SX) = \frac{1}{k^2} \left(\text{Var}(X_z) - \text{k-means}(S(X_z)) \right),
\end{align*}
\end{lemma}
\begin{proof}
    Say the points are $X_1, \ldots, X_n$, and the projections along $z$ are $x_1, \ldots, x_n$. Let $\mu = 0$ be the mean of $X$, and $\mu_l$ be the mean of $B_l$. The variance of $SX$ along $z$ is 

\begin{align*}
    \text{Var}(SX_z) &= \sum_{l=1}^m ({\mu_l}_z - \mu_z)^2 \\
    &= \sum_{\ell=1}^m \left(\frac{\sum_{i\in B_\ell} x_i}{k}\right)^2\\
    &= \frac{1}{k^2} \left( \sum_{i=1}^n x_i^2 - \sum_{\ell=1}^m \sum_{i\in B_\ell} (x_i - {\mu_l}_z)^2 \right)\\
    &= \frac{1}{k^2} \left(\text{Var}(X_z) - \text{k-means}(S(X_z)) \right) 
\end{align*}


\end{proof}


\subsection{Random Bagging, Aggregate-MIR}\label{appendix:random}





\begin{proposition*}[Full version of Proposition \ref{thm:kmeans-mir-random}]
For super-bags $B_l'$ as defined in Algorithm \ref{algo:random_bag_agg_mir} with arbitrary non-overlapping partitions $B_l^{(1)}$ and $B_l^{(2)}$, we have 
\begin{align*}
\sum_{l=1}^{r}\text{k-means-cluster}\left(\{\Tilde{y}_i\}_{i \in B_l'}\right) \geq \sum_{l=1}^{r} \text{k-means-cluster}\left(\{\Tilde{y}_i\}_{i \in B_l^{(1)}}\right) + \text{k-means-cluster}\left(\{\Tilde{y}_i\}_{i \in B_l^{(1)}}\right)
\end{align*}
where, $k$-means-cluster($C$) is the $k$-means clustering loss for cluster $C$. This expands to give the following:
\begin{equation*}
\sum_{l=1}^{r}\sum_{i \in B'_l} \left(\Tilde{y}_i - \mu_l'\right)^2 \geq \sum_{l=1}^{r}\left(\sum_{j \in B_l^{(1)}} \left(\Tilde{y}_i - \mu_l^{(1)}\right)^2 + \sum_{j \in B_l^{(2)}} \left(\Tilde{y}_i - \mu_l^{(2)}\right)^2 \right)
\end{equation*}
where, $\mu$ denotes the respective cluster means.
\end{proposition*}

\begin{proof}
    
We write the $k$-means loss for $B'_l$. Let $\mu_l' = \sum_{j \in B'_l}{}\Tilde{y}_i/2k$.
\begin{align*}
    \sum_{i \in B'_l} \left(\Tilde{y}_i - \mu_l'\right)^2 &= \sum_{i \in B'_l} \Tilde{y}_i^2 - 2\Tilde{y}_i\mu_l' + \mu_l'^2 \\
    &= \left(\sum_{i \in B'_l} \Tilde{y}_i^2\right) - \frac{\left(\sum_{i \in B'_l} \Tilde{y}_i\right)^2}{k} + \frac{\left(\sum_{i \in B'_l} \Tilde{y}_i\right)^2}{2k}\\
    &= \left(\sum_{i \in B'_l} \Tilde{y}_i^2\right) + \left(\frac{1}{4k} - \frac{1}{k}\right)\left(\sum_{i \in B'_l} \Tilde{y}_i\right)^2 \\
    &= \left(\sum_{i \in B'_l} \Tilde{y}_i^2\right) - \frac{1}{2k}\left(\sum_{i \in B'_l} \Tilde{y}_i\right)^2 \\
\end{align*}

Next, we write the $k$-means loss for $B_l^{(1)}$. Let $\mu_l^{(1)} = \sum_{j \in B_l^{(1)}}{}\Tilde{y}_i/k$.
\begin{align*}
    \sum_{j \in B_l^{(1)}} \left(\Tilde{y}_i - \mu_l^{(1)}\right)^2
    &= \sum_{j \in B_l^{(1)}} \Tilde{y}_i^2 - 2\Tilde{y}_i\mu_l^{(1)} + {\mu_l^{(1)}}^2 \\
    &= \left(\sum_{j \in B_l^{(1)}} \Tilde{y}_i^2\right) - \frac{2\left(\sum_{j \in B_l^{(1)}} \Tilde{y}_i\right)^2}{k} + \frac{\left(\sum_{j \in B_l^{(1)}} \Tilde{y}_i\right)^2}{k}\\
    &= \left(\sum_{j \in B_l^{(1)}} \Tilde{y}_i^2\right) - \frac{1}{k} \left(\sum_{j \in B_l^{(1)}} \Tilde{y}_i\right)^2 
\end{align*}

Similarly, for $B_l^{(2)}$, we get
\begin{align*}
    \sum_{j \in B_l^{(2)}} \left(\Tilde{y}_i - \mu_l^{(2)}\right)^2 =
    \left(\sum_{j \in B_l^{(2)}} \Tilde{y}_i^2\right) - \frac{1}{k} \left(\sum_{j \in B_l^{(1)}} \Tilde{y}_i\right)^2 
\end{align*}

We define $\Delta_l = \sum_{i \in B'_l} \left(\Tilde{y}_i - \mu_l'\right)^2 - \sum_{j \in B_l^{(1)}} \left(\Tilde{y}_i - \mu_l^{(1)}\right)^2 - \sum_{j \in B_l^{(2)}} \left(\Tilde{y}_i - \mu_l^{(2)}\right)^2$.
\begin{align*}
    \Delta_l &= \frac{-1}{2k}  \left(\sum_{i \in B'_l} \Tilde{y}_i\right)^2 + \frac{1}{k}  \left[\left(\sum_{j \in B_l^{(1)}} \Tilde{y}_i\right)^2 + \left(\sum_{j \in B_l^{(2)}} \Tilde{y}_i\right)^2 + 2\sum_{i \in B_l^{(1)}}\sum_{j \in B_l^{(2)}} \Tilde{y}_i \Tilde{y}_j - 2\sum_{i \in B_l^{(1)}}\sum_{j \in B_l^{(2)}} \Tilde{y}_i \Tilde{y}_j \right] \\
    &= \frac{-1}{2k}  \left(\sum_{i \in B'_l} \Tilde{y}_i\right)^2 + \frac{1}{k}  \left[\left(\sum_{j \in B_l'} \Tilde{y}_i\right)^2 -2\sum_{i \in B_l^{(1)}}\sum_{j \in B_l^{(2)}} \Tilde{y}_i \Tilde{y}_j \right] \\
    &= \frac{1}{2k}  \left(\sum_{i \in B'_l} \Tilde{y}_i\right)^2 + \frac{-2}{k}  \left(\sum_{i \in B_l^{(1)}}\sum_{j \in B_l^{(2)}} \Tilde{y}_i \Tilde{y}_j\right)\\
    &= \frac{1}{2k} \left[ \left(\sum_{i \in B'_l} \Tilde{y}_i\right)^2 -4 \left(\sum_{i \in B_l^{(1)}}\sum_{j \in B_l^{(2)}} \Tilde{y}_i \Tilde{y}_j\right)\right]\\
    &= \frac{1}{2k} \left[\left(\sum_{j \in B_l^{(1)}} \Tilde{y}_i\right) - \left(\sum_{j \in B_l^{(2)}} \Tilde{y}_i\right)\right]^2 \\
    &\geq 0
\end{align*}

For any super-bag $B_l'$ for $l \in [r]$, $\Delta_l > 0$. We can now sum over all bags to get the total loss observed after bagging $\Delta = \sum_{l=1}^{r}\Delta \geq 0$. This implies that the loss incurred by applying the $k$-means objective is higher when the instances are clustered into super-bags of sizes $2k$, compared to our random bagging approach, which creates two non-overlapping bags of sizes $k$ from the super-bags.

\end{proof}

\section{Differential Privacy (Continued)}\label{appendix:privacy}













 
 

In this section, we quantify the additional loss in utility incurred due to label-DP guarantees, for each setting we consider. We give full versions of the theorems stated in Section \ref{privacy}, along with the proofs.

\subsection{Instance-MIR}




    \begin{theorem*}[full version of Theorem \ref{thm:instance-mir-loss-priv}]
There exists a bagging $B~\text{with }  |B_l| = k, \forall l \in [m]$, satisfying $(\epsilon,\delta)$ label-DP, such that for $\hth$ in \eqref{eq:instanceloss}, we have
\begin{align*}
    &\E\left[ ||\hth-\tth||_2^2 \right]
    \leq ||(\bX^T\bX)^{-1}\bX^T||_{op}^2\left(2\left( \kmeansopt + n\left(1-\frac{1}{k}\right)\alpha^2 \right)  +  d \left( \sigma^2 + \frac{\alpha^2}{k^2} \right) \right), \nonumber
\end{align*}
where $\alpha^2 = \frac{16R^2\log\left(\frac{1.25}{\delta/2}\right) }{\epsilon^2}$, and $OPT$ is the objective value of the optimal $k$-means clustering over $\ty$.
\end{theorem*}
\begin{proof}
The error due to privacy can be decomposed into two parts. 

We need to add noise to the bag-labels before releasing them. MIR outputs one label at random, hence the sensitivity of the output is $2R$. Due to privacy amplification via subsampling \citep{balle2018privacyamplificationsubsamplingtight, steinke2022compositiondifferentialprivacy}, and the fact that $\epsilon << n$ in our setting, we add $\normal\left(0,\frac{\alpha^2}{k^2} \right)$ noise to the bag-label value to ensure $\left(\frac{\epsilon}{2}, \frac{\delta}{2}\right)$ label-DP, where $\alpha^2 = \frac{16R^2\log\left(\frac{1.25}{\delta/2}\right) }{\epsilon^2}$. Note that we assume addition of $\normal\left(0,\sigma^2 \right)$ noise to each $\ty_i$. Adding $\normal\left(0,\frac{\alpha^2}{k^2} \right)$ to each bag-label is equivalent to adding $\normal\left(0,\frac{\alpha^2}{k^2} \right)$ to each label $y_i$, hence leading to a total noise of $\normal\left(0,\sigma^2 + \frac{\alpha^2}{k^2} \right)$ to each $\ty_i$, leading to an additional error of $d\frac{\alpha^2}{k^2}$ over the intital $d\sigma^2$.  

In addition, since the objective here is a label-dependent clustering, we must use a differentially private $k$-means algorithm, leading to additional loss in utility. Adding $\normal\left(0,\alpha^2 \right)$ noise to each label, and then find an optimal clustering over the noise labels, satisfies $\left(\frac{\epsilon}{2}, \frac{\delta}{2}\right)$ label-DP by postprocessing. If $OPT$ is the objective value of the optimal $k$-means clustering over $\ty$, this private clustering method will lead to an additional error of $\left(1-\frac{1}{k}\right) \alpha^2$, due to Lemma \ref{noisyclustering}. 

Now, we have two queries, each of which are $\left(\frac{\epsilon}{2}, \frac{\delta}{2}\right)$ label-DP, ensuring $(\epsilon, \delta)$ label-DP in total due to composition.
\end{proof}


\paragraph{Private clustering} Note that it is possible to further reduce the error $n\left(1-\frac{1}{k}\right) \alpha^2$ due to private clustering. Note that the above method for private clustering satisfies the more stringent notion of local-DP \citep{bebensee2019localdifferentialprivacytutorial}, while we only need to satisfy the standard notion of central-DP. Hence, while it is easy to analyse, we can potentially find a much more accurate private clustering mechanism, suitably modifying existing algorithms in the rich literature on differentially-private $k$-means clustering \citep{su2015differentiallyprivatekmeansclustering, Lu_2020}, for the special case of a single dimension. 


\subsection{Bag-LLP}







\begin{theorem*}[full version of Theorem \ref{thm:bag-llp-loss-priv}]
There exists a bagging $B~\text{with }  |B_l| = k, \forall l \in [m]$, satisfying $(\epsilon,\delta)$ label-DP, such that for $\hth$ in \eqref{eq:bagloss}, we have
\begin{align}
    &\E\left[ \|\hth-\tth\|_2^2 \right]
    = OPT \left(  \sigma^2 + \frac{\alpha^2}{k} \right) \frac{m}{k},\nonumber
\end{align}
where $\alpha^2 = \frac{4R^2\log\left(\frac{1.25}{\delta}\right) }{\epsilon^2}$, and $OPT$ is the optimal value of $\left(\frac{\lambda_{max}(f(X))}{\lambda_{min}(f(X))}\right)^2$.
\end{theorem*}
\begin{proof}
     In this case, the optimal bagging strategy in independent of the labels. Hence, we just need to add noise to the bag-labels before releasing them, and not add noise for a private clustering of the labels. Each bag-label here is the mean of $k$ labels, hence the sensitivity of the output is $\frac{2R}{k}$. We add $\normal\left(0,\frac{\alpha^2}{k^2} \right)$ noise to the label value to ensure $(\epsilon,\delta)$ label-DP, where $\alpha^2 = \frac{4R^2\log\left(\frac{1.25}{\delta}\right) }{\epsilon^2}$. This is equivalent to adding $\normal\left(0,\frac{\alpha^2}{k} \right)$ noise to each of the $k$ labels, and then averaging them. Note that we assume addition of $\normal\left(0,\sigma^2 \right)$ noise to each $\ty_i$. Adding $\normal\left(0,\frac{\alpha^2}{k} \right)$ to each label $y_i$, leads to a total noise of $\normal\left(0,\sigma^2 + \frac{\alpha^2}{k} \right)$ to each $\ty_i$, leading to an additional error of $\frac{\alpha^2}{k} \frac{m}{k}$ over the intital $\sigma^2 \frac{m}{k}$.
    
    
\end{proof}




\subsection{Aggregate-MIR}

Theorem \ref{thm:MIR-agg-UB1} shows that, for $\hth$ in \eqref{eq:aggloss}, given a bagging $B$, with equal sized bags, we have
\begin{align}
    &\E\left[ \|\hth-\tth\|_2^2 \right]
    \leq \frac{1}{k} \|((S\bX)^T S\bX)^{-1} (S\bX)^T\|_{op}^2 \nonumber \left( \sum_{l=1}^m \sum_{\ty_i\in B_l} (\ty_i - \mu_l)^2  + \sigma^2nk \right),
\end{align}

If we want a private bagging $B$, the error due to privacy can be decomposed into two parts. We need to add noise to the bag-labels before releasing them. As in the case of Instance-MIR, we add $\normal\left(0,\frac{\alpha^2}{k^2} \right)$ noise to the bag-labels value to ensure $(\epsilon,\delta)$ label-DP, where $\alpha^2 = \frac{4R^2\log\left(\frac{1.25}{\delta}\right) }{\epsilon^2}$, leading to an additional error of $nk\frac{\alpha^2}{k^2}$ over the intital $nk\sigma^2$. 


Now, there are two terms that contribute to the clustering error, term 1 $\left( \|((S\bX)^T S\bX)^{-1} (S\bX)^T\|_{op}^2\right)$, and term 2 $\left(\sum_{l=1}^m \sum_{\ty_i\in B_l} (\ty_i - \mu_l)^2\right)$. Term 1 is involved in Bag-LLP, and minimizes the condition number of the bag-centroids. Term 2 is also involved in Instance-MIR, and minimizes a label-dependent $k$-means clustering objective. If we minimize Term 1, the optimal bagging strategy in independent of the labels. Hence, we just need to add noise to the bag-labels before releasing them, and not add noise for a private clustering of the labels. However, in this case, the value of Term 2 could be suboptimal. 

If we minimize Term 2, we must use a differentially private $k$-means algorithm, leading to additional loss in utility. Adding $\normal\left(0,\alpha^2 \right)$ noise to each label, and then find an optimal clustering over the noise labels, satisfies $(\epsilon,\delta)$ label-DP. As in the case of Instance-MIR, this private clustering method will lead to an additional error of $n\left(1-\frac{1}{k}\right) \alpha^2$. Note that since we now have two private queries, we would have to split the privacy budget amongst them. However, minimizing term 2 might lead to a suboptimal value of Term 1.
















\section{Experiments (Continued)}\label{appendix:experiments}
We implement 4 bagging mechanisms on each of Instance-MIR, Aggregate-MIR, and Bag-LLP, namely (1) Instance $k$-means, (2) Label $k$-means, (3) Random bagging, and (4) Scaled Instance $k$-means. We also implement Label $k$-means super-bags (Algorithm \ref{algo:random_bag_agg_mir}) for Aggregate-MIR, and Bag-LLP. In addition, we also vary the value of $\sigma$. In the tables, we present the mean and standard deviation of the error, calculated over $15$ runs for each experiment. As expected, in most cases for Bag-LLP (Table \ref{tab:app_bag_llp}) and Aggregate-MIR (Table \ref{tab:app_agg_mir}), scaled instance $k$-means performs better than instance $k$-means, which in turn performs better than random bagging, which in turn performs better than label $k$-means. However, for Instance-MIR (Table \ref{tab:app_instance_mir}), all the mechanisms show similar performance, with label $k$-means showing better performance in many cases.
\input{supplement}
\subsection{Statistical Significance}
\label{app:paired-t-value}
Table \ref{tab:t-value-A} has statistical significance scores for the results in Table \ref{tab:non_private_llp_bag_inst_mir}. There is one row for each bag-size $\{10, 50\}$ and three settings of Bag-LLP, Instance-MIR, and Aggregate-MIR. The columns \textbf{I$k$M}, \textbf{L$k$M}, and \textbf{Rand} contain the mean errors of the bagging methods: Instance $k$-means, Label $k$-means, and Random. All methods are evaluated on 15 independent trial datasets for each row.  
In column \textbf{(L$k$M vs.\ I$k$M)-T} we present the paired $T$-value for Label $k$-means and Instance $k$-means. In column \textbf{S1 (90\%)} we check whether the magnitude of this $T$-value is greater than the critical-$T = 1.760$, indicating whether there is a significant difference in the means with $90\%$ confidence.  
In column \textbf{(Rand vs best)-T} we present the paired $T$-value for Random Bagging vs.\ the better (i.e., lower error) of Label $k$-means and Instance $k$-means. Column \textbf{S2 (90\%)} indicates whether there is a significant difference in the means of Random vs the better of Label $k$-means and Instance $k$-means, with $90\%$ confidence. Table \ref{tab:t-value-B} similarly has the confidence scores for the results in Table \ref{tab:private_llp_bag_inst_mir} (private bagging) of the paper.

\textbf{Takeaways}: We see from Table \ref{tab:t-value-A} that Instance-$k$-means has statistically significant better performance over Label $k$-means as well as Random for Aggregate-MIR and for Bag-LLP (bag-size 10). For Instance-MIR on the other hand there is no statistically significant difference between the methods.  
In Table \ref{tab:t-value-B} we see that Label $k$-means is statistically significantly better than Instance $k$-means for all settings. However, Label $k$-means has statistically significant better performance over Random for 2 settings with bag-size 10.  
Overall we see that in many settings our results provide statistically significant separation between the techniques.


\begin{table}[!ht] 
\begin{footnotesize}
    \centering
    \begin{tabular}{lllllllll}
    \hline
        Setup & $k$ & I$k$M & L$k$M & Rand & (L$k$M vs I$k$M)-T & S1 (90\%) & (Rand vs best)-T & S2 (90\%) \\ \hline
        Bag & 10 & $0.0082 \pm 0.002$ & $0.0458 \pm 0.012$ & $0.0099 \pm 0.002$ & 12.301 & Yes & 2.004 & Yes \\ 
        LLP & 50 & $0.0392 \pm 0.008$ & $0.0629 \pm 0.008$ & $0.0423 \pm 0.009$ & 7.062 & Yes & 1.261 & No \\ \hline
        Instance & 10 & $0.0088 \pm 0.002$ & $0.0072 \pm 0.002$ & $0.0085 \pm 0.002$ & -1.688 & No & 1.332 & No \\ 
        MIR & 50 & $0.0388 \pm 0.006$ & $0.0404 \pm 0.007$ & $0.0419 \pm 0.006$ & 0.643 & No & 1.172 & No \\ \hline
        Aggregate & 10 & $0.0102 \pm 0.002$ & $0.0453 \pm 0.008$ & $0.0221 \pm 0.004$ & 15.85 & Yes & 8.284 & Yes \\ 
        MIR & 50 & $0.0437 \pm 0.008$ & $0.0601 \pm 0.008$ & $0.0619 \pm 0.012$ & 5.339 & Yes & 4.505 & Yes \\ \hline
    \end{tabular}   
    \caption{Statistical Significance for Non-Private Bagging} \label{tab:t-value-A}
\end{footnotesize}
\end{table}

\begin{table}[!ht]
\begin{footnotesize}
    \centering
    \begin{tabular}{lllllllll}
    \hline
        $\epsilon$ & $k$ & I$k$M & L$k$M & Rand & (L$k$M vs I$k$M)-T & S1 (90\%) & (Rand vs best)-T & S2 (90\%) \\ \hline
        0.5 & 10 & $0.0619 \pm 0.012$ & $0.0505 \pm 0.005$ & $0.0553 \pm 0.008$ & -4.105 & Yes & 1.761 & Yes \\ 
        ~ & 50 & $0.0656 \pm 0.012$ & $0.0559 \pm 0.008$ & $0.0564 \pm 0.007$ & -2.297 & Yes & 0.208 & No \\ \hline
        1 & 10 & $0.0537 \pm 0.009$ & $0.0362 \pm 0.006$ & $0.0397 \pm 0.010$ & -5.513 & Yes & 1.189 & No \\ 
        ~ & 50 & $0.0595 \pm 0.012$ & $0.0480 \pm 0.005$ & $0.0447 \pm 0.005$ & -3.029 & Yes & -1.689 & No \\ \hline
        2 & 10 & $0.0390 \pm 0.008$ & $0.0189 \pm 0.004$ & $0.0216 \pm 0.005$ & -9.182 & Yes & 2.148 & Yes \\
        ~ & 50 & $0.0521 \pm 0.009$ & $0.0431 \pm 0.006$ & $0.0434 \pm 0.008$ & -3.513 & Yes & 0.1569 & No \\ \hline
    \end{tabular}
    \caption{Statistical Significance for Private Bagging} \label{tab:t-value-B}
\end{footnotesize}
\end{table}


\section{Generalized Linear Models}\label{GLM}

We now present the setup and terminology we use in this section, borrowed from \cite{javanmard2024priorboostadaptivealgorithmlearning}.
The instance-level labels $y_i$ are conditionally independent given $\bx_i$ in GLMs, and are drawn from a specific distribution within the exponential family. The corresponding log-likelihood function can be expressed as:
\begin{align*}\label{eq:GLM}
    \log p(y_i \mid \eta_i,\phi) = \frac{y_i\eta_i -b(\eta_i)}{a_i(\phi)} + c(y_i,\phi)\,,
\end{align*}
where $\eta_i$ is a location variable and $\phi$ is the scaling variable.
The functions $a_i$, $b$, and $c$ are provided.
We can take $a_i(\phi) = \phi/w_i$, where $w_i$ is a constant prior weight.
We analyse canonical GLMs, in which $\eta_i = \bx_i^T \tth$ for an unknown model ~$\tth$. Some properties of GLMs are $\mu = \E[y|x] = b'(x^T\tth)$, and $\Var(y|x) = a(\phi)b''(x^T\tth)$. We consider $\mathcal{L}$ to the negative log likelihood, and can ignore the term $c(y_i,\phi)$ as it does not depend on $\theta$. Our objective is to find a bagging strategy which closes the gap between the true model $\tth$ and $\hth$. For GLMs, we achieve this by minimizing the gradient of the loss at $\tth$. We now state the following lemma, that will be used later on. 


\begin{lemma}\label{lem:min_grad_strongly_convex}[\cite{javanmard2024priorboostadaptivealgorithmlearning}]
Suppose that the loss $\mathcal{L}$ is strongly convex with parameter $\mu$ and $\hth = \arg\min_{\theta}\mathcal{L}(\theta)$. Then, for any model $\tth$, we have
\[
\|\hth - \tth\|_2 \le \frac{1}{\mu}\|\mathcal{L}(\tth)\|_2.
\]
In addition, if $\mathcal{L}$ has a Lipschitz continuous gradient with parameter $L$, we have
\[
 \frac{1}{L}\|\mathcal{L}(\tth)\|_2 \le \|\hth - \tth\|_2.
\]
\end{lemma}

\subsection{Instance-MIR}


Let $\hth$ be the minimizer of the instance-level loss, i.e.,
  \begin{align*}\label{eq:GLM_estimator}
      \hth &= \argmin_{\theta} \frac{1}{n} \sum_{l=1}^m \sum_{i \in B_l}\frac{\overline{y_l}\eta_i -b(\eta_i)}{a_i(\phi)}.
  \end{align*}
 We find the optimal $\hth$ by solving $\nabla\mathcal{L} (\hth) = \mathbf{0}$, and use Lemma \ref{lem:min_grad_strongly_convex} which states that $\|\hth - \tth\|_2$ is lower bounded by $\|\nabla \mathcal{L}(\tth)\|_2$ for strongly convex functions. We now state the main result of this section below. We define the bagging matrices $A,S$ as in Section \ref{MIR-event}.
 



\begin{theorem}[GLM Upper Bound, Instance-MIR]
Given a bagging denoted by $S$, we have
\begin{align*}
    \E\left[\|\nabla \mathcal{L}(\tth)\|_2 \right]
    &\leq \|X^TD^{-1}\|_{op}^2 \left(m(\|b'(X\tth)\|_2^2 + \|Db''(X\tth)\|_1) + \|(S - I)b'(X\theta)\|_2^2 - \|Sb'(X\theta)\|_2^2 \right), 
\end{align*}
  where, $D = \Diag(\{a_i(\phi)\})$.
\label{thm:glm-mir-event-loss}
\end{theorem}
\begin{proof}
    We begin by computing $\nabla\mathcal{L} (\theta)$ and expressing it in the matrix format:
    \begin{align*}
        \nabla\mathcal{L} (\theta) &= \frac{1}{n} \sum_{l = 1}^m \sum_{i \in B_l} \frac{\left(\overline{y_l} - b'(x_i^T\theta)\right)x_i}{a_i(\phi)} \\
        &= X^TD^{-1}(Ay - b'(X\theta)).
    \end{align*}
    We now expand the expected value below.
    \begin{align*}
        \E\left[\|\nabla\mathcal{L} (\theta)\|_2^2 \right] &= \E\left[\|X^TD^{-1}(Ay - b'(X\theta))\|_2^2 \right] \\
        &\leq \|X^TD^{-1}\|_{op}^2 \E\left[\|Ay - b'(X\theta)\|_2^2 \right] \\
        &= \|X^TD^{-1}\|_{op}^2 \E\left[(Ay - b'(X\theta))^T(Ay - b'(X\theta)) \right] \\
        &= \|X^TD^{-1}\|_{op}^2 \E\left[(Ay)^T(Ay) - b'(X\theta)^TAy - (Ay)^Tb'(X\theta) + b'(X\theta)^T b'(X\theta)\right] \\
        &= \|X^TD^{-1}\|_{op}^2 \left(\E\left[(Ay)^T(Ay)\right] - b'(X\theta)^TSy - (Sy)^Tb'(X\theta) + b'(X\theta)^T b'(X\theta) \right) \\
        &= \|X^TD^{-1}\|_{op}^2 \big(\E\left[(Ay)^T(Ay)\right] - b'(X\theta)^T Sy - (Sy)^T b'(X\theta) + b'(X\theta)^T b'(X\theta)\\
        &+ (Sb'(X\theta))^T (Sb'(X\theta)) - (Sb'(X\theta))^T (Sb'(X\theta)) \big) \\
        &= \|X^TD^{-1}\|_{op}^2 \left(\E\left[\|Ay\|_2^2\right] + \|(S - I)b'(X\theta)\|_2^2 - \|Sb'(X\theta)\|_2^2 \right) \\
        &\leq \|X^TD^{-1}\|_{op}^2 \left(\E\left[\|A\|_{op}^2\|y\|_2^2\right] + \|(S - I)b'(X\theta)\|_2^2 - \|Sb'(X\theta)\|_2^2 \right) \\
        &\leq \|X^TD^{-1}\|_{op}^2 \left(m(\|b'(X\tth)\|_2^2 + \|Db''(X\tth)\|_1) + \|(S - I)b'(X\theta)\|_2^2 - \|Sb'(X\theta)\|_2^2 \right)
    \end{align*}
\end{proof}
Note that, since the term $\|X^TD^{-1}\|_{op}^2$ is constant and the first term $m(\|b'(X\tth)\|_2^2 + \|Db''(X\tth)\|_1)$ is independent of the bagging strategy, it can be disregarded. Thus, we focus on the remaining terms to derive a clustering objective. We expand the matrix notation and express these terms as a summation over instances. We define $\mu_l := \frac{\sum\mu_i}{|B_l|}$, where $\mu_i = \E[y_i | x_i] = b'(x_i^T\tth)$. We get the following
\begin{align*}
    \min_{(B_1,\dots,B_m) \in \mathcal{B}} \quad \|(S - I)b'(X\theta)\|_2^2 - \|Sb'(X\theta)\|_2^2 &= \min_{(B_1,\dots,B_m) \in \mathcal{B}} \quad \sum_{l=1}^m \sum_{i \in B_l} (\mu_i - \mu_l)^2 - \sum_{l=1}^m |B_l|\mu_l
\end{align*}
Minimizing the first term in the objective is similar to performing $1d$ $k$-means clustering.


\subsection{Aggregate-MIR}
Let $\hth$ be the minimizer of the aggregate-level loss, i.e.,
  \begin{align*}
      \hth &= \argmin_{\theta} \frac{1}{m} \sum_{l=1}^m \frac{\overline{y_l}\sum_{i \in B_l}\frac{\eta_i}{|B_l|} -b\left(\sum_{i \in B_l}\frac{\eta_i}{|B_l|}\right)}{a_l(\phi)}.
  \end{align*}
 The steps involved in the analysis here are similar to the instance-level loss function. We find the optimal $\hth$ by solving $\nabla\mathcal{L} (\hth) = \mathbf{0}$ and then minimize $\|\nabla \mathcal{L}(\tth)\|_2$ to approximate $\|\hth - \tth\|_2$. We now state the main result of this section below. We define the bagging matrices $A,S$ as in Section \ref{MIR-agg}.

\begin{theorem}[GLM Upper Bound, Aggregate-MIR]
Given a bagging denoted by $S$, we have
\begin{align}
    \E\left[\|\nabla \mathcal{L}(\tth)\|_2 \right]
    &\leq n \lambda_{max}(X^TX)\left(m(\|b'(X\tth)\|_2^2 + \|Db''(X\tth)\|_1) + \|Sb'(X\theta) - b'(SX\theta)\|_2^2 - \|Sb'(X\theta)\|_2^2 \right))
\end{align}
  where, $D = \text{Diag}(\{a_i(\phi)\})$.
\label{thm:glm-Aggregate-MIR}
\end{theorem}
\begin{proof}
    We begin by computing $\nabla\mathcal{L} (\theta)$ and expressing it in the matrix format:
    \begin{align*}
        \nabla\mathcal{L} (\theta) &= \frac{1}{m} \sum_{l = 1}^m  \frac{\left(\overline{y_l} - b'\left(\sum_{i \in B_l}\frac{x_i^T\theta}{|B_l|}\right)\right)\sum_{i \in B_l}\frac{x_i^T\theta}{|B_l|}}{a_l(\phi)} \\
        &= (SX)^TD^{-1}(Ay - b'(SX\theta)).
    \end{align*}
   We now expand the expected value below.
    \begin{align*}
        \E\left[\|\nabla\mathcal{L} (\theta)\|_2^2 \right] &= \E\left[\|(SX)^TD^{-1}(Ay - b'(SX\theta))\|_2^2 \right] \\
        &\leq \|(SX)^TD^{-1}\|_{op}^2 \E\left[\|Ay - b'(SX\theta)\|_2^2 \right] \\
        &= \|(SX)^TD^{-1}\|_{op}^2 \E\left[(Ay - b'(SX\theta))^T(Ay - b'(SX\theta)) \right] \\
        &= \|(SX)^TD^{-1}\|_{op}^2 \E\left[(Ay)^T(Ay) - b'(SX\theta)^TAy - (Ay)^Tb'(SX\theta) + b'(SX\theta)^T b'(SX\theta)\right] \\
        &= \|(SX)^TD^{-1}\|_{op}^2 \left(\E\left[(Ay)^T(Ay)\right] - b'(SX\theta)^TSy - (Sy)^Tb'(SX\theta) + b'(SX\theta)^T b'(SX\theta) \right) \\
        &= \|(SX)^TD^{-1}\|_{op}^2 \big(\E\left[(Ay)^T(Ay)\right] - b'(SX\theta)^TSb'(X\theta) - (Sb'(X\theta))^Tb'(SX\theta) + b'(SX\theta)^T b'(SX\theta) + \\ 
        & \ \ \ \ (Sb'(X\theta))^T (Sb'(X\theta)) - (Sb'(X\theta))^T (Sb'(X\theta)) \big) \\
        &= \|(SX)^TD^{-1}\|_{op}^2 \left(\E\left[\|Ay\|_2^2| X\right] + \|Sb'(X\theta) - b'(SX\theta)\|_2^2 - \|Sb'(X\theta)\|_2^2 \right) \\
        &\leq \|(SX)^TD^{-1}\|_{op}^2 \left(\E\left[\|A\|_{op}^2\|y\|_2^2| X\right] \|Sb'(X\theta) - b'(SX\theta)\|_2^2 - \|Sb'(X\theta)\|_2^2 \right) \\
        &\leq \|(SX)^TD^{-1}\|_{op}^2 \left(m(\|b'(X\tth)\|_2^2 + \|Db''(X\tth)\|_1) + \|Sb'(X\theta) - b'(SX\theta)\|_2^2 - \|Sb'(X\theta)\|_2^2 \right)\\
        &\leq \|D^{-1}\|_{op}^2 \lambda_{max}(X^TX)\left(m(\|b'(X\tth)\|_2^2 + \|Db''(X\tth)\|_1) + \|Sb'(X\theta) - b'(SX\theta)\|_2^2 - \|Sb'(X\theta)\|_2^2 \right)
    \end{align*}
\end{proof}
We now justify why the final objective in Theorem \ref{thm:glm-Aggregate-MIR} leads to a clustering objective. The key term in this objective which depends on $S$ is $\|Sb'(X\theta) - b'(SX\theta)\|_2^2$. Our task is to determine the optimal bagging matrix $S$ that would minimize this term. To simplify this expression and develop an interpretable algorithm, we assume that the function $b'(.)$ is monotonic\footnote{The monotonicity condition holds true for the majority of distributions belonging to the exponential family, including normal, poisson, logistic, and inverse gaussian.}.
Focusing on the case where $b'(.)$ is an increasing function, we know that $b'(t_1) \geq b'(t_2)$ $\iff$ $t_1 \geq t_2$. Simplifying, we get that
\begin{align*}
    \Big\|(Sb'(X\theta) - b'(SX\theta)\Big\|_2^2 &=
    \sum_{l=1}^m \left(\sum_{x \in B_l} \frac{b'(x^T\tth)}{|B_l|}  -  b'\left(\sum_{x \in B_l} \frac{x^T\tth}{|B_l|}\right)\right)^2
\end{align*}
Since $b'$ is an increasing function, the inequality $b'\left(\max_{x' \in B_l} x'^T\tth\right) \geq b'(x^T\tth)$ holds true for all $x \in B_l$ (and $\max_{x' \in B_l} x'^T\tth \geq x^T\tth$). Similarly, $b'\left(x^T\tth\right) \geq b'(\min_{x' \in B_l} x'^T\tth)$ would hold true for all $x \in B_l$ (and $x^T\tth \geq \min_{x' \in B_l} x'^T\tth)$) We now look at the first term:
\begin{align*}
    \frac{b'\left(\sum_{x \in B_l}\min_{x' \in B_l} x'^T\tth\right)}{|B_l|} &\leq \sum_{x \in B_l}  \frac{b'(x^T\tth)}{|B_l|} \leq  \frac{b'\left(\sum_{x \in B_l} \max_{x' \in B_l} x'^T\tth\right)}{|B_l|} \\
    b'\left(\min_{x' \in B_l} x'^T\tth\right) &\leq \sum_{x \in B_l}  \frac{b'\left(x^T\tth\right)}{|B_l|} \leq b'\left(\max_{x' \in B_l} x'^T\tth\right).
\end{align*}
We now bound the second term:
\begin{align*}
    b'\left( \sum_{x \in B_l} \frac{\min_{x' \in B_l} x'^T\tth}{|B_l|}\right) &\leq b'\left(\sum_{x \in B_l} \frac{x^T\tth}{|B_l|}\right) \leq  b'\left(\sum_{x \in B_l}\frac{ \max_{x'} x'T\tth}{|B_l|}\right) \\
    b'\left(\min_{x' \in B_l} x'^T\tth\right) &\leq b'\left(\sum_{x \in B_l} \frac{x^T\tth}{|B_l|}\right) \leq b'\left(\max_{x' \in B_l} x'^T\tth\right).
\end{align*}
It is easy to see that the difference $\|Sb'(X\theta) - b'(SX\theta)\|_2^2$ has an upper bound:
\begin{align}
    \sum_{l=1}^m \left(\sum_{x \in B_l} \frac{b'(x^T\tth)}{|B_l|}  -  b'\left(\sum_{x \in B_l} \frac{x^T\tth}{|B_l|}\right)\right)^2 &\leq
    \sum_{l=1}^m \left(b'\left(\max_{x' \in B_l} x'^T\tth\right) -  b'\left(\min_{x' \in B_l} x'^T\tth\right)\right)^2.
    \label{eq:glm_agg_mir_clustering}
\end{align}

If $n = mk$ and we need to construct-equal sized bags having $k$ instances each, then the minimization of Equation \ref{eq:glm_agg_mir_clustering} can be achieved by sorting $b'(x^T\tth)$ for all $x \in X$, and dividing the points into contiguous chunks of size $k$. This process resembles the $1d$ clustering objective with an equal-size constraint.
\end{document}
