\documentclass{uai2022}
\usepackage[american]{babel}

\usepackage{natbib} 
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage[utf8]{inputenc} 
\usepackage[T1]{fontenc}    
\usepackage{hyperref}      
\usepackage{url}            
\usepackage{booktabs}       
\usepackage{amsfonts}       
\usepackage{nicefrac}       
\usepackage{microtype}     
\usepackage{xcolor}
\usepackage{enumitem}
\usepackage{amsmath}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{color}
\usepackage{multirow}
\usepackage{array}
\usepackage{amsthm}
\usepackage{enumerate}
\usepackage{float}

\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{\emph{Proposition}}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}

\newcommand{\D}{{\cal D}}
\newcommand{\iid}{{\em i.i.d.}}

\title{Asymptotic Optimality for Active Learning Processes \\
Supplementary Materials}
\begin{document}

\onecolumn
\maketitle
\tableofcontents
\newpage

\section{PROOFS}

\subsection{Proof of Theorem 1}
\label{proof1_sec}

\begin{theorem} (Hoeffding Inequality with IWERM)
Let $\{(\mathbf{x}_i, y_i)\}_{i=1}^{n_t}$ be $n_t$ instances that are sampled from the instrumental distribution $Q(\mathbf{x},y)$. Denote r.v. $\mathbf{S} = R(\theta) - {R}^w_{t}({\theta})$ that takes over $\theta$, and let $b = \sup \mathbf{S}$, $a = \inf \mathbf{S}$, $E[\mathbf{S}] = \eta$. $\forall \epsilon > 0$, we have
\begin{small}
\begin{equation}
\label{hoeff}
 \mathbb{P}\Big(\big|R(\theta) - {R}^w_{t}({\theta}) \big| \geq \epsilon \Big)  \leq 2e^{\frac{-2n_t(\epsilon-\eta)^2}{(b-a)^2}}.
\end{equation}
\end{small}
\label{thm-1}
\end{theorem}


\begin{proof}
Firstly, assuming that $\mathbf{X}$ be real-valued random variable and  $E[\mathbf{X}] = \eta$. $\forall \lambda > 0$, we have Markov's inequality:
\begin{equation}
\label{markov}
\begin{aligned}
\mathbb{P}\left( \mathbf{X} \geq \epsilon \right) = \mathbb{P}\left( e^{\lambda\mathbf{X}} \geq e^{\lambda\epsilon} \right) \leq e^{-\lambda\epsilon}E[e^{\lambda\mathbf{X}}].
\end{aligned}
\end{equation}
Following Hoeffding's lemma,
\begin{equation}
\label{hoefflemma}
\begin{aligned}
E[e^{\lambda\mathbf{X}}] \leq \exp\big(\lambda\eta + \tfrac{\lambda^2(b-a)^2}{8}\big).
\end{aligned}
\end{equation}
Considering $\mathbf{S}$ and $S_i$ for each $(\mathbf{x}_i,y_i)$ and $a_i \leq S_i \leq b_i$. Using (\ref{markov}) and (\ref{hoefflemma}), we have
\begin{equation}
\label{hoeff1}
\begin{aligned}
\mathbb{P}( \mathbf{S} \geq \epsilon ) = \mathbb{P}\Big( \sum \limits_{i=1}^{n_t}S_i \geq N\epsilon \Big) \leq \Big(\prod \limits_{i=1}^{n_t}E_Q[e^{\lambda S _i}]\Big)e^{-n_t\lambda\epsilon} \leq \Big(\prod \limits_{i=1}^{n_t}e^{(\lambda\eta + \frac{\lambda^2(b_i-a_i)^2}{8})}\Big)e^{-n_t\lambda\epsilon}.
\end{aligned}
\end{equation}
Minimizing over $\lambda \geq 0$,
\begin{equation}
\label{hoeff2}
\begin{aligned}
\mathbb{P}( \mathbf{S} \geq \epsilon ) \leq \min \limits_{\lambda \geq 0}\exp\Big(\frac{n_t\lambda^2(b-a)^2}{8} - n_t\lambda\epsilon + n_t\lambda\eta\Big)
= \exp\big(-\tfrac{2n_t(\epsilon-\eta)^2)}{(b-a)^2}\big).
\end{aligned}
\end{equation}
Finally,
\begin{equation}
\label{hoeff3}
\begin{aligned}
\mathbb{P}\left( |\mathbf{S}| \geq \epsilon \right) \leq  2\exp\big(-\tfrac{2n_t(\epsilon-\eta)^2)}{(b-a)^2}\big).
\end{aligned}
\end{equation}
\end{proof}

\paragraph{RQ1:} What is the difference between \textbf{Theorem 1} and \citep{beygelzimer2009importance}'s \textbf{Theorem 1}?

\emph{Ans1:} Both the two theorems aim to provide a safe guarantee -- consistency, but \citep{beygelzimer2009importance}'s \textbf{Theorem 1} only applies to stream-based AL, while ours applies to any AL, including stream-based AL and pool-based AL as $B \geq 1$. Specifically, in \cite{beygelzimer2009importance}, they make use of the martingale property and applied Azuma's inequality to get the bound, while our paper use Hoeffding's inequality to get the bound. The difference between the two inequalities is: Hoeffding proved this result for independent variables rather than martingale differences, and also observed that light modifications of his argument establish the result for martingale differences. From the perspective of AL, \citep{beygelzimer2009importance}, since their theorem could only be applied to stream-based AL, where the data samples come in order, or pool-based AL with batch size as $1$. They calculate the conditional expectation of $E[Z_t|Z_t-1,...,Z_0]$, while $Z_t = \sum (U_t,...,U_0)$. In contrast, our theorem could be applied to any kind of AL sampling scheme, both stream-based AL and pool-based AL with $B \geq 1$.

\subsection{Proof of Proposition 1}

\begin{proposition} (\emph{Asymptotic Variance of Estimators})
Let ${R}_{t}^w({\theta}) = \tfrac{1}{\sum \nolimits_{i=1}^{n_t}\beta(\mathbf{x}_i,y_i)} \sum \nolimits_{i=1}^{n_t}\beta(\mathbf{x}_i,y_i)l(f(\mathbf{x}_i;{\theta}), y_i)$ and $R(\theta) = E_{(\mathbf{x},y) \sim P}[l(f(\mathbf{x};\theta), y)] = \iint l(f(\mathbf{x}; \theta), y)P(\mathbf{x},y)d\mathbf{x} dy$, by employing ``Delta Method'', we have
\begin{equation}
\label{asymvar}
\begin{aligned}
\sqrt{n_t}({R}_{t}^w({\theta})- R(\theta)) \stackrel{n_t \rightarrow \infty}{\longrightarrow} \mathcal{N}(0, \sigma_Q^2),
\end{aligned}
\end{equation}
with $\sigma^2_Q = \iint \beta(\mathbf{x},y) [l(f(\mathbf{x};\theta),y)-R(\theta)]^2 P(\mathbf{x},y)d\mathbf{x}dy$.
\label{pro-1}
\end{proposition}

\begin{proof}
Take $l_i = l(f(\mathbf{x}_i;{\theta}),y_i)$, $l = \{l_1,...,l_i,...,l_{n_t}\}$ $\beta_i = \beta(\mathbf{x}_i,y_i)$, $\beta = \{\beta_1,...,\beta_i,...,\beta_{n_t}\}$, $r_t = \sum\limits_{i=1}^{n_t} \beta_i l_i$, ${R}_t = {R}_t^w({\theta}) = \frac{1}{n_t} r_t$, $R = R(\theta)$ and $B_{n_t} = \sum\limits_{i=1}^{n_t}\beta_i$, $\mathbf{l}_{n_t} = \frac{1}{n_t}\sum\limits_{i=1}^{n_t}l_i$.

Since the data samples are drawn from $Q$ distribution, we have
$E_Q[{R}_t]=R$ , $E_Q[r_t]=n_tR$, and $E_Q[B_{n_t}]=n_t$.
The random variables $\beta_1,...,\beta_{n_t}$ and $\beta_1l_1,...,\beta_{n_t}l_{n_t}$ are $i.i.d.$, by using CLT, we have
\begin{equation}
\sqrt{n_t}(\frac{1}{n_t}r_t-R) \stackrel{n_t \rightarrow \infty}{\longrightarrow} \mathcal{N}(0,Var(\beta l))
\end{equation}
\begin{equation}
\sqrt{n_t}(\frac{1}{n_t}B_{n_t}-1) \stackrel{n_t \rightarrow \infty}{\longrightarrow} \mathcal{N}(0,Var(\beta))
\end{equation}

Assuming $g(u,v)=\frac{u}{v}$, let $u = \frac{1}{n_t}r_t$, $v = \frac{1}{n_t}B_{n_t}$.
We then use multivariate delta method to get

\begin{equation}
\sqrt{n_t}(g(u,v) - g(E[u],E[v]) = \sqrt{n_t}(\frac{1}{B_{n_t}} r_t - \frac{R}{1}) \stackrel{n_t \rightarrow \infty}{\longrightarrow} \mathcal{N}(0,\nabla g^T\Sigma\nabla g)
\end{equation}
where $\nabla g = \nabla(\frac{r_t}{B_{n_t}}) = \nabla g(R,1)$ represents the gradient of $g$ and $\Sigma$ is the covariance matrix of $r_t$ and $B_{n_t}$.

$\Sigma = \left( \begin{array}{cc} Var(\beta l) & Cov(\beta l, \beta) \\ Cov(\beta, \beta l) & Var(\beta) \end{array} \right)$

Then we calculate
\begin{equation}
\begin{aligned}
\nabla g(R,1)^T\Sigma\nabla g(R,1) & = Var(\beta l) - 2R Cov(\beta l,\beta) + R^2 Var(\beta)
\\ & = E[(\beta l)^2] - E[(\beta l)]^2 - 2R(E[\beta^2l] - R*1]) +R^2(E[\beta^2] - E[\beta]^2)
\\ & = E[(\beta l)^2] - R^2  - 2RE[\beta^2l] + 2R^2 + R^2(E[\beta^2] - R^2 * 1)
\\ & = E[(\beta l)^2] - 2RE[\beta^2l] + R^2E[\beta^2]
\\ & = \iint \beta(\mathbf{x},y)^2 (l_i - R)^2 Q(\mathbf{x},y) d\mathbf{x}dy
\\ & = \iint \beta(\mathbf{x},y) [l(f(\mathbf{x};{\theta}_{t}),y)-R(\theta)]^2 P(\mathbf{x},y)d\mathbf{x}dy
\end{aligned}
\end{equation}
\end{proof}

Note that in the proof of \textbf{Proposition 1}, we utilize original form ${R}_{t}^w({\theta}) = \tfrac{1}{\sum \nolimits_{i=1}^{n_t}\beta(\mathbf{x}_i,y_i)} \sum \nolimits_{i=1}^{n_t}\beta(\mathbf{x}_i,y_i)l(f(\mathbf{x}_i;{\theta}), y_i)$. In our full paper, to facilitate the calculation, we use ${R}_{t}^w({\theta}) = \tfrac{1}{n_t} \sum \nolimits_{i=1}^{n_t}\beta(\mathbf{x}_i,y_i)l(f(\mathbf{x}_i;{\theta}), y_i)$, while $\lim_{n_t \rightarrow \infty} \beta(\mathbf{x},y) = 1$ and $E_Q[\beta(\mathbf{x},y)] = 1$.

\subsection{Proof of Proposition 2}

\begin{proposition} (\emph{Optimal Sampling Distribution}) The optimal instrumental sampling distribution that minimizes $\sigma^2_Q$ is
\begin{equation}
\label{eqn:Qopt}
Q^{opt}_{t}(\mathbf{x},y) \propto \big|l(f(\mathbf{x};\theta),y) - R(\theta)\big|P(\mathbf{x},y).
\end{equation}
\label{pro-2}
\end{proposition}

\begin{proof}
We minimize the variance estimate $\sigma^2_Q$ in terms of $Q$ under the constraint $\iint Q(\mathbf{x},y)d\mathbf{x}dy = 1$ using Lagrange multiplier $\tau$.
\begin{align}
\mathcal{L}[Q,\tau] & = \sigma^2_Q + \tau(\iint Q(\mathbf{x},y)d\mathbf{x}dy - 1) \\
& = \iint \frac{\Lambda(\mathbf{x},y)}{Q(\mathbf{x},y)} + \tau(Q(\mathbf{x},y) - 1) d\mathbf{x}dy,
\end{align}
where $\Lambda(\mathbf{x},y) = P(\mathbf{x},y)^2[l(f(\mathbf{x};\theta),y) - R(\theta)]^2$.

We define $G(Q(\mathbf{x},y);\mathbf{x},y) = \frac{\Lambda(\mathbf{x},y)}{Q(\mathbf{x},y)} + \tau(Q(\mathbf{x},y) - 1)$. The optimal point for the constrained problem satisfies the Euler-Lagrange equation:
\begin{align}
\frac{\partial G}{\partial Q(\mathbf{x},y)} = -\frac{\Lambda(\mathbf{x},y)}{Q(\mathbf{x},y)^2} + \tau = 0.
\end{align}
A solution $w.r.t$ the normalization constraint is:
\begin{equation}
\label{sol}
\begin{aligned}
Q^* = \frac{\sqrt{\Lambda(\mathbf{x},y)}}{\iint\sqrt{\Lambda(\mathbf{x},y)}d\mathbf{x}dy}.
\end{aligned}
\end{equation}

Since $Q$ is a sampling distribution, we dismiss the negative solution. Substituting $\Lambda$ into (\ref{sol}), we have
\begin{align}
Q^*(\mathbf{x},y) \propto \big|l(f(\mathbf{x};\theta),y) - R(\theta)\big|P(\mathbf{x},y).
\end{align}
\end{proof}

These proofs are with reference of \citep{sawade2010active}.

\section{ADDITIONAL RELATED WORK}

\label{relate_sec}
This Section is the supplement of the Section Related Work in full paper, which mainly discusses the difference between \citep{farquhar2021statistical} and our work. Both \citep{farquhar2021statistical} and our work focus on the bias problems resulting from the AL processes. In \citep{farquhar2021statistical}, they construct unbiased estimator of empirical risk $R_{\text{Labelled}}$ by $R_{\text{Pool}}$ with weighted loss. They optimize the intended objective, not for minimize the train -- test gap. Different from \citep{farquhar2021statistical}, we construct (asymptotic) unbiased estimator of the expectation of loss $E_{\mathcal{X}\times \mathcal{Y}\sim P}[\text{loss}]$ by IWERM ($\beta R_{\text{Labelled}}$). In \citep{farquhar2021statistical}, their assumptions of constructing unbiased estimator during AL processes are 1) data that are sampled uniformly from $\D_{\text{Pool}}$ is unbiased (with probability $\tfrac{1}{N}$ if $|\D_{\text{Pool}}| = N$) and 2) the selection probability must be non-zero on all of the training data. In our work, we construct the (asymptotic) unbiased estimator during the AL processes by the assumption: data in $\D_{\text{Pool}}$ are sampled $i.i.d.$ from underlying distribution $P(\mathbf{x},y)$ and each data ($\mathbf{x}_i,y_i$) are sampled with probability $P(\mathbf{x}_i,y_i)$. In \citep{farquhar2021statistical}, they construct the acquisition proposal distribution from the perspective of the risk estimation itself. In our work, we construct the acquisition proposal distribution from the existing AL strategies. To sum up, \citep{farquhar2021statistical} aims to ``remove the bias'' during AL training processes, while our work aims to model the difference between the underlying distribution of the whole data space and the sampling distribution generated by AL strategies.



\section{SUPPLEMENT OF METHODOLOGY}

\subsection{Independence Assumption} 
In main paper, we discussed the independence assumption in Section 1. The whole AL process is changing constantly with the labeled set and basic model updating in each stage, and thus it is not enough to just collect data ``actively'' and treat the model fitting part the same as passive learning. For passive learning, one key assumption is that the training set comprises \iid samples from the unknown true data distribution $P(\mathbf{x},y)$, $\D_n \stackrel{i.i.d.}{\sim} P$. 



If we select data samples sequentially by some fixed heuristics in AL (e.g., uncertainty-based strategies), the labeled training set is {\bf not} drawn \iid~from $P$. In AL sampling processes, data are sampled in different stages are not independent to each other, since the sampling strategy at stage $t$ depends on stage $t-1$, would mixing them into $\D_l$ violates the needs of independence to prove decent statistical bounds? The answer is \textbf{No}. From the aspect of sample size tends to infinity, Although AL processes are not independent, after observed enough data, for both  stream-based and pool-based AL, the data (including both labeled and unlabeled) are sampled \iid from underlying data distribution (which is consistent with Section 3 in \citep{beygelzimer2009importance}). Therefore, we can still obtain the statistical bounds with independence assumption as sample size tends to infinity. From the aspect of per stage in AL sampling processes, although the current data distribution of labeled set is non \iid, but the estimator provided by IWERM is still be unbiased, therefore, we can still obtain the statistical bounds of our learned hypothesis consistent with the statistical bounds under the independent assumptions. 

\subsection{Discussions of ``non-informativeness''}
\label{non-info_sec}

We analyse the representation of ${\beta}_t$ when the sample size tends to infinity.
Suppose that $R_{t}^w(\theta)$ is an unbiased estimator of $R(\theta)$, which is based on a sufficiently strong classifier (e.g., using CNN as a basic classifier).
If an infinite number of  samples are observed, the basic classifier will have a very certain prediction given $\mathbf{x}_i$.
Thus, entropy-based uncertainty sampling will converge to ``non-informativeness'' as the sample size tends to infinity, since all predictions are certain.

We explain why some AL sampling strategies can converge to  ``non-informativeness'' based on the assumptions in AL sampling processes (see Section 3.1 after Lemma 1) from 2 aspects, using entropy-based uncertainty sampling as example. We review the assumptions here: In this paper, we assume that AL would not query non-existing or out-of-distribution (OOD) data samples and would not query wrong/noisy labels from oracles/experts, that is, $P(\mathbf{x}, y) > 0$ and $Q(\mathbf{x}, y) > 0$. Additionally, we could also obtain another vital information from these assumptions: $P(y = y_{\text{true}}|\mathbf{x}_i) = 1$ for all labeled samples. Firstly, after querying  enough samples, any $\mathbf{x}_i$ actually appears in the labeled trained set, and thus we know it’s hard label and are very certain about it, i.e., $P(y = y_{\text{true}}|\mathbf{x}_i) = 1$, thus the confidence is $1$. Secondly, \citep{fernandezdecossio2015maximum} shows that the practice of using sample average as surrogates of probability expectations is reliable provided sample size is large. Equation (1) in \citep{fernandezdecossio2015maximum} shows that the entropy of model parameters will converge to a certain value as sample size increases. That is, after observing enough data, any given $\mathbf{x}_i$ will not change the basic model, and thus any  $\mathbf{x}_i$ is meaningfulness to improve the basic model,  which is consistent with our proposed ``non-informativeness'' assumption.


Besides for uncertainty-based AL methods like entropy-based uncertainty sampling (\textbf{US}), some representative/diversity based AL strategies also converge to ``non-informativeness''. For instance, \citet{wu2006sampling} provided a diversity-based method, which encourages the selection of unlabeled samples that are far from the labeled set and removes the redundancy within the selected samples. The redundancy of samples is measured by the angles between the samples:
\begin{equation}
\begin{aligned}
\text{diversity}(\mathbf{x}_i) = 1 - \max\nolimits_{\mathbf{x}_j \in \D_l}\frac{K(\mathbf{x}_i, \mathbf{x}_j)}{\sqrt{K(\mathbf{x}_i, \mathbf{x}_i)K(\mathbf{x}_j, \mathbf{x}_j)}}
\end{aligned}
\end{equation}
where $K$ is Mercer kernel operator and $\D_l$ refers to the labeled set. when the size of $\D_l$. when the size of $\D_l$ tends to infinity, then $\text{diversity}(\mathbf{x}_i)$ converges to a constant. Moreover, some combined AL strategies also converge to ``non-informativeness''. For instance, combining uncertainty-based and representative/diversity based methods with weighted sum optimization, if each of the components converges to a constant, the combined strategies will converge to ``non-informativeness''.

Next we show an example that the acquisition function of existing AL methodology would not converge to ``non-informativeness''.
\citet{wu2006sampling} further provided a representativeness-based sampling scheme, which indicates that the examples with high representativeness will add more information to the training set. The representativeness of an instance can be evaluated on how many instances are similar to it. Given unlabeled data pool $\D_u$, $|\D_u|=n$, the representativeness score is defined as the average similarity of all other data in $\D_u$:
\begin{equation}
\begin{aligned}
\text{representativeness}(\mathbf{x}_i) = \frac{\sum_{ i\neq j}K(\mathbf{x}_i, \mathbf{x}_j)}{n-1}.
\end{aligned}
\end{equation}
The output of this acquisition function would not be constant as the sample size increases, since it just depends on unlabeled data pool.

It's not easy to provide a very clear and recognizable paradigm for ensuring whether an AL sampling scheme converges to ``non-informativeness'' or not. We should observe its acquisition function to determine whether it converges to ``non-informativeness'' or not. In general, most non-agnostic AL sampling schemes that make selection singly depend on the basic learned models would converge to ``non-informativeness'', since they are aiming to detect the disagreement of predictions of given learned models or the uncertainty of the output label, after observing enough data and obtaining well-training basic learned models, the discrepancy among unlabeled data samples could not be accessed. In contrast, most agnostic AL sampling methods like \citep{sener2017active} that make selection singly depend on the information extracted from unlabeled pool would not converge to ``non-informativeness'', since the unlabeled data pool is constantly changes, and the information extracted from unlabeled data pool might also be changed in each stage of AL processes.

\subsection{Discussions of P distribution}

How to estimate ${P}_t$ distribution in practice is one key point in experimental settings. In classical ML experiments, the feature is fixed, thus we employ the fixed feature provided by the data set. In deep learning tasks, we employ the penultimate layer of the neural network as feature, therefore the feature changes dynamically as the updating of basic classifier. In classical ML tasks, we model $P_t(\mathbf{x},y)=P_t(\mathbf{x}|y)P_t(y)$ using a class-conditional generative model and prior distribution of class labels. In deep learning tasks, we model $P_t(\mathbf{x},y)=P_t(y|\mathbf{x})P_t(\mathbf{x})$ using the classifiers posterior and input data distribution. The classifier posterior $P_t(y|\mathbf{x})$ is learned from the labeled data, while input data distribution $P_t(\mathbf{x})$ is estimated from feature $\mathbf{x}$.

\section{ADDITIONAL EXPERIMENTS}

\subsection{Dataset Description}
\label{dataset_sec}
See Table~\ref{d} for the details of datasets applied in our experiments, including the number of classes, the number of feature dimension, the size of initial labeled pool, the size of initial unlabeled data pool, the size of testing set and the imbalance ratio of each dataset.

\begin{table}[tb]
\scriptsize
\centering
\caption{Datasets used in the experiments. The Imbalance Ratio (IR) is the ratio of the number of samples in the majority class to that of the minority class.}
\label{d}
{\begin{tabular}{@{}l|p{1.7cm}<\centering p{1.7cm}<\centering p{1.7cm}<\centering p{1.7cm}<\centering p{1.7cm}<\centering p{1.7cm}<\centering p{1.7cm}<\centering}
\hline
Dataset & $\#$ of classes & $\#$ of feature dimension & $\#$ of initial labelled set & $\#$  of unlabeled pool & $\#$  of test set & $\#$ of Maximum Budget & Imbalance Ratio \\
\hline
\emph{EX8a} & 2 & 2 & 20 & 325 & 518 & 325 & 1.0 \\
\emph{GCloudub} & 2 & 2 & 20 & 380 & 600 & 380 & 2.0 \\
\emph{R15} & 15 & 2 & 40 & 200 & 360 & 200 & 1.0 \\
\emph{D31} & 31 & 2 & 80 & 1,120 & 1,800 & 1,120 & 1.0 \\
\hline
\emph{Clean1} & 2 & 168 & 20 & 170 & 285 & 170 & 1.3 \\
\emph{Tic-tac-toe} & 2 & 9 & 20 & 363 & 575 & 363 & 6.8 \\
\emph{Splice} & 2 & 61 & 20 & 380 & 600 & 380 & 1.1 \\
\emph{Vehicle} & 4 & 18 & 20 & 318 & 508 & 318 & 1.1 \\
\hline
\end{tabular}}
\end{table}

\subsection{Baselines}
\label{baseline_sec}
This section shows the detail description of baseline AL models.
\begin{itemize}
\item \textbf{US} \citep{lewis1994heterogeneous}: This method is introduced in full paper.
\item \textbf{QBC}: \textbf{Query-by-Committee (QBC)} uses a committee of models $\mathcal{C} = \{\theta^{(1)},..., \theta^{(C)}\}$ (constructed by ensemble methods or various basic classifiers), which are trained on $\D_l$ to predict the labels of $\D_u$, and the ones with largest disagreement are selected for labeling by an oracle \cite{seung1992query, settles2009active}. The disagreement level could be measured by Voting Entropy (VE) or KL divergence. The optimization function is:
    \begin{align}
    x^*_{VE} = \arg\max_x - \sum_i \frac{V(y_i)}{C} \log \frac{V(y_i)}{C},
    \end{align}
    where $V(\cdot)$ is the voting entropy across the committee of classifiers.
\item \textbf{EER}: \textbf{Expected Error Reduction (EER)} maximizes the decrease of loss by adding new data samples \cite{roy2001toward, settles2009active}. The optimization function is:
    \begin{align}
    x^*_{EER}=\arg\min_x \sum_i p_{\theta}(y_i|x) (- \sum_{u=1}^{U} \sum_j  p_{\theta^+}(y_j|x^{(u)}) \log p_{\theta^+}(y_j|x^{(u)})),
    \end{align}
    where $\theta^+$ refers to the newly trained model after adding new data tuple.
\item \textbf{BMDR}: \textbf{Batch-mode Discriminative and Representative AL (BMDR)} \cite{wang2015querying} queries a batch of informative and representative examples by minimizing the empirical risk bound of AL.
    \begin{align}
    \min \limits_{\D_q,f} \sum \nolimits_{\{\mathbf{x},y\} \in \D_l}l(f,\mathbf{x},y) + \sum \nolimits_{\mathbf{x}_i \in \D_q}l(f,\mathbf{x},\hat{y}) + \lambda ||f||^2 + \beta MMD(\D,\D_l \cup \D_q),
    \end{align}
    where $\D = \D_l \cup \D_u$. MMD is maximum mean discrepancy, is a distance on the space of probability measures which has found numerous applications in machine learning and nonparametric testing.
\item \textbf{US-D}: It is \textbf{US} with Dropout regularization.
\item \textbf{UPAL} \citep{ganti2012upal}: works by minimizing the unbiased estimator of the risk of a hypothesis in a given hypothesis space. In this work, they calculate the importance weight of data $(\mathbf{x}_i, y_i)$ as $\tfrac{Q^t_i}{p^t_i}$, where $Q_i^t \in \{0, 1\}$, represents whether this data sample are queried or not, and $p^t_i$ is sampling distribution, calculated by $p^t_i = p^t_{\text{min}} + (1 - \tfrac{1}{n p^t_{\text{min}}})\tfrac{H(Pr[+1|\mathbf{x}_i,h_{\mathcal{A},t-1}])}{\sum_j H(Pr[+1|\mathbf{x}_j,h_{\mathcal{A},t-1}])}$. $H(\cdot)$ is entropy and $p^t_{\text{min}} = \tfrac{1}{n_t}$.
\item \textbf{SWAL} \citep{imberg2020optimal}: this work shows that optimal predictive performance is achieved by over-sampling influential instances and high-leverage data points, and that uncertain instances not necessarily are informative ones. \textbf{SWAL} computes sampling probabilities $p_{t,i} \in (0,1)$ for each data point, and they update the sampling weights ($w$) by $w_{t,i} = w_{t-1,i}+(\tfrac{1}{p_{t,i} - w_{t-1,i}})$. and update the model parameter by:
    \begin{equation}
    \hat{\theta}_t = \arg\min_{\theta} \sum_i w_{t,i}l_i(f(\mathbf{x}_i;\theta),y_i).
    \end{equation}
    We employed $3$ variants in this paper:
    \begin{enumerate}[1. ]
    \item \textbf{SWAL-cora} calculates the sampling probabilities by: $p_{t,i} \propto \sqrt{h_{ii}(\theta)}$,
    where $h_{ii}(\theta) = \text{Var}_{\theta}(Y^*_i|\mathbf{x}_i)x_i^T \mathbf{H}^{-1} x_i$,
    $\mathbf{H} = \mathbf{H}(\theta) \propto \mathbf{X}^ T \mathbf{V} \mathbf{X}$
    and $\mathbf{V} = \mathbf{V}(\theta)$ be the diagonal matrix of $\text{Var}_{\theta}(Y^*_i|\mathbf{x}_i)$.
    \item \textbf{SWAL-corb} calculates the sampling probabilities by: $p_{t,i} \propto ||\sqrt{\text{Var}_{\theta}(Y^*_i)}\mathbf{V}\mathbf{X}\mathbf{H}^{-1}x_i ||$.
    \item \textbf{SWAL-prop} calculates the sampling probabilities by: $p_{t,i} \propto \sqrt{E_{\theta}[l_i(f(\mathbf{x}_i;\theta),Y^*_i)^2]}$.
    \end{enumerate}

\end{itemize}
\textbf{US}, \textbf{US-D}, \textbf{QBC}, \textbf{EER} and \textbf{BMDR} are all converge to ``non-informativeness'' as the sample size increases.

\subsection{Implementation Details}
\label{imple_sec}
\begin{itemize}
\item In classical ML tasks, for basic classifier in various AL methods and our framework, we employed Support Vector Machine (SVM) with probability measure\footnote{\url{https://scikit-learn.org/0.24/modules/generated/sklearn.svm.SVC.html}}. Note that there are some experiments were missing in presented experimental results, e.g., EER on \emph{D31} dataset could not be completed, since the basic classifier would encounter ``All samples with positive weights have the same label.'' error when facing some subsets. So we didn't report these performance in our experiments since they are not completed 10 repeated trials. There are 3 requirements for choosing basic learned model: 1) the basic learned model is asymptotically unbiased and consistent as sample size increases in passive learning tasks; 2) the basic learned model could output the predicted class probabilities; 3) the basic learned model could change sample weights during training.
\item For the basic parameter settings of basic AL models, we followed the settings provided by ALiPy project \cite{TLHalipy}\footnote{\url{https://github.com/NUAA-AL/ALiPy}}. The experiments are based on sklearn.
\item In \textbf{QBC}, we employed ``Bagging meta-estimator'' strategy to achieve ``committee of classifiers'' and the basic classifier in Bagging is the default setting in sklearn library, that is, Decision Tree classifier.
\item We utilize Gaussian Naive Bayes\footnote{\url{https://scikit-learn.org/0.24/modules/naive_bayes.html}} to estimate $P_t$ in AL with classical ML tasks.
\item We utilize predicted class probabilities ($P_t(y|\mathbf{x})$) provided by basic classifiers and Kernel Density Estimator\footnote{\url{https://scikit-learn.org/0.24/modules/generated/sklearn.neighbors.KernelDensity.html}} (KDE) to calculate ($P_t(\mathbf{x})$) to estimate $P_t$ in AL with deep learning tasks.
\item When splitting datasets of classical ML tasks, we have more data samples in the testing ($60\%$) sets than the training sets ($40\%$), since we want to observe the generalization of the basic classifiers generated by various AL methods.
%\item There is no extra hyperparameter for our framework to fine-tune, except for the batch size settings.
\item We randomly select the initial data pool $\mathcal{D}^0_l$ from the training set and the remaining un-selected training set becomes our unlabeled data pool.
\item In classical ML tasks, we fixed the random seed ($4666$) when splitting the initial labeled/training/testing sets to ensure that considering the $10$ repeated experiments, we have the same data splitting for running each AL method on each dataset.
\item To avoid bias problems, we have avoided any specific dataset tuning or pre-processing.
\item In our practical implementation, we time a coefficient on the importance-weight: $\beta_t(\mathbf{x}_i,y_i)=\tfrac{|\D_u + \D_l|}{|\D_u|}\tfrac{P_t(\mathbf{x}_i,y_i)}{Q_t(\mathbf{x}_i,y_i)}$, where $|\D_u + \D_l|$ refers to the full dataset size and $|\D_u|$ refers to the size of unlabeled data pool at stage $t$. This is because during estimation at each stage, $P_t$ is estimated and normalized over the whole dataset. For $Q_t$ -- specially for calculating the importance weight of training data, it is estimated and normalized over the labelled set,. So when we calculate the importance weight $\beta_t$ for training data, we have an extra normalization coefficient $\tfrac{|\D_u + \D_l|}{|\D_u|}$. To eliminate the impact of the coefficient, we should time this coefficient when calculating the importance weight for re-training the basic classifier. Note that it only works for pool-based AL.
\end{itemize}

\subsection{Experimental Results}

\subsubsection{Sensitive analysis}
We use GaussianNB for $P$, which has 2 parameters: priors and variance smoothing. The priors refer to the prior probabilities of the classes. The variance smoothing refers to the portion of the largest variance of all features that is added to variances for calculation stability. can see \url{https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html} for more details.
In the paper, we use the default settings (variance smoothing: $10^{-9}$, prior: None).
We conduct new experiments varying the hyperparameters using \textbf{US} on \emph{EX8a} under $B=10$ (see Table \ref{SA}).
Our model is not sensitive to the hyperparameter settings.
\begin{table}[htb]
\centering
\caption{Sensitive Analysis.}
\label{SA}
\begin{tabular}{l|c}
\hline
setting & AUBC (acc) \\
\hline
variance smoothing $10^{-9}$ & $0.839 (0.013)$ \\
variance smoothing $10^{-7}$ & $0.849 (0.016)$ \\
variance smoothing $10^{-5}$ & $0.846 (0.015)$ \\
variance smoothing $10^{-3}$ & $0.844 (0.011)$ \\
variance smoothing $10^{-2}$ & $0.843 (0.013)$ \\
variance smoothing $10^{-1}$ & $0.847 (0.022)$ \\
\hline
prior None &  $0.839 (0.013)$ \\
prior Uniform & $0.842 (0.018)$ \\
prior Class Ratio & $0.843 (0.010)$ \\
\hline
\end{tabular}
\end{table}

\subsubsection{Additional Experiments on Classical ML tasks}
We present the accuracy vs.~Budget curves with batch size settings $B = 1$, $B = 5$ and $B = 20$ (see Figures~\ref{acc-figure-1},~\ref{acc-figure-5} and \ref{acc-figure-20}), the AUC vs.~Budget curves with batch size setting $B = 1$, $B = 5$, $B = 10$ and $B = 20$ (see Figures~\ref{auc-figure-1},~\ref{auc-figure-5},~\ref{auc-figure-10} and \ref{auc-figure-20}) and the $\text{F}_1$ vs.~Budget curves with batch size settings $B = 1$, $B = 5$, $B = 10$ and $B = 20$ (see Figures~\ref{f1-figure-1},~\ref{f1-figure-5},~\ref{f1-figure-10} and \ref{f1-figure-20}).

We could observe from the same with different batch size settings (i.e., $B \in \{1, 5, 10, 20\}$), our approach still improves the performance of the baseline AL model much and maintains the advantage over the unbiased AL baselines. In general, the performance on $B \in \{1, 5, 20\}$ show similar trends compared with the performance with batch size setting $B = 1$, which ensures the stability of our proposed at different batch size settings.

For different evaluation metrics, i.e., compare AUBC (acc) with AUBC (AUC) and AUBC ($\text{F}_1$), despite the difference in concrete AUBC values, the shape of the curves, the trend and the timing of model convergence are similar. That is, different evaluations also provide consistent results.

\begin{figure*} [tb]
\centering
\begin{minipage}{3.8cm}
\centering
\includegraphics[width=3.8cm]{img/ex8a_imputation_acc-10-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{3.8cm}
\centering
\includegraphics[width=3.8cm]{img/gcloudub_imputation_acc-10-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{3.8cm}
\centering
\includegraphics[width=3.8cm]{img/r15_imputation_acc-10-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{3.8cm}
\centering
\includegraphics[width=3.8cm]{img/d31_imputation_acc-10-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{3.8cm}
\centering
\includegraphics[width=3.8cm]{img/clean1_imputation_acc-10-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{3.8cm}
\centering
\includegraphics[width=3.8cm]{img/splice_imputation_acc-10-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{3.8cm}
\centering
\includegraphics[width=3.8cm]{img/tictactoe_imputation_acc-10-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{3.8cm}
\centering
\includegraphics[width=3.8cm]{img/vehicle_imputation_acc-10-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{Accuracy-budget curves for classical ML tasks with $B = 10$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (acc) over 10 trials is shown in parentheses in the legend.}
\label{acc-figure}

\end{figure*}

\begin{figure*} [htb]
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/ex8a_imputation_auc-10-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/gcloudub_imputation_auc-10-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/r15_imputation_auc-10-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/d31_imputation_auc-10-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/clean1_imputation_auc-10-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/splice_imputation_auc-10-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/tictactoe_imputation_auc-10-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/vehicle_imputation_auc-10-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{AUC-budget curves for classical ML tasks with $B = 10$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (auc) over 10 trials is shown in parentheses in the legend. 
}
\label{auc-figure-10}
\end{figure*}

\begin{figure*} [htb]
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/ex8a_imputation_f1-10-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/gcloudub_imputation_f1-10-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/r15_imputation_f1-10-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/d31_imputation_f1-10-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/clean1_imputation_f1-10-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/splice_imputation_f1-10-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/tictactoe_imputation_f1-10-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/vehicle_imputation_f1-10-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{$\text{F}_1$-budget curves for classical ML tasks with $B = 10$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (f1) over 10 trials is shown in parentheses in the legend. 
}
\label{f1-figure-10}
\end{figure*}




\begin{figure*} [htb]
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/ex8a_imputation_acc-20-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/gcloudub_imputation_acc-20-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/r15_imputation_acc-20-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/d31_imputation_acc-20-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/clean1_imputation_acc-20-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/splice_imputation_acc-20-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/tictactoe_imputation_acc-20-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/vehicle_imputation_acc-20-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{Accuracy-budget curves for classical ML tasks with $B = 20$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (acc) over 10 trials is shown in parentheses in the legend. 
}
\label{acc-figure-20}
\end{figure*}


\begin{figure*} [htb]
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/ex8a_imputation_auc-20-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/gcloudub_imputation_auc-20-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/r15_imputation_auc-20-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/d31_imputation_auc-20-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/clean1_imputation_auc-20-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/splice_imputation_auc-20-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/tictactoe_imputation_auc-20-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/vehicle_imputation_auc-20-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{AUC-budget curves for classical ML tasks with $B = 20$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (auc) over 10 trials is shown in parentheses in the legend.
}
\label{auc-figure-20}
\end{figure*}

\begin{figure*} [htb]
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/ex8a_imputation_f1-20-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/gcloudub_imputation_f1-20-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/r15_imputation_f1-20-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/d31_imputation_f1-20-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/clean1_imputation_f1-20-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/splice_imputation_f1-20-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/tictactoe_imputation_f1-20-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/vehicle_imputation_f1-20-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{$\text{F}_1$-budget curves for classical ML tasks with $B = 20$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (f1) over 10 trials is shown in parentheses in the legend. 
}
\label{f1-figure-20}
\end{figure*}




\begin{figure*} [htb]
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/ex8a_imputation_acc-5-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/gcloudub_imputation_acc-5-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/r15_imputation_acc-5-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/d31_imputation_acc-5-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/clean1_imputation_acc-5-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/splice_imputation_acc-5-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/tictactoe_imputation_acc-5-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/vehicle_imputation_acc-5-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{Accuracy-budget curves for classical ML tasks with $B = 5$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (acc) over 10 trials is shown in parentheses in the legend. 
}
\label{acc-figure-5}
\end{figure*}


\begin{figure*} [htb]
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/ex8a_imputation_auc-5-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/gcloudub_imputation_auc-5-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/r15_imputation_auc-5-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/d31_imputation_auc-5-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/clean1_imputation_auc-5-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/splice_imputation_auc-5-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/tictactoe_imputation_auc-5-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/vehicle_imputation_auc-5-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{AUC-budget curves for classical ML tasks with $B = 5$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (auc) over 10 trials is shown in parentheses in the legend. 
}
\label{auc-figure-5}
\end{figure*}

\begin{figure*} [htb]
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/ex8a_imputation_f1-5-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/gcloudub_imputation_f1-5-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/r15_imputation_f1-5-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/d31_imputation_f1-5-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/clean1_imputation_f1-5-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/splice_imputation_f1-5-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/tictactoe_imputation_f1-5-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/vehicle_imputation_f1-5-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{$\text{F}_1$-budget curves for classical ML tasks with $B = 5$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (f1) over 10 trials is shown in parentheses in the legend. 
}
\label{f1-figure-5}
\end{figure*}




\begin{figure*} [htb]
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/ex8a_imputation_acc-1-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/gcloudub_imputation_acc-1-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/r15_imputation_acc-1-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/d31_imputation_acc-1-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/clean1_imputation_acc-1-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/splice_imputation_acc-1-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/tictactoe_imputation_acc-1-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/vehicle_imputation_acc-1-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{Accuracy-budget curves for classical ML tasks with $B = 1$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (acc) over 10 trials is shown in parentheses in the legend. 
}
\label{acc-figure-1}
\end{figure*}


\begin{figure*} [htb]
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/ex8a_imputation_auc-1-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/gcloudub_imputation_auc-1-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/r15_imputation_auc-1-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/d31_imputation_auc-1-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/clean1_imputation_auc-1-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/splice_imputation_auc-1-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/tictactoe_imputation_auc-1-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/vehicle_imputation_auc-1-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{AUC-budget curves for classical ML tasks with $B = 1$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (auc) over 10 trials is shown in parentheses in the legend.
}
\label{auc-figure-1}
\end{figure*}

\begin{figure*} [htb]
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/ex8a_imputation_f1-1-w}
\footnotesize(a) EX8a
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/gcloudub_imputation_f1-1-w}
\footnotesize(b) GCloud-ub
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/r15_imputation_f1-1-w}
\footnotesize(c) R15
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/d31_imputation_f1-1-w}
\footnotesize(d) D31
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/clean1_imputation_f1-1-w}
\footnotesize(e) Clean1
\end{minipage}
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/splice_imputation_f1-1-w}
\footnotesize(f) Splice
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/tictactoe_imputation_f1-1-w}
\footnotesize(g) Tic-tac-toe
\end{minipage}
\centering
\begin{minipage}{4.1cm}
\centering
\includegraphics[width=4.1cm]{img/vehicle_imputation_f1-1-w}
\footnotesize(h) Vehicle
\end{minipage}
\caption{$\text{F}_1$-budget curves for classical ML tasks with $B = 1$. The solid lines represent our proposed method and the dashed lines represent the corresponding baseline AL strategy. The mean and standard deviation of the AUBC (f1) over 10 trials is shown in parentheses in the legend. 
}
\label{f1-figure-1}
\end{figure*}

\bibliography{zhan_217}

\end{document}
