\documentclass[accepted]{uai2022} 
\usepackage[american]{babel}

\usepackage{natbib} 
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} 
\usepackage{booktabs} 
\usepackage{tikz} 

\usepackage[utf8]{inputenc} 
\usepackage[T1]{fontenc}    
\usepackage{hyperref}       
\usepackage{url}            
\usepackage{booktabs}   
\usepackage{amsfonts}  
\usepackage{nicefrac}    
\usepackage{microtype}  
\usepackage{xcolor}
\usepackage{enumitem}
\usepackage{amsmath}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{graphicx}
\usepackage{color}
\usepackage{multirow}
\usepackage{array}
\usepackage{placeins}
\usepackage{textcase}
\usepackage{subfig}
\usepackage{float}

\newcommand{\swap}[3][-]{#3#1#2} 

\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{\emph{Proposition}}
\newtheorem{proof}{Proof}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{lemma}{Lemma}

\newcommand{\D}{{\cal D}}
\newcommand{\iid}{{\em i.i.d.}}
\newcommand{\empR}{R^{\mathrm{emp}}}

\title{Asymptotic Optimality for Active Learning Processes}

\author[1]{\href{mailto:<xyzhan2-c@my.cityu.edu.hk>?Subject=Asymptotic Optimality for Active Learning Processes}{Xueying Zhan}{}}
\author[2]{Yaowei Wang}
\author[1]{Antoni~B.~Chan}

\affil[1]{
    Department of Computer Science\\
    City University of Hong Kong\\
    Hong Kong SAR, China
}
\affil[1]{
    Department of Computing\\
    The Hong Kong Polytechnic University\\
    Hong Kong SAR, China
}  
  \begin{document}
\maketitle

\begin{abstract}
Active Learning (AL) aims to optimize basic learned model(s) iteratively by selecting and annotating unlabeled data samples that are deemed to best maximise the model performance with minimal required data. However, the learned model is easy to overfit due to the biased distribution (\emph{sampling bias} and \emph{dataset shift}) formed by non-uniform sampling used in AL. Considering AL as an iterative sequential optimization process, we first provide a perspective on AL in terms of statistical properties, i.e., asymptotic unbiasedness, consistency and asymptotic efficiency, with respect to basic estimators when the sample size (size of labeled set) becomes large, and in the limit as sample size tends to infinity. We then discuss how biases affect AL. Finally, we proposed a flexible AL framework that aims to mitigate the impact of bias in AL by minimizing generalization error and importance-weighted training loss simultaneously.
\end{abstract}


\section{Introduction}
\label{intro}
The main goal of AL is to iteratively optimize a basic learning model with a finite set of data samples $\D_n = \{(\mathbf{x}_i, y_i)\}_{i=1}^n$, where each data sample $(\mathbf{x}_i,y_i)$ is sequentially selected from the unlabeled data pool and annotated.
AL iterates between \textbf{data collection} and \textbf{model fitting} by repeatedly querying the labels of new data samples. Thus, selection of both the basic learning model and selection rule are of vital importance. From the perspective of data collection, AL has two branches: 1) pool-based AL, which selects new data sample(s) from a large unlabeled data pool for annotation, and 2) stream-based AL, which receives one data sample at a time and determines whether or not to label the instance \citep{cheng2013feedback}.

Many existing AL works focus on how to design acquisition functions based on  \textbf{fixed heuristics} for data collection. For instance, uncertainty-based sampling strategies aim to select unlabeled data samples with the lowest confidence (the largest uncertainty) of being classified correctly  by the basic model \citep{lewis1994heterogeneous}. Most uncertainty-based methods belong to non-agnostic AL sampling strategies, that is, when making selections, the active learners rely more on the decision boundary estimated by the currently-trained basic model \citep{pereira2019empirical}.
%
In contrast, agnostic AL approaches make no assumption related to the decision boundary learned by the basic classifier, ignoring the information provided by the basic classifier (e.g., label information) and only utilizing the information directly from the unlabeled data pool \citep{pereira2019empirical}.
For instance, many representativeness-based methods, which select subsets that are most representative of the unlabeled data pool, are agnostic AL.
%
Combined strategies \citep{shen2004multi, ebert2012ralf, li2013adaptive, ash2019deep} integrate the advantages of aforementioned sampling strategies, and are more adaptable to various data topologies \citep{munro2020human}.

However, the whole AL process is changing constantly with the labeled set and basic model updating in each stage, and thus it is not enough to just collect data ``actively'' and treat the model fitting stage in the same manner as passive learning. For passive learning, one key assumption is that the training set comprises \iid~(independent and identically distributed) samples from the unknown true data distribution $P(\mathbf{x},y)$, $\D_n \stackrel{i.i.d.}{\sim} P$. If we select data samples sequentially by some fixed heuristics in AL (e.g., uncertainty-based strategies), the labeled training set is {\bf not} drawn \iid~from $P$. That is, the labeled training set employed in AL is biased, due to the recycled use of past samples at each stage and the lack of independence between data samples \citep{abayes2010Fred, portier2018asymptotic, farquhar2021statistical}. In this paper, we denote the bias resulting from the fixed heuristics during data collection  as ``\emph{sampling bias}'', which is inevitable during the whole AL processes.

Given a training set with \emph{sampling bias}, an unbiased and consistent estimator of a basic model in passive learning might no longer be unbiased {(with respect to the original dataset), even asymptotically, in AL \citep{sugiyama2009pool, farquhar2021statistical}. For instance, {\em ordinary least squares} %(OLS)
becomes biased due to the sampling bias problems \citep{sugiyama2009pool}. The combination of a biased data collection procedure and biased model estimator will result in a vicious cycle, where biased samples create biased models, then create even more biased samples. \citet{dasgupta2008hierarchical} provided a detailed explanation of this phenomenon: many AL heuristics start by utilizing a small initial labeled set to estimate a rough decision boundary, and then querying points that are increasingly closer to their current estimate of the boundary. During AL training, samples are queried based on increasingly confident assessments of their informativeness, e.g., the largest uncertainty, and the labeled set will diverge farther away from the true underlying data distribution. Moreover, if the basic learned model itself is biased, the rate of divergence will be accelerated.

In this paper, we explore the relationship between data collection and model fitting stages in AL, and discuss crucial factors for designing AL approaches that help reduce the negative effects of sampling bias. We then propose a flexible AL framework that can be applied on top of existing AL sampling schemes, through minimizing the combination of generalization error and re-weighted training loss in each stage. In our work, we utilize existing AL sampling schemes to generate sampling distribution. The sampling distribution is then used to compute the {\em importance weight} that represents the discrepancy between the underlying data distribution and the AL sampling distribution -- that is -- modeling the \emph{dataset shift}.
The importance weight is designed for minimizing the generalization error, and is used to re-weight the training loss for model learning in each stage. The re-weighted training loss is an asymptotically unbiased and consistent estimator of the true risk. Furthermore, this risk estimator could achieve asymptotic efficiency by optimizing its hyper-parameters.

\section{Related Work}
\label{relate}

One challenging and common problem in the research fields of learning from insufficient data (e.g.,  AL, few-shot learning, semi-supervised learning) is the overfitting of the learned models  due to the biased distribution formed by limited training data. 
Bias appears in both stages of AL: {\em sampling bias}, which is attributed to the AL heuristics in data selection stage, and {\em dataset shift}, which is caused by the sampling bias and influences the model fitting stage. 

{\em Sampling bias} is a bias in which samples are collected in a way that some samples of the intended population have a lower or higher sampling probability than others. If {\em sampling bias} is not accounted for, experiment results (e.g., performance of the  basic model in AL) might be erroneously attributed to the phenomenon or the model selection under study, rather than the method of sampling. On the one hand, sampling bias has a negative impact on AL as discussed in Section~\ref{intro}. \citet{schutze2006performance} observed a ``missed cluster effect'' of AL, where some important clusters in the feature space are not represented in the AL sample set, and thus this sample set is not sufficient for estimating  a basic model that is consistent with one learned from the true distribution. Furthermore, AL sampling will ignore these clusters in the data, and never query points from there, results in a local minimum. On the other hand, {\em sampling bias} sometimes can be helpful in AL.
\citet{mussmann2018uncertainty} proved that the uncertainty sampling updates are preconditioned SGD steps on the population $0/1$ loss, and move in descent directions for parameters that are not approximate stationary points.  \citet{chang2017active} confirmed that the proper bias can be beneficial to generalization performance. They  proposed ``active bias''  that emphasizes uncertain points and find that it increases the model performance, compared with using a fully labeled set.

{\em Dataset shift} (aka {\em covariate shift}, {\em dataset drifting}) refers to the discrepancy between the data distributions of the training and testing sets (or true underlying data distribution).
It causes a principal problem during the model fitting step in AL, since some regions with large density in the unlabeled data pool may not be well represented by the labeled data.
To reduce the impact of {\em dataset shift}, the labeled training set could be re-sampled with respect to an appropriate distribution, so as to minimize the statistical risk of the classifier built on the re-sampled data \citep{zadrozny2004learning, richards2011active}.
Both \emph{sampling bias} and \emph{dataset shift} lead to error/bias in learning the optimal hypothesis. In this paper, we distinguish them to help explain why AL processes must be biased: the AL sampling strategy itself brings \emph{sampling bias}, and this \emph{sampling bias} creates the \emph{dataset shift}.

To reduce bias problems in AL, \citet{sener2017active} provided a representativeness-based model that utilizes core-set approach (i.e., $k$-center) for pool-based AL, i.e. choosing set of points such that a model learned over the selected subset is competitive for the remaining data points. A rigorous bound between the average loss over the given subset and the remaining data points is derived by decomposing an upper bound of the AL loss.
Inspired by \cite{sener2017active}, we decompose the upper bound of the AL loss as a combination of training error and generalization error, but different from \citep{sener2017active}, we focus 
%more 
on modeling the discrepancy between the true underlying data distribution and the AL sampling distribution.

\citet{ganti2012upal} and \citet{imberg2020optimal} proposed unbiased pool-based AL sampling schemes with the idea of ``subroutine rejection-threshold'' from \citep{beygelzimer2009importance} and the Horvitz-Thompson unbiased estimator \citep{horvitz1952generalization}. \citet{ganti2012upal} formally proved that \emph{importance-weighted risk is an unbiased estimator of the true risk}. \citet{imberg2020optimal} derived asymptotic Taylor expansions for the expected generalisation error and mean squared error of the predictions, and consequently presented sampling schemes that optimise the performance of AL approaches.
\citet{beygelzimer2009importance} proposed an importance-weighted AL sampling scheme based on the learner called a \emph{subroutine rejection-threshold}, which efficiently corrects the sampling bias. However, due to the nature of the selected learner, their work is only applicable to stream-based AL. \citet{farquhar2021statistical} constructed an unbiased estimator of the empirical risk of the labeled set via  the risk of unlabeled data pool with weighted loss. The aim is to remove the bias in AL, that is, they minimize the difference between the two risks, not the train-test data gap ({\em dataset shift}). In contrast, our work models the divergence between the underlying distribution of the whole data space and the AL sampling distribution, and we minimize a more general ``train-test'' gap -- the gap between labeled data distribution and the true underlying data distribution.

\section{Methodology}
In this section, we firstly discuss crucial factors for designing AL methods that reduces the aforementioned bias problems. Then we propose our AL framework that can be applied on top of existing AL strategies.

\subsection{AL Loss}

Assume we have an AL strategy $\mathcal{A}$ for a $K$-class classification task with feature space $\mathcal{X}$, label space $\mathcal{Y} \in \{1,...,K\}$, classifier $f$ and a loss function $l(f(\mathbf{x};\theta),y):\mathcal{X} \times \mathcal{Y} \rightarrow \mathbb{R}$, parameterized over the hypothesis $\theta$.
In general, passive learning aims to minimize the risk:
\begin{equation}
\label{risk1}
\begin{aligned}
R(\theta) & = E_{(\mathbf{x},y) \sim P}[l(f(\mathbf{x};\theta), y)] \\&= \iint l(f(\mathbf{x}; \theta), y)P(\mathbf{x},y)d\mathbf{x} dy.
\end{aligned}
\end{equation} 
$P$ is unknown in most practical situations, but we can obtain sample data $\D_n$, and thus approximate it with an empirical distribution $P_{\delta}(\mathbf{x},y) = \frac{1}{n}\sum\nolimits_{i=1}^{n}\delta(\mathbf{x}=\mathbf{x}_i, y=y_i)$, where $\delta(\cdot)$ is a Dirac mass centered at $(\mathbf{x}_i,y_i)$ \citep{zhang2017mixup}. Then, the \textbf{empirical risk} is formulated as 
\begin{equation}
\begin{aligned}
R^{\text{emp}}(\theta) & = \iint l(f(\mathbf{x}; \theta), y)P_{\delta}(\mathbf{x},y)d\mathbf{x} dy 
\\ & = \frac{1}{n}\sum\nolimits_{i=1}^{n}l(f(\mathbf{x}_i; \theta), y_i).   
\end{aligned}
\end{equation}

Generally, in statistical machine learning, there is an assumption that the empirical risk will converge to the true risk as the number of samples increases, i.e., the empirical risk is an asymptotically unbiased estimate of the true risk, as follows:
\begin{lemma}
\label{l1}
The empirical risk $\empR(\theta)$ is an \textbf{asymptotically unbiased} estimate of $R(\theta)$:
\begin{equation}
\lim \nolimits_{n \rightarrow \infty} E[\empR({\theta}) - R(\theta)] = 0.
\end{equation}
\end{lemma}

In AL, consider a large unlabeled data pool $\D_u$ that is sampled \iid~from $P$, where the label of each data is unobserved, and an initial labeled data pool $\D_0=\{(\mathbf{x}_i,y_i)\}_{i=1}^{n_0}$ that is also sampled \iid~from $P$. Strategy $\mathcal{A}$ sequentially selects an unlabeled subset from $\D_u$ and queries their labels from an oracle for building the training set. At stage $t$, we collect and label new data samples $\D_t^{new}$ from $\D_u$, where $|\D_t^{new}| = n_t - n_{t-1} \leq B$ and $B$ is the batch size.  We then update the training set, obtaining $\D_t=\{(\mathbf{x}_i,y_i)\}_{i=1}^{n_t}$.

In most existing AL works, following multiple stages of a myopic approach, where each stage is solved independently, the goal of AL for a single stage is written as:
\begin{enumerate}[nosep,leftmargin=1em,labelwidth=*,align=left]
\item Select and label new samples: $\D_t^{\text{new}} = \mathop{\arg\min}_{\mathbf{x} \in \D_u}^B a(\mathbf{x}; \mathcal{A}|\D_{t-1})$,
where $a(\mathbf{x}; \mathcal{A})$ is the acquisition function of AL strategy $\mathcal{A}$ (and parameters within) \citep{gal2017deep}, and update $\D_t = \D_t^{new}\cup\D_{t-1}$.

\item Calculate the empirical risk at stage $t$, denoted as $\empR_t(\theta)$ \citep{sener2017active}:
\begin{equation}
\label{goal}
\empR_t(\theta) = \frac{1}{n_t}\sum\nolimits_{i=1}^{n_t}l(f(\mathbf{x}_i; \theta), y_i),
\end{equation}
and minimize it to obtain the estimated optimal hypothesis at stage $t$:
$\hat{\theta}_t = \arg\min_\theta R^{\text{emp}}_t(\theta)$.
\end{enumerate}

However, as mentioned in Section~\ref{relate}, an AL sampling heuristic will deliberately select a subset of unlabeled samples for labeling. Thus the selected data set will be distributed differently from the true distribution $P(\mathbf{x},y)$. We denote the data distribution {\em induced} by the AL algorithm as an {\em instrumental} distribution $Q(\mathbf{x},y)$. Therefore, the risk estimation during AL is actually biased since the labeled set is sampled from $Q$ instead of $P$. In this paper, we assume that \emph{AL will not query non-existent or out-of-distribution (OOD) data samples, and oracles/experts will not produce wrong/noisy labels, that is, $P(\mathbf{x},y)>0$ and $Q(\mathbf{x},y)>0$.}

Importance Weight Empirical Risk Minimization (IWERM) is widely adopted to remove bias in AL \citep{shimodaira2000improving, sugiyama2007covariate, cortes2010learning, vogel2020weighted}. It is originally designed to solve \emph{dataset shift} \citep{shimodaira2000improving, sugiyama2007covariate, sugiyama2009pool, sawade2010active, vogel2020weighted}. We denote {\em importance weight} as $\beta(\mathbf{x},y) = \tfrac{P(\mathbf{x},y)}{Q(\mathbf{x},y)}$.
After reweighing by $\beta$, the \textbf{weighted empirical risk at stage $t$ under the $Q$ distribution} (denote as $R^w_{t}(\theta)$) is re-estimated:
\begin{equation}
\begin{aligned}
\label{risk3}
R^w_{t}({\theta})
 = \tfrac{1}{n_t} \sum \nolimits_{i=1}^{n_t}\beta(\mathbf{x}_i,y_i)l(f(\mathbf{x}_i;{\theta}), y_i).
\end{aligned}
\end{equation}
Reweighing the empirical risk under the $Q$ distribution forms an \emph{unbiased} estimator of the true risk:
\begin{equation}
\label{unb}
\begin{aligned}
\nonumber
& E_{(\mathbf{X},\mathbf{y}) \sim Q}\big[\tfrac{1}{n_t} \sum \nolimits_{i=1}^{n_t}\beta(\mathbf{x}_i,y_i)l(f(\mathbf{x}_i;{\theta}), y_i)\big]
\\
&= \tfrac{1}{n_t} \sum \nolimits_{i=1}^{n_t}E_{(\mathbf{X},\mathbf{y}) \sim Q}\big[\beta(\mathbf{x}_i,y_i)l(f(\mathbf{x}_i;{\theta}), y_i)\big]
\\
&= \tfrac{1}{n_t} \sum \nolimits_{i=1}^{n_t}E_{(\mathbf{x}_i,y_i) \sim Q}\big[\beta(\mathbf{x}_i,y_i)l(f(\mathbf{x}_i;{\theta}), y_i)\big]
\\
&= \tfrac{1}{n_t} \sum \nolimits_{i=1}^{n_t}\iint Q(\mathbf{x}_i,y_i) \tfrac{P(\mathbf{x}_i,y_i)}{Q(\mathbf{x}_i,y_i)}l(f(\mathbf{x}_i;{\theta}), y_i) d\mathbf{x}_i dy
\\
&= \tfrac{1}{n_t} \sum \nolimits_{i=1}^{n_t}\iint P(\mathbf{x}_i,y_i) l(f(\mathbf{x}_i;{\theta}), y_i) d\mathbf{x}_i dy
\\
&= \tfrac{1}{n_t} \sum \nolimits_{i=1}^{n_t} R(\theta) = R(\theta),
\end{aligned}
\end{equation}
where $\mathbf{X} = \{\mathbf{x}_1,\cdots,\mathbf{x}_{n_t}\}$ and $\mathbf{y} = \{y_1,\cdots,y_{n_t}\}$.


\subsection{Upper Bound of AL Loss}
From \eqref{goal} and \eqref{risk3}, we observe that AL will perform well if we have: 
(i) larger labeled set, i.e., more budget; 
(ii) consistent and unbiased estimator of the empirical risk;
(iii) considering \eqref{risk3}, correctly modeling the discrepancy between $P$ and $Q$, which also accelerates the convergence rate of AL. Based on these considerations, to design a flexible AL approach, inspired by \citet{sener2017active}, we consider an upper bound of the risk $R(\theta)$ using the triangle inequality:
\begin{equation}
\label{goal2}
\begin{aligned}
R(\theta)  \leq \underbrace{\big|R(\theta) - {R}^w_{t}({\theta}) \big|}_{\text{1st term}} + \underbrace{\big|{R}^w_{t}({\theta})\big|}_{\text{2nd term}},
\end{aligned}
\end{equation}
The 1st term is the generalization error of the AL training process,
while the 2nd term is the training loss, as in \eqref{risk3}. In practice, the size of labeled set is finite and hence the loss function is bounded. Hoeffding's Inequality can quantify how these factors (i.e., $n_t$, $\beta$) affect the convergence of the 1st term, yielding the following theorem (see proof in supplementary materials).

\begin{theorem} (Hoeffding Inequality with IWERM)
Let $\{(\mathbf{x}_i, y_i)\}_{i=1}^{n_t}$ be $n_t$ instances that are sampled from the instrumental distribution $Q(\mathbf{x},y)$. Denote r.v. $\mathbf{S} = R(\theta) - {R}^w_{t}({\theta})$ that takes over $\theta$, and let $b = \sup \mathbf{S}$, $a = \inf \mathbf{S}$, $E[\mathbf{S}] = \eta$. $\forall \epsilon > 0$, we have
\begin{equation}
\label{hoeff}
 \mathbb{P}\Big(\big|R(\theta) - {R}^w_{t}({\theta}) \big| \geq \epsilon \Big)  \leq 2e^{\frac{-2n_t(\epsilon-\eta)^2}{(b-a)^2}}.
\end{equation}
\label{thm-1}
\end{theorem}

When $n_t \rightarrow \infty$, $2\exp(\tfrac{-2n_t(\epsilon-\eta)^2}{(b-a)^2}) \rightarrow 0$, and thus the risk estimator ${R}_{t}^w({\theta})$ is also {\em consistent}.
With ideal $\beta$, then $\eta = 0$ since $R_{t}^w(\theta)$ is an unbiased estimate of $R(\theta)$. Additionally, for the vanilla risk case $\empR_t(\theta)$ (when $\beta = 1$ in $R_t^w(\theta)$), which is the case with most AL methods,  under an AL scenario with finite labeled set, $P$ is not generally equal to $Q$, and thus $\eta \neq 0$ during the AL process. Thus,
$\empR_t(\theta)$ is \textbf{not} an unbiased and consistent estimate of the risk $R(\theta)$ \citep{shimodaira2000improving}.
This shows the superiority of IWERM.

For the 2nd term in \eqref{goal2}, the empirical risk of the selected samples are weighted appropriately to compensate for the discrepancy between the instrumental and true distributions, which leads to a consistent and asymptotically unbiased estimate of the risk \citep{sawade2010active}. Previous IWERM  works  \citep{sugiyama2007covariate, sugiyama2009pool, sawade2010active, vogel2020weighted} assume that the source data distribution for training is different with the target data distribution for testing, but the difference only comes from the input distributions $P(\mathbf{x})$ and $Q(\mathbf{x})$, while the posterior distributions $P(y|\mathbf{x})$ and $Q(y|\mathbf{x})$ are assumed to be identical. In our work, we \textbf{relax this assumption} and consider that the full %complete
joint distributions $P(\mathbf{x},y)$ and $Q(\mathbf{x},y)$ are different.

\subsection{Importance Weight Estimation}

The estimation of IWERM is both unbiased and consistent using ideal importance weight $\beta(\mathbf{x},y)$. However, the ideal $\beta$ is not achievable in practice, and thus we approximate $\beta(\mathbf{x},y)$ by ${\beta}_{t}(\mathbf{x},y)$ in every stage $t$. The estimator based on ${\beta}_{t}(\mathbf{x},y)$ is still consistent and asymptotically unbiased under the following conditions:
\begin{equation}
\label{beta_approx}
{\beta}_{t}(\mathbf{x},y) = \tfrac{{P}_{t}(\mathbf{x},y)}{{Q}_{t}(\mathbf{x},y)} \rightarrow \tfrac{P(\mathbf{x},y)}{Q(\mathbf{x},y)} = \beta(\mathbf{x},y), ~\text{as}~ n_t \rightarrow \infty.
\end{equation}
\eqref{beta_approx} holds
by properly selecting the formulation of $P_t$ and $Q_t$, which will be introduced in the next sections. Specifically, 
if  $\lim\nolimits_{n_t\rightarrow\infty}\beta_t(\mathbf{x},y) = 1$ holds, then $P(\mathbf{x},y) = Q(\mathbf{x},y)$ as sample size tends to infinity. Thus, the estimator $R^w_{t}(\theta)$ is asymptotically unbiased and consistent  for AL sampling strategies that converge to ``non-informativeness'', which is defined as follows:
\begin{definition}
(Non-informativeness) An acquisition function $a(\cdot;\theta)$ is ``non-informativeness'' if the output is a constant (denoted as $c_a$) for arbitrary input $\mathbf{x}$:
\begin{equation}
\lim\limits_{n_t \rightarrow \infty} a(\mathbf{x};\theta) = c_a, \forall \mathbf{x} \in \mathcal{X}.
\end{equation}
\end{definition}

We will explain the reason in Section~\ref{beta_section}.


\subsubsection{$P$ Distribution}
Since $P(\mathbf{x},y)$ is unknown in practice, in Bayesian inference, a family of probability distributions ${P}_t(\mathbf{x}, y|\phi)$ is specified to approximate $P(\mathbf{x},y)$ \citep{box2011bayesian, tran2017selection}, where $\phi$ is not known in advance and needs to be estimated from observed data samples. At stage $t$, given labeled set $\D_{t} = \{(\mathbf{x}_i,y_i)\}^{n_t}_{i=1}$ and a prior distribution $p(\phi)$, the posterior distribution of the parameter $\phi$ is estimated as
\begin{equation}
\label{phiestimate}
{P}_{t}(\phi|\D_{t})=\frac{p(\phi)\prod^{n_t}_{i=1}{P}_{t}(\mathbf{x}_i,y_i|\phi)}{\int p(\phi)\prod^{n_t}_{i=1}{P}_{t}(\mathbf{x}_i,y_i|\phi) d\phi}.
\end{equation}
The predictive distribution is
\begin{equation}
\label{pestimate}
{P}_{t}(\mathbf{x},y|\D_{t}) = \int {P}_{t}(\mathbf{x},y|\phi){P}_{t}(\phi|\D_{t})d\phi.
\end{equation}


\subsubsection{$Q$ Distribution}
\label{qdist}
Inspired by \citep{abayes2010Fred}, rather than selecting a particular datum to query, we model the query as a draw of a sample from the distribution $Q$. We define the $Q$ distribution at stage $t$ as follows:
\begin{equation}
\label{qfunc}
\begin{aligned}
Q_t(\mathbf{x},y) = Q_t(\mathbf{x},y;\theta)=\tfrac{q_t(\mathbf{x};\theta)P_t(\mathbf{x},y)}{\iint q_t(\mathbf{x};\theta)P_t(\mathbf{x},y)d\mathbf{x} dy}.
\end{aligned}
\end{equation}
$q_{t}(\mathbf{x};\theta)$ is an AL querying density function, where $q_{t}(\mathbf{x};\theta) > 0, \forall \mathbf{x} \in \mathcal{X}$ and $\int q_{t}(\mathbf{x};\theta)d\mathbf{x} = 1$.
Note that $Q_t$ is relative to the underlying distribution, i.e., specifies the relative over-sampling or under-sampling w.r.t. $P$.
Choosing $q_t$ to be constant is equivalent to selecting instances at random from the data pool (uniform sampling).
In contrast, choosing $q_t$ to be narrow will focus AL on a particular region, and in the limit, setting $q_t$ to a delta function will select a particular sample without reference to the underlying distribution $P$ \citep{abayes2010Fred}.

In pool-based AL, we select instances based on maximizing the acquisition function $a(\mathbf{x};\mathcal{A})$: $\mathbf{x}^* = \arg\max_{\mathbf{x} \in \mathcal{D}_u} a(\mathbf{x};\mathcal{A})$, and thus the acquisition function should be converted into a querying density. 
For example, entropy-based uncertainty methods will select data samples with the largest entropy across all classes, and the corresponding acquisition function is: $a(\mathbf{x}) = \sum \nolimits_{k=1}^{K} \bar{p}(y=k|\mathbf{x};\theta)\log \bar{p}(y=k|\mathbf{x};\theta)$, and $\bar{p}$ is the predicted class probability of given $\mathbf{x}$.

In our work, we convert the acquisition function to querying density function $q_t$
by applying the \emph{softmax} function
\begin{equation}
\label{softmaxfunc}
q_t(\mathbf{x}_i;\theta) = \frac{\exp(\alpha_t a(\mathbf{x}_i;\mathcal{A}_t)) }{\sum\nolimits_j \exp(\alpha_t a(\mathbf{x}_j;\mathcal{A}_t) )},
\end{equation}
where $\alpha_t$ is temperature hyperparameter ($\alpha_t > 0$), and $\mathcal{A}_t$ is the AL strategy at stage $t$.
Note that the softmax does not change the ranking of the unlabeled samples for AL sampling.
We select the temperature hyperparameter $\alpha_t$ to preserve the asymptotic efficiency of the whole AL processes (see Proposition~\ref{pro-2} in Section~\ref{para_section}). 

Finally, we approximate $Q(\mathbf{x},y)$ by ${Q}_{t}(\mathbf{x},y;\theta,\phi)$, w.r.t. $q_t$ and ${P}_t(\mathbf{x},y|\phi)$, as follows:
\begin{equation}
\label{qfunc1}
{Q}_{t}(\mathbf{x},y;\theta,\phi)=\frac{q_{t}(\mathbf{x};\theta){P}_{t}(\mathbf{x},y|\phi)}{\iint q_{t}(\mathbf{x};\theta){P_{t}}(\mathbf{x},y|\phi)d\mathbf{x} dy}.
\end{equation}


\subsubsection{Importance Weight $\beta$}
\label{beta_section}
Next we represent the approximation of $\beta(\mathbf{x},y)$ as ${\beta}_{t}(\mathbf{x},y;\theta,\phi)$ at stage $t$ as
\begin{equation}
\begin{aligned}
\label{beta}
{\beta}_{t}(\mathbf{x},y;\theta,\phi) = \tfrac{{P}_{t}(\mathbf{x},y|\phi)}{{Q}_{t}(\mathbf{x},y;\theta,\phi)}
=
\tfrac{\iint q_{t}(\mathbf{x};\theta){P}_{t}(\mathbf{x},y|\phi)d\mathbf{x} dy}{q_{t}(\mathbf{x};\theta)}.
\end{aligned}
\end{equation}

We analyse the representation of ${\beta}_t$ when the sample size tends to infinity.
Supposing that $R_{t}^w(\theta)$ is an unbiased estimator of $R(\theta)$, which is based on a sufficiently strong classifier (e.g., using CNN as a basic classifier).
If an infinite number of  samples are observed, the basic classifier will have a very certain prediction given $\mathbf{x}_i$.
Thus, AL sampling strategies like entropy-based uncertainty sampling will converge to ``non-informativeness'' as the sample size tends to infinity, since all predictions are certain.
Note that the numerator in (\ref{beta}) is a constant w.r.t. $(\mathbf{x},y)$. However, the numerator is still required for numerical stability, as setting it to 1 will yield large $\beta_t$ values that make the loss numerically unstable.
Consider Assumption~\ref{a2} below, in which case 
the acquisition function will output a constant for any $\mathbf{x}$ as the sample size increases.
\begin{assumption}
\label{a2}
The existing AL strategy $\mathcal{A}$ adopted in AL querying density function converge to ``non-informative'' as the sample size increases.
\end{assumption}
Under Assumption \ref{a2},  $\lim\limits_{n_t \rightarrow \infty}q(\mathbf{x}_i;\theta) = \tfrac{\exp(\alpha_t c_a)}{|\D_u|\exp(\alpha_t c_a)} = \tfrac{1}{|\D_u|}$ in \eqref{softmaxfunc}, where $|\D_u|$ is the size of unlabeled pool\footnote{See more discussions and examples of ``non-informativeness'' AL sampling strategies in supplementary materials.}.
Then, \eqref{beta} becomes
\begin{equation}
\lim_{n_t\rightarrow \infty} {\beta}_{t}(\mathbf{x},y;\theta,\phi)=\tfrac{\iint (1/|\D_u|) {P}_{t}(\mathbf{x},y|\phi)d\mathbf{x} dy}{1/|\D_u|} = 1,
\end{equation}
where $\iint {P}_{t}(\mathbf{x},y|\phi)d\mathbf{x} dy = 1$ since ${P}_{t}(\mathbf{x},y|\phi)$ is a probability density function. Thus, the whole process is asymptotically unbiased and consistent.

Note that \eqref{beta_approx} converges point-by-point based on our assumption of ``non-informativeness'' and our designed $P_t$ and $Q_t$. The reasons are as follows. First, in the ideal case, as sample size tends to infinity, enough data is observed and thus the underlying data distribution $P$ is known. Thus, the optimal sampling distribution should be the data distribution itself, i.e., $P_i = Q_i$, and thus $\beta_i = P_i/Q_i = 1$. 
Second, based on \eqref{softmaxfunc} and our ``non-informativeness'' requirement, $Q_t$ would also be the same as $P_t$ as sample size tends to infinity, since $q_t$ converges to a constant, and thus $\beta_i = P_i/Q_i = 1$. 
Regarding the convergence of $\beta_t$ in \eqref{beta}, we can regard the numerator $\iint q_{t}(\mathbf{x};\theta){P}_{t}(\mathbf{x},y|\phi)d\mathbf{x}dy$ as a normalization constant.  In the remaining part $\tfrac{1}{q_t(\mathbf{x};\theta)}$, $q_t$ is the softmax function (see \eqref{softmaxfunc}), which tends to $0$ if and only if $q_t \rightarrow +\infty$. However, this condition will never be satisfied according to Section 4.2 in \citep{guo2017calibration}. Thus, $\beta_t$ will converge to a finite value.

We further explain why some AL methods can converge to  ``non-informativeness'' based on the assumptions in AL sampling processes from two aspects, using entropy-based uncertainty sampling as example. We assume that AL would not query non-existing or out-of-distribution (OOD) data samples and would not query wrong/noisy labels from oracles/experts, that is, $P(\mathbf{x}, y) > 0$ and $Q(\mathbf{x}, y) > 0$. Additionally, we could also obtain another vital information from these assumptions: $P(y = y_{\text{true}}|\mathbf{x}_i) = 1$ for all labeled samples. Firstly, after querying  enough samples, any $\mathbf{x}_i$ actually appears in the labeled trained set, and thus we know the hard label and are very certain about it, $i.e.$, $P(y = y_{\text{true}}|\mathbf{x}_i) = 1$, thus the confidence is $1$. Secondly, \citep{fernandezdecossio2015maximum} shows that the practice of using sample average as surrogates of probability expectations is reliable provided sample size is large. Equation (1) in \citep{fernandezdecossio2015maximum} shows that the entropy of model parameters will converge to a certain value as sample size increases. That is, after observing enough data, any given $\mathbf{x}_i$ will not change the basic model, and thus any  $\mathbf{x}_i$ is meaningfulness to improve the basic model,  which is consistent with our proposed ``non-informativeness'' assumption.
More discussions are in Appendix.

\subsubsection{Parameter Estimation of $\alpha$}
\label{para_section}
When calculating the ${Q}_t$ distribution, the temperature scaling parameter $\alpha_t$ in \eqref{softmaxfunc} needs to be estimated. We propose a method for estimating $\alpha_t$ by considering the asymptotic efficiency of ${R}_{t}^w({\theta}$), i.e., the asymptotic variance of the estimator (see proof of Proposition~\ref{pro-1} in appendix).
\begin{proposition} (\emph{Asymptotic Variance of Estimators})
Let ${R}_{t}^w({\theta})$ be defined in \eqref{risk3} and $R(\theta)$ be defined in \eqref{risk1}, by employing the ``Delta Method'', we have
\begin{equation}
\label{asymvar}
\begin{aligned}
\sqrt{n_t}({R}_{t}^w({\theta})- R(\theta)) \stackrel{n_t \rightarrow \infty}{\longrightarrow} \mathcal{N}(0, \sigma_Q^2),
\end{aligned}
\end{equation}
with $\sigma^2_Q = \iint \beta(\mathbf{x},y) [l(f(\mathbf{x};\theta),y)-R(\theta)]^2 P(\mathbf{x},y)d\mathbf{x}dy$.
\label{pro-1}
\end{proposition}

We next consider selecting the parameters $\alpha_t$ so as to minimize the variance of the estimator (see proof of Proposition~\ref{pro-2} in Appendix).

\begin{proposition} (\emph{Optimal Sampling Distribution}) The optimal instrumental sampling distribution that minimizes $\sigma^2_Q$ is
\begin{equation}
\label{eqn:Qopt}
Q^{opt}_{t}(\mathbf{x},y) \propto \big|l(f(\mathbf{x};\theta),y) - R(\theta)\big|P(\mathbf{x},y).
\end{equation}
\label{pro-2}
\end{proposition}

In practical use, we employ $P_t$ to approximate $P$, and thus, $Q^{opt}_{t} \approx \big|l(f(\mathbf{x};\theta),y) - R(\theta)\big|{P}_{t}(\mathbf{x},y)$. For each sample $(\mathbf{x}_i, y_i)$, define shorthand ${Q}^i(\alpha)={Q}_{t}(\mathbf{x}_i,y_i; \theta,\phi,\alpha_t)$, ${P}^i={P}_{t}(\mathbf{x}_i,y_i,|\phi)$, $l_i=l(f(\mathbf{x}_i;\theta),y_i)$, and $R=R(\theta)$.
Based on \eqref{eqn:Qopt}, to obtain the optimal sampling distribution, we set $\frac{{Q}^i(\alpha_t)}{|l_i - R| {P}^i} = c_o$
for some constants $c_o$.
Equivalently taking the logarithm, $\log {Q}^i(\alpha_t) - \log |l_i-R|{P}^i = \log c_o$.
Thus, $(\alpha,c_o)$ can be estimated by minimizing the squared error of the log-constant term, summed over all labeled samples, at stage $t$, we have
\begin{equation}
\label{getpara}
\alpha_t^*, c_o^* = \mathop{\arg\min}\limits_{\alpha_t, c_o} \sum \nolimits_{i} ( \log {Q}^i(\alpha_t) - \log|l_i - R|{P}^i - \log c_o )^2.
\end{equation}
Note that $R(\theta)$ is generally expected to be a very small value close to zero for a well-trained model \citep{sener2017active}. In our experiments, we set $R=10^{-3}$.
There is no closed-form solution, and instead we use a numerical optimization toolbox\footnote{E.g., minimize function in Scipy library.} to solve for $\alpha_t,c_o$ in each stage $t$.

\subsection{Proposed AL Framework}
\label{proposed_al_frame}
In summary, we propose a flexible AL framework on top of existing AL strategies based on IWERM. The proposed framework gives an asymptotically unbiased and consistent estimate of the true risk if Assumption~\ref{a2} holds, which can be satisfied by proper selection of the AL strategy and the basic model. Additionally, the hyperparameter $\alpha_t$ is selected to minimize the variance of the risk estimator, and thus our framework is also asymptotically efficient.
The whole framework is described in Algorithm~\ref{alg-1}.

\begin{algorithm}[tb]
\caption{The proposed AL Framework.}
\label{alg-1}
\begin{algorithmic}[1]
\REQUIRE Initial labeled set $\D_0$, unlabeled data pool $\D_u$, prior information of $p(\phi)$, AL method $\mathcal{A}$, initial importance weight $\beta_0 = \{1,1,...\}$, batch size $B$, oracle $\mathcal{O}$.
\STATE Stage $0$: Estimate initial model $\hat{\theta}_0  = \min_\theta  R_{0}(\theta)$  with $\D_0$ and $\beta_0$. Estimate $\hat{\phi}_{0}$ from $\D_{0}$.
Estimate $q_{0}(\mathbf{x};\hat{\theta}_{0})$. Calculate ${Q}_{0}(\mathbf{x},y;\hat{\theta}_{0},\hat{\phi}_{0})$.
\FOR{stage $t$ in $1,...,T$}
\STATE {\em Update labeled set}: Obtain $B$ data samples from $\mathcal{D}_u$ with ${Q}_{t-1}$, and query labels from $\mathcal{O}$. Add the new samples $\D_t^{\text{new}}$ to $\D_{t-1}$ to obtain $\D_t$.
\STATE \emph{Update unlabeled set}: update $\D_u$ by removing $\D_t^{\text{new}}$.
\STATE \emph{Estimate ${P}_t$:} Estimate $\hat{\phi}_t$ with $\D_{t}$ by \eqref{phiestimate}. Calculate ${P}_{t}(\mathbf{x},y)$ by \eqref{pestimate}.
\STATE \emph{Estimate ${Q}_t$:} Update $\alpha_t $ by \eqref{getpara}. Calculate $q_{t}(\mathbf{x};\hat{\theta}_{t-1})$ by \eqref{softmaxfunc} and ${Q}_{t}(\mathbf{x},y;\hat{\theta}_{t-1},\hat{\phi}_t)$ by \eqref{qfunc1}.
\STATE \emph{Importance weight:} Calculate ${\beta}_t(\mathbf{x},y)$ by \eqref{beta}.
\STATE \emph{Re-train basic model(s):}  $\hat{\theta}_t = \min_{\theta} R_{t}^w(\theta)$ from \eqref{risk3}.
\ENDFOR
\end{algorithmic}
\end{algorithm}

\section{Experiment}
\label{experiment}
To validate the effectiveness of our proposed AL framework, we compare the performance between existing AL strategies (as baseline methods) and incorporated with our unbiased AL framework (as the basic AL acquisition functions). We also compare our model with other de-biased/less biased AL sampling schemes.

\subsection{Experimental Settings}
\subsubsection{Datasets}
We consider $8$ datasets for classical ML tasks: $4$ real-life UCI datasets \citep{Dua:2019}, including \emph{Clean1}, \emph{Splice}, \emph{Tic-tac-toe}, and \emph{Vehicle}; $4$ synthetic datasets, including \emph{EX8a} \citep{andrew2008stan}, \emph{Gaussian Cloud Unbalance} \citep{NIPS2017_8ca8da41}, \emph{R15} and \emph{D31} \citep{veenman2002maximum}.
The datasets can be categorized into: synthetic data (\emph{EX8a}, \emph{GCloudub}, \emph{R15} and \emph{D31}); real-life data (\emph{Clean1}, \emph{Splice}, \emph{Tic-tac-toe} and \emph{Vehicle}); binary-class classification tasks (\emph{EX8a}, \emph{GCloudub}, \emph{Clean1}, \emph{Splice} and \emph{Tic-tac-toe}); multi-class classification tasks (\emph{R15}, \emph{D31}, and \emph{Vehicle}); imbalanced data cases (\emph{GCloudub} and \emph{Tic-tac-toe}).

\subsubsection{Baselines} 
We compare our model with $4$ typical AL strategies \citep{settles2009active}, including entropy-based Uncertainty Sampling (\textbf{US}) \citep{lewis1994heterogeneous}, Query-by-Committee (\textbf{QBC}) \citep{seung1992query}, Expected Error Reduction (\textbf{EER}) \citep{roy2001toward} and Batch-mode Discriminative and Representative AL (\textbf{BMDR}) \citep{wang2015querying}.
\textbf{US} finds unlabeled data samples with largest entropy of predicted probabilities. \textbf{QBC} minimizes the version space (set of hypotheses that are consistent with labeled set). \textbf{EER} selects data points with minimal expected future risk. \textbf{BMDR} queries a batch of informative and representative examples by minimizing the empirical risk bound of AL. We also utilize these four AL strategies as basic AL methods in our framework by using \eqref{softmaxfunc}. We change the output of these AL methods (the data samples to query, ranked by the corresponding acquisition function) to the querying density (normalizing the actual output of the acquisition function for unlabeled data samples).

We also compare our proposed method with $2$ unbiased AL sampling methods, which are based on importance sampling/weighting techniques: 
Unbiased Pool-based AL (\textbf{UPAL}) \citep{ganti2012upal} and Sampling-Weighted AL (\textbf{SWAL}) \citep{imberg2020optimal}, which has $3$ variants: \textbf{SWAL-cora} (Corollary 1 (a) in \citet{imberg2020optimal}), \textbf{SWAL-corb} (Corollary 1 (b) in \citet{imberg2020optimal}) and \textbf{SWAL-prop} (Proposition 1 in \citet{imberg2020optimal}).
The implementations of \textbf{US}, \textbf{QBC}, \textbf{EER} and \textbf{BMDR} are from ALiPy \citep{TLHalipy}.
\textbf{UPAL} and \textbf{SWAL} are re-implemented with reference to the released code\footnote{\url{https://github.com/imbhe/OSiUAL}}.

\subsubsection{Implementation Details}
We repeated each experiment $10$ times with randomly split training and testing sets, and reported the average testing performance. We employed the same basic classifier for the AL baselines and our methods under each dataset. To evaluate average performance, we compute area under the performance-budget curve (AUBC) \citep{zhan2021comparative}, by evaluating the AL method for different fixed budgets (e.g., Accuracy vs.~Budget in Figure~\ref{acc-figure1}). The area under the curve is calculated by trapezoid method, with higher values reflecting better performance of AL under varying budgets.
%
More details about experimental design are in the supplemental materials, including how $P$ is modeled (Section A3.3 and Section A4.3 in supplemental materials) the description of datasets, baselines and more implementation details (Section A4.1-4.3 in supplemental materials).

\subsection{Experimental Results}

Figure~\ref{acc-figure1} presents the accuracy-budget curves with batch size $10$, with the AUBC values reported in the legend. 
Note that in these experiments, the size of $\D_u$ is set as the upper bound of the AL budget. That is, the basic AL models, \textbf{US}, \textbf{QBC}, \textbf{EER} and \textbf{BMDR} converge to the same accuracy at the end of the AL process,  since their basic classifier will be trained on the whole training set with uniform importance weight (the vanilla risk case).

We next analyze the experimental results w.r.t. different dataset properties.
More experimental results with different batch size settings ($B \in\{ 1, 5, 20\}$) and for various evaluation metrics (AUBC-AUC and AUBC-$\text{F}_1$) are presented in the supplementary materials (see Section A4.4).

\begin{figure}[!hbt]
\centering
\subfloat[EX8a]{\includegraphics[width=0.5\linewidth]{img/ex8a_acc10_basic.pdf}}
\subfloat[GCloudub]{\includegraphics[width=0.5\linewidth]{img/gcloudub_acc10_basic.pdf}}
\\
\subfloat[R15]{\includegraphics[width=0.5\linewidth]{img/r15_acc10_basic.pdf}}
\subfloat[D31]{\includegraphics[width=0.5\linewidth]{img/d31_acc10_basic.pdf}}
\\ 
\subfloat[Clean1]{\includegraphics[width=0.5\linewidth]{img/clean1_acc10_basic.pdf}}
\subfloat[Splice]{\includegraphics[width=0.5\linewidth]{img/splice_acc10_basic.pdf}}
\\
\subfloat[Tic-tac-toe]{\includegraphics[width=0.5\linewidth]{img/tictactoe_acc10_basic.pdf}}
\subfloat[Vehicle]{\includegraphics[width=0.5\linewidth]{img/vehicle_acc10_basic.pdf}}
\caption{Accuracy-budget curves for classical ML tasks with $B = 10$, including the comparison between our framework with basic AL methods ($i.e.,$ \textbf{US}, \textbf{QBC}, \textbf{EER} and \textbf{BMDR}). The solid lines represent our methods and dashed lines represent the corresponding baseline AL methods.}
\label{acc-figure1}
\end{figure}

\begin{figure}[!hbt]
\centering
\subfloat[EX8a]{\includegraphics[width=0.5\linewidth]{img/ex8a_acc10_unbiased.pdf}}
\subfloat[GCloudub]{\includegraphics[width=0.5\linewidth]{img/gcloudub_acc10_unbiased.pdf}}
\\
\subfloat[R15]{\includegraphics[width=0.5\linewidth]{img/r15_acc10_unbiased.pdf}}
\subfloat[D31]{\includegraphics[width=0.5\linewidth]{img/d31_acc10_unbiased.pdf}}
\\ 
\subfloat[Clean1]{\includegraphics[width=0.5\linewidth]{img/clean1_acc10_unbiased.pdf}}
\subfloat[Splice]{\includegraphics[width=0.5\linewidth]{img/splice_acc10_unbiased.pdf}}
\\
\subfloat[Tic-tac-toe]{\includegraphics[width=0.5\linewidth]{img/tictactoe_acc10_unbiased.pdf}}
\subfloat[Vehicle]{\includegraphics[width=0.5\linewidth]{img/vehicle_acc10_unbiased.pdf}}
\caption{Accuracy-budget curves for classical ML tasks with $B = 10$, including the comparison between our framework (we select \textbf{BMDR} as basic AL for comparison) and unbiased AL baselines, $i.e.$, \textbf{SWAL}, \textbf{UPAL}.}
\label{acc-figure2}
\end{figure}


\begin{table*}[!htb]
\centering
\caption{Comparison of our model (\textbf{BMDR}-based) against unbiased AL baselines.
The table shows the mean and standard deviation AUBC (acc) values, and the highest AUBC (acc) values are in \textbf{bold}. A paired t-test was conducted between our method and the others,
and *, **, *** indicate statistical significant differences at $p<0.05$, $p<0.01$, and $p<0.001$, respectively.
This experiment uses  $10$ trials and $B = 10$.
}
\label{pvalue}
{\begin{tabular}{c|llll|c}
\toprule
Dataset & \textbf{SWAL-cora} & \textbf{SWAL-corb} & \textbf{SWAL-prop} & \textbf{UPAL} & \textbf{BMDR-ours}\\
\hline
\emph{EX8a} & $0.825 \pm 0.014$** & $0.819 \pm 0.018$** & $0.832 \pm 0.008$** & $0.841 \pm 0.016$ & \textbf{0.849 $\pm$ 0.014} \\
%\hline
\emph{GCloudub} & $0.945\pm 0.006$* & $0.943\pm 0.010$* & $0.946\pm 0.009$ & $0.946\pm 0.008$* & \textbf{0.949 $\pm$ 0.007} \\
%\hline
\emph{R15} & 0.749 $\pm$ 0.053*** & 0.733 $\pm$ 0.036*** & 0.889 $\pm$ 0.023*** & 0.881 $\pm$ 0.035*** &\textbf{0.979 $\pm$ 0.006} \\
%\hline
\emph{D31} & 0.908 $\pm$ 0.013***  & 0.908 $\pm$ 0.012*** & 0.940 $\pm$ 0.005*** & 0.933 $\pm$ 0.007*** &\textbf{0.968 $\pm$ 0.004}\\
\hline
\emph{Clean1} & 0.795 $\pm$ 0.017* & 0.803 $\pm$ 0.019** & 0.785 $\pm$ 0.034* & 0.799 $\pm$ 0.025* & \textbf{0.815 $\pm$ 0.022}\\
%\hline
\emph{Splice} & 0.785 $\pm$ 0.014* & 0.786 $\pm$ 0.013* & 0.788 $\pm$ 0.014 & 0.784 $\pm$ 0.013* & \textbf{0.795 $\pm$ 0.013}\\
%\hline
\emph{Tic-tac-toe} & 0.763 $\pm$ 0.016 & 0.765 $\pm$ 0.017 & 0.762 $\pm$ 0.016 & 0.765 $\pm$ 0.018 & \textbf{0.768 $\pm$ 0.021} \\
%\hline
\emph{Vehicle} & 0.679 $\pm$ 0.012** & 0.681 $\pm$ 0.010* & 0.671 $\pm$ 0.017* & 0.686 $\pm$ 0.015 & \textbf{0.692 $\pm$ 0.010} \\
\bottomrule
\end{tabular}}
\end{table*}


\subsubsection{Comparisons with Basic AL Methods}

The purpose of this experiment is to observe if our proposed approach can enhance existing AL models. Comparing with basic AL sampling strategies, our approach significantly improves the performance of the basic AL methods, by achieving faster convergence. Especially, on \emph{R15} (see Fig.~\ref{acc-figure1}c), our approaches converge after querying $10$ samples, while the basic AL methods converge after  $130$ samples. On \emph{D31} (Fig.~\ref{acc-figure1}d), the improvements are more significant -- our approaches converge after $40$ samples, while the basic AL methods gradually converge after $600$ to $900$ samples. Both \emph{R15} and \emph{D31} have clear data/cluster distributions, but the tasks are more difficult because there are more classes (\emph{R15} has $15$ classes and \emph{D31} has $31$ classes). The basic AL methods more easily fall into local optimum and make wrong judgments of the decision boundary, while our methods avoids this problem by correctly modeling the discrepancy between the underlying data distribution and the current sampling distribution, and thus achieves better performance.

On \emph{GCloudub} (Fig.~\ref{acc-figure1}b), our approaches converge after $80$ samples, while for the basic AL methods, \textbf{US}, \textbf{EER}, and \textbf{BMDR}
converge after 100, 260, and 200 samples, respectively. Besides faster convergence, our method also provides more stable performance due to de-biasing the \emph{dataset shift}. These basic AL sampling strategies do not always perform well on various data types, e.g., \textbf{US} and \textbf{EER} even show a performance drop on \emph{Splice} (Fig.~\ref{acc-figure1}d). This is caused by \emph{sampling bias}, the incorrect judgment of decision boundaries, as mentioned in Section~\ref{intro}, while our method reduces the effect of the \emph{sampling bias} by correctly modeling the discrepancy between the sampling distribution and the underlying data distribution -- the \emph{dataset shift}. Our method improves the AUBC(acc) performances of \textbf{US} from $0.711$ to $0.793$ and  \textbf{EER} from $0.716$ to $0.794$.

On datasets with class imbalance, the improvement from our approach is more substantial, e.g., on \emph{GCloudub} with imbalance ratio (IR) $2.0$, and on \emph{Tic-tac-toe} with IR $6.8$. For instance, based on AUBC (acc), we improve \textbf{BMDR} from $0.923$ to $0.949$ on \emph{GCloudub}, improve \textbf{US} from $0.714$ to $0.772$ and improve \textbf{EER} from $0.716$ to $0.766$ on \emph{Tic-tac-toe}. The better performance on imbalanced data is likely because of the importance-weighting, which reduces bias caused by under-sampling the larger class.

\subsubsection{Comparison with Unbiased AL Methods}

Comparing our approach with other unbiased AL sampling strategies (3 variants of \textbf{SWAL}, and \textbf{UPAL}), we observe that all these methods reduce the \emph{sampling bias} problems during the AL process. Our method achieve the best performance on all of the $8$ datasets, as shown in Fig.~\ref{acc-figure2}. We further examine the differences in performance between our method (we choose \textbf{BMDR}) and unbiased AL baselines (\textbf{SWAL} and \textbf{UPAL}) by using a paired t-test on $10$ repeated trials on each dataset. The test results are shown in Table~\ref{pvalue}. Our method outperforms the baselines at a statistically significant level ($p < 0.05$) on $24$ out of $32$ ($75\%$) of the experiments, while performing similarly ($p > 0.05$) on $8$ out of $32$ ($25\%$). The t-test results indicate that, compared with the baseline unbiased AL models, our model can achieve better or similar results under various task scenarios.


Our model perform particularly well on \emph{R15} and \emph{D31}, while \textbf{SWAL-cora} and \textbf{SWAL-corb} have $10\%$ performance drop at the end of the accuracy curves on \emph{R15}. Different from \textbf{SWAL-prop} that determines the sampling probabilistic scheme by label uncertainty alone, \textbf{SWAL-cora\&corb} compute the sampling probabilities by the location of data points in the feature space and account for additional information captured by the Hessian of the total loss and the gradients of the individual losses and predictions. However, on \emph{R15}, the location information even hinders the judgement since there are some clusters that are close to each other on \emph{R15} and hard to be classified. Similar trends are observed in \emph{D31}. There are no significant differences between our model and baselines unbiased AL models on \emph{Tic-tac-toe} -- we think this is because the result (0.76$\sim$0.77) is already close to the optimal performance that AL can achieve. Since this dataset is fairly imbalanced, AL needs more data to determine the actual decision boundary and overstep the local optimum.

In summary, our approach provides competitive experimental results for various data topologies, and our proposed method effectively improves the basic AL models' performance, achieving more stable and faster convergence rate than the baseline unbiased/de-biased AL methods.

\section{Conclusion}
In this paper, we discuss how the bias problems (i.e., \emph{sampling bias}, \emph{dataset shift}) arise from the sample selection and model fitting steps during the AL processes. We then explore crucial statistical properties (i.e., asymptotically unbiasedness, asymptotically efficiency and consistency) for designing AL approaches that reduce the negative effects of bias problems.  Based on these considerations, we propose a flexible AL framework that operates on the top of existing AL sampling schemes. It provides asymptotically unbiased, efficient and consistent estimate of true risk by utilizing \emph{sampling bias} and well modeling \emph{dataset shift}. The experimental results show that the proposed framework improves the generalization of various basic AL models and also maintains a certain advantage on various data topologies, comparing with other unbiased/de-biased AL sampling schemes.

\begin{acknowledgements} 
This work was supported by a grant from the Research Grants Council of the Hong Kong Special Administrative Region, China (Project No. CityU 11215820). 

The authors would like to thank dear friends Dr. Xinhong Chen and Dr. Hui Lan from City University of Hong Kong for their useful discussions and feedback.
\end{acknowledgements}

\bibliography{zhan_217}

\end{document}
