%!TEX root = ./main.tex

\section{Experimental Results}\label{Experiments}
\label{experimental_setup}
% \zixin{Adding class-imbalanced and noisy setting}
\subsection{Experimental Setup} 
We evaluate the performance of \algname %against several state-of-the-art baselines 
on four active learning benchmarks in image classification: MNIST \citep{lecun1998gradient}, FashionMNIST \citep{xiao2017fashion}, CIFAR10 \citep{krizhevsky2009learning}, SVHN\citep{netzer2011reading}. To facilitate a thorough comparison against the baselines, we evaluated them across various acquisition stage budget $B$ as \{500, 700, 900, 1000\} for MNIST and FashionMNIST with $k = 200$, \{5000, 7000, 9000, 10000\} for CIFAR10 and SVHN with $k = 2500$. We focus on the accuracy of the validation set as the key performance metric, with the validation set size fixed at 1000 for all datasets. %We fix the validation size to be 1000 across all datasets. Lastly, 
We ran each experiment ten times and reported average and standard error across all experiments.

%Depending on the type of dataset, we consider different network architectures for classifiers. 
We consider two %classifier structures
network architectures: For MNIST and FashionMNIST, we utilized a neural network structure similar to LeNet \citep{lecun1998gradient}, as suggested by \citet{beck2021effective}, and for CIFAR10 and SVHN, we employed ResNet-18 \citep{he2016deep}. We defer the details of utility model architecture and the choice of classifiers to the Appendix \ref{UtilityModelArchitecture}. %in Appendix \ref{UtilityModelArchitecture}.

We fit all classifiers using cross-entropy loss with the Adam optimizer until training accuracy exceeds $99\%$ with maximum $100$ epochs and learning rate $0.001$. No learning rate schedulers or data augmentations are used. \footnote{Baselines use implementations from open-source AL toolkit DISTIL \citet{decileteam_2023_distil}. All models are trained in PyTorch \citep{paszke2017automatic}.}


\begin{figure*}[t!]
\centering
% \rotatebox[origin=c]{90}{\quad \quad \scriptsize Cumulative loss}
    \begin{subfigure}{.33\textwidth}
        \centering
\includegraphics[clip,trim=0cm 0cm 0cm 0cm,width=\textwidth]{./fig/Table1/Performance on MNIST with Pretraining Budget 200.png}
    \caption{\footnotesize MNIST}
    \label{Pretraining Budget Variation}
     \end{subfigure}%\hfil
    \begin{subfigure}{.33\textwidth}
        \centering
        \includegraphics[clip,trim=0cm 0cm 0cm 0cm,width=\textwidth]{./fig/Table1/Performance on FashionMNIST with Pretraining Budget 200.png}
        % {\quad \quad \tiny Query cost}
        \caption{FashionMNIST}\label{}
        %\vspace{-2mm}
    \end{subfigure}%\hfil
    \begin{subfigure}{.33\textwidth}
        \centering
        \includegraphics[clip,trim=0cm 0cm 0cm 0cm,width=\textwidth]{./fig/Table1/Performance on CIFAR10 with Pretraining Budget 2500}
        % {\quad \quad \tiny Query cost}
        \caption{CIFAR10}\label{}
        %\vspace{-2mm}
    \end{subfigure}%\hfil
    \\
    \begin{subfigure}{.33\textwidth}
        \centering
        \includegraphics[clip,trim=0cm 0cm 0cm 0cm,width=\textwidth]{./fig/Table1/Performance on SVHN with Pretraining Budget 2500.png}
        % {\quad \quad \tiny Query cost}
        \caption{SVHN}\label{}
        %\vspace{-1mm}
    \end{subfigure}
    \begin{subfigure}{.33\textwidth}
    \includegraphics[clip,trim=0cm 0cm 0cm 0cm,width=\textwidth]{./fig/Pretraining BUdget Ablation CIFAR10.png}
        \caption{ Ablation on $k$
        % \footnotesize Active Learning validation performance with Stage 1 Labeling Budget 5000 for CIFAR10 across various choices of Pretraining Budget
        }
        %\vspace{-1mm}
    \label{CIFAR10 Pretraining Budget Variation}
    \end{subfigure}
    \begin{subfigure}{.33\textwidth}
\includegraphics[clip,trim=0cm 0cm 0cm 0cm,width=.9\textwidth]{./fig/performance_vs_lambda_OT.png}
    %\vspace{-2mm}
    \caption{Ablation on $\lambda_{\text{OT}}$
    % Different choices of $\lambda_{OT}$ for Seed Set size 2500 for Dataset CIFAR10
    }
    %\vspace{-3mm}
    \label{lambda_OT hyperparameter}
    \qquad
    \end{subfigure}
    \caption{Experimental results. \textbf{(a-d)} Active learning validation performance. \textbf{(e)} Active Learning validation performance with the acquisition stage budget $B=5000$ for CIFAR10 across various choices of pretraining budget $k$. \textbf{(f)} Different choices of $\lambda_{\text{OT}}$ for pretraining set size $k=2500$ on CIFAR10. Results are given in \%.} %\si{I will give zixin the source code for generating figures. Feel free to adjust the titles, labels, fonts etc.}}
    \label{Accuracy Validation Performance}
    % \caption{\footnotesize Active Learning validation performance with Stage 1 Labeling Budget 5000 for CIFAR10 across various choices of Pretraining Budget}
%     \label{Pretraining Budget Variation}
\end{figure*}


\subsection{Baselines}
While numerous AL methods has been proposed for %are designed for %task-specific settings including but not limited to 
specific tasks such as object detection \citep{yuan2021multiple}, semantic segmentation \citep{kim2021task} and instance segmentation \citep{chaplot2021seal}, %However, 
these algorithms rely on heuristics acquisition functions and are not suitable to the single-round setting considered in this work. Hence, we mainly consider the state-of-the-art learning-based AL baselines designed for the single-round AL setting: %Indeed, there are many 
 %we focus on the singe-round setting.
%studying one round active learning which is rarely established before. Therefore, we evaluate baselines including:

\textbf{DULO} \citep{wang2023one}: A learning based approach curated for one round AL setting by selecting a subset with size $B$ instances from $\Unlabeled_{0}$ which maximize a learned utility function. %This utility function predicts the performance metric of the target model when trained on the chosen subset post-labeling, analogous to our method. 
DOLO relies on a regression-based surrogate utility function, and employs a stochastic block-wise greedy selection strategy for batch acquisition. In contrast, our algorithm utilizes a RankNet with a multi-task training loss for training the acquisition function, and uses the greedy-margin subroutine for data acquisition.

%However, it is worth noting that in this approach, the utility function is formulated as a regression problem and the subset selection involves a blockwise stochastic greedy with run time $O(|\Unlabeled|)$ with ours \citep{wang2023one}. The detailed discussion of the distinction is in Appendix \ref{distinction}.

\textbf{LLAL} \citep{yoo2019learning}: A learning based approach estimating the errors of the predictions (loss) made by the classifier and select $B$ unlabeled instances with top predicted losses. \footnote{\citet{yoo2019learning} design loss prediction module using middle layers of ResNet18. For FashionMNIST and MNIST, we extract middle layers of \citet{beck2021effective}'s neural networks.}

We also include a \textsc{random} strategy as baseline, which selects $B$ samples uniformly at random. % from $\Unlabeled_{0}$. %This baseline is commonly used as a comparison to passive learning.
For additional comparisons against a collection of non-task-aware, heuristic-based AL baselines, please refer to the supplemental results in Appendix \ref{rest_baselines}.
%\vspace{-1mm}
\subsection{Main Results}
In Figure \ref{Accuracy Validation Performance}, \algname outperforms most of the baselines across multiple architectures and various labeling budgets for the acquisition stage. For easy datasets like FashionMNIST and MNIST, \algname shall learn a good shared representation for effective utility value interpolation and can easily beat all the baselines oblivious to different labeling budgets which suggests \algname is a good choice regardless of labeling budget. Even though MNIST and FashionMNIST are easy to learn, but due to limited labeled pool, LLAL \citep{yoo2019learning} fails to learn a good loss prediction module or \textit{uncertainty} estimate, and thus \algname has a substantial gain compared to it. However, \algname performs interpolation techniques to augment utility samples within a limited labeled pool and generalize to predictions of a longer history of labeled data, leading to a learning-based acquisition function amenable to the growing labeled pool. For more challenging datasets, such as CIFAR10 and SVHN, when the model fails to have good architecture priors due to a limited labeled pool, \algname outperforms DULO \citet{wang2023one} in large gain compared to easy datasets. We conjecture it's because ranking is generally easier for model to learn compared to regression, especially under complex dataset (Figure~\ref{fig:overview}).

\subsection{Ablation Study}
\label{ablation}

We perform an ablation study on the size of Pretraining set, the design choices of each submodule as bilevel training, OT distance and RankNet as well as hyperparameter for OT Distance Loss (Definition~\ref{total_loss}). We use CIFAR10 as an example dataset, and defer our results on the remaining datasets to the Appendix~\ref{Full_Supplement_Experiment}.


\paragraph{Size of pretraining budget $k$}
Naturally, we want to examine the effect of size of pretraining set for determining how the scale of initial labeled pool impacts overall single round selection performance. Figure~\ref{CIFAR10 Pretraining Budget Variation} shows across different seed set size for pretraining stage, \algname outperforms all other baselines.

\paragraph{Bi-level training, OT Distance and RankNet}
\label{Three Design Choices}
Next, we shift to study the intertwined effects of three design choices. Table~\ref{BilevelTraining1} shows the combined efficacy of bilevel training, OT distance, and RankNet, offering insights into the synergy of these three foundational modules. The cross mark for RankNet means regression based acquisition function and the loss is designed as MSE between predicted utility vs. true utility value. One thing to note is that if the performance of regression based acquisition function without bi-level training and OT distance is similar to random, which corroborates our intuition about ranking instead of regressing validation accuracy on labeled samples.

\begin{table}[h!]
\centering
\small
\caption{Ablation study on three submodules with pretraining set $k=3500$ and acquisition budget $B=5000$. The last row %with each block written as - is 
corresponds to the random baseline.}

\label{BilevelTraining1}
\scalebox{0.95}{
\begin{tabular}{lccc}
    \toprule
    Bilevel & Optimal Transport & RankNet & Accuracy\\
    \midrule
     $\checkmark$ & $\checkmark$ & $\checkmark$ & $\mathbf{77.3 \pm 0.2}$ \\
    $\checkmark$ & $\checkmark$ & $\times$ & $76.1 \pm 0.3$\\
    $\checkmark$ & $\times$ & $\checkmark$ & $76.2 \pm 0.4$\\
    $\checkmark$ & $\times$ & $\times$ & $70.5 \pm 0.3$ \\
    $\times$ & $\checkmark$ & $\checkmark$ & $75.5 \pm 0.3$\\
     $\times$ & $\checkmark$ & $\times$ & $75.5 \pm 0.3$ \\
    $\times$ & $\times$ & $\checkmark$ & $76.0 \pm 0.8$ \\
    $\times$ & $\times $ & $\times$ & $74.6 \pm 0.7$ \\
    - & - & - & $74.7 \pm 0.3$ \\
    \bottomrule
\end{tabular}}
\end{table}


\paragraph{Hyperparameter Tuning for OT distance}
By definition, $\mathcal{L}_{\text{Total}} = \mathcal{L}_{\text{Rank}_{12}} + \lambda_{\text{OT}} \cdot \mathcal{L}_{\text{OT}}$ (Definition \ref{total_loss}). One can change the scale of $\lambda_{\text{OT}}$ for utility model training in pretraining. We study the effect of hyperparameter $\lambda_{\text{OT}}$ in final model performance on validation set. We highlight the importance of incorporating OT distance into the loss structure which makes $\hat{u}$ insensitive to the scale of $\lambda_{\text{OT}}$. When $\lambda_{\text{OT}} > 0$, the overall validation accuracy is larger than $\lambda_{\text{OT}} = 0$. The choice of $\lambda_{\text{OT}}$ is specific to dataset and batch setting and we present one setting of $\lambda_{\text{OT}}$ with varied Labeling Budget for acquisition stage in Figure \ref{lambda_OT hyperparameter}. We also provide additional results on more fine-grained orders of magnitude of $\lambda_{OT}$ in Appendix~\ref{HyperparameterTuning_OTDistance}.



\begin{figure*}[!h]
\centering
\begin{subfigure}[t]{.33\textwidth}
        \centering
    \includegraphics[width=\textwidth]{./fig/Rebuttal/Varying_Validation_Size_For_CIFAR10_Subset_500.png}
    \caption{Utility value vs. validation set size }\label{fig:valsetsize}
    \end{subfigure}
    % \qquad
    \begin{subfigure}[t]{.33\textwidth}
        \centering
        \includegraphics[clip,trim=0cm 0cm 0cm 0cm,width=\textwidth]{./fig/Table1/Performance on CIFAR10 with Pretraining Budget 2500 noisy Main Paper.png}
        % {\quad \quad \tiny Query cost}
        \caption{Noisy oracle}\label{fig:noisy}
    \end{subfigure}
    % \qquad
    \begin{subfigure}[t]{.33\textwidth}
    \includegraphics[clip,trim=0cm 0cm 0cm 0cm,width=\textwidth]{./fig/Table1/Performance on CIFAR10 with Pretraining Budget 2500 imbalance Main Paper.png}
        \caption{Class Imbalance
        }\label{fig:imbalance}
    \end{subfigure}
    \caption{Robustness analysis. }
\end{figure*}


\subsection{Robustness Analysis}
\paragraph{Validation set size vs. validation accuracy}
One potential concern of surrogate model training is the consistency and robustness of \textit{utility} across different subset sizes. In real-world applications, it is crucial to adapt to scenarios with varying data availability, ranging from scarce data, resulting in small validation sets \citep{hacohen2022active}, to situations with ample labeled examples, leading to larger validation sets \citep{citovsky2021batch}.  %instances where significant labeled examples are accessible \citep{citovsky2021batch}(and large validation set). 
To account for the variations in validation set size, we conduct experiments to measure validation set accuracy across various sizes. Using CIFAR10 as a benchmark dataset, we evaluated 100 randomly collected utility samples, 
%with utility value measured by the different validation size across 200, 400, 600, 800, 1000. 
assessing utility values across validation set sizes of 200, 400, 600, 800, 1000. As illustrated in Figure~\ref{fig:valsetsize}, the average validation set accuracy remains consistent regardless of the validation set size, with the standard error decreasing as the size of the validation set increases. We have also conducted sensitivity analysis of validation accuracy w.r.t the size of validation set size for MNIST, FashionMNIST and SVHN in Appendix~\ref{validationsize}.
% \begin{figure}[!h]
% \label{validationsize}
% \includegraphics[width=.4\textwidth]{UAI2024/fig/Rebuttal/Varying_Validation_Size_For_CIFAR10_Subset_500.png}
% \caption{Effect of validation set size on measuring validation accuracy(utility). }\label{fig:valsetsize}
% \end{figure}


\paragraph{Noisy oracles} The quality of labels provided by an oracle can vary depending on the expertise of human annotators. For example, labels from medical images annotated by experts are likely to be more accurate compared to crowd-sourced data from non-experts. To examine the robustness of \algname, we investigate the impact of a noisy oracle, which non-adversarially generated erroneous labels for certain classes. We randomly changed the groundtruth labels for $20 \%$ of the data to reflect incorrect labeling. Figure \ref{fig:noisy} denotes RAMBO outperforms the rest of three baselines by a large margin even though all of four methods are greatly affected by noisy labels with performances dropped by $30 \%$.
\paragraph{Class-Imbalance}
The issue of class imbalance, where some classes are underrepresented compared to others, can significantly affect the performance of active learning algorithms. To investigate the robustness of \algname in such scenarios, we follow class-imbalanced settings similar to \citet{killamsetty2021glister} and artificially generate class-imbalance for the
above dataset by removing $20\%$ of the instances from $30\%$
of total classes available. 
% we artificially created class imbalance by flipping the ground truth labels of $20\%$ of the training data to different classes.
%We artificially generate class-imbalancedness by where $20\%$ of groundtruth labels of training data are flipped to other remaining labels. 
Figure \ref{fig:imbalance} illustrates that \algname exhibits a much greater advantage over other baselines.



% \begin{table}[!h]
% \centering
% % \zixin{MNIST Dataset}
% \caption{Seed Set size 200 for Dataset MNIST}
% \label{BilevelTraining2}
% \begin{tabular}{@{}l*{4}{p{2cm}}@{}}
%     \toprule
%     Stage 1 Budget & non-bilevel + OT + RankNet & bilevel + non-OT + RankNet & bilevel + OT + Non-RankNet & bilevel + OT + RankNet \\
%     \cmidrule(r){2-5} 
%     500 & 94.2(0.001) & 94.4(0.001) & [Your Data] & \textbf{96.1(0.001)}  \\
%     700 & 94.9(0.001) & 94.6(0.002) & [Your Data] & \textbf{96.4(0.001)} \\
%     1000 & 94.6(0.001) & 95.2(0.003) & TBD & \textbf{97.8(0.001)}  \\
%     \midrule
%     % You can continue with more rows here.
% \end{tabular}
% \end{table}


        % Utility Model (Non-Bilevel) & TBD & TBD & 80.1(0.003)\\
        % BADGE & 74.6(0.005) & \underline{77.6(0.006)} & 79.6(0.004) \\
        % CoreSet & \underline{75.2(0.003)} & 75.6(0.002)& 79.5(0.004)  \\
        % GLISTER & 75(0.003) & 77(0.005)&\textbf{81.5(0.007)}  \\
        % Margin & \textbf{75.7(0.002)} & 77.2(0.005) & 79.9(0.004)\\
        % random & 74.7(0.005) & 75.8(0.004) &79.2(0.003) \\
    %     \bottomrule
    % \end{tabular}
% \end{table}

% \begin{table}[]
% \label{BilevelTraining2}
%     \centering
%     \caption{Seed Set size 2500}
%     \begin{tabular}{@{}lccc@{}}
%         \toprule
%         & \multicolumn{3}{c}{SVHN} \\
%         \cmidrule(r){2-4} 
%         Labeling Budget & 5000 & 7000 &10000 \\
%         \midrule
%         \textbf{Utility Model (Bilevel)} & \textbf{88.1(0.002)} &  \textbf{89.1(0.002)} &\textbf{90.2(0.001)} \\
%         Utility Model (Non-Bilevel) & TBD & TBD & 80.1(0.003)\\
%         BADGE & 74.6(0.005) & \underline{77.6(0.006)} & 79.6(0.004) \\
%         CoreSet & \underline{75.2(0.003)} & 75.6(0.002)& 79.5(0.004)  \\
%         GLISTER & 75(0.003) & 77(0.005)&\textbf{81.5(0.007)}  \\
%         Margin & \textbf{75.7(0.002)} & 77.2(0.005) & 79.9(0.004)\\
%         random & 74.7(0.005) & 75.8(0.004) &79.2(0.003) \\
%         \bottomrule
%     \end{tabular}
% \end{table}

% \begin{table}[]
%     \centering
%     \caption{Varying Labeled data for Updating Stage 1 with Stage 2 Labeling Budget 5000 for Dataset CIFAR100}
%     \begin{tabular}{@{}lcccccc@{}}
%         \toprule
%         & \multicolumn{4}{c}{Stage 1} \\
%         \cmidrule(r){1-5} %\cmidrule(lr){7-8}
%         Labeled Data for Stage 1 &1500 & 3500 & 4500 & 5500 \\
%         \midrule
%         \textbf{Utility Model} & \textbf{32(0.004)} & TBD & TBD & TBD \\
%         BADGE & 25.7(0.008) & 82.4(0.003) & 80.6(0.004) & 44.4(0.001) \\
%         CoreSet & \underline{28(0.005)} & 79.1(0.003) & 79.9(0.003) & 44.9(0.008) \\
%         GLISTER & 23.6(0.005) & 81.4(0.003) & 80.8(0.003) & CHNAGE \\
%         Margin & 26.9(0.004) & \underline{82.5(0.003)} & \underline{82.7(0.002)} & 44.6(0.006) \\
%         random & 26.7(0.004) & 81(0.002) & 79.1(0.003) & 43.2(0.005) \\
%         \bottomrule
%     \end{tabular}
% \end{table}

% \begin{table}[t]
% \caption{Few Rounds Comparisons}
% \label{FewRoundsComparison}
% \begin{center}
% \begin{tabular}{lcr}
% \\ \hline \\
% \multicolumn{1}{c}{\bf Labeling Budget}  &\multicolumn{1}{c}{\bf 500} &\multicolumn{1}{c}{\bf 1000}
% \\ \hline \\
% Dendrite    &.89     &Input terminal \\
% Axon        &.72     &Output terminal \\
% Soma        &.60     &Cell body (contains cell nucleus) \\
% \end{tabular}
% \end{center}
% \end{table}


% End to End evaluation
% MNIST (mnistnet) on 50-150 with 20 as batch size 
% Utility Samples 30 Epochs 30 for 3 rounds
% Labeling budget 350
% https://arxiv.org/pdf/2106.15324.pdf
% FashionMNIST (mnistnet)
% % USPS (resnet18)
% See Appendix B
% (Resnet18 pretrained features)
% SVHN (200-300 data points per batch for 30 rounds)
% Utility Samples (30) for epochs 30 for mid point 3 and 4
% Utility Samples (20) for epochs 20 for mid point 5
% (resnet18)
% CIFAR10 (200-300 data points per batch for 30 rounds)
% (10 utility samples for pretraining 3500 and one round budget 10000)
% (resnet18)
% CIFAR100 
% IMDB (move to appendix)
% tiny-Imagenet (possible)

% \begin
% Fixed same stage1 evaluate on stage2 with model update
% Fixed same stage1 evaluate on stage2 without model update
% For Glister Fix $T = L$






% \subsubsection{Comparison to Learning-based Active Learning Baselines}
% \begin{itemize}
%     \item Glister
%     \item Badge
%     \item Coreset
%     \item Random
%     \item Margin
%     \item Oracle: Accuracy
% \end{itemize}



%{Image Classification Datasets}

