% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{float}
\usepackage{bm}
\usepackage{graphicx}
\usepackage{tabularx}
\usepackage{wrapfig}
\usepackage{enumitem}

% math defns
% 	vector notation
\newcommand{\vct}[1]{\boldsymbol{#1}}
%   matrices
\newcommand{\mtx}[1]{\boldsymbol{#1}}

\newcommand{\vx}{\vct{x}}
\newcommand{\vz}{\vct{z}}
\newcommand{\vzero}{\vct{0}}


\newcommand{\mG}{\mtx{G}}
\newcommand{\mZ}{\mtx{Z}}

\newcommand{\E}{\operatorname{\mathbb{E}}}
\newcommand{\R}{\mathbb{R}}


%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{nadagouda_566}


\title{Active Metric Learning and Classification using Similarity Queries\\(Supplementary Material)}

\author{%
  Namrata Nadagouda \qquad Austin Xu \qquad Mark A. Davenport
  ~\\
  School of Electrical and Computer Engineering\\
  Georgia Institute of Technology\\
  Atlanta, Georgia, USA 
}

\begin{document}

\onecolumn 
\maketitle

\appendix

\section{Plackett-Luce Model Details}
The Plackett-Luce model is derived from an assumption, the Luce's choice axiom \citep{luce1959individual}, also known as the Independence of Irrelevant Alternatives (IIA), which states that the presence of other items in a choice set do not change the relative probabilities of choosing items in the set. This is a reasonable assumption in our setting. This model belongs to a family of discrete choice models which are commonly used to describe situations where a selection is made from a set of options. Such scenarios are encountered widely in the fields of economics \citep{train2009discrete}, for example, to explain the choice made by a company on whether or not to launch a product into the market, in psychology \citep{tversky1981framing} to interpret the choices made by humans in every day situations and, more recently, in computer science \citep{rosenfeld2019predicting} to model choices made by a user in online platforms. 


%Metric learning: experimental setup, more results
\section{Metric Learning} 

In this section, we provide precise experimental details and highlight additional metric learning experimental results for both DML and non-parametric embedding learning via MDS.

\subsection{Deep metric learning}

\paragraph{Neural network architectures and learning rates.}
For the DML experiments, we utilize the following network architectures and learning rates for the three datasets. We utilize networks consisting only of fully connected layers with ReLU nonlinearities inserted between all layers.
\begin{itemize}[leftmargin=*]
    \item \textbf{Mahalanobis Metric Dataset: } Fully connected layers of sizes 32, 48, and 10, respectively. Learning rate: $0.0001$
    \item \textbf{Food73 Dataset: } Fully connected layers of sizes 12, 12, and 12, respectively. Learning rate: $0.0005$
    \item \textbf{Graduate Admissions Dataset: } Fully connected layers of sizes 16, 12, and 10, respectively. Learning rate: $0.0001$
\end{itemize}
We utilize the same learning rate for re-training models across all methods (random, Info-NN, Batch-Euclidean/Centroid).

\paragraph{Experiment parameters.}
In all experiments, we utilized a value of $\mu = 0.00001$ for the probability model and utilized $20$ initialization triplets. Batch sizes of $10$ (synthetic), $30$ (food and graduate admissions) are used. Furthermore, for Info-NN experiments, we utilize the following values for hyperparameters $\sigma^2$ (distance distribution variance), $n_s$ (number of samples used to compute mutual information), $B$ and $B^\prime$ (number of top most informative queries selected per batch):
\begin{itemize}[leftmargin=*]
    \item \textbf{Synthetic Mahalanobis Metric Dataset: } $\sigma^2 = 1$, $n_s = 100$, $B^\prime = 10 = B$
    \item \textbf{Food73 Dataset: } $\sigma^2 = 6.5$, $n_s = 1,000$, $B^\prime = 5$
    \item \textbf{Graduate Admissions Datset: } $\sigma^2 = 10$, $n_s = 1,330 ~(= 10N)$, $B^\prime = 5$
\end{itemize}
As reported in the main paper, we used batch sizes of $10, 30,$ and $30$ for the Mahalanobis, food, and admissions datasets respectively. These batch sizes are the sizes of the NN queries collected. For any method using triplets, the batch size is doubled, resulting in batch sizes of $20, 60,$ and $60$, respectively. This is done so we can compare both on a per-query and per-triplet basis. To set such parameters, a coarse grid search was performed to find the best performing parameters. 

We compared our method against two baselines found in \citet{kumari2020batch}. These baselines follow the same general approach of weighting informativeness (measured using entropy) and diversity (measured using various metrics such as the Euclidean distance of all permutations of the triplet or the centroid of the three points selected in the triplet) for an \textit{overcomplete} batch size. We utilize an overcompleteness factor of $3$, which indicates that for a batch of $B$ triplets, the $3B$ most informative triplets are identified. The informativeness of the $3B$ triplets are then weighted by the informativeness, and the top $B$ triplets are then presented to the oracle. From studies performed by \citet{kumari2020batch}, anything above a factor of $2$ exhibits roughly the same performance.

\paragraph{Additional embedding visualizations.}
Models used to generate all embedding visualizations, including those shown in the main paper, used the same number of triplets. We present an additional visualization of the Food73 dataset embedding learned with the Batch-Centroid and Batch-Euclidean methods in Fig. \ref{fig:food_viz_batch}. In comparison to the embedding learned with Info-NN (Fig. 3 in main paper), the embedding learned with Batch-Centroid after the same number of triplets does a poorer job of grouping together vegetables, unlike the Info-NN embedding. 

We also present a visualization of the embedding learned via Batch-Euclidean on the Graduate Admissions dataset in Fig. \ref{fig:adm_vis_euclid}. Comparing embeddings learned with Info-NN and Batch-Centroid (Fig. 5 in main paper) and Batch-Euclidean, it is clear that Info-NN selects queries that more closely group highly ranked candidates together. However, none of the methods visualized are able to completely cluster candidate tiers distinctly; for all three methods, admitted students (fellowship and non-fellowship) are intermingled with candidates in the first and second rejection tiers. 

\begin{figure}[t]
\captionsetup[sub]{justification=centering}
\begin{subfigure}{\textwidth}
    \centering
    \includegraphics[scale=0.25]{figures/supp/batch_centroid_vis.png}
\end{subfigure}
\vskip 0pt
\begin{subfigure}{\textwidth}
    \centering
    \includegraphics[scale=0.25]{figures/supp/batch_euclidean_vis.png}
\end{subfigure}
\caption{Visualization of food embedding learned using queries selected with Batch-Centroid (top) and Batch-Euclidean (bottom) generated using t-SNE \citep{maaten2008visualizing}.}
\label{fig:food_viz_batch}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.4\textwidth]{figures/supp/adm_euclidean_vis.png}
    \caption{Visualization of admissions embedding learned using queries selected with Batch-Euclidean generated using t-SNE \citep{maaten2008visualizing}.}
    \label{fig:adm_vis_euclid}
\end{figure}

\paragraph{Additional results on Graduate Admissions dataset.}
Results for additional values of $K$ for Recall@$K$ and TopFraction@$K$ are presented in Fig. \ref{fig:dml_recall23} and Fig. \ref{fig:dml_frac23}, respectively. On both a per-triplet and per-query basis, Info-NN is performs the best for all values of $K$. We note that for Recall@$K$ for larger values of $K$, all methods perform roughly the same and perform well. This is because the dataset contains a large number of tier 4 rejections, which every method is able to successfully group together, inflating the Recall@$K$ value. Thus, we believe that the TopFraction@$K$ results do a better job of illustrating how the method does in selecting queries that group admitted or more highly ranked candidates together. 

\begin{figure}[t]
\captionsetup[sub]{justification=centering}
\hspace*{\fill}
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/adm_info_rand_euclid_cent_recall2.png}
    \label{fig:dml_adm_recall2}
\end{subfigure}
\hspace*{\fill} % separation between the subfigures
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/adm_info_rand_euclid_cent_recall3.png}
    \label{fig:dml_adm_recall3}
\end{subfigure}
\hspace*{\fill}
\vskip 0pt
\hspace*{\fill}
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/adm_info_rand_euclid_cent_recall2_queries.png}
    \label{fig:dml_adm_recall2_queries}
\end{subfigure}
\hspace*{\fill}
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/adm_info_rand_euclid_cent_recall3_queries.png}
    \label{fig:dml_adm_recall3_queries}
\end{subfigure}
\hspace*{\fill}
\caption{\small Per-triplet (top) and per-query (bottom) comparison for Info-NN against other methods. Recall@$2$ (left) and Recall@$3$ (right).} 
\label{fig:dml_recall23}
\end{figure}

\begin{figure}[t]
\captionsetup[sub]{justification=centering}
\hspace*{\fill}
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/adm_info_rand_euclid_cent_topk2nn.png}
    \label{fig:dml_adm_frac2}
\end{subfigure}
\hspace*{\fill} % separation between the subfigures
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/adm_info_rand_euclid_cent_topk3nn.png}
    \label{fig:dml_adm_frac3}
\end{subfigure}
\hspace*{\fill}
\vskip 0pt
\hspace*{\fill}
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/adm_info_rand_euclid_cent_topk2nn_queries.png}
    \label{fig:dml_adm_frac2_queries}
\end{subfigure}
\hspace*{\fill}
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/adm_info_rand_euclid_cent_topk3nn_queries.png}
    \label{fig:dml_adm_frac3_queries}
\end{subfigure}
\hspace*{\fill}
\caption{\small Per-triplet (top) and per-query (bottom) comparison for Info-NN against other methods. TopFraction@$25$ (left) and TopFraction@$30$ (right).} \vspace{-5mm}
\label{fig:dml_frac23}
\end{figure}


\subsection{MDS Embedding Learning}

We perform a set of experiments which utilize MDS to learn representations of the items. In particular, we use this opportunity to compare the performance of NN queries against a more complex ranking query \citep{canal2020active}. When comparing against ranking queries, it is important to note that \textbf{we expect both actively selected and randomly selected ranking queries to outperform a nearest neighbor query of the same size on a per-query basis}, as there is a discrepancy in the amount of information each query contains. All experiments were performed on a 2019 MacBook Pro, 2.6 GHz 6-Core Intel i7, 16 GB RAM.

\paragraph{Data generation.}
In each simulation, the ground truth embedding consists of points drawn independently from a multivariate Normal distribution with mean $\vzero$ and covariance matrix $\mtx{I}$. We utilize a deterministic oracle, which orders the items based on their true distances from the selected reference object and generate a new initialization embedding with entries drawn uniformly at random from $[0,1]$ for every trial.

\paragraph{Experiment parameters.} 
For both the Info-NN vs. Random-NN and Info-NN vs. Ranking experiments, we utilize a diminishing $\mu$ parameter. For each active learning iteration $k \in \{1, \ldots, K\}$, we set $\mu = D_{\text{max}}(0.99)^k$, where $D_{\text{max}}$ is the maximum pairwise distance in the current estimate of the embedding. As presented in \citet{tamuz2011adaptively}, the $\mu$ parameter can be thought of as a margin. With a diminishing $\mu$, we are enforcing a stricter margin in the earlier stages of learning, when our estimate of the embedding is poor. As the number active learning cycles increases, our estimate of the embedding should improve, thus lessening the need for a larger margin. Multiple other options for $\mu$ were considered, such as setting $\mu$ to a constant or the maximum of all pairwise distances, but we found that the diminishing $\mu$ worked well for the MDS synthetic embedding learning experiments. We utilized step size of $\alpha = 0.5$ for probabilistic MDS. This parameter was not finely tuned. We observed similar performance as long as $\alpha$ is reasonably small ($\alpha < 1$). 

\paragraph{Probabilistic multidimensional scaling.}
To fit an embedding using nearest neighbor or ranking queries, we first decompose the query response into a set of paired comparisons and store these paired comparisons in $\mathcal{S}$. A nearest neighbor query of size $M$ as $M-1$ paired comparisons and similarly, ranking query of size $M$ can be decomposed into $\frac{M(M-1)}2$ paired comparisons. Thus, the active embedding technique framework is general enough to accommodate both query types. We then utilize a version of the probabilistic multidimensional scaling (MDS) approach presented in \citet{tamuz2011adaptively}. Starting with some input embedding $\mZ$, we perform a fixed number of gradient descent iterations with a fixed step size $\alpha$ (not necessarily to convergence) on the empirical log-loss   
\[
\ell_{\mathcal{S}}(\mZ) = \frac1{|\mathcal{S}|}\sum\limits_{i=1}^{|\mathcal{S}|} \log\frac{1}{p_{Q_i}},
\] where for $Q_i = r_i \cup \{t_i^{(1)}, t_i^{(2)}\}\in \mathcal{S}$
\[
p_{Q_i}(Y_i = 1 ~|~ D_{Q_i}) = \frac{(D_{i1}^2 + \mu)^{-1}}{(D_{i1}^2 + \mu)^{-1} + (D_{i2}^2 + \mu)^{-1}}.
\] 
with $D_{Q_i} \coloneqq \{ D_{i1}, D_{i2} \}, D_{i1} \coloneqq D_{r_i,t_i^1}$, and $D_{i_2} \coloneqq D_{r_i,t_i^2}$~. That is, we perform updates of the form $\mZ = \mZ - \alpha\nabla \ell_{\mathcal{S}}(\mZ)$. 

Our active embedding strategy, utilizing probabilistic MDS, is as follows: Starting with an initial embedding $\mZ_0$, we initialize our algorithm by running probabilistic MDS on $\mZ_0$ with $K_0$ randomly drawn queries to obtain $\mZ_1$.
At each iteration $k > 0$, we alternate between the following: 
\vspace{-1mm}
\begin{enumerate}
    \item Fix each column in $\mZ_{k}$ as the reference data point, run Info-NN to find the query that maximizes mutual information with respect to the reference, and choose the query with the maximum mutual information over all $N$ reference data points.
    \item Solicit a response from the oracle for the chosen query, append the paired comparison decomposition to $\mathcal{S}$, and apply probabilistic MDS to $\mZ_{k}$ with the updated $\mathcal{S}$ to obtain $\mZ_{k+1}$. 
\end{enumerate}
\vspace{-1mm}
The full procedure can be found in Alg. \ref{alg:active_emb}


\begin{algorithm}[t]
\caption{Info-NN-M: Active Embedding Technique}
\label{alg:active_emb}
\begin{algorithmic}
\REQUIRE Embedding $\mZ_{\text{init}} \in \R^{D \times N}$, query length $M$, number of active learning cycles $K$, burn-in period $K_0$, number of samples $n_s$, number of MDS iterations $K_{\text{MDS}}$, MDS step size $\alpha$
\STATE $\mathcal{S} \leftarrow \{\}$ 
\FOR{$k = 1,\ldots,K_0$}
\STATE $Q_k \leftarrow $ query of size $M$ drawn uniformly at random
\STATE $y_k \leftarrow $ oracle response to $Q_k$
\STATE $\mathcal{S} \leftarrow \mathcal{S} ~\cup (y_k, Q_k)$ 
\ENDFOR
\STATE $\mZ_0 \leftarrow \text{probabilisticMDS}(\mZ_{\text{init}}, \mathcal{S}, K_{\text{MDS}}, \alpha)$
\FOR{$k = 1,\ldots,K$}
\STATE $(I, Q) \leftarrow \{\}$ (Store highest MI value and corresponding query for all references)
\FOR{$j = 1,\ldots,N$}
    \STATE $Q_j \leftarrow$ Set of all queries of size $M$ for which to compute MI with $j$ as reference item
    \STATE $I_j \leftarrow \text{Info-NN-distances}(\mZ, Q_j, n_s)$ (Compute MI for each query)
    \STATE $(I, Q) \leftarrow (I, Q) \cup (\max I_j, \arg\max I_j)$ (Store query in $Q_j$ with highest MI)
\ENDFOR
\STATE $Q_{j^{\star}} \leftarrow $ Query in $(I, Q)$ with highest corresponding value in $I$
\STATE $y_{j^{\star}} \leftarrow $ Oracle response to $Q_{j^{\star}}$
\STATE $\mathcal{S} \leftarrow \mathcal{S} ~\cup (y_{j^{\star}}, Q_{j^{\star}})$
\STATE $\mZ_k \leftarrow \text{probabilisticMDS}(\mZ_{k-1}, \mathcal{S}, K_{\text{MDS}}, \alpha)$
\ENDFOR\\
%\algorithmicoutput ~$\mZ_k$
\end{algorithmic}
\end{algorithm}

\paragraph{Evaluation metrics.}
To quantify the performance of our approach, we examine how well our recovered embedding preserves the rank ordering of the items. To do so, we use the Kendall's Tau rank correlation coefficient \citep{kendall1938new}. To capture the holistic quality of the learned embedding, we set each object as the reference object, rank all other items based on distance to the reference object, and compute the Kendall's Tau between that item and the ranking induced by the ground-truth embedding with the same reference object. We then define the \emph{aggregate Kendall's Tau} as the mean of all of these Kendall's Tau coefficients. In our simulations we consider multiple trials and we report the median aggregate Kendall's Tau and the 25\% and 75\% quantiles. 

For the following experiments, \emph{Info-Ranking-M} means the active selection method in \citet{canal2020active} was used to select ranking queries each with a set $T_i$ of size $M$.

\paragraph{Info-NN vs. Random-NN.} 
In the first simulation, we quantify the improvement in using the adaptive algorithm over randomly selected nearest neighbor queries. In particular, we fix $N = 20$, $D = 2$ or $D = 5$, use $K_0 = 20$ initial random queries, and examine the performance for queries of sizes $M = 2, 3, 4$, and $5$. % For a fixed query size, we perform 20 trials and report the median and 25\% and 75\% quantiles of the aggregate Kendall's Tau. For each trial, we use a new embedding. 

As shown in Fig.~\ref{fig:top1randadapt}, for all query sizes the learned embedding is significantly better when queries are selected actively rather than at random. Notably, Info-NN-$3$ queries exceed the performance of randomly selected size 4 and 5 queries despite being smaller. Randomly selected nearest neighbor queries of sizes $3, 4,$ and $5$ all performed similarly, indicating that randomly selected queries contain redundant information that cannot be overcome solely by increasing the query size. %This matches our intuition about \eqref{eq:MI_2} in that our algorithm avoids selecting redundant queries. 

\begin{figure}[t]
\captionsetup[sub]{justification=centering}
\begin{subfigure}{0.43\textwidth}
    \includegraphics[width=\textwidth]{figures/probmds/length456_NN_D2_neurips.png}
\end{subfigure}
\hspace*{\fill} % separation between the subfigures
\begin{subfigure}{0.43\textwidth}
    \includegraphics[width=\textwidth]{figures/probmds/length456_NN_D5_neurips.png}
\end{subfigure}
\hspace*{\fill}
\caption{Comparison of actively selected nearest neighbor queries and randomly selected nearest neighbor queries for $D = 2$ (left) and $D = 5$ (right). Info-NN outperforms randomly selected queries in all cases, even outperforming randomly selected queries of larger size in some cases. Gradient step parameters: $500$ iterations, step size = $0.5$.} \vspace{-5mm}
\label{fig:top1randadapt}
\end{figure}


\paragraph{Info-NN vs. Ranking.}
In the second simulation, we compare the performance of actively selected nearest neighbor queries against ranking queries \citep{canal2020active}. We observe that nearest neighbor queries perform competitively to ranking queries, as illustrated in Fig.~\ref{fig:top1ranking}. Again, we fix $N=20$, $D = 2$, utilize $K_0 = 20$ initial random queries, and examine the performance of Info-NN queries of sizes $3, 4,$ and $5$ and ranking queries of sizes $3$ and $4$. % We report the median and 25\% and 75\% quantiles of the aggregate Kendall's Tau over 20 trials in Fig.~\ref{fig:top1ranking}. Again, we generate a new  embedding for every trial.

We observe that the nearest neighbor query exhibits similar performance to randomly selected ranking queries, despite the ranking queries containing twice as many paired comparisons as a nearest neighbor query. Info-NN-$3$ queries are able to match randomly selected ranking queries of the same size, while Info-NN-$4$ queries exceed the performance of randomly selected ranking queries of size $3$, while almost matching the performance of actively selected size $3$ ranking queries. Employing Info-NN can nearly compensate for the difference in information between nearest neighbor and ranking queries, highlighting an advantage in the trade-off between complexity and ``information density'' (the number of triplets contained in one query).

\begin{figure}[t]
    \centering
    \includegraphics[width=0.43\textwidth]{figures/probmds/length456_ranking_NN_D2_neurips.png}
    \caption{Comparison of actively selected nearest neighbor queries and actively selected and randomly selected ranking queries. Info-NN performs is competitive with a randomly selected ranking query of the same size. Gradient step parameters: $500$ iterations, step size = $0.5$.}
    \label{fig:top1ranking}
\end{figure}

\subsection{Active Selection Computational Comparison}
While our mutual information computation strategy is similar, utilizing NN queries results in computational advantages when compared to the ranking query used in \citet{canal2020active}. To compare the time discrepancy between computing mutual information for ranking and nearest neighbor queries, we perform $10$ iterations of our embedding technique, and record the amount of time it takes to compute the mutual information for each object as the reference object. We then report the average and standard deviation of the times taken. We use the same parameters for each active learning algorithm, such as number of queries to consider and number of distance samples generated. In Table \ref{tab:rank_nn_timing}, we report the average amount of time it takes to compute the mutual information for a given reference object for differently sized queries in actively selecting nearest neighbor queries using Alg.~\ref{alg:active_emb} and the method presented in \citet{canal2020active}. The drastic discrepancy in timing between the two methods is due primarily to the fact that the nearest neighbor mutual information computation does not require computation for all possible permutations of the set of $M$ items, whereas the ranking query does.

\begin{table}[t]
    \centering
    \caption{Timing results, in seconds, for computing mutual information for nearest neighbor and ranking queries. Experiments performed on 2019 MacBook Pro, 2.6 GHz 6-Core Intel i7, 16 GB RAM.} 
    \vspace{2mm}
    \label{tab:rank_nn_timing}
    \resizebox{0.5\textwidth}{!}{
    \begin{tabular}{l c c c}
        \toprule
        & $M = 2$ & $M = 3$ & $M = 4$\\
        \midrule
        NN & $0.0265 \pm 0.0036$ & $0.1509 \pm 0.0044$ & $0.6634 \pm 0.0812$ \\
        Ranking & $ 0.6605\pm0.0583 $ & $ 8.5394\pm0.3400 $ & $175.0046\pm93.2602 $\\
        \bottomrule
    \end{tabular}}
\end{table}


%Classification: experimental setup, more results
\section{Classification}

\subsection{Algorithms}

A description of the active classification framework and the complete Info-NN query strategy utilized to select samples for labeling, is below.

\begin{algorithm}
\caption{Active Learning for Classification}
\label{alg:al_classification}
\begin{algorithmic} 
\REQUIRE Dataset $\mathcal{X} = \{\vx_i\}_{i=1}^N$, batch size $B$, number of classes $C$, number of samples $n_s$
\STATE $\mathcal{L}_0 \leftarrow \{(\vx_i, y_i)\}_{i=1}^j$ initial (balanced) labeled dataset
\STATE $\mathcal{U}_0 \leftarrow \{\vx_i\}_{i=j+1}^N$
\STATE $M_0 \leftarrow \text{Model trained on } \mathcal{L}_0$
\FOR{$k = 1,\ldots, K$}
\STATE $\mathcal{B}_k \leftarrow \text{Info-NN-$m$} (M_{k-1}, \mathcal{L}_{k-1}, \mathcal{U}_{k-1}, B, C, n_s)$
\STATE $\mathcal{L}_k \leftarrow \mathcal{L}_{k-1} \cup \{(x_i,y_i) : x_i \in \mathcal{B}_k\}$
\STATE $\mathcal{U}_k \leftarrow \mathcal{U}_{k-1} \backslash \mathcal{B}_k$
\STATE $M_k \leftarrow \text{Model trained on } \mathcal{L}_k$
\ENDFOR\\
%\algorithmicoutput ~$M_K$
\end{algorithmic}
\end{algorithm}

\begin{algorithm}
\caption{Info-NN-$M$}
\label{alg: info_nn_cluster}
\begin{algorithmic} 
\REQUIRE Model $\mathcal{M}$, labeled set $\mathcal{L}$, unlabeled set $\mathcal{U}$, batch size $B$, number of classes $C$, number of samples $n_s$
\STATE $\mZ_{\mathcal{L}}$ = Compute Embedding ($\mathcal{L}$)
\STATE $\mZ_{\mathcal{U}}$ = Compute Embedding ($\mathcal{U}$)
\STATE $Q \leftarrow \{\}$ (Set of candidate queries)
\FOR{$u \in \mZ_{\mathcal{U}}$}
\STATE $\text{NN}_u \leftarrow$ Top $M$ nearest neighbors 
\STATE $Q_u \leftarrow u \cup \text{NN}_u$
\STATE $Q \leftarrow Q \cup Q_u$
\ENDFOR
\STATE $I \leftarrow $Info-NN-distances$(\mZ_{\mathcal{U}}, Q, n_s)$
\STATE $G(\mathcal{U}) \leftarrow$ K-Means Clustering ($\mathcal{U}, \mathcal{L}$)
\STATE $\mathcal{B} \leftarrow$ unlabeled samples corresponding to top values of $I$ from every cluster\\
\end{algorithmic}
\end{algorithm}

\subsection{Experimental Details}

\paragraph{Computational infrastructure}

The experiments were performed on a combination of three desktop machines with the following configurations:
\begin{enumerate}[leftmargin=*]
    \item A $3.80$GHz $16$-Core Intel $i7-9800X$ CPU and an Nvidia Quadro RTX $5000$ GPU
    \item A $2.10$GHz $20$-core Intel Xeon Gold $6230$ CPU and four Nvidia Quadro RTX $6000$ GPUs
\end{enumerate}

\paragraph{Datasets.}

Below are the details of the real world datasets used on classification experiments.
\begin{itemize}[leftmargin=*]
    \item MNIST \citep{lecun1998gradient} is a dataset of black and white images of handwritten digits belonging to $10$ classes and consists $60,000$ training samples and $10,000$ test samples.
    \item CIFAR-10 \citep{krizhevsky2009learning} is a dataset consisting of colour images belonging to $10$ classes with $50,000$ training samples and $10,000$ test samples.
    \item SVHN \citep{netzer2011reading} consists of digits (10 classes) from natural scene RGB images with $73,257$ training samples and we use $10,000$ samples for testing the accuracy of the learned models.
\end{itemize}
 

\paragraph{Baselines.}

The details of the baseline active labeling methods used are as follows.
\begin{itemize}[leftmargin=*]
    \item BatchBALD: Samples are selected according to the algorithm described in \citet{kirsch2019batchbald}. The algorithm uses Monte-Carlo (MC) sampling to compute joint probabilities of the different labeling configurations in a batch of samples which is very memory intensive. This requires the pool of unlabeled data to be sub-sampled in order for the computations to be feasible. The number of MC samples for the computations and the size of the pool set was determined by the memory associated with the GPUs. We use $10^3$ MC samples and the sizes of the pool set used were $20,000$ for MNIST and $5,000$ for both CIFAR-$10$ and SVHN respectively. We would like to note here that we did not perform an extensive experimentation to determine an optimal configuration of the number of MC samples and size of the pool set but decided a configuration based on the settings that did not result in running out of GPU memory.
    \item K-Center: Optimal samples that achieve the desired coverage, based on the distances in the embedding space learned by the network, are selected. This method is based on the algorithm described in \citet{sener2017active}.
    \item MaxEntropy: The top unlabeled samples with the maximum entropy, computed based on the class probabilities predicted by the model, are chosen.
    \item Random: A batch of samples is drawn at random from the pool for labeling.
\end{itemize}

\paragraph{Models and training methodology.}

In all the experiments, the models are trained from scratch at every active learning cycle. The performance reported is measured on a holdout test set comprising of $10,000$ samples in all the experiments.

\textbf{MNIST:} For experiments on the MNIST dataset, we use a model similar to the one used in \citet{kirsch2019batchbald}. Specifically, we use a CNN consisting of two convolutional blocks followed by two fully connected layers. The two convolutional blocks consist of $32$ and $64$ filters of kernel size $5$, each followed by layers of dropout, max-pooling and relu units. The two fully connected layers, of size $128$ and $10$ respectively, also have a dropout unit between them. We use a probability of $0.5$ for all dropout units. 

The data inputs to the model are normalized and batch sizes of $64$ and $128$ are used while training and testing respectively. We use the Adam optimizer with a learning rate of 0.001. Since the size of the labeled set used in these experiments is small compared to the entire dataset, we use early stopping to ensure that the model does not overfit to the training data. We use a validation set of size 100 consisting of 10 samples from every class selected at random and we stop training after 10 consecutive epochs of increasing validation loss. 

\textbf{CIFAR-10 and SVHN:} For both the datasets, we use a ResNet-18 \citep{he2016deep} to conduct the experiments. While training, the data inputs are normalized along with augmentation techniques consisting of random cropping with an output size of $32$ and a padding of $4$ and random horizontal flipping. The model is trained for $250$ epochs using the Adam optimizer with a learning rate of $0.001$ in combination with the cosine annealing scheduler. A batch size of $128$ is used for both training and testing.

\paragraph{Mutual Information estimation for Info-NN:} The parameter $\mu$ is set equal to the maximum value of the inter-sample distances in the embedding space. The variance for the normal distribution of distances is set as the variance of all the distances in the embedding space and $100$ samples from the distributions are used for inference. These values were found to work well in all the experiments and an extensive and a systemic search for these hyperparameters was not performed.

\subsection{Additional Results}

\begin{figure}[t]
\captionsetup[sub]{justification=centering}
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/classification/mnist_info_nn_2_versions.png}
    \label{fig:mnist_2_versions}
\end{subfigure}
\hspace*{\fill} % separation between the subfigures
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/classification/mnist_diff_query_lengths_2.png}
    \label{fig:mnist_diff_query_lengths}
\end{subfigure}
\hspace*{\fill} % separation between the subfigures
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/classification/cifar10_diff_query_lengths_2.png}
    \label{fig:cifar10_diff_query_lengths}
\end{subfigure}
\caption{Active classification experiments:Comparison of the performances of Info-NN with and without clustering using a batch size of 3 on the MNIST dataset (left). Performance comparison between Info-NN queries of different lengths on MNIST (center) and CIFAR-10 (right) datasets.}
\label{fig:classification_more_results}
 \end{figure}

\begin{figure}[t]
\captionsetup[sub]{justification=centering}
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/classification/entropy_visualization_all_classes.png}
\end{subfigure}
\hspace*{\fill} % separation between the subfigures
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/classification/info_nn_visualization_all_classes.png}
\end{subfigure}
\hspace*{\fill}
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/classification/coreset_visualization_all_classes.png}
\end{subfigure}
\caption{Visualization of samples selected on MNIST with MaxEntropy (left), Info-NN-$3$ (center) and K-Center (right) querying strategies, generated using UMAP \citep{mcinnes2018umap}. Each of the blobs correspond to one among the 10 classes and the samples selected are indicated by black crosses.}
\label{fig:samples_selected}
\end{figure}

\paragraph{Performance plots.}

We compare the performance of Info-NN with and without clustering (top $b$ samples are selected solely based on informativeness) on MNIST. The results are illustrated in Fig.~\ref{fig:classification_more_results} where we can observe the improved performance realized by Info-NN when combined with clustering. \\
\\
Also, we conducted experiments on MNIST and CIFAR-$10$ datasets to determine the optimal query length for Info-NN. In Fig.\ref{fig:classification_more_results}, we can observe that queries of length 3 resulted in the best performance on MNIST, significantly outperforming queries of longer lengths. On CIFAR-$10$, while all of them seem to exhibit a similar performance, queries of length $3$ outperform the others consistently. Thus, we use queries of length 3 in all the experiments with supervised classification.


\paragraph{Visualizations.}

The samples selected by different active methods are illustrated in  Fig.~\ref{fig:samples_selected}. We can observe that MaxEntropy tends to select redundant informative samples indicated by clusters of black crosses and K-Center selects samples to ensure diversity indicated by the more distributed placement of the selected samples. In the case of Info-NN, we see a combination of clustered and distributed samples likely selecting both informative and diverse samples.



\bibliography{nadagouda_566}

\end{document}
