%\documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[table,xcdraw]{xcolor}
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{multicol}
% \usepackage{graphicx}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{float}
\usepackage{pifont}


\usepackage{kbordermatrix}
\usepackage{tikz}
\usepackage{lipsum} % Provides sample text


% \usepackage[table,xcdraw]{xcolor}
\usepackage{hyperref}
\usepackage{bbm}
\usepackage{dsfont}
\usepackage{subcaption}

\usepackage{multirow}

\usepackage{amssymb}
\usepackage{amsthm}
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{property}[theorem]{Property}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\usepackage[ruled,vlined]{algorithm2e}
\input{math_commands}


\newcommand{\maxime}[1]{\textit{\color{blue}{[Max: #1]}}}
\newcommand{\ola}[1]{\textit{\color{red}{[Ola: #1]}}}
\newcommand{\aud}[1]{\textit{\color{magenta}{[Audrey: #1]}}}

\newcommand{\Cesa}{{\fontfamily{qcr}\selectfont{Cesa}}}
\newcommand{\Margin}{{\fontfamily{qcr}\selectfont{Margin}}}
\newcommand{\Gappeltron}{{\fontfamily{qcr}\selectfont{Gappletron}}}
\newcommand{\ALPS}{{\fontfamily{qcr}\selectfont{ALPS}}}
\newcommand{\Neural}{{\fontfamily{qcr}\selectfont{Neural}}}

\newcommand{\CBP}{{\fontfamily{qcr}\selectfont{CBP}}}
\newcommand{\CBPsidestar}{{\fontfamily{qcr}\selectfont{CBPside$^\star$}}}
\newcommand{\NeuralCBP}{{\fontfamily{qcr}\selectfont{NeuralCBP}}}
\newcommand{\INeural}{{\fontfamily{qcr}\selectfont{INeural}}}
\newcommand{\INeuraltuned}{{\fontfamily{qcr}\selectfont{INeural (less exploration)}}}
\newcommand{\INeuraloff}{{\fontfamily{qcr}\selectfont{INeural (official)}}}
\newcommand{\Neuronal}{{\fontfamily{qcr}\selectfont{Neuronal}}}
\newcommand{\Neuronaltuned}{{\fontfamily{qcr}\selectfont{Neuronal (less exploration)}}}
\newcommand{\Neuronaloff}{{\fontfamily{qcr}\selectfont{Neuronal (official)}}}

\newcommand{\EENets}{{\fontfamily{qcr}\selectfont{EENets}}}


\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
            \node[shape=circle,draw,inner sep=0.5pt] (char) {#1};}}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Neural Active Learning Meets the Partial Monitoring Framework}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<maxime.heuillet.1@ulaval.ca>?Subject=Your UAI 2024 paper}{Maxime Heuillet}{}}
\author[1,3]{Ola Ahmad}
\author[1,2]{\href{mailto:<audrey.durand@ift.ulaval.ca>?Subject=Your UAI 2024 paper}{Audrey Durand}}
% Add affiliations after the authors
\affil[1]{%
    Université Laval, Canada
}
\affil[2]{%
    Canada-CIFAR AI Chair, Mila, Canada
}
\affil[3]{%
    Thales Research and Technology (cortAIx), Canada\\
  }
  
\begin{document}
\maketitle

\begin{abstract}
We focus on the online-based active learning (OAL) setting where an agent operates over a stream of observations and trades-off between the costly acquisition of information (labelled observations) and the cost of prediction errors.
We propose a novel foundation for OAL tasks based on partial monitoring, a theoretical framework specialized in online learning from partially informative actions. 
We show that previously studied binary and multi-class OAL tasks are instances of partial monitoring.
We expand the real-world potential of OAL by introducing a new class of cost-sensitive OAL tasks.
We propose \NeuralCBP{}, the first PM strategy that accounts for predictive uncertainty with deep neural networks.
Our extensive empirical evaluation on open source datasets shows that \NeuralCBP{} has competitive performance against state-of-the-art baselines on multiple binary, multi-class and cost-sensitive OAL tasks. 
\end{abstract}

\section{Introduction}\label{sec:intro}


%Obtaining labelled  observations to train a machine learning agent is generally costly. 
%\textit{Active learning} is a specific technique where the agent interactively queries an expert to acquire labelled observations.
%Since the agent selects which observations to label, the number of labelled observations is often lower than in supervised learning, which results in lower costs.
%In contrast, supervised learning necessitates substantial amounts of labelled data, poses a significant cost in real-world applications. 

In active learning~\citep{cohn1994improving}, an agent decides to query an expert to obtain labels on selected observations. 
This active acquisition of labels efficiently reduces the number of labelled observations needed to learn a task.
Active learning therefore appears as a cost-effective solution for modern machine learning, which often relies on large volumes of labelled observations \citep{kusne2020fly}.

%\ola{stream-based is part of online learning, non. Can it be done offline?}
In this work, we focus on the \textit{online-based active learning} (OAL) setting for binary and multi-class classification tasks \citet{beygelzimer2009importance}.
The agent operates over a (possibly infinite) stream of observations.
For each observation, the agent predicts the class and either decides to reveal its prediction or to query an expert to obtain the label.
The OAL setting we consider differs from the \textit{batch setting} where the agent gathers fixed-size batches of observations to label \citet{saran2023streaming, kareem2020understanding}.
In both the OAL and the batch-based settings, all decisions are irrevocable and associated with costs.
The goal is to minimize the cumulative cost over the stream of decisions, by trading-off between the cost of obtaining new labels (\textit{labeling complexity}) and the cost of prediction errors (\textit{generalization performance}). 


%This trade-off is known as the exploration-exploitation trade-off \citep{lattimore2020bandit}.
%If the agent chooses not to request the label of a given observation, it must then predict the class to which the observation belongs. \aud{I think the agent always has to predict, whether it requests the label or not. The nuance is in the loss incurred for a bad prediction (none if label was requested, positive if label not requested). The previous sentence says otherwise.} 
%\ola{I find this paragraph empty from references} 
%\maxime{What do you think of that: we refer here to bandits and partial monitoring as fields that deeply study the exploration-exploitation trade-off and that this paper aims to leverage this literature?}

%In this work, we focus on online active learning problems for classification tasks. 
% This problem echoes with multiple real world use-cases such as online transfer learning \cite{} and online domain adaptation \cite{}. 
%The online active learning problem is studied in the binary setting, 
%Guarantees of \Margin{} and \Cesa{} have been proposed assuming a linear model is used, but heuristics with more expressive models can be obtained. 
%\ola{why, what is the limitation of linear assumption?}
%For instance, it is possible to use \ALPS{} \citep{desalvo2021online} with a multi-layers perceptron (MLP) and enjoy guarantees on the labelling complexity and on the generalization performance.
%\aud{per class or irrespective of the predicted class?}. 
%\aud{On explique précédemment les principes derrière \Margin{} et \Cesa{}, mais rien sur \ALPS{} et \Gappeltron{}. Il faudrait donner le minimum d'info pour comprendre leur limitation.}

In the context of OAL for binary classification, the \Margin{} strategy~\citep{sculley2007practical} queries the expert when the prediction uncertainty is greater than a user-specified threshold.
In contrast, with \Cesa{}~\citep{cesa2006sampling}, labelled observations are acquired proportionally to the global prediction error rate of the strategy. Both \Margin{} and \Cesa{} are specifically analyzed for the class of linear separators and are designed for binary tasks.
More recent studies focused on multi-class OAL tasks.
The \Gappeltron{}~\citep{van2021beyond} leverages graph feedback, making it inherently multi-class. However, simialrly to \Cesa{} and \Margin{}, \Gappeltron{} is specifically analyzed for linear separators.

Modern applications of machine learning involve high-dimensional observations that require learning complex representations.
As a result, \Neural{}~\citep{wang2021neural} and \ALPS{}~\citep{desalvo2021online} proposed multi-class OAL strategies based on deep neural networks. 
\Neural{} and \ALPS{} have been outperformed by \INeural{}~\citep{ban2022improved}, an improved and more practical version of the \Neural{} strategy ~\citep{wang2021neural}. 
The current state-of-the-art, \Neuronal{}~\citep{ban2024neural}, addresses scalability limitations of \INeural{},
opening the door to using sophisticated neural architectures, such as convolutional neural networks. 
 %(contexts have to be one-hot-encoded) which opens
%\aud{What about \ALPS{}? Il est juste outperformed par tout le reste? Si c'est ça, il faudrait le mentionner.}
%\ola{modern are now the transformer networks. I would replace 'modern' by 'more complex'}
%\aud{La phrase suivante concerne la définition du problème, pas les stratégies; déplacer dans une section où l'on décrit le problème.}

%\aud{Unclear if the \Neuronal{} contribution (over \Neural{}/\INeural{}) is to address a technical limitation or to show that guarantees can hold even with CNNs.}
%\ola{todo: improve this sentence and focus on how NN helps finding better guarantees.}
%The decision to ask for a label in \Neural{}, \INeural{} and \Neuronal{} depends on the absolute difference between top-2 predictions, assuming the predictions are optimistic in the face of uncertainty.  
%\cite{saran2023streaming} proposes an approach to study streaming active learning in the batch setting, they do not compare against I-Neural although it is cited in the related works
%Batch Active Learning at Scale \citet{}
%Understanding the Effects of Batching in Online Active Learning \cite{kareem2020understanding}

%assume that the cost of prediction errors is the same across all classes. This assumption on the cost structure is at the core of the algorithmic dynamics of existing approaches, making it difficult to derive cost-sensitive variations directly from them. 
%\ola{Should not be the costs of labelling observations depends on the prediction error, which may and vary from one class (or instance) to another?.}

In critical real-world applications, the costs of prediction errors from one class to another may vary significantly.
Cost-sensitive OAL is studied for regression tasks~\citep{cai2023active}, but classification tasks remain an open problem.
Existing OAL strategies all assume \textit{uniform costs}, i.e. prediction error and labeling costs are the same across all classes.
This assumption is the core of the algorithmic design of existing approaches, making it challenging to extend them to cost-sensitive tasks.
Motivated by the fact that cost-sensitive learning has fostered the adoption of supervised learning in real-world scenarios, such as learning from imbalanced data~\citep{elkan2001foundations}, we address the following questions:
\emph{\textbf{1) How to frame cost-sensitive OAL classification tasks? and, 
2) Can we design a practical cost-sensitive OAL agent?}}

%\aud{Si on manque d'espace, on peut supprimer ce qui suit puisque ça n'aide pas particulièrement à comprendre les contributions et que ça répète ce qui arrive en section 2.}
%In a PM game, a learning agent processes a stream of observations. For each observation, the agent selects an action. The agent then incurs an instant (unknown) cost and receives a feedback signal that is partially informative about the game. The actions have different levels of cost and informativeness. This has the potential to capture a wide diversity of applications. For example, in OAL, requesting a label can be seen as an informative action, while using the prediction is not.

% on predetermined features
\vspace{-0.5em}
\paragraph{Contributions.}

We tackle these questions from a novel perspective based on Partial Monitoring (PM)~\citep{piccolboni2001discrete, bartok2014partial}, a theoretical framework for online learning problems with partially informative actions. 
\circled{1}
\underline{Connecting ideas in separate fields}: We hypothesize and validate that we can establish a novel, non-trivial, connection between the field of active learning and the PM framework.
\circled{2}
\underline{Methodological}: We show how partial monitoring reduces to existing binary and multi-class OAL tasks and enables the formulation of novel cost-sensitive OAL tasks.
\circled{3}
\underline{Algorithmic}: We propose \NeuralCBP{} (Neural Confidence Bound Partial Monitoring), a partial monitoring (PM) strategy able to learn from neural networks. Existing PM strategies are limited to the linear~\citep{heuillet2024randomized} and logistic~\citep{bartok2012CBPside}, which constitute a bottleneck towards the adoption of PM in practice. \NeuralCBP{} presents algorithmic dynamics that differ from existing OAL strategies, which can be of independent interest to the OAL community.
\circled{4}
\underline{Empirical}: Our empirical evaluation shows that \NeuralCBP{} competes with the current state-of-the-art in multiple binary and multi-class settings, and across various neural architectures.
\circled{5} \underline{Advocacy}: 
Documented applied studies of PM are limited to synthetic experiments \cite{singla2014contextual, kirschner2023linear, heuillet2024randomized}. Furthermore, PM is a field traditionally more supported by theoretical advances. 
Our work shows that PM is a promising framework that can be effectively applied in applied settings such as OAL. 
\circled{6} \underline{Reproduciblity}: To support adoption, our study is fully reproducible with open-source code and implementation details (see Appendix \ref{app:label_efficient}).



% We tackle these questions from a novel perspective based on Partial Monitoring (PM)~\citep{piccolboni2001discrete, bartok2014partial}, a theoretical framework for online learning problems with partially informative actions. 
% 1) We show that the PM framework can be leveraged to frame binary, multi-class, and cost-sensitive OAL classification tasks.
% 2) Currently, PM strategies are confined to linear~\citep{heuillet2024randomized} and logistic~\citep{bartok2012CBPside} model assumptions, restricting them to a narrow spectrum of applications. 
% Acknowledging that state-of-the-art OAL strategies leverage deep neural networks, we propose \NeuralCBP{}, the first PM strategy that accounts for predictive uncertainty with deep neural networks.
% 3) We show that \NeuralCBP{} competes and even outperforms the state-of-the-art on binary, multi-class, and cost-sensitive OAL tasks. We also investigate the robustness of \NeuralCBP{} and baselines across various neural architectures.
% 4) Our work contributes to illustrating the practical potential of the PM framework, a field traditionally supported by theoretical results.

\vspace{-1em}
\section{background}
\label{sec:background}

A PM~\citep{bartok2014partial} game is played between a learning agent and the environment over multiple rounds.
The agent has a finite set of $N$ actions. The environment has a finite set of $M$ outcomes.
The game is defined by a cost matrix $\textbf{C} \in [0,1]^{N \times M} $ and a feedback matrix $ \textbf{H} \in \Sigma^{N \times M}$. 
The symbol space $\Sigma$ is arbitrary and is not necessarily numeric (i.e. could be symbols). Without loss of generality, we assume that feedback symbols associated with one action are distinct from those induced by the other actions. 
We note $\textbf{c}_i$ the $i$-th row of the matrix $\textbf{C}$. The same notation applies to matrix $\textbf{H}$. A summary table of the important notations is reported in Table \ref{tab:notations} in the Appendix.

%\paragraph{Notation} For actions and outcomes, we refer interchangeably to indices and elements in sets $A$ or $Y$: action $a$ refers both to $a$-th action and to the $a$-th element in $A$. 

\subsection{Dynamics of a game}

%\aud{Confirmer; ça voudrait dire que l'environnement peut sélectionner les contextes de manière à toujours choisir le contexte le plus difficile à résoudre? Ça se peut, mais il faut vérifier que c'est quand même correct.} \maxime{Oui, Bartok dit que les contextes peuvent meme apparaitre de facon adversarialle si on veut.}

Matrices $\textbf{C}$ and $\textbf{H}$ are revealed to the agent before the game begins. The horizon of rounds $T$ is unknown to the agent. At each round $t \in \{1, 2, \dots, T \}$, the environment samples an observation $x_t \in \mathcal X$. We make no assumption regarding the sampling process of the observations.
The environment then samples an outcome $y_t \in \{1, 2, \dots, M\}$ from an \textit{outcome distribution} that depends on $x_t$, and that is denoted $p(x_t) \in \Delta_M \subset \mathbb R^{M \times 1}$, 
where $\Delta_M$ is the $M$-dimensional probability simplex.
We assume outcomes are sampled i.i.d with respect to the outcome distribution. 

% \ola{$h_t$ is unnecessary} \maxime{It is used in section 4.2}
The agent observes $x_t$ and selects an action $i_{t} \in \{ 1, 2, \dots, N \} $. 
Then, the agent then incurs a cost $\textbf{C}[i_t,y_t]$ and receives a feedback symbol $h_t = \textbf{H}[i_t,y_t]$, where $[i,y]$ denotes the element at row $i$ and column $y$. 
Therefore, costs and feedback symbols are deterministic elements of matrices $\textbf{C}$ and $\textbf{H}$ respectively.
We emphasize that the agent only observes the feedback symbol $h_t$, with neither the outcome nor the cost being revealed.

The goal is to minimize the cost incurred in each round.
This is achieved by selecting the action $i^\star_t$ that minimizes the expected cost for $x_{t}$, and is defined such that
$i^\star_t = \argmin_{1 \leq i \leq N} \textbf{c}_{i} p(x_t)$. 
%\ola{question, what if we replace $i$ by another index $k$ to refer to indices of a matrix instead of indices of the actions $i$ that depend on $t$. Ignore it if you dont agree.}
%\aud{$\textbf{c}_i$ undefined}
%\ola{I suggest replace $P(Y|X=x_t)$ by the same notation above.}
The performance of the agent is measured by the cumulative regret (to minimize) w.r.t. the optimal action strategy:
\vspace{-0.5em}
\begin{equation}
    R(T) = \sum_{t=1}^{T}  ( \textbf{c}_{i_t}  - \textbf{c}_{i^\star_t} )  p(x_t),
\label{eq:contextual_regret}
\vspace{-0.5em}
\end{equation}
Eq.~\ref{eq:contextual_regret} scales sub-linearly with $T$ if the agent identifies the optimal action and commits to it over time. This requires to balance \textit{exploration} (playing informative actions) and \textit{exploitation} (minimizing per-round regret). 
%\aud{J'ai ajouté une phrase sur le compromis exploration-exploitation pour faciliter les explications en section 4.2 (Explore-Exploit Networks).}

\vspace{-1em}
\subsection{Structure of a game}

We now introduce two definitions to characterize the cost $\bf{C}$ and feedback $\bf{H}$ matrices of any PM game.

\begin{definition}[Cell decomposition, \citet{bartokICML2012}]
    The \textit{cell} $\mathcal O_i $ is defined as the subspace in the probability simplex $\Delta_M$ such that action $i$ would be optimal. Formally,
    $ \mathcal O_i =  \{ p \in \Delta_M, \forall j \in \{1,...,N\},  (\textbf{c}_i-\textbf{c}_j) p  \leq 0 \}. $
    \label{def:cell}
\end{definition}

\vspace{-0.25em}
%\aud{J'ai changé $i'$ pour $j$ histoire de conserver la même mécanique que la définition 2.1.} \maxime{problématique: $j$ est déjà utilisé dans la définition, c'est pour cela qu'il faut utiliser $i'$} \aud{J'ai changé pour $k$.}

Based on the above definition, action $i$ is: (i) \textit{dominated} if $\mathcal O_i = \emptyset$ (i.e. there is no outcome distribution s.t. the action is optimal); (ii) \textit{degenerate} if it is not dominated and there exist action $k$ such that $\mathcal O_i \subsetneq \mathcal O_{k}$ (i.e. actions $i$ and $k$ are duplicates, both are jointly optimal under some outcome distribution); and (iii) \textit{Pareto-optimal} otherwise. The set of Pareto-optimal actions is denoted $\mathcal P$. 

%\aud{Undefined}
% (columns)\ola{"columns" is not necessary because $h_i$ is a one row vector.}
For an action $i$, let $\sigma_i$ denote the number of unique feedback symbols on $\textbf{h}_i$. Let $\Sigma_i=\{s_1, ..., s_{\sigma_i}\}$ denote the enumeration of symbols sorted by order of appearance in $\textbf{h}_i$. Let $\pi_i(x_t) \in \Delta_{ \sigma_i } \subset \mathbb R^{\sigma_i\times 1}$ denote the probability distribution of receiving each feedback symbol of action $i$ given $x_t$.

\begin{definition}[Signal matrix, \citet{bartokICML2012}]
    Given action $i$, the elements in the \textit{signal matrix} $S_i \in \{0,1\}^{ \sigma_i \times M}$ are defined as $S_i[u,v] = \mathds{1}_{ \{ \textbf{H}[i,v] = s_{u}  \} }$. 
\end{definition}
%\aud{J'ai changé $a$ pour $i$ histoire de conserver la même mécanique que la définition 2.1.}

\begin{property}
The outcome distribution is connected to the feedback symbols distribution of action $i$ through the signal matrix $S_i$ with the relation $\pi_i(x_t) = S_i p(x_t)$. 
\label{prop:link}
\end{property}
\vspace{-0.5em}
%\aud{J'ai changé $a$ pour $i$ histoire de conserver la même mécanique que la définition 2.1.}

\section{Stream-based active learning as a partial monitoring game}
\label{sec:alpm}

OAL problems have been studied under varied feedback models, such as bandit \cite{erez2024real, pmlr-v30-Daniely13} and full information feedback \cite{sakaue2024online}. In this work, we propose a connection between OAL problems and the PM feedback model \cite{bartok2014partial}. In particular, we leverage specific PM instances, known as \textit{label-efficient} games, to capture OAL problems.

The original label-efficient game \citep{helmbold1997some} is characterized by $N=3$ actions (predict class 1, predict class 2, and query the expert), $M = 2$ outcomes (the ground-truth classes), and the following cost and feedback matrices: 
\begin{align}
    \bf{C}=\kbordermatrix{ & \text{class } 1 & \text{class } 2\\
        \text{pred. class } 1 & 0 & 1\\
        \text{pred. class } 2 & 1 & 0 \\
                \text{expert} & 1 & 1},
    \bf{H}=\kbordermatrix{
        & \text{class } 1 & \text{class } 2\\
        & \Diamond & \Diamond \\
        & \wedge & \wedge\\
        & \bot & \odot} \notag .
\end{align}
For reproducibility, in Appendix \ref{app:label_efficient}, we instantiate all the definitions presented above using the label-efficient game as an example.
Several OAL studies on binary classification correspond to instances of the original label-efficient game~\citep{cohn1994improving, balcan2007margin, beygelzimer2009importance}. 

Using the game theoretical definitions presented above and developed in \cite{bartokICML2012}, we now introduce a generalization of this game to multi-class classification with possibly non-uniform costs and multiple experts. 

\paragraph{Generalized label-efficient game} The OAL classification task with $M$ classes and $E$ experts can be cast as a PM game with $N = M+E$ actions and $M$ outcomes.
Without loss of generality, we assume that the actions $\{M+1, \dots, M+E\}$ correspond to requesting a label from the $E\geq1$ experts.
All actions $i>M$ (i.e. actions associated with an expert) are \textit{dominated} (see Def.~\ref{def:cell}) and admit $ \sigma_i=M$ distinct symbols. 
The other actions $\{1, \dots, M\}$ lead to a single feedback symbol, i.e. $\sigma_i=1$. 
%\aud{J'utiliserais plutôt $\sigma_i$.}
%\aud{Est-ce que tu veux dire que pour toutes les actions $i>M$, alors le nombre de symboles observables avec chaque action correspond à $ \sigma_i=M$? Clarifier}

The original label-efficient game corresponds to the single-expert ($E=1$) binary task ($M=2$) with a uniform cost matrix. In this work, we focus on single-expert multi-class ($M \geq 2)$ games with a potentially non-uniform cost matrix. The multi-expert setting ($E>1$) suggests that experts reveal the outcome (ground-truth label) with different stochasticity levels. Capturing this would require a different PM setting where feedback is subject to noise, as studied by \cite{kirschner2020information,kirschner2023linear}.
% We employ the standard PM setting where losses and feedbacks are deterministic, while considering $E>1$ would require a different setting where feedbacks are subject to noise, as studied in \cite{kirschner2020information,kirschner2023linear}. 

\paragraph{Connecting regret minimization and OAL}
The cost matrix captures both the cost of querying an expert and the cost of prediction errors. The goal of PM agents is to minimize the regret (see Eq.~\ref{eq:contextual_regret}), which corresponds to simultaneously minimizing 
the cost associated with label queries (label complexity) and the cost of prediction errors (generalization performance).
This goal aligns with the objectives of established OAL methodologies \citep{desalvo2021online,wang2021neural,ban2022improved,ban2024neural}.
%Nevertheless, existing PM strategies are limited to linear and logistic regression.
%We now introduce \NeuralCBP{}, the first neural PM strategy.
%\ola{todo, I would finish with something as above.}
%\ola{attention: there is a reference without author name, shown as 'anonymous'.} 
%\ola{your claim means any PM approach is able to solve OAL through label-efficient game. This is risky because it reduces the impact of NeuralCBP.}

%\ola{todo: L is similar to the sota, how we interpret cost-sensitivity?} \maxime{pas sûr de comprendre cette question}
%
%, and subsequently modify it to accommodate the multi-class online active learning scenario.\ola{todo: verify}
%\begin{definition}[Generalized label-efficient game]
%    An online active learning problem with $M$ classes and one expert ($E=1$), can be cast as a partial monitoring game with $N = M+E$ actions, and $M$ outcomes.
%    Without loss of generality, we assume the actions $E$ correspond to requesting a label from an expert. 
%    The action associated with the expert is dominated (see def. \ref{def:cell}) admits $\sigma_1=M$. 
%    The other actions are such that $\sigma_i=1, \forall i \in \{2, \dots, N\}$. 
%    The cost-sensitivity structure is represented by the matrix $\textbf{L}$.
%\end{definition}
%

%The theoretical guarantee of a partial monitoring approach is based on the regret upper bound. In contrast, existing OAL admit two distinct guarantees: an upper bound on the generalization performance and an upper bound on the label complexity. 


%While the generalized label-efficient game appears to be a straightforward extension of the initial partial monitoring framework, it is constrained to linear problems and is not suitable for addressing multi-class classification tasks involving neural networks.
%\ola{todo:verify. I added this because some dots are missing.}
%\paragraph{Extension of Label Efficient games}
%An online active learning problem with $M$ classes and $1$ expert corresponds to a partial %monitoring game with $N = M+1$ actions and $M$ outcomes. 
%Without loss of generality, we assume action $1$ corresponds to asking the expert for a label. 
%The loss matrix is designed with the underlying property that action 1 is dominated.
%In addition, the feedback matrix admits $\sigma_1=M$ for the expert verification action, while $\sigma_i=1$ for the other actions.
%approaches  partial monitoring perofrmanc
%existing empirical works, the regret metric reports jointly the label complexity and the predictive performance of the approach on the deployment stream.

\section{The \NeuralCBP{} approach}

%Unlike existing partial monitoring approaches whose exploration strategy is based on \textit{upper confidence bound}, \NeuralCBP{} brings a novel perspective because its exploration strategy is based on \textit{successive elimination}, as we will see in the next paragraph. 
%\subsection{Elements common to all \CBP{} approaches}
%\label{sec:common_CBP}

We now introduce \NeuralCBP{},a partial monitoring strategy able to learn from neural networks. 
While the emphasis of this study is on OAL classification tasks, \NeuralCBP{} is a general PM strategy that can be applied to the broader diversity of PM games. 
Algorithm~\ref{alg:NeuralCBP} displays the pseudo-code of \NeuralCBP{}.

%\aud{Citer papier CBP de base ici (pas CBPSide), tout en gardant la citation CBPSide ensuite}
The proposed \NeuralCBP{} builds upon \CBP{} (Confidence Bound Partial Monitoring) methods \cite{bartokICML2012}, which currently have limited practical potential due to linear~\citep{heuillet2024randomized} and logistic \citep{bartok2012CBPside} model assumptions.
For an observation $x_t$, the expected cost difference between two actions $i$ and $j$ is 
\begin{align}
    \delta_{i,j}(x_t) = (\textbf{c}_i - \textbf{c}_j) p(x_t),
\label{eq:expected_loss}
\end{align} where $p(x_t)$ is unknown by definition of the PM game. Action $j$ is better than action $i$ when $\delta_{i,j}(x_t)>0$. 

\begin{definition}[Neighbors, \citet{bartokICML2012}]
Two Pareto-optimal actions $i$ and $j$ are \textit{neighbors} if $\mathcal O_i \cap \mathcal O_j$ is a ($M-2$)-dimensional polytope. The set of all neighbor pairs is denoted $\mathcal N$.  
\label{def:neighbor}
\end{definition}
Two actions are neighbors when these actions can not be jointly optimal for a given outcome distribution. Therefore, given observation $x_t$, one only needs to compute $\delta_{i,j}(x_t)$ for neighbor pairs in $\mathcal N$ at round $t$, rather than for all the action pairs $\{i,j\}$ in the game~\citep{bartokICML2012}.
% Given observation $x_t$ at round $t$, \NeuralCBP{} computes $\delta_{i,j}(x_t)$ for all the neighbor pairs in $\mathcal N$, rather than computing it for all the action pairs in the game.


% Pseudocode 2
\begin{algorithm}
\SetKwInOut{Input}{input}
\Input{ $\mathcal P, \mathcal N  $ }
\DontPrintSemicolon
\caption{NeuralCPB}
\label{alg:NeuralCBP}

Initialize $\theta_1, \theta_2$ \;

$G_{a,t} = \lambda \mathbf{1}_{m+\sigma m}, \forall a \in \{1, \dots, N\} $\;
    
\For{$t > N$} {

    Initialize $\mathcal U(t) \gets \{\}$ \;
    
    Receive observation $x_t$ \;

    Get $\hat \pi(x_t)$ based on $f_1(x_t,\theta_1)$ \;
    Get $w(x_t)$ based on $f_2(x_t,\theta_2)$ \;
    \For{each action-pair $\{i,j\} \in \mathcal N$} {

         $\hat \delta_{i,j}(t) = \sum_{a \in V_{i,j} } v_{ija} \hat \pi_a(x_t)$ \;

         $z_{i,j}(t) \gets  \sum_{a \in V_{i,j}} \| v_{ija} \|_2 w_{a}(x_t)  $  \;
        
        \uIf{$| \hat \delta_{i,j}(t) | \geq$ $z_{i,j}(t)$   }{

            Add $\{i,j\}$ to $\mathcal U(t)$  \; }

    }     
    Compute $D(t)$ based on $\mathcal U(t)$ \;
    
    Obtain $\mathcal P(t)$ and $\mathcal N(t)$ based on $D(t)$ \;
    
    $\mathcal N^{+}(t) \gets \bigcup_{ {i,j} \in \mathcal N(t) }  N^{+}_{i,j} $ \;
    
    $\mathcal V(t) \gets \bigcup_{ {i,j} \in \mathcal N(t) } V_{i,j} $ \;
    
    Compute $\mathcal R(x_t)$ \;

    $\mathcal S(t) \gets \mathcal P(t) \cup \mathcal N^{+}(t) \cup ( \mathcal V(t)  \cap \mathcal R(x_t)  ) $ \;
    
    Play $ a_t = \argmax_{ a \in \mathcal S(t) } W_{a}  w_a(x_t) $ \; 
    
    Observe feedback $h_t$ \;

    Update $\theta_1, \theta_2$ with Algorithm \ref{alg:gradient_descent} (Appendix \ref{app:implementation})\;

    Update $G_{a_t,t}^{-1}$ (see \citet{sherman_morrison})  \; %\aud{missing initialization if that is an update}
    
    }
\end{algorithm}

\subsection{Outcome and feedback distributions}

Recall that the agent does not observe the outcomes. 
Consequently, the agent cannot directly estimate the outcome distribution $p(x_t)$.
As a result, estimating the expected cost difference $\delta_{i,j}(x_t)$  using Eq.~\ref{eq:expected_loss} is not feasible in practice.
This motivates additional definitions to estimate the expected loss difference in practice.
\begin{definition}[Observer set, \citet{bartokICML2012}]
The set $V_{i,j}$ includes all actions that verify the relation
$(\textbf{c}_i - \textbf{c}_j)^\top \in  \oplus_{a \in V_{i,j}} \text{Im} (S_a^\top)$, where $\oplus$ corresponds to the direct sum.
\label{def:observer_set}
\end{definition}
\begin{definition}[Observer vectors, \citet{bartokICML2012}]
Given action $a \in V_{i,j}$, the observer vector $v_{ija} \in \mathbb R^{\sigma_a}$ is selected to satisfy the relation $(\textbf{c}_i - \textbf{c}_j)^\top = \displaystyle \sum_{a \in V_{i,j} } S_a^\top v_{ija}$.
\label{def:observer_vector}
\vspace{-1em}
\end{definition}
The set $V_{i,j}$ contains actions that induce informative feedback symbols about $\textbf{c}_i-\textbf{c}_j$. It is defined such that $\textbf{c}_i - \textbf{c}_j$ can be expressed as a linear combination of the signal matrix images of actions in $V_{i,j}$, with the observer vectors being the coefficients of the combination.

Combining Definitions \ref{def:observer_set} and \ref{def:observer_vector} with Property~\ref{prop:link} allows to express $\delta_{i,j}(x_t)$ as a function of the feedback distributions $\pi_a(x_t)$  of all actions $a \in V_{i,j}$:% instead of the outcome distribution $p(x_t)$:
\begin{align}
    \delta_{i,j}(x_t)  %= \text{Eq \ref{eq:expected_loss} }
    =  \sum_{a \in V_{i,j} } v_{ija}^\top \mathbb \pi_a(x_t). \label{eq:feedback_dist} 
\end{align}
Consequently, on can compute the estimate $\hat \delta_{i,j}(x_t)$ using the feedback distribution estimates $\hat \pi_a(x_t)$ associated with the actions in $V_{i,j}$. Similarly, the uncertainty in the loss difference estimate $\hat \delta_{i,j}(x_t)$ is:  
\begin{equation}
\label{eq:real_CI}
z_{i,j}(x_t) = \sum_{ a\in V_{i,j} } \| v_{ija} \|_{\infty} w_a(x_t),
\vspace{-0.5em}
\end{equation}
where $w_a(x_t)$ is the uncertainty on $\hat \pi_a(x_t)$ \citep{lienert2013exploiting}. 
Methods to compute $\hat \pi_a(x_t)$ and $w_a(x_t)$ depend on the setting considered, e.g. without side-observation \cite{bartokICML2012}, or with linear \cite{heuillet2024randomized}, or logistic \cite{bartok2014partial} side-information. We now present a method for the neural setting.
%\ola{to respect the chronicle time of references, I put logistic (2014), then linear (2024)}

\subsection{Inference with neural networks}
\label{sec:neural_inference}

%To estimate the feedback distribution $\hat \pi_a(x_t)$ and quantify uncertainty $w_a(x_t)$ over such estimate, we leverage advances from the bandit literature~\citep{lattimore2020bandit}. 
%This is motivated as bandit feedback is a specific case of partial monitoring feedback \citep{bartok2014partial} where all the actions in the game induce informative feedback.
%Neural bandit strategies use statistical deviation bounds to quantify the uncertainty on the estimates of action values~\cite{zhou2020neural, kassraie2022neural, xu2022neural}.
%In practice, such statistical bounds are conservative, inducing too much exploration and hampering empirical performance.
%Fortunately, uncertainty can be quantified from a generalization error perspective, as opposed to a statistical one, which is notably effective~\cite{ban2021ee}.

%\paragraph{EENets in partial monitoring}


%We draw inspiration from advances in the bandit literature to estimate the feedback distribution and quantify uncertainty over this estimate~\citep{lattimore2020bandit}.
The strategy \INeural{}~\citep{ban2022improved} frames the OAL classification task under bandit feedback, where all actions are self-informative. 
As a result, \INeural{} leverages the Explore-Exploit Networks (referred to as \EENets{}) initially introduced for bandit feedback~\citep{ban2021ee}.
The current state-of-the art in OAL (\Neuronal{}~\citep{ban2024neural}), is a follow-up strategy based on \EENets{} that showcases the limitations of a bandit feedback structure in practice and highlights the importance of finding an adequate and general feedback structure for the diversity of OAL classification tasks.
As a response, we extend \EENets{} to address the exploration-exploitation trade-off in the general PM setting.
This extension is non-trivial because the PM feedback/cost structure requires exploration techniques that go beyond techniques used in bandit feedback.
Further technical differences between \NeuralCBP{} and other \EENets{} strategies are discussed in Section~\ref{app:technical_differences}.

\EENets{} comprise an \textit{exploitation network} (denoted $f_1$) to estimate action values and an \textit{exploration network} (denoted $f_2$) to quantify the uncertainty on the predictions of $f_1$. 

\paragraph{Exploitation network}

In the PM setting, the exploitation network $f_1$ predicts the feedback distributions required in Eq.~\ref{eq:feedback_dist}.
To instantiate efficiently the exploitation network $f_1$ in PM, we need to distinguish informative from non-informative actions.

\begin{definition}[Set of informative actions]
    The set of informative actions, $\mathcal{I} = \{ i : i \in \{1, \dots, N\}, \text{ and } \sigma_i \geq 2 \}$, comprises all actions that induce at least two distinct feedback symbols.
\label{def:info_actions}
\end{definition}
%Non-informative actions are included in the complimentary set $\bar{\mathcal{I} }$. 
%\aud{Voir si la défintiion du complimentary set $\bar{\mathcal{I} }$ est nécessaire. On pourrait utiliser $a\not\in\mathcal I$ pour référer aux action non-informatives.}

\begin{definition}[Valid feedback symbols]
    The set of valid feedback symbols is noted $\Sigma_{\mathcal I} = \bigcup_{ i \in \mathcal I } \Sigma_i$. 
    %\aud{Es-tu certain que ce n'est pas plutôt une union des ensembles $\Sigma_a, \forall a \in \mathcal I$? Parce que sinon $|\Sigma_{\mathcal I}| = |\mathcal I|$.} \maxime{Absolument, c'est une notation d'union, j'ai corrigé, merci!} 
    The dimension of $\Sigma_{\mathcal I}$ is $\sigma = \displaystyle\sum_{i\in \mathcal I} \sigma_i$, which represents the total number of unique symbols induced by the actions in $\mathcal I$. 
    %\aud{Confusion entre la notation $s$ et la la notation $s_1$, $s_2$, etc utilisée pour les symboles (voir fin section 2).}
\end{definition}

%\aud{Parfois on utilise $a$ pour dénoter une action générique, parfois on utiliser $i$. Dans la définition prédécente, on utilise les deux. Dans le reste de la section 4.2, on utilise $a$. Dans la section 4.3, on retombe sur $i$. Il faudrait uniformiser.}
%\ola{todo: there is an issue with this def. due to the confusion between using a set and expressing it as a scalar c.}
%\ola{I suggest to add example after the definition 4.5 for clarity. You can rely on the example in sec. 3}
%\aud{Si $\mathcal I$ contient deux actions qui permettent toutes les deux d'observer les symboles $\wedge$ et $\odot$, alors on aurait $c = 4$ avec la définition; est-ce que c'est bien ce qui est recherché? Parce que le terme ``admissible feedback symbols'' laisse penser que l'on aurait pu vouloir obtenir $c = 2$, i.e. le nombre de symboles différent observables avec l'ensemble des actions dans $\mathcal I$. Il me semble que ça aurait été plus cohérent avec la Def. 4.6.}

%\aud{Do you mean that $\pi_a(x) = 1$, with $h$ being the unique symbol in $H_a$?}
% Estimating the  feedback distribution of uninformative actions therefore complicates the learning task and extends inference time, two issues of significant importance when using deep neural networks. 
Current \CBP{} strategies \citet{bartok2012CBPside, lienert2013exploiting, heuillet2024randomized} estimate the feedback distribution for all the actions of a game.
However, 
Property \ref{prop:link} shows that for any uninformative action $i \notin \mathcal I$, the learned feedback distribution is always $\pi_i(x_t)=1$. In some PM games, most of the actions are uninformative, as it is the case for \textit{generalized label-efficient} games (presented in Section \ref{sec:alpm}) where only expert actions are informative. 
Therefore, attributing learnable parameters to uninformative actions, as is done in current \CBP{} strategies \citep{heuillet2024randomized, bartok2012CBPside}, turns out to be inefficient. 
In contrast, \NeuralCBP{} attributes learnable parameters only to the informative actions in the game (see Definition \ref{def:info_actions}). 
Restricting learnable parameters to the subset of informative actions $\mathcal I$ is essential because \NeuralCBP{} relies on the Explore-Exploit networks (\EENets{}) that require a shared representation for the actions. Including non-informative actions would cause overfitting, unstable learning, and increased complexity.


% Property \ref{prop:link} shows that for any uninformative action $i \notin \mathcal I$, the (unknown) feedback distribution is always $\pi_i(x_t)=1$. 
% In some PM games, most of the actions are uninformative, as it is the case for \textit{generalized label-efficient} games (presented in Section \ref{sec:alpm}) where only expert actions are informative. 
% As a consequence, learning the feedback distribution of uninformative actions can result in the neural networks overfitting these uninformative feedback distributions. Moreover, estimating the  feedback distribution of uninformative actions unnecessarily complicates the learning task and extends inference time, two issues of significant importance when using deep neural networks.
% Therefore, unlike existing PM approaches, \NeuralCBP{} learns only the feedback distribution of informative actions.
%\aud{I would use $\theta_1$ instead of $\theta^1$ to avoid confusion with an exponent and be coherent with $f_1$. Otherwise you could use $f^{(1)}$ and $\theta^{(1)}$. Also $f(x, \theta)$ does not correspond to the prediction space, it denotes a prediction (which is in the prediction space).}
%\aud{What is $\hat w_a(x)$? It seems to be undefined.}
%\in [0,1]^c, \text{ where } c = \displaystyle\sum_{i\in \mathcal O} \sigma_i.
%\aud{I would add a textual explanation like you did with previous definitions to communicate the intuition, e.g. $c$ is the number of unique symbols observable with informative actions?}


%\ola{unclear. is it related to the signal matrix?} \maxime{The feedback distributions $\pi$ are related in theory to the feedback matrices (connection between signal and outcome distribution); in practice we have to estimate the feedback distribution as is.}
\begin{definition}[Set of informative feedback distributions]
    The set of informative feedback distributions, denoted
    $\Pi(x_t) = \{ \pi_i(x_t),  i \in \mathcal I \}$, contains the (unknown) feedback distribution vectors of each informative action.
\label{def:infi_dist}
\end{definition}

\begin{remark}
For practical purposes, remark that the set $\Pi(x_t)$ can be converted into a (flattened) $\sigma$-dimensional row vector. 
%\aud{Seulement vrai si $\sigma_a = s~\forall a\in \mathcal I$} \maxime{$s$ représente le nombre total de symboles associés aux actions informatives, on peut representer $\Pi(x_t)$ soit comme un set de feedback distributions, soit on peut flatten ce set et dans ce cas comme il y a $s$ symboles associés aux actions informatives, on soit que le vecteur flattened est de dimension $s$} \aud{Ok! La notation $s$ par contre peut porter à confusion avec la notation utilisée pour les symboles (fin section 2). Pourquoi pas utiliser $\sigma = \sum_{a\in\mathcal I} \sigma_a$?}
The conversion from a set to a flattened vector, and conversely from a flattened vector to a set, is possible because the cardinality $\sigma_i$ of each feedback distribution is known by definition of the PM game. 
%\aud{Donc $s = \max_{a\in\mathcal I} \sigma_a$ et du padding est utilisé quand $\sigma_a < s$?} \maxime{Non, pas de padding. $s = \displaystyle \sum_{a \in \mathcal I} \sigma_a$ i.e. le nombre total de feedback symbols associés aux actions informatives}
\label{rq:vector}
\end{remark}

The network $f_1$, therefore, learns the flattened vector associated with $\Pi(x_t)$ and predicts the desired estimates $\hat \pi_i(x_t)$ for actions $i\in \mathcal I$. Network $f_1$ can be instantiated as a fully connected multi-layer perceptron of depth $L$ and width $m$:
$$  f_1(x_t, \theta_1) = W_1^L \Psi( W^{L-1}_1 \Psi(  W^{L-2}_1   \dots  \Psi( W^1_1 x_t )  )  ), $$
where $W^1_1\in \mathbb R^{ m \times d }  $, $W^\ell_1\in \mathbb R^{ m \times m }, 1<\ell<L$, and $W^L_1\in \mathbb R^{ \sigma \times m }  $. 
%\aud{$W^C_1$?}
The notation $\Psi(x_t)= \max(0,x_t)$ refers to the ReLU activation function. We use a multi-layer perceptron as an example, but we will see in the experiments that $f_1$ can instantiate other neural architectures.

The network $f_1$ is trained by performing stochastic gradient descent with the mean squared error $\mathcal L_1$ between the predictions of $f_1$ and the observed feedback symbols, defined as 
\vspace{-0.5em}
$$\mathcal L_{1}(\theta_1) =  \displaystyle \sum_{\tau = 1, h_{\tau} \in \Sigma_{\mathcal I}}^{t-1}\frac{( f_1(x_{\tau}, \theta_1 ) - e( h_{\tau} ) )^2}{2},$$
where $e(\cdot)$ refers to a $\sigma$-dimensional one-hot encoding.
Note that $\theta_1$ is updated based on the history of valid feedback symbols ($h_t \in \Sigma_{\mathcal I}$) and their associated observations ($x_t$). 

%The labels of $f_1$ \aud{I don't know what ''label of a function'' means; do you mean ``outputs of $f_1$?} correspond to $c$-dimensional vectors \aud{$c$?}, where observed feedback symbols $\{h_{\tau}\}_{\tau=1}^{t-1}$ are one-hot-encoded.
%Note that no assumption is made regarding the architecture used for $f_1$. 
%\aud{Do you mean the set containing the (estimated) probability of obtaining symbol $h$ with action $a$ given observation $x$? Adjust the equation, it does not work. You cannot probability distributions to define the probabilities assigned to a single symbol. Seems like it should use $\hat \pi_a(h|x)$.}

% \aud{Pourquoi ``individual''? Cette terminologie vient de toi ou de \EENets{}?}
% \aud{No.} \maxime{why?} \aud{Certains des éléments de $\hat \pi(x)$ seront des distributions et d'autres seront un scalaire (de valeur 1). Ça ne fait pas de sens d'ajouter $\hat \pi_a(x)$ is $a \in \mathcal O$, sinon ajouter 1; les éléments de $\hat \pi(x)$ ne sont plus dans le même format sinon.}

Based on remark \ref{rq:vector}, the set of feedback distribution estimates over all the $N$ actions in the game is defined as
\begin{align*}
    \hat \pi(x_t) = \{ \hat \pi_i(x) \text{ if } i \in \mathcal I, [1] \text{ otherwise }, i \in \{1,\dots,N\} \},
\end{align*} 
where $[1]$ is the feedback distribution vector of uninformative actions and $|\hat \pi(x)|=N$. The $i$-th element of $\hat \pi(x)$ is the feedback distribution estimate of action $i$. In Sec. \ref{sec:expl_explo}, we will describe how $\hat \pi(x)$ is used to trade-off between exploration and exploitation.

%\aud{Pas certaine si c'est le meilleur endroit pour discuter de la distinction avec le principe appliqué aux bandits; peut-être en faire un paragraphe/sous-section dédié à résumer l'ensemble des distinctions avec les travaux antérieurs (peut-être dans la section 4.4 en changeant le nom pour l'élargir?)}

%Furthermore, the prediction space of $f_1$ in \NeuralCBP{} comprises the feedback distributions of a sub-set of informative actions. The feedback distribution of each actions can have varying dimension whereas the \EENets{} defined in \citet{ban2021ee} admit one single estimate per action. 
%\aud{I don't understand how that is different to what was being said in the beginning of the paragraph.}

%\begin{definition}[Adjusted feedback distributions]
%   Let $\pi(x)$ be the set of feedback distributions for all the informative actions in $\mathcal O$. $\pi'(x)$ is defined as
%    $$ \hat \pi'(x) = \left\{\ \forall i \in \underline N,  \hat \pi_i(x_t) \text{ if } i \in \mathcal O, 1 \text{ otherwise } \right\}. $$
%\end{definition}

%\maxime{Je n'ai pas encore trouve le meilleur moyen d'exprimer clairement la difference entre definition 3.7 et 3.8, la 3.7 correspond au actual output de f1, qui exclue les 1 pour simplifier la hypothesis class, alors que 3.8 est la version augmentee avec les 1 qui manquent car l'approche NeuralCBP a quand meme besoin de reposer sur ces valeurs, meme si elles ne sont pas apprises par un modele. Je n'ai pas encore trouve le moyen d'exprimer cet aspect proprement c'est pour cela que 3.8 est pour le moment pas incluse dans le texte.}

\paragraph{Exploration network}

The exploration network $f_2$ estimates the prediction error of network $f_1$. These estimates are used to quantify the uncertainty on the predictions of $f_1$, and they are used to compute the confidence formula defined in Eq.~\ref{eq:real_CI}.

\begin{definition}[End-to-end embedding,  \cite{ban2024neural}]
    Given the exploitation network $f_1$ and an observation $x_t$, the end-to-end embedding is defined as
    $$ \phi(x_t) = \left[  \Psi(W_1^1 x_t)^\top, \texttt{vec}(  \nabla_{W_1^L} f_1(x_t,\theta_1)^\top ) \right] \in \mathbb R^{m+\sigma m}, $$
    where the first element is the output vector of the first layer of $f_1$ and the second element is the flattened (represented by operator $\texttt{vec}$) partial derivative of $f_1$ with respect to the parameters of the last layer. 
    In practice, $\phi(x_t)$ is normalized by dividing all the elements by the $l_2$-norm of the vector. 
\label{def:e2e_embedding}
\end{definition}

%\aud{ This feels really ``programmatic''; can you rewrite in maths?} \maxime{The definition was defined as is in the paper...} \aud{Ok :( Dans ce cas il faut au moins donner la notation nécessaire dans la définition (e.g. $K$, $\sigma$, $\operatorname{vec}$}
%\aud{Normalization does not explicitly appear in the equation; do you mean that it is normalized following that definition? What does it mean to normalize a tuple of two components like this?}

To produce uncertainty estimates, the network $f_2$ learns the function $\Pi(x_t)-f_1(x_t,\theta_1)$. 
The network $f_2$ is instantiated as a multi-layer perceptron of depth $L$ and width $m$, which receives the end-to-end embedding $\phi(x_t)$:
$$  f_2(x_t, \theta_2) = W_2^L \Psi( W^{L-1}_2 \Psi(  W^{L-2}_2  \dots  \Psi( W^1_2 \phi(x_t) )  )  ), $$
where $W^1_2\in \mathbb R^{ m \times (m+ \sigma m) }  $, $W^\ell_1\in \mathbb R^{ m \times m }, 1<\ell<L$, and $W^L_1\in \mathbb R^{ \sigma \times m }  $.
The weights $\theta_2$ of network $f_2$ are updated with stochastic gradient descent using the loss 
$$ \mathcal L_2 ( \theta_2 ) = \displaystyle \sum_{\tau=1, h_{\tau} \in \Sigma_{\mathcal I} }^{t-1}  \frac{( f_2(x_{\tau},\theta_2)-( e(h_{\tau})-f_1(x_{\tau},\theta_1) ) )^2}{2}. $$

\begin{remark}
    The network $f_2$ is also based on the flattened vector representation of $\Pi(x_t)$. Therefore, the predictions of $f_2$ are $\sigma$-dimensional vectors. Since the number of symbols $\sigma_i$ is known for any action by definition of the PM game, we can convert the flattened prediction vector of $f_2$ into a set of vectors.
    \label{rq:vector2}
\end{remark}
% associated to each action in $\mathcal I$. 

%\aud{Jusqu'ici, on utilise toujours $x_t$ pour décrire les quantités. À partir d'ici, la notation change pour un contexte $x$ sans indice de temps. Il faudrait uniformiser (incluant la section 4.3).}
Given remark \ref{rq:vector2}, let
% the set of uncertainty estimates over the $N$ actions in the game, noted $w(x)$, is
$w(x_t) = \{ \max( \hat w_i(x_t) ) \text{ if } i \in \mathcal I,  0 \text{ otherwise }, i \in \{1,\dots,N\} \} $ denote the set of uncertainty estimates over all actions,
where $| w(x_t) |=N $ and the notation $w_i(x_t)$ refers to the $i$-th element of $w(x_t)$. In other words, the uncertainty of an informative action corresponds to the maximum uncertainty value predicted by $f_2$ over the $\sigma_i$ symbols induced by action $i$, denoted $\max( \hat w_i(x_t) )$. This can be thought of as the worst-case uncertainty for the informative action $i$. For uninformative actions, the uncertainty is $0$ following the heuristic that $\pi_i(x_t) = [1]$ for $i \not\in \mathcal I$. 

%\aud{Cette équation ne correspond pas à l'incertitude maximale sur les symboles induits par l'action $a$, elle correspond à la probabilité maximale estimée associée à un symbole induit par l'action $a$; valider.})

\subsection{Exploration and exploitation}
\label{sec:expl_explo}

By leveraging the PM-extended \EENets{} mechanism, \NeuralCBP{} can compute $\hat \delta_{i,j}(x_t)$ for all neighbor action pairs $\{i,j\} \in \mathcal N$ using the feedback distributions predicted by network $f_1$ (see Eq.~\ref{eq:feedback_dist}). It can also compute uncertainty estimates $z_{i,j}(x_t)$ on $\hat \delta_{i,j}(x_t)$ using the uncertainties predicted by network $f_2$ (see Eq.~\ref{eq:real_CI}).

Following the \CBP{} methodology, 
% Recall that \NeuralCBP{} computes $\hat \delta_{i,j}(x)$ for all pairs in the set of neighbor pairs $\mathcal N$ using Equation \ref{eq:feedback_dist}. Then, 
\NeuralCBP{} then separates low uncertainty from high uncertainty estimates of $\hat \delta_{i,j}(x_t)$ by using a \textit{successive elimination} \citep{even2002pac} criteria  $ | \hat \delta_{i,j}(x_t) | > z_{i,j}(x_t)$ for each action pair  $\{i,j\} \in \mathcal N$.
At round $t$, the pairs that verify the criteria are gathered in the set of confident pairs, denoted  $\mathcal U(t)$. 
\NeuralCBP{} leverages $\mathcal U(t)$ to compute a sub-space of the probability simplex $\Delta_M$, defined as $D(t) = \{ p \in \Delta_M, \{i,j\} \in \mathcal U(t), \operatorname{sign}( \hat \delta_{i,j}(x_t) ) (\textbf{c}_i-\textbf{c}_j) p > 0 \}$. The set $D(t)$ thus contains all likely outcome distributions given the confident estimates of loss differences.
The true (unknown) outcome distribution $p(x_t)$ is included with high confidence in the sub-space $D(t)$. %Indeed, $D(t)$ is characterized by constraints based on the signs of the low uncertainty estimates in $\mathcal U(t)$.
%\aud{Je ne comprends pas cette dernière phrase. Est-ce qu'elle tente de résumer en mots la définition math de $D(t)$ qui précède?} \maxime{Oui! $D(t)$ est une espace de probabilité qui est obtenu à partir de contraintes basées sur des estimations qui ont bcp de confiance, donc il y a de grande chance que la vraie outcome distribution (qu'on ne peut pas estimer en PM) soit inclue dans $D(t)$. } \aud{J'ai proposé une phrase qui me semble parler un peu plus que la description textuelle de la contrainte de signe dans la définition. Confirmer que ça conserve bien la signification.}

\NeuralCBP{} then considers the set of \textit{likely Pareto-optimal actions} $\mathcal P(t) \subseteq \mathcal P$ containing all Pareto-optimal actions $i \in \mathcal P$ such that their cell $\mathcal O_i$ intersects with the sub-space $D(t)$.
Similarly, it considers the set of \textit{likely neighbors} $\mathcal N(t) \subseteq \mathcal N$ containing all neighbor action pairs $\{i, j\} \in \mathcal N$ such that their common cell $\mathcal O_i \cap \mathcal O_j$ intersects with $D(t)$.
When $\mathcal P(t)$ contains only one action, $\mathcal N(t)$ is automatically empty, and therefore \NeuralCBP{} exploits.
When $\mathcal P(t)$ contains more than one action, it explores. 

%In contrast, the sets $\mathcal P$ and $\mathcal N$ correspond to Pareto-optimal actions and neighbor pairs without any knowledge of the outcome distribution.

\paragraph{The selected action at round $t$.}

Let $X_{i,t} = \{ \phi(x_{\tau}) \}_{\tau = 1, i_{\tau} = i }^{t-1}$ denote the history up to time $t$ (exclusively) of the observations embeddings under which action $i$ was selected, and let $G_{i,t}= \lambda I_d + X_{i,t} X_{i,t}^\top$ denote the associated Gram matrix.
Let the notation $\| x \|^2_{S} = x^\top S x$ denote the norm of vector $x$ weighted by some matrix $S$.
% \aud{$l_2$-norm?}

\begin{definition}[Underplayed actions, \cite{heuillet2024randomized}]
The set of underplayed actions, $\mathcal R(x_t) = \{  i \in  \{1, \dots, N\} \text{ s.t. } 1/ \| x_t \|^2_{G_{i,t}^{-1}} < \eta_i f(t) \}$, contains actions that have been played less than some play rate function $f(t)$ weighted by a scalar $\eta_i > 0$. 
The quantity $1/ \| x_t \|^2_{G_{i,t}^{-1}}$ is a pseudo-count of the number of times action $i$ was selected, weighted by the similarity between the current observation $x_t$ and the observations at previous selections of action $i$.
\label{def:rarely_sampled_actions_contextual}
\end{definition}


\begin{definition}[Neighborhood action set \cite{bartokICML2012}]
    The neighborhood action set of a neighbor pair $\{i,j\}$ is defined as $ N^+_{i,j} = \{ k \in \{1, \dots N\}, \mathcal O_i \cap \mathcal O_j \subseteq \mathcal O_k\}.$ Note that $N^+_{i,j}$ naturally contains $i$ and $j$. If $N^+_{i,j}$ contains another action $k$, then $\mathcal O_k = \mathcal O_i$ or $\mathcal O_k = \mathcal O_j$ or $\mathcal O_k = \mathcal O_i \cap \mathcal O_j$.
\end{definition}

\NeuralCBP{} computes the \textit{likely neighbor} action set $\mathcal N^+(t) = \bigcup_{ \{i,j\} \in \mathcal N(t) } N^+_{i,j} $ based on the remaining action pairs (likely neighbors) in $\mathcal N(t)$. Similarly, the set of \textit{likely observer actions} is defined as $\mathcal V(t) = \bigcup_{ \{i,j\} \in \mathcal N(t) } V_{i,j}$.

The final set of actions considered by \NeuralCBP{} at round $t$, denoted $\mathcal S(t)$, contains all potentially optimal actions (i.e. $\mathcal P(t) \cup \mathcal N^+(t)$) and all informative actions (i.e. $\mathcal V(t) \cup \mathcal R(x_t)$). From $\mathcal S(t)$, \NeuralCBP{} selects the action with the greatest uncertainty weighted by $W_{a} = \max_{\{i,j\} \in \mathcal N} \| v_{ija} \|_{\infty}$, i.e. $a_t = \argmax_{i \in \mathcal S(t)} W_a w_a(x_t)$.
%\ola{I did not understand, do you mean play the action w/ max uncertainty? if the action is predict a class, this leads to large prediction error non?} 


\section{Experiments}
\label{sec:experiments}

We compare the empirical performance of \NeuralCBP{} to state-of-the-art baselines on a set of binary, multi-class, and cost-sensitive OAL classification tasks. 
To evaluate the robustness across neural architectures, we conduct experiments with a multi-layer perceptron (MLP) and the convolutional architecture LeNet \cite{lecun1998gradient}. To our knowledge, our experiments are the first to evaluate OAL with a convolutional architecture.
For reproducibility, we open-source the code base of \NeuralCBP{} and the baselines. We also open-source the code base of PM-based OAL game environments. The code base is available on Github: \url{https://github.com/MaxHeuillet/neuralCBPside}.

%\subsection{Experiment set-up}
%\ola{the test set has a fixed size for each dataset right? it is important to note it since you use varying volumes of observation during training.}
\paragraph{Datasets} For binary OAL tasks, we evaluate on \textbf{Adult} \citep{asuncion2007uci}, \textbf{MagicTelescope} \citep{asuncion2007uci}, and the \textbf{modified MNIST} \citep{mnist} (odds vs. even numbers) datasets.
For multiclass OAL tasks, we consider \textbf{covertype} and \textbf{shuttle} from the UCI repository \citep{asuncion2007uci}, \textbf{MNIST} \citep{mnist}, \textbf{Fashion} \citep{fashion}, and \textbf{CIFAR10} \citep{cifar}. For each dataset, we put aside  $15\%$ of the observations to create a separate fixed size \textit{test set}, intended exclusively for evaluation of the generalization performance. 
We sample from the remaining observations a \textit{deployment stream} that has a finite horizon of $T=10$k rounds. 
The OAL strategies acquire labelled data from the deployment stream. We run each experiment $25$ times with different dataset splits for each run.

\vspace{-0.5em}
\paragraph{Baselines} We compare \NeuralCBP{} to six baselines. 
In the binary setting, we adapt the strategies \Cesa{} and \Margin{}, originally proposed for linear classifiers, to function with MLPs.
We evaluate the multi-class state-of-the-art strategies \INeural{} and \Neuronal{}. Both \INeural{} and \Neuronal{} rely on a hyper-parameter $\gamma$ that influences the amount of exploration. We consider respectively two instances of each strategy: one with the hyper-parameter configuration specified in their official publications ($\gamma=6$ for \Neuronal{} and \INeural{}) and one that we chose to induce less exploration ($\gamma=3$ for \Neuronal{} and \INeural{}). 
We further discuss implementation details and hyper-parameters in Appendix \ref{app:hyperparameters}.

\vspace{-0.5em}
\paragraph{Performance metrics} 

To characterize the performance on the deployment stream, we measure the \textit{final cumulative regret} achieved at the end of the stream (Eq.~\ref{eq:contextual_regret}), which has to be minimized. 
We also report the \textit{win count}, i.e. the number of times a strategy achieves the lowest final regret at the end of the horizon across the $25$ runs of each experiment. Lastly, we perform \textit{one-sided Welch's t-tests} to asses if \NeuralCBP{}'s final regret distribution is significantly lower than the baselines.

To evaluate the generalization performance and account for possible data imbalance in the considered datasets, we measure the \textit{weighted f1-score} on the test sets. We measure the weighted f1-score while considering different volumes of expert-labelled observations acquired on the deployment stream (10, 25, 50, 100, 150, 250, 300, 400, 500, 750, 1000, 2500, 5000, 7500, 9000). Fixing the number of labelled observations allows for a fair comparison between the strategies. 

Lastly, we report the \textit{average number of expert verifications} consumed by each strategy. It is important to note that, unlike previous experimental protocols \citep{ban2024neural, ban2022improved, desalvo2021online}, we do not set a maximum expert query budget. This choice aims to illustrate how each strategy effectively adapts its label complexity to the learning task. 
This choice reflects the realistic scenario where the optimal expert budget is unknown prior to deployment (as it largely depends on the dataset and type of architecture).

%of the generalization performance based on the same volume of labelled observations for all strategies.
%We considered more small volume thresholds to characterize the ability to learn efficiently from small volumes of labelled observations. 
%These specific volume thresholds allow

\begin{figure}
    \centering
    \begin{subfigure}[b]{0.44\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/results_binary_MLP_b.pdf}
        \caption{Final regret (lower is better)}
        \label{fig:Q1_regret}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case1_MLP_adult_b.pdf}
        \caption{Adult}
        \label{fig:Q1_adult}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case1_MLP_MagicTelescope_b.pdf}
        \caption{Magic Telescope}
        \label{fig:Q1_magic}
    \end{subfigure}
    \par\bigskip % adds some space between the rows
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case1_MLP_MNISTbinary_b.pdf}
        \caption{MNIST Binary}
        \label{fig:Q1_mnistbinary}
    \end{subfigure}
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/binary_legend_cr.png}
        %\caption{Legend}
        \label{fig:legend}
        % \vspace{-2em}
    \end{subfigure}
    \caption{Performance on binary OAL with MLP.}
    \label{fig:Q1}
\vspace{-1.5em}
\end{figure}


\vspace{-0.5em}
\subsection{Binary case}
\label{sec:Q1_results}
%The regret of \Neuronal{} \texttt{(less exploration)} is not significantly different regret than \NeuralCBP{} on MNISTbinary (p-value=0.088) and Adult (p-value-0.624).

%For example, \Neuronal{} \texttt{(less exploration)}'s regret is equivalent to \NeuralCBP{}'s on MNISTbinary (p-value=0.088) and Adult (p-value-0.624). Similar equivalence is observed with \Cesa{} (p-value = 0.67) and \Margin{} (p-value = 0.52) on the adult dataset.

Figure~\ref{fig:Q1_regret} reports the final regrets achieved on the deployment stream. 
Numerical details are reported in Table~\ref{tab:Q1} of Appendix~\ref{app:results}.
\NeuralCBP{} achieves the best final regret on the MNISTbinary and Adult datasets. On the MagicTelescope dataset, \NeuralCBP{} achieves a final regret comparable \Neuronal{}, \Cesa{} and \Margin{}, suggesting all these strategies are close to the optimal solution on this dataset.
We observe that the final regret of all the strategies is subject to high variance (see Fig.~\ref{fig:Q1_regret}) caused by variations in the task difficulty over the $25$ dataset splits. As a result, it is insightful to assess the performance solely based on the average final regret metric. The win count reveals that over 25 trials, \NeuralCBP{} outperforms the baselines with 10 wins on MNISTbinary, 14 wins on MagicTelescope, and 10 wins on Adult. 


%\aud{Est-ce que tu rapportes les courbes de regret cumulatif (au fil des rounds) en Appendix? Ce serait bien pour confirmer qu'il y a bien un apprentissage.}



%\ola{with respect to the limited budget available. Right?}
Figures \ref{fig:Q1_adult}, \ref{fig:Q1_magic}, and \ref{fig:Q1_mnistbinary} display the weighted f1-score performance on the test sets for different volumes of expert queries.
For each strategy, the f1-score curve stops at different expert-query volumes, which illustrates the \textit{label complexity} of each approach. 
\NeuralCBP{} exhibits a lower label query complexity (see Figures \ref{fig:Q1_adult} and \ref{fig:Q1_magic}) and achieves a f1-score performance that is comparable to the one achieved by the other baselines. 

%\aud{Pourquoi les courbes s'arrêtent à des rounds différents? Je croyais que le stream faisait toujours $T=10k$.} \maxime{Je dois updater les x-axis des figures, ce n'est pas des rounds de temps mais des expert-queries volumes, chaque approche a une "label complexity" différente ce qui provoque les courbes de s'arreter plus ou moins tôt.}



\begin{figure}
    \centering
    \begin{subfigure}[b]{0.44\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/results_nonbinary_MLP_b.pdf}
        \caption{Final regret}
        \label{fig:Q2_regret}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_game_case_seven_MLP_FASHION_b.pdf}
        \caption{Fashion ($M=10$)}
        \label{fig:Q2_fashion}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_game_case_seven_MLP_MNIST_b.pdf}
        \caption{MNIST ($M=10$)}
        \label{fig:Q2_mnist}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_game_case_seven_MLP_covertype_b.pdf}
        \caption{Covertype ($M=7$)}
        \label{fig:Q2_covertype}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_game_case_seven_MLP_shuttle_b.pdf}
        \caption{Shuttle ($M=7$)}
        \label{fig:Q2_shuttle}
    \end{subfigure}
    \begin{subfigure}[b]{0.44\textwidth}
        \centering
        \vspace{0.5em}
        \includegraphics[width=\textwidth]{figures_CR/nonbinary_legend_cr.png}
        %\caption{Legend}
        \label{fig:Q2_legend}
        \vspace{-2em}
    \end{subfigure}
    \caption{Performance on multi-class OAL with MLP.}
    \label{fig:Q2}
\vspace{-1.5em}
\end{figure}



\vspace{-1em}
\subsection{Multi-class case}
\label{sec:Q2_results}
Figure \ref{fig:Q2_regret} reports the final regret on multi-class tasks. Numerical values are reported in Table~\ref{tab:Q2} (Appendix~\ref{app:results}).
\NeuralCBP{} consistently achieves the lowest final regret on the four datasets considered. The improvement in final regret performance is statistically significant for MNIST, Fashion and adult datasets (all p-values$<0.01$), and \NeuralCBP{} achieves identical performance to \Neuronal{} (less exploration) on covertype. The improvement ranges from $15\%$ to $40\%$ over the second best baseline (\Neuronal{} with less exploration) on MNIST, Fashion, and shuttle datasets.
%On the covertype dataset, \NeuralCBP{} improves final regret performance by $51\%$ over the second best baseline (\Neuronal{} official).

Figures \ref{fig:Q2_fashion}, \ref{fig:Q2_mnist}, \ref{fig:Q2_covertype} and \ref{fig:Q2_shuttle} show that given the same volume of expert queries, \NeuralCBP{} achieves a comparable or better f1-score performance on the test sets. Other strategies may achieve a f1-score performance increase at the expense of significantly more expert queries. However, the generalization improvement obtained from these additional labelled observations is not reflected in terms of final regret performance. 
%\aud{Slightly better = ? E.g., less than 5\%?}


\begin{figure}
    \centering
    \begin{subfigure}[b]{0.44\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/results_nonbinary_LeNet_b.pdf}
        \caption{Final regret}
        \label{fig:Q3_regret}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case2_LeNet_CIFAR10_b.pdf}
        \caption{Cifar10 ($M=10$)}
        \label{fig:Q3_cifar10}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case2_LeNet_FASHION_b.pdf}
        \caption{Fashion ($M=10$)}
        \label{fig:Q3_fashion}
    \end{subfigure}
    \par\bigskip % adds some space between the rows
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case2_LeNet_MNIST_b.pdf}
        \caption{MNIST ($M=10$)}
        \label{fig:Q3_MNIST}
    \end{subfigure}
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/nonbinary_LeNet_legend_cr.png}
        %\caption{Legend}
        %\label{fig:Q3_legend}
        % \vspace{-2em}
    \end{subfigure}
    \caption{Performance on multi-class OAL with LeNet.}
    \label{fig:Q3}
\end{figure}

% \ola{todo: remove non used methods from the legend.}

\begin{figure}
    \centering
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/cost_action_count_adult.pdf}
        \caption{Adult}
        \label{fig:Q4_adult}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/cost_action_count_MagicTelescope.pdf}
        \caption{MagicTelescope}
        \label{fig:Q4_magic}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/cost_action_count_MNISTbinary.pdf}
        \caption{MNIST binary}
        \label{fig:Q4_MNIST}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/cost_action_legend.png}
        % \caption{MNIST binary}
        \label{fig:Q4_legend}
    \end{subfigure}
    \hfill
    \caption{Distribution of the number of expert queries (Expl), true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN) for \NeuralCBP{} playing Label-efficient with uniform cost vs high cost on FP. Intrinsic goal of FP-sensitive: Minimize the count of FP.}
    \label{fig:Q4}
\end{figure}


\vspace{-0.75em}
\paragraph{Robustness across different neural architectures.}
\label{sec:Q3_results}
We now evaluate \Neuronal{} and \NeuralCBP{} on a set of multi-class tasks using the convolutional architecture LeNet \citep{lecun1998gradient}. 
\INeural{} is omitted because it requires one-hot encodings of observations in the action space, which is not scalable when operating over multi-dimensional tensor observations. See numeric details in Table \ref{tab:Q3}.

Figure \ref{fig:Q3_regret} shows that \NeuralCBP{} achieves the best final regret performance.
Furthermore, \NeuralCBP{} achieves a f1-score performance comparable to \Neuronal{} for equivalent volumes of expert queries. This is expected as the underlying networks $f_1$, $f_2$ and embeddings are the same for both approaches. 
Although \NeuralCBP{} and \Neuronal{} curves overlap along the f1-score axis (y-axis), it is worth noting that \NeuralCBP{} has a smaller label complexity (x-axis). \Neuronal{} (official) and \Neuronal{} (less exploration) consume more expert queries, which is not translated into an improved final regret performance.

% Specifically, on CIFAR10, \NeuralCBP{} attains the highest f1-score with 500 expert labels while \Neuronal{} (official) fails to achieve the same performance with over 5000 expert queries (10x more labels). 
% On MNIST, \NeuralCBP{} achieves the best f1-score with less than 500 expert labels while \Neuronal{} (official) fails to achieve the same performance even with twice the number of expert queries.

% \paragraph{Sensitivity to hyper-parameters.}

% \vspace{-0.5em}
\subsection{Specifying a cost structure}

In this experiment, we investigate the impact of cost-sensitivity on the action selection strategy of \NeuralCBP{}. To our knowledge, \NeuralCBP{} is the only applicable strategy in cost-sensitive OAL tasks.
We consider the previous binary OAL tasks conducted on Adult, MagicTelescope, and the modified MNIST datasets. 

Recall from Section \ref{sec:alpm} that the original label efficient game corresponds to a binary classification task where the costs of prediction errors and expert queries are equal to $1$ across the two classes. We refer to this formulation as the \textbf{uniform costs} case. We also consider a cost-sensitive variation where the cost of false negatives (FN) is twice as low as the cost of false positives (FP). We refer to this cost-sensitive variation as the \textbf{FP-sensitive costs} case. In the FP-sensitive case, the intrinsic goal is to minimize the amount of false positives. For example, this could refer to a learning system constrained to minimize incorrect positive detection in medical screenings (HIV, cancer, etc).

Figure \ref{fig:Q4} illustrates the influence of the cost structure on \NeuralCBP{}.
We measure the count distribution (mean, median, 1st and 3rd quartiles) of expert queries, true positives (TP), true negatives (TN), false positives (FP), and false negatives (FP), over the $25$ runs.
In Adult and MagicTelescope datasets (Figures \ref{fig:Q4_adult} and \ref{fig:Q4_magic}), the third quartile of the FP count is approximately $3$ times smaller under the FP-sensitive cost structure. On MNIST binary (Figure \ref{fig:Q4_MNIST}), the mean FP count is $336\pm 112.24$ (1-std) in the uniform case; this value drops to $136\pm30$ (1-std) in the FP-sensitive case. These numeric results show that \NeuralCBP{} successfully accounts for the specified FP-sensitive cost structure.

%\aud{It could be stronger to provide actual number, e.g. 3$\times$ lower, 50\% lower,...} 


% \begin{itemize}
%     \item margin vs. successive elimination based
%     \item benefit of successive elimination based exploration
%     \item Compared to I-Neural and Neuronal, our method is hyper-parameter free
%     \item The decision to explore in our method is different from I-Neural and Neuronal (they compare the absolute difference between the 2 best predictions, which is a generalization of Cesa-Bianchi). Our method is not based on this principle.
%     \item Our guarantees are different because we express the performance of the agent in terms of regret, while online active learning typically express the performance with two different bounds, one on the generalization performance and one on the label complexity. 
% \end{itemize}


\section{Discussion }
\label{app:technical_differences}

%\EENets{} have been introduced for neural bandits in \cite{ban2021ee}. Then, \EENets{} have been leveraged in OAL with the \INeural{} strategy \cite{ban2022improved}. Further improvements on the scalability of \EENets{} have been proposed in \Neuronal{} \cite{ban2024neural}. 

\paragraph{Partial monitoring feedback.} \NeuralCBP{} is a strategy designed for the partial monitoring setting. To account for partial monitoring games, \NeuralCBP{} operates a distinction between incurred costs (not learned, specified in the cost matrix $\bf{C}$), observable feedbacks (not learned, specified in the feedback matrix $\bf{H}$) and feedback distribution (learned components noted $\hat \pi_a(x_t)$ for each action $a$). The proposed PM-based formulation enables to specify a cost structure and possibly the presence of multiple experts. 

\paragraph{Distinct exploration principles.} In existing \EENets{} strategies, the predictions of the network $f_2$ are added to the predictions of $f_1$ to compute an \textit{upper confidence bound} on the predictions. Then, the magnitude of the difference between the top two predictions drives the exploration. In contrast, the predictions of network $f_2$ in \NeuralCBP{} contribute to a \textit{successive elimination} criteria (defined in Section \ref{sec:expl_explo}) that checks whether the upper and lower confidence bounds of two different actions overlaps or not. 

\paragraph{Sensitivity to hyper-parameters} 
In \Neuronal{} and \INeural{} strategies, the decision of querying the expert is based on the difference between the top two class predictions. If the difference is greater than a slack term (obtained from the theory), the strategy asks for an expert label. The slack term provided by the theory is not usually computed in practice \cite{ban2024neural}. Therefore, the user must select a proxy $\gamma$ of the slack term. As demonstrated in our experiments, we have evaluated the official ($\gamma=0.6$) instances of \Neuronal{} and \INeural{}, as well as instances that induce less exploration ($\gamma=3$). We observe from Figures \ref{fig:Q1}, \ref{fig:Q2}, and \ref{fig:Q3},  that the final regret performance of \Neuronal{} and \INeural{} on a specific dataset is sensitive to an appropriate choice of $\gamma$. One benefit of \NeuralCBP{} over \Neuronal{} and \INeural{} is that the exploration is driven by a successive elimination criteria that is hyper-parameter free. This is relevant in online learning where deployment data is typically unknown in advance, making it difficult to tune hyper-parameters.



% \vspace{-0.5em}
\section{Conclusion}

Our work demonstrates the potential of the partial monitoring framework in practice, a field traditionally supported by theoretical research. 
We leverage the PM framework to formulate OAL tasks and propose \NeuralCBP{}, a PM strategy able to learn efficiently from neural networks. 
While the emphasis of this paper is on OAL, \NeuralCBP{} is a general PM approach that can be applied to the broader diversity of partial monitoring games.
Lastly, we demonstrate the empirical performance of \NeuralCBP{} on a set of binary, multi-class and cost-sensitive OAL tasks, and highlight technical and empirical benefits over existing OAL strategies. 

\paragraph{Limitations} A limitation of \NeuralCBP{} is that it does not scale well with large number of classes. Combinatorial PM strategies could address this limitation \citep{lin2014combinatorial}.
Furthermore, \NeuralCBP{} does not capture the multi-expert case. 
The multi-expert case is studied in \citet{dekel2012selective, pmlr-v162-kumar22a} without the PM framework but a PM perspective based on \citet{kirschner2020information, kirschner2023linear} could be an avenue of future research.

\begin{contributions}
Maxime Heuillet: conceptualization,  methodology, empirical investigation, visualizations, implementation, writing (original draft, editing), funding acquisition. 
Ola Ahmad: conceptualization, writing (review, editing), supervision. 
Audrey Durand: conceptualization, writing (review, editing), supervision, funding acquisition.
\end{contributions}

\begin{acknowledgements} 
This work was funded through a Mitacs Accelerate grant. We thank Alliance Canada and Calcul Quebec for access to computational resources and staff expertise consultation. We would like to thank Dr. Yikun Ban for answering our technical questions about \INeural{} and \Neuronal{}. We also acknowledge the library pmlib of Tanguy Urvoy that was helpful to implement \NeuralCBP{} and PM game environments.
\end{acknowledgements}

% References
\bibliography{Bib}

\newpage

\onecolumn

\title{Neural Active Learning Meets the Partial Monitoring Framework\\(Supplementary Material)}
\maketitle

\appendix

\begin{table}[H]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|c|l|c|}
\hline
\textbf{Notation}                           & \textbf{Definition}          & \textbf{Observable by the agent?} \\ \hline
$N$                                & Number of actions  & $\checkmark$ \\ \hline
$M$                                & Number of outcomes & $\checkmark$  \\ \hline
$E$                                & Number of experts & $\checkmark$  \\ \hline
$\Sigma$                 & Feedback space (space of symbols) & $\checkmark$  \\ \hline
$\bf{C} \in \mathbb [0,1]^{N \times M}$     & Cost matrix  & $\checkmark$ \\ \hline
$\bf{c}_i$          & Row $i$ in matrix $\bf{C}$ (associated with action $i$) & $\checkmark$ \\ \hline
$\bf{H} \in \Sigma^{N \times M}$      & Feedback matrix  & $\checkmark$ \\ \hline
$\bf{h}_i$          & Row $i$ in matrix $\bf{H}$ (associated with action $i$) & $\checkmark$ \\ \hline
$\sigma_i$  & Number of unique feedback symbols induced by action $i$ (i.e. on row $i$ of $H$) & $\checkmark$  \\ \hline
$\Delta_M$              & Probability simplex of dimension $M$ (i.e. over the outcome space) &  $\checkmark$ \\ \hline
$\Delta_{\sigma_i} $     & Probability simplex of dimension $\sigma_i$ (i.e. over the symbol space induced by action $i$) & $\checkmark$  \\ \hline
$T$          & Total number of rounds in a game (horizon) & \ding{55} \\ \hline
$i_t$          & Action played by the agent at round $t$ & $\checkmark$ \\ \hline
$y_t$          & Outcome  at round $t$ & \ding{55} \\ \hline
$h_t$          & Feedback symbol at round $t$ & $\checkmark$ \\ \hline
$\bf{H}[i_t,y_t]$          & Element in matrix $\bf{H}$ at row $I_t$ and column $J_t$ (i.e. feedback received at round $t$) & $\checkmark$ \\ \hline
$\bf{C}[i_t,y_t]$          & Element in matrix $\bf{C}$ at row $I_t$ and column $J_t$ (i.e. loss incurred at round $t$) & \ding{55} \\ \hline
$\mathcal X$   & Observation space   & \ding{55} \\ \hline
$\mathcal I$   & Set of informative actions &  $\checkmark$ \\ \hline
$\Sigma_{\mathcal I}$   & Valid feedback symbols & $\checkmark$ \\  \hline
$\Pi(x_t)$   & Set of informative feedback distributions & \ding{55} \\  \hline
$x_t \in \mathcal X$   & Observation received at time $t$ & $\checkmark$  \\ \hline
$X_{i,t}$   & History of end-to-end embeddings for action $i$ up to time $t$  & $\checkmark$  \\ \hline
$G_{i,t} $   & Gram matrix for action $i$ up to time $t$ & $\checkmark$ \\ \hline
$p(x_t)\in \Delta_M$   & Outcome distribution & \ding{55} \\ \hline
$\mathcal{O}_i \subseteq \Delta_M$  & Cell of action $i$  & $\checkmark$ \\ \hline
$\Sigma_i$  & Enumeration of symbols sorted by order of appearance in $h_i$ & $\checkmark$ \\ \hline
$S_i \in \{0,1\}^{\sigma_i \times M}$          & Signal matrix of action $i$ & $\checkmark$  \\ \hline
$\pi_i(x_t) \in \Delta_{\sigma_i}$  & Distribution for the unique feedback symbols induced by action $i$ & \ding{55} \\ \hline
$\delta_{i,j}(x_t)$          & Expected loss difference between action $i$ and $j$ & \ding{55} \\ \hline
$\mathcal P$          & Set of Pareto optimal actions (i.e. set of actions) & $\checkmark$ \\ \hline
$\mathcal N$          & Set of neighbor action pairs (i.e. set of pairs of actions) & $\checkmark$ \\ \hline
$\mathcal U(t)$          & Set of confident action pairs (i.e. set of pairs of actions) & $\checkmark$ \\ \hline
$V_{i,j}$          & Observer set for pair $i,j$ (i.e. set of actions ) & $\checkmark$ \\ \hline
$v_{ija}$          & Observer vector associated with $V_{i,j}$ (index $a$ indicates to which action in $V_{i,j}$ it is associated to) & $\checkmark$ \\ \hline
$z_{i,j}(t)$          & Confidence for a pair $\{i,j\}$ at round $t$ & $\checkmark$ \\ \hline
$D(t) \subseteq \Delta_M$        & Sub-space of the simplex based on constraints in $\mathcal U(t)$, it includes $p^\star$ with high confidence & $\checkmark$      \\ \hline
$N^+_{i,j}$          & Neighbor action set for pair $i,j$ (set of actions) & $\checkmark$ \\ \hline
$\mathcal P(t)$          & Plausible subset of $\mathcal P$ given $D(t)$ (set of actions)  & $\checkmark$ \\ \hline
$\mathcal N(t)$          & Plausible subset of $\mathcal N$ given $D(t)$ (set of pairs of actions) & $\checkmark$ \\ \hline
$\mathcal R(x_t)$          & Set of underplayed actions at time $t$ (set of actions) & $\checkmark$ \\ \hline
$e(\cdot)$          & One hot encoding & $\checkmark$ \\ \hline
$\mathcal S(t)$          & Final set of actions considered by CBP (set of actions) & $\checkmark$ \\ \hline
$W_a = \max_{\{i,j\} \in \mathcal N} \| v_{ija} \|_{\infty} $   & Weight of an action & $\checkmark$  \\ \hline
\end{tabular} }
\caption{List of notations}
\label{tab:notations}
\end{table}

\section{Analysis of the label efficient game}
\label{app:label_efficient}

The original label-efficient game \citep{helmbold1997some} is defined by the following cost and feedback matrices: 

\begin{align}
    \bf{C}=\kbordermatrix{ & \text{class } A & \text{class } B\\
        \text{pred. class A} & 0 & 1\\
        \text{pred. class B} & 1 & 0 \\
                \text{expert} & 1 & 1},
    \bf{H}=\kbordermatrix{
        & \text{class } A & \text{class} B\\
        & \Diamond & \Diamond \\
        & \wedge & \wedge\\
        & \bot & \odot} \notag .
\end{align}

The game includes a set of $N=3$ possible actions and $M=2$ possible outcomes (class A, and class B). 
For actions $1$ and $2$, there is $\sigma_1 = \sigma_2 = 1$ unique feedback symbol. For action $3$, there is $\sigma_3=2$ feedback symbols, and the enumeration is $\{\bot, \odot \}$. Therefore, the set of informative actions is $\mathcal I = \{ 3\}$. 

\paragraph{Signal Matrices:} The dimension of the signal matrices are such that $S_1 \in \{0,1\}^{1\times2}$ and $S_2 \in \{0,1\}^{1\times2}$ and $S_3 \in \{0,1\}^{2\times2}$. The matrices verify:  \\
$$ S_1 =  \begin{bmatrix} 1 & 1   \end{bmatrix}, \quad S_2 =  \begin{bmatrix} 1 & 1   \end{bmatrix}, \quad S_3 = \begin{bmatrix} 1 & 0 \\ 0 & 1  \end{bmatrix} $$

The outcome distribution is noted $p^\star = [p_A, p_B]^\top$.

\paragraph{Cells:} Each action can be associated to a sub-space of the probability simplex noted \textit{cell} (see Definition \ref{def:cell}): 
\begin{itemize}
    \item For action 1, we have:  $\mathcal O_1 = \{ p \in  \Delta_M, \forall j \in \{1,\dots, N\}, (\textbf{c}_1-\textbf{c}_j) p \leq 0 \}$. This probability space corresponds to the following constraints: 
    $$   \begin{bmatrix} \textbf{c}_1-\textbf{c}_1 \\ \textbf{c}_1-\textbf{c}_2 \\ \textbf{c}_1-\textbf{c}_3 \end{bmatrix} p    =   \begin{bmatrix} 0 & 0 \\ -1 & 1 \\ -1 & 0 \end{bmatrix} p    \leq 0 $$
    The first constraint $   (\textbf{c}_1-\textbf{c}_1) p    \leq 0$ is always verified. The second constraint $   (\textbf{c}_1-\textbf{c}_2) p  \leq 0$ implies $-p_A+p_B \leq 0 \iff p_B \leq p_A$. The third constraint $  (\textbf{c}_1-\textbf{c}_3) p    \leq 0$ implies $-p_A \leq 0 \iff p_A \geq 0$.
    \item For action 2, we have:  
    $\mathcal O_2 = \{ p \in  \Delta_M, \forall j \in \{1,\dots, N\}, (\textbf{c}_2-\textbf{c}_j) p \leq 0 \}$. This probability space corresponds to the following constraints: 
    $$  \begin{bmatrix} \textbf{c}_2-\textbf{c}_1 \\ \textbf{c}_2-\textbf{c}_2 \\ \textbf{c}_2-\textbf{c}_3  \end{bmatrix} p = \begin{bmatrix} 1 & -1 \\ 0 & 0 \\ 0 & -1 \end{bmatrix} p    \leq 0 $$
    The second constraint $ (\textbf{c}_2-\textbf{c}_2) p \leq 0$ is always satisfied. The first constraint $ (\textbf{c}_2-\textbf{c}_1) p \leq 0$ implies $p_A-p_B \leq 0 \iff p_A \leq p_B$. The third constraint $ (\textbf{c}_2-\textbf{c}_1) p \leq 0$ implies $-p_B \leq 0 \iff p_B \geq 0$.
    \item For action 3, we have:  $\mathcal O_3 = \{ p \in  \Delta_M, \forall j \in \{1,\dots, N\}, (\textbf{c}_3-\textbf{c}_j) p \leq 0 \}$. 
    This probability space corresponds to the following constraints: 
    $$   \begin{bmatrix} \textbf{c}_3-\textbf{c}_1 \\ \textbf{c}_3-\textbf{c}_2 \\ \textbf{c}_3-\textbf{c}_3 \end{bmatrix}  p  = \begin{bmatrix} 1 & 0 \\ 0 & 1 \\ 0 & 0 \end{bmatrix}  p \leq 0 $$
    The third constraint $ (\textbf{c}_1-\textbf{c}_1) p \leq 0$ is always verified. The first constraint $ (\textbf{c}_1-\textbf{c}_2) p  \leq 0$ implies $ p_A \leq 0$ and the second constraint $   (\textbf{c}_1-\textbf{c}_3) p    \leq 0$ implies $ p_B \leq 0$. 
    There exist no probability vector in $\Delta_M$ satisfying these three constraints at the same time. 
\end{itemize}

\paragraph{Pareto optimal actions:} From the analysis of the cells, we have $\mathcal O_{3} = \emptyset$. Therefore, action 3 is dominated, according to Definition \ref{def:cell}. The remaining actions 1 and 2 are Pareto optimal because their respective cells are not included in one another, i.e. $\mathcal P = \{1,2\}$.

\paragraph{Neighbor actions:} In this paragraph, we will determine whether action 1 and 2 are a neighbor pair.

$$ \mathcal O_1 \cap \mathcal O_2 = \begin{cases} p_B \leq p_A \\  p_A \geq 0 \\  p_A \leq p_B \\ p_B \geq 0 \end{cases} $$
The only point in this vector space is $\begin{bmatrix} 0.5 & 0.5   \end{bmatrix}^\top$. Therefore, $\dim(\mathcal O_1 \cap \mathcal O_2) = 0 = M-2$ and the pair $\{1,2\}$ is a neighbor pair, i.e. $\mathcal N = \{ \{1,2\}, \}$.

\paragraph{Neighbor action set:} This set is defined as $N^+_{ij} = \{ k \in \{ 1, \dots, N\}, \mathcal O_i \cap \mathcal O_j \subset \mathcal O_k \}$. This yields: $N^+_{1,2} = N^+_{2,1} = [1,2]$ because the cell of action $3$ is empty.

\paragraph{Informative action set} Action $3$ is the only informative action because $\sigma_3=2\geq1$.

\paragraph{Observer set:} We have: $V_{1,2} = \{ 3 \}$ same applies to $V_{2,1} = \{ 3 \}$, because action $3$ is the only informative action.

\paragraph{Observer vector:} For the pair $\{1,2\}$, we have to find $v_{ija}, a \in V_{ij}$ such that $C_1^\top - C_2^\top = \sum_{a \in V_{ij} } S_i^T v_{ija}$, according to Definition \ref{def:observer_vector}. Choosing and $v_{121}^\top = \begin{bmatrix} -1 & 1 \end{bmatrix}$ verifies the relation: 

\begin{equation*}
    \textbf{c}_1^\top - \textbf{c}_2^\top =   \begin{bmatrix} -1 \\ 1 \end{bmatrix} =    \begin{bmatrix} 1 & 0 \\ 0 & 1  \end{bmatrix} \begin{bmatrix} -1 \\ 1 \end{bmatrix}    
\end{equation*}

\section{Implementation details}
\label{app:implementation}

The pseudo-code in Algorithm \ref{alg:gradient_descent} details how the \EENets{} is updated.

\begin{algorithm}
\SetKwInOut{Input}{input}
\SetKwInOut{Output}{output}
\DontPrintSemicolon
\caption{Update EENet with gradient descent}
\label{alg:gradient_descent}

\Input{$\theta_1, \theta_2$  }

Epoch number $K_1$ and $K_2$, learning rate $\eta_1$ and $\eta_2$ \;

Initialize $\theta_1^{(0)} = \theta_1$ \;

\For{$k \in \{ 1, \dots , K_1 \} $} { 
        $\theta_1^{(k)} = \theta_1^{(k-1)} - \mu_1 \nabla_{ \theta_1^{(k-1)} } \mathcal L_1( \theta_1^{(k-1)} ) $ ;\
}

$\tilde \theta_1 = \theta^{(K_1)}_1$ \;

Initialize $\theta_2^{(0)} = \theta_2$ \;

\For{$k \in \{ 1, \dots , K_2 \} $} { 
        $\theta^{(k)}_2 = \theta^{(k-1)}_2 - \mu_2 \nabla_{ \theta^{(k-1)}_2 } \mathcal L_2( \theta_2^{(k-1)} ) $ ;\
}

$\tilde \theta_2 = \theta_2^{(K_2)}$ \;


\Output{$\tilde \theta_1, \tilde \theta_2$}

\end{algorithm}


\section{Experiment details.}

\label{app:hyperparameters}

The neural components of the strategies refer to two networks $f_1$ and $f_2$ for \NeuralCBP{}, \Neuronal{} and \INeural{}; $f_1$ is trained using the MSE loss functions $\mathcal L_1$ and $f_2$ with $\mathcal L_2$. 
For strategies \Cesa{} and \Margin{}, the neural component corresponds to one network $f_1$,  which is trained using the MSE loss function $\mathcal L_1$. 
In the experiments with a MLP, $f_1$ is a MLP architecture of width $m=100$ and depth $L=2$, and $f_2$ is a MLP of width $m=100$ and depth $L=2$. In the experiments with a LeNet, $f_1$ is a LeNet architecture \cite{lecun1998gradient}, and $f_2$ is a MLP of width $m=100$ and depth $L=2$.

\paragraph{Update protocol for the neural components of the strategies.}

At the beginning of the game, each strategy plays each action once. Then, to save compute, we perform updates at every round for the first $N \leq t \leq 50$ steps. We update every $50$ rounds for $t \leq 1000$. Finally, we update every $500$ rounds when $t \geq 1000$. An equivalent update protocol has been used in related neural online learning literature \cite{xu2022neural}. This update protocol is implemented for all the strategies considered in the experiments.

\paragraph{End-to-end embedding down sampling.} Both \Neuronal{} and \NeuralCBP{} use the \textit{end-to-end embedding} (see Definition \ref{def:e2e_embedding}). Due to the dimension of a flattened gradient, the embedding received as input to $f_2$ requires a down-sampling. Similarly to \cite{ban2024neural}, we use a \textit{block-reduction averaging operator}. When $f_1$ is based on a MLP architecture, the reduction parameter to $51$ following \cite{ban2024neural}. When $f_1$ is based on a LeNet architecture, we set the reduction parameter to $51$ for MNIST and FASHION datasets. For CIFAR10, we increase the block averaging to $153=51\times 3$ to account for the three color channels (RGB) of CIFAR10 observations.

\paragraph{\NeuralCBP{}.} 
To speed up compute, the inversion and updates of the Gram matrix are performed on GPU, using the Sherman-Morison update \citep{sherman_morrison}.
We set $f(t) = \alpha^{1/3} t^{2/3} \log(t)^{1/3}$, $\eta_a = W_a^{2/3}$ and $\alpha=1.01$ according to previous literature \cite{} \cite{heuillet2024randomized}.
This combination of parameters is justified by the theoretical analysis of \CBP{} and is not meant to be tuned further. 

To update $f_1$ and $f_2$, we use the Adam optimizer, with the learning rate set to the default value $\mu_1 = \mu_2 = 0.001$ (both for MLP and LeNet architectures). Following \cite{ban2024neural}, we set the batch size to $64$ and the number of epochs to $40$. We performed a grid search for the learning rate over $\{0.0001, 0.001 \}$ and found that the value $0.001$ performs best.


\paragraph{\Neuronal{}}
The strategy \Neuronal{} admits a hyper-parameter $\gamma$ that influences the amount of exploration. For \Neuronal{} (official), we set $\gamma=6$, as reported in \citet{ban2024neural}. We also consider the instance $\gamma=3$ for \Neuronal{} (less exploration), which exhibits less exploration. 

Following \cite{ban2024neural},  we set the batch size to $64$ and the number of epochs to $K_1=K_2=40$. 
For $f_1$ and $f_2$, we use the Adam optimizer, with the value for the learning rate set at $\mu_1=\mu_2=0.001$ for both the MLP, and LeNet architecture. 

We performed a grid search and report empirical findings for the learning rate over $\{0.0001, 0.001\}$. We observed from Tables \ref{tab:Q1}, \ref{tab:Q1_app}, \ref{tab:Q2}, \ref{tab:Q2_app} and \ref{tab:Q3}, \ref{tab:Q3_app} that a learning rate $\mu_1=\mu_2=0.001$ performs best on most datasets for both instances of \Neuronal{} (official and $\gamma=3$).

\paragraph{\INeural{} }
The exploration parameter is set to $\gamma=6$ for \INeural{} (official) and to $\gamma = 3$ for \INeural{} (less exploration).
Note that the networks $f_1$ and $f_2$ of \INeural{} have a different input dimension, as they require input observations to be one-hot-encoded over the action space. 

For $f_1$ and $f_2$, we use the Adam optimizer, with the default value for the learning rate set at $\mu_1=\mu_2=0.001$. The batch size is set to $64$ and the number of epochs to $40$. This approach is known to be outperformed by \Neuronal{}, we used the set of optimal hyper-parameters reported in \cite{ban2024neural}.

\paragraph{\Cesa{} }
The exploration strategy of the approach \Cesa{} is hyper-parameter free. 
For the network $f_1$, we use the Adam optimizer, with the default value for the learning rate set at $\mu_1=0.001$. The batch size is set to $64$ and the number of epochs to $K_1=40$.

\paragraph{ \Margin{} }
The exploration parameter of the \Margin{} approach is set to $1$.
For network $f_1$, we use the Adam optimizer, with the default value for the learning rate set at $\mu_1=0.001$. The batch size is set to $64$ and the number of epochs to $K_1=40$.

\subsection{Numerical results}
\label{app:results}

In this Appendix, we report the empirical performance of \Neuronal{} with a learning rate set to $\mu_1=\mu_2=0.0001$. We also report numeric details for all the figures reported in the main body and appendix.
\bigbreak

\begin{multicols}{2}



\begin{figure}[H]
    \centering
    \begin{subfigure}[b]{0.44\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/results_binary_MLP_a.pdf}
        \caption{Final regret (lower is better)}
        \label{fig:Q1_regret_app}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case1_MLP_adult_a.pdf}
        \caption{Adult}
        \label{fig:Q1_adult_app}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case1_MLP_MagicTelescope_a.pdf}
        \caption{Magic Telescope}
        \label{fig:Q1_magic_app}
    \end{subfigure}
    \par\bigskip % adds some space between the rows
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case1_MLP_MNISTbinary_b.pdf}
        \caption{MNIST Binary}
        \label{fig:Q1_mnistbinary_app}
    \end{subfigure}
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/binary_legend_cr.png}
        %\caption{Legend}
        \label{fig:legend_app}
        % \vspace{-2em}
    \end{subfigure}
    \caption{Performance on binary OAL with MLP. \Neuronal{} with $\mu_1=\mu_2=0.0001$.}
    \label{fig:Q1_app}
\end{figure}

\begin{figure}[H]
    \centering
    \begin{subfigure}[b]{0.44\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/results_nonbinary_MLP_a.pdf}
        \caption{Final regret}
        \label{fig:Q2_regret_app}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_game_case_seven_MLP_FASHION_a.pdf}
        \caption{Fashion ($M=10$)}
        \label{fig:Q2_fashion_app}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_game_case_seven_MLP_MNIST_a.pdf}
        \caption{MNIST ($M=10$)}
        \label{fig:Q2_mnist_app}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_game_case_seven_MLP_covertype_a.pdf}
        \caption{Covertype ($M=7$)}
        \label{fig:Q2_covertype_app}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_game_case_seven_MLP_shuttle_a.pdf}
        \caption{Shuttle ($M=7$)}
        \label{fig:Q2_shuttle_app}
    \end{subfigure}
    \begin{subfigure}[b]{0.44\textwidth}
        \centering
        \vspace{0.5em}
        \includegraphics[width=\textwidth]{figures_CR/nonbinary_legend_cr.png}
        %\caption{Legend}
        \label{fig:Q2_legend_app}
        \vspace{-2em}
    \end{subfigure}
    \caption{Performance on multi-class OAL with MLP. \Neuronal{} with $\mu_1=\mu_2=0.0001$. }
    \label{fig:Q2_app}
% \vspace{-1.5em}
\end{figure}

\end{multicols}

\begin{figure}[H]
    \centering
    \begin{subfigure}[b]{0.44\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/results_nonbinary_LeNet_a.pdf}
        \caption{Final regret}
        \label{fig:Q3_regret_app}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case2_LeNet_CIFAR10_a.pdf}
        \caption{Cifar10 ($M=10$)}
        \label{fig:Q3_cifar10_app}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case2_LeNet_FASHION_a.pdf}
        \caption{Fashion ($M=10$)}
        \label{fig:Q3_fashion_app}
    \end{subfigure}
    \par\bigskip % adds some space between the rows
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/f1_case2_LeNet_MNIST_a.pdf}
        \caption{MNIST ($M=10$)}
        \label{fig:Q3_MNIST_app}
    \end{subfigure}
    \begin{subfigure}[b]{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures_CR/nonbinary_LeNet_legend_cr.png}
        %\caption{Legend}
        %\label{fig:Q3_legend}
        % \vspace{-2em}
    \end{subfigure}
    \caption{Performance on multi-class OAL with LeNet. \Neuronal{} with $\mu_1=\mu_2=0.0001$. }
    \label{fig:Q3_app}
\end{figure}

\begin{table}[H]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Dataset        & Approach                    & Mean regret & p-value & win count & Mean Exploration & p-value (exploration) \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & Neural-CBP                  & 1351.92     & 1.0     & 10.0      & 638.32           & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & IneurAL (official)          & 1701.72     & 0.0     & 0.0       & 1311.44          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & IneurAL (less exploration)  & 1701.28     & 0.103   & 2.0       & 624.96           & 0.756                 \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & Neuronal (official)         & 1566.16     & 0.0     & 0.0       & 1363.88          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & Neuronal (less exploration) & 1646.16     & 0.172   & 13.0      & 778.4            & 0.01                  \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & Cesa                        & 2627.52     & 0.0     & 0.0       & 303.28           & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & Margin                      & 3169.6      & 0.0     & 0.0       & 100.28           & 0.0                   \\ \hline
MagicTelescope & Neural-CBP                  & 3370.36     & 1.0     & 14.0      & 496.68           & 1.0                   \\ \hline
MagicTelescope & IneurAL (official)          & 7343.04     & 0.0     & 0.0       & 6452.8           & 0.0                   \\ \hline
MagicTelescope & IneurAL (less exploration)  & 4473.2      & 0.0     & 0.0       & 3138.28          & 0.0                   \\ \hline
MagicTelescope & Neuronal (official)         & 3716.8      & 0.207   & 0.0       & 1604.84          & 0.0                   \\ \hline
MagicTelescope & Neuronal (less exploration) & 3287.0      & 0.769   & 7.0       & 444.64           & 0.613                 \\ \hline
MagicTelescope & Cesa                        & 3245.64     & 0.623   & 1.0       & 328.76           & 0.033                 \\ \hline
MagicTelescope & Margin                      & 3574.12     & 0.483   & 3.0       & 58.04            & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & Neural-CBP                  & 2968.88     & 1.0     & 10.0      & 127.64           & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & IneurAL (official)          & 8243.12     & 0.0     & 0.0       & 7610.2           & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & IneurAL (less exploration)  & 3909.48     & 0.027   & 0.0       & 1136.24          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & Neuronal (official)         & 3222.24     & 0.551   & 1.0       & 951.72           & 0.025                 \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & Neuronal (less exploration) & 3035.92     & 0.884   & 5.0       & 89.64            & 0.379                 \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & Cesa                        & 3180.0      & 0.643   & 3.0       & 320.12           & 0.001                 \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & Margin                      & 3166.56     & 0.682   & 6.0       & 19.92            & 0.0                   \\ \hline
\end{tabular}%
}
\caption{
Supplement for Section \ref{sec:Q1_results} presented in the main paper (see Figure \ref{fig:Q1}). \Neuronal{} with $\mu_1=\mu_2=0.001$.
Mean regret: average regret at the last step ($T = 10$k). 
P-value: Welch's t-test on the distribution of regrets at the last step, with \NeuralCBP{} as reference (p-value $>0.05$ means no statistical difference). 
Win count: number of times a given strategy achieved the lowest final regret (ties included). 
Mean exploration: average number of expert-verified observations. 
P-value (exploration): Welch's t-test on the distribution of number of expert queries.
}
\label{tab:Q1}
\end{table}

\begin{table}[H]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Dataset        & Approach                    & Mean regret & p-value & win count & Mean Exploration & p-value (exploration) \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & Neural-CBP                  & 1351.92     & 1.0     & 15.0      & 638.32           & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & IneurAL (official)          & 1701.72     & 0.0     & 0.0       & 1311.44          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & IneurAL (less exploration)  & 1701.28     & 0.103   & 8.0       & 624.96           & 0.756                 \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & Neuronal (official)         & 1730.08     & 0.0     & 0.0       & 1558.76          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & Neuronal (less exploration) & 1440.48     & 0.005   & 2.0       & 987.84           & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & Cesa                        & 2627.52     & 0.0     & 0.0       & 303.28           & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST binary   & Margin                      & 3169.6      & 0.0     & 0.0       & 100.28           & 0.0                   \\ \hline
MagicTelescope & Neural-CBP                  & 3370.36     & 1.0     & 16.0      & 496.68           & 1.0                   \\ \hline
MagicTelescope & IneurAL (official)          & 7343.04     & 0.0     & 0.0       & 6452.8           & 0.0                   \\ \hline
MagicTelescope & IneurAL (less exploration)  & 4473.2      & 0.0     & 0.0       & 3138.28          & 0.0                   \\ \hline
MagicTelescope & Neuronal (official)         & 4226.96     & 0.002   & 0.0       & 2555.96          & 0.0                   \\ \hline
MagicTelescope & Neuronal (less exploration) & 3372.2      & 0.994   & 4.0       & 556.52           & 0.631                 \\ \hline
MagicTelescope & Cesa                        & 3245.64     & 0.623   & 3.0       & 328.76           & 0.033                 \\ \hline
MagicTelescope & Margin                      & 3574.12     & 0.483   & 2.0       & 58.04            & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & Neural-CBP                  & 2968.88     & 1.0     & 10.0      & 127.64           & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & IneurAL (official)          & 8243.12     & 0.0     & 0.0       & 7610.2           & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & IneurAL (less exploration)  & 3909.48     & 0.027   & 2.0       & 1136.24          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & Neuronal (official)         & 4819.44     & 0.001   & 1.0       & 3395.52          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & Neuronal (less exploration) & 2523.64     & 0.155   & 3.0       & 305.96           & 0.032                 \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & Cesa                        & 3180.0      & 0.643   & 3.0       & 320.12           & 0.001                 \\ \hline
\rowcolor[HTML]{C0C0C0} 
adult          & Margin                      & 3166.56     & 0.682   & 6.0       & 19.92            & 0.0                   \\ \hline
\end{tabular}%
}
\caption{
Supplement for Figure \ref{fig:Q1_app}. \Neuronal{} with $\mu_1=\mu_2=0.0001$.
Mean regret: average regret at the last step ($T = 10$k). 
P-value: Welch's t-test on the distribution of regrets at the last step, with \NeuralCBP{} as reference (p-value $>0.05$ means no statistical difference). 
Win count: number of times a given strategy achieved the lowest final regret (ties included). 
Mean exploration: average number of expert-verified observations. 
P-value (exploration): Welch's t-test on the distribution of number of expert queries.
}
\label{tab:Q1_app}
\end{table}


% Please add the following required packages to your document preamble:
% \usepackage{multirow}
\begin{table}[H]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Dataset   & Approach                    & Mean regret & p-value & win count & Mean Exploration & p-value (exploration) \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST     & Neural-CBP                  & 1811.96     & 1.0     & 25.0      & 1305.24          & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST     & IneurAL (official)          & 4016.84     & 0.0     & 0.0       & 3870.12          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST     & IneurAL (less exploration)  & 2371.36     & 0.0     & 0.0       & 1818.56          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST     & Neuronal (official)         & 3275.0      & 0.0     & 0.0       & 3224.84          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST     & Neuronal (less exploration) & 2213.16     & 0.0     & 0.0       & 2030.44          & 0.0                   \\ \hline
Fashion   & Neural-CBP                  & 2898.24     & 1.0     & 25.0      & 1523.92          & 1.0                   \\ \hline
Fashion   & IneurAL (official)          & 4882.48     & 0.0     & 0.0       & 4382.88          & 0.0                   \\ \hline
Fashion   & IneurAL (less exploration)  & 3437.76     & 0.0     & 0.0       & 2210.2           & 0.0                   \\ \hline
Fashion   & Neuronal (official)         & 4466.76     & 0.0     & 0.0       & 4193.28          & 0.0                   \\ \hline
Fashion   & Neuronal (less exploration) & 3417.44     & 0.0     & 0.0       & 2777.44          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
covertype & Neural-CBP                  & 5060.24     & 1.0     & 12.0      & 446.96           & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
covertype & IneurAL (official)          & 9129.68     & 0.0     & 0.0       & 8235.72          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
covertype & IneurAL (less exploration)  & 7707.72     & 0.0     & 0.0       & 4862.12          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
covertype & Neuronal (official)         & 5970.96     & 0.0     & 0.0       & 4802.48          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
covertype & Neuronal (less exploration) & 5060.92     & 0.997   & 13.0      & 1330.4           & 0.001                 \\ \hline
shuttle   & Neural-CBP                  & 907.56      & 1.0     & 14.0      & 190.08           & 1.0                   \\ \hline
shuttle   & IneurAL (official)          & 7989.88     & 0.0     & 0.0       & 7901.0           & 0.0                   \\ \hline
shuttle   & IneurAL (less exploration)  & 2810.08     & 0.0     & 1.0       & 2292.48          & 0.0                   \\ \hline
shuttle   & Neuronal (official)         & 1547.08     & 0.004   & 7.0       & 183.48           & 0.902                 \\ \hline
shuttle   & Neuronal (less exploration) & 1524.0      & 0.003   & 4.0       & 157.16           & 0.437                 \\ \hline
\end{tabular}%
}
\caption{Numeric values in support of Section \ref{sec:Q2_results} presented in the main paper (see Figure \ref{fig:Q2}). \Neuronal{} with $\mu_1=\mu_2=0.001$.
Mean regret: average regret at the last step ($T = 10$k). 
P-value: Welch's t-test on the distribution of regrets at the last step, with \NeuralCBP{} as reference (p-value $>0.05$ means no statistical difference). 
Win count: number of times a given strategy achieved the lowest final regret (ties included). 
Mean exploration: average number of expert-verified observations. 
P-value (exploration): Welch's t-test on the distribution of number of expert queries. }
\label{tab:Q2}
\end{table}

% Please add the following required packages to your document preamble:
% \usepackage{multirow}
\begin{table}[H]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Dataset   & Approach                    & Mean regret & p-value & win count & Mean Exploration & p-value (exploration) \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST     & Neural-CBP                  & 1811.96     & 1.0     & 25.0      & 1305.24          & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST     & IneurAL (official)          & 4016.84     & 0.0     & 0.0       & 3870.12          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST     & IneurAL (less exploration)  & 2371.36     & 0.0     & 0.0       & 1818.56          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST     & Neuronal (official)         & 4123.8      & 0.0     & 0.0       & 4093.68          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST     & Neuronal (less exploration) & 2722.72     & 0.0     & 0.0       & 2605.16          & 0.0                   \\ \hline
Fashion   & Neural-CBP                  & 2898.24     & 1.0     & 25.0      & 1523.92          & 1.0                   \\ \hline
Fashion   & IneurAL (official)          & 4882.48     & 0.0     & 0.0       & 4382.88          & 0.0                   \\ \hline
Fashion   & IneurAL (less exploration)  & 3437.76     & 0.0     & 0.0       & 2210.2           & 0.0                   \\ \hline
Fashion   & Neuronal (official)         & 4881.16     & 0.0     & 0.0       & 4709.56          & 0.0                   \\ \hline
Fashion   & Neuronal (less exploration) & 3584.28     & 0.0     & 0.0       & 3089.64          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
covertype & Neural-CBP                  & 5060.24     & 1.0     & 20.0      & 446.96           & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
covertype & IneurAL (official)          & 9129.68     & 0.0     & 1.0       & 8235.72          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
covertype & IneurAL (less exploration)  & 7707.72     & 0.0     & 0.0       & 4862.12          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
covertype & Neuronal (official)         & 7831.48     & 0.0     & 0.0       & 7287.32          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
covertype & Neuronal (less exploration) & 5659.96     & 0.0     & 4.0       & 3431.8           & 0.0                   \\ \hline
shuttle   & Neural-CBP                  & 907.56      & 1.0     & 20.0      & 190.08           & 1.0                   \\ \hline
shuttle   & IneurAL (official)          & 7989.88     & 0.0     & 0.0       & 7901.0           & 0.0                   \\ \hline
shuttle   & IneurAL (less exploration)  & 2810.08     & 0.0     & 0.0       & 2292.48          & 0.0                   \\ \hline
shuttle   & Neuronal (official)         & 1661.08     & 0.0     & 1.0       & 1277.04          & 0.0                   \\ \hline
shuttle   & Neuronal (less exploration) & 1666.0      & 0.0     & 4.0       & 404.56           & 0.011                 \\ \hline
\end{tabular}%
}
\caption{Numeric values in support of Figure \ref{fig:Q2_app}. \Neuronal{} with $\mu_1=\mu_2=0.0001$.
Mean regret: average regret at the last step ($T = 10$k). 
P-value: Welch's t-test on the distribution of regrets at the last step, with \NeuralCBP{} as reference (p-value $>0.05$ means no statistical difference). 
Win count: number of times a given strategy achieved the lowest final regret (ties included). 
Mean exploration: average number of expert-verified observations. 
P-value (exploration): Welch's t-test on the distribution of number of expert queries. }
\label{tab:Q2_app}
\end{table}


\begin{table}[H]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Dataset & Approach                    & Mean regret & p-value & win count & Mean Exploration & p-value (exploration) \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST   & Neural-CBP                  & 954.458     & 1.0     & 17.0      & 439.75           & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST   & Neuronal (official)         & 1342.24     & 0.0     & 0.0       & 1284.16          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST   & Neuronal (less exploration) & 992.6       & 0.015   & 7.0       & 817.92           & 0.0                   \\ \hline
Fashion & Neural-CBP                  & 2749.72     & 1.0     & 20.0      & 606.52           & 1.0                   \\ \hline
Fashion & Neuronal (official)         & 3576.76     & 0.0     & 0.0       & 3027.96          & 0.0                   \\ \hline
Fashion & Neuronal (less exploration) & 2818.8      & 0.072   & 6.0       & 1698.16          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
CIFAR10 & Neural-CBP                  & 6865.92     & 1.0     & 25.0      & 2515.04          & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
CIFAR10 & Neuronal (official)         & 8535.76     & 0.0     & 0.0       & 7519.36          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
CIFAR10 & Neuronal (less exploration) & 7478.12     & 0.0     & 0.0       & 5114.48          & 0.0                   \\ \hline
\end{tabular}%
}
\caption{Numeric values in support of Section \ref{sec:Q3_results} presented in the main paper (see Figure \ref{fig:Q3}). \Neuronal{} with $\mu_1=\mu_2=0.001$.
Mean regret: average regret at the last step ($T = 10$k). 
P-value: Welch's t-test on the distribution of regrets at the last step, with \NeuralCBP{} as reference (p-value $>0.05$ means no statistical difference). 
Win count: number of times a given strategy achieved the lowest final regret (ties included). 
Mean exploration: average number of expert-verified observations. 
P-value (exploration): Welch's t-test on the distribution of number of expert queries. }
\label{tab:Q3}
\end{table}


\begin{table}[H]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Dataset & Approach                    & Mean regret & p-value & win count & Mean Exploration & p-value (exploration) \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST   & Neural-CBP                  & 954.458     & 1.0     & 24.0      & 439.75           & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST   & Neuronal (official)         & 2009.04     & 0.0     & 0.0       & 1975.8           & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
MNIST   & Neuronal (less exploration) & 1274.24     & 0.0     & 0.0       & 1154.96          & 0.0                   \\ \hline
Fashion & Neural-CBP                  & 2749.72     & 1.0     & 24.0      & 606.52           & 1.0                   \\ \hline
Fashion & Neuronal (official)         & 4182.96     & 0.0     & 0.0       & 3919.52          & 0.0                   \\ \hline
Fashion & Neuronal (less exploration) & 3095.28     & 0.0     & 1.0       & 2352.64          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
CIFAR10 & Neural-CBP                  & 6865.92     & 1.0     & 25.0      & 2515.04          & 1.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
CIFAR10 & Neuronal (official)         & 9410.12     & 0.0     & 0.0       & 9130.44          & 0.0                   \\ \hline
\rowcolor[HTML]{C0C0C0} 
CIFAR10 & Neuronal (less exploration) & 8483.68     & 0.0     & 0.0       & 6674.24          & 0.0                   \\ \hline
\end{tabular}%
}
\caption{Numeric values in support of Figure \ref{fig:Q3_app}. \Neuronal{} with $\mu_1=\mu_2=0.0001$.
Mean regret: average regret at the last step ($T = 10$k). 
P-value: Welch's t-test on the distribution of regrets at the last step, with \NeuralCBP{} as reference (p-value $>0.05$ means no statistical difference). 
Win count: number of times a given strategy achieved the lowest final regret (ties included). 
Mean exploration: average number of expert-verified observations. 
P-value (exploration): Welch's t-test on the distribution of number of expert queries. }
\label{tab:Q3_app}
\end{table}


\end{document}
