%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % NOTE (JR): change preprint command for revised versions!
%\documentclass[a4paper,12pt]{article}    %TA fuer Korrekturelesen
% after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

\usepackage[space]{cite}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{textcomp}
\usepackage{xcolor}
\usepackage{dirtytalk}
\usepackage{algorithm2e}
\usepackage{hyperref}
\usepackage{bm}
\usepackage{float}

% Define Commands:
\DeclareRobustCommand{\bbone}{\text{\usefont{U}{bbold}{m}{n}1}}
\DeclareMathOperator{\EX}{\mathbb{E}}% expected value
\DeclareRobustCommand{\bbone}{\text{\usefont{U}{bbold}{m}{n}1}}
\DeclareMathOperator{\Prob}{\mathbb{P}}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\Var}{\operatorname{Var}}

\newcommand{\Dcal}{\mathcal{D}}
\newcommand{\ppp}{\mathrm{PPP}}

%envs

\newtheorem{definition}{Definition}
\newtheorem{Theorem}{Theorem}
\newtheorem{Proposition}{Proposition}
\newtheorem{Proof}{Proof}
\newtheorem{Corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{Hypothesis}{Hypothesis}


%% Some suggested packages, as needed:
% \usepackage{natbib} % has a nice set of citation styles and commands
%     \bibliographystyle{plainnat}
%     \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams



% If you use BibTeX in apalike style, activate the following line:
\bibliographystyle{apalike}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{%Bayesian(,) PLS / Bayesian(,) PL(ea)S(e)!\\ 
Approximately Bayes-Optimal Pseudo-Label Selection}% (PSL) in Self-Training }

%% Approximately streichen?

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<rodemann@stat.uni-muenchen.de>?Subject=Your paper on Bayes-Optimal Pseudo-Label Selection}{Julian Rodemann}{}}
\author[1,2,3]{Jann Goschenhofer}
\author[1,2,4]{Emilio Dorigatti}
\author[1,2]{Thomas Nagler}
\author[1]{Thomas Augustin}

% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    Ludwig-Maximilians-Universität (LMU)\\
    Munich, Germany
}
 \affil[2]{%
     Munich Center for Machine Learning (MCML)\\
     Munich, Germany
 }
  \affil[3]{%
       Fraunhofer Institute for Integrated Circuits (IIS)\\
     Erlangen, Germany
 }
   \affil[4]{%
     Institute of Computational Biology\\
     Helmholtz-Zentrum\\
     Neuherberg, Germany
 }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }  
\begin{document}
\maketitle
% \begin{abstract}
% %\addtolength\baselineskip{4ex} %TA fuer Korrekturelesen
%   Semi-supervised learning by self-training heavily relies on pseudo-label selection (PLS). 
%   This selection often depends on the initial model fit on labeled data. Early overfitting might thus be propagated to the final model by selecting instances with overconfident but erroneous predictions, often referred to as confirmation bias. 
%   This paper introduces BPLS, a Bayesian framework for PLS that aims to mitigate this issue. 
%   At its core lies a criterion
%   for selecting instances to label: an analytical approximation of the posterior predictive of pseudo-samples. 
%   We derive this selection criterion by proving Bayes-optimality of the posterior predictive of pseudo-samples.
%   %Moreover, BPLS brings along the benefit of allowing for the incorporation of prior information not only in predicting but also in selecting pseudo-labels.
%   %What is more, BPLS is flexible enough to incorporate prior knowledge. %not only in predicting but also in selecting pseudo-labeled data. %What is more, BPLS involves no hyperparameters that require tuning. 
%   We further overcome computational hurdles by approximating the criterion analytically. 
%   Its relation to the marginal likelihood allows us to come up with an approximation based on Laplace's method and the Gaussian integral.
%   %For the sake of computational feasibility, we \mbox{approximate} the PPP by varying degrees of accuracy. %, with the pseudo-label likelihood corresponding to the roughest approximation in case of parametric models. 
% We empirically assess BPLS on simulated and real-world data. When faced with high-dimensional data prone to overfitting, BPLS outperforms traditional PLS methods.\footnote{\textbf{Open Science:} Implementation and code to reproduce results are available at \url{https://github.com/rodemann/Bayesian-pls}}%, while achieving similar performance on data with lower dimensions.\  
% \end{abstract}

%\addtolength\baselineskip{4ex} %TA fuer Korrekturelesen
% \addtolength\parskip{8ex} %TA fuer Korrekturelesen

\begin{abstract}
Semi-supervised learning by self-training heavily relies on pseudo-label selection (PLS). 
  This selection often depends on the initial model fit on labeled data. Early overfitting might thus be propagated to the final model by selecting instances with overconfident but erroneous predictions, often referred to as confirmation bias. 
  This paper introduces BPLS, a Bayesian framework for PLS that aims to mitigate this issue. 
  At its core lies a criterion
  for selecting instances to label: an analytical approximation of the posterior predictive of pseudo-samples. 
  We derive this selection criterion by proving Bayes-optimality of the posterior predictive of pseudo-samples.
  We further overcome computational hurdles by approximating the criterion analytically. 
  Its relation to the marginal likelihood allows us to come up with an approximation based on Laplace's method and the Gaussian integral. 
We empirically assess BPLS on simulated and real-world data. When faced with high-dimensional data prone to overfitting, BPLS outperforms traditional PLS methods.\footnote{\textbf{Open Science:} Implementation and code to reproduce results are
available at\, \url{https://github.com/rodemann/Bayesian-pls}}.
\end{abstract}



\section{INTRODUCTION}
\label{sec:intro}




%\subsection{Motivation}
Labeled data are scarce in many learning settings. This can be due to a variety of reasons such as restrictions on time, knowledge, or financial resources. Unlabeled data, however, are often much more accessible. This has given rise to the paradigm of semi-supervised learning (SSL), where information from unlabeled data is integrated into model training to improve predictions in a supervised learning framework. 
Within SSL, an intuitive and widely used approach is referred to as self-training or pseudo-labeling \cite{shi2018transductive, lee2013pseudo, mcclosky2006effective}. The idea is to fit an initial model to labeled data and iteratively assign pseudo-labels to unlabeled data according to the model's predictions. 
The latter requires a criterion (sometimes called confidence measure) for pseudo-label selection (PLS), that is, the selection of instances to be pseudo-labeled and added to the training data. %\footnote{Note that actually pseudo-labeled instances from the set of unlabeled data are selected rather than pseudo-labels itself.Nevertheless, the notion of pseudo-label selection (PLS) became prevalent in the literature. Thus, we stick with PLS.}
% More recent SSL methods based on consistency regularization outperform self-training procedures \cite{Berthelot2019, Sohn2020, Zhang2021c}.
% However, relying on specific data augmentation procedures, these are not applicable to all data modalities and domains, whereas self-training is \cite{amini2022self}. 
% This is due to the fact that it does not require domain-specific data augmentations. 
% What is more, \cite{rizve2020defense} show that self-training can in fact compete with consistency regularization if the model is well-calibrated.

By design, self-training strongly relies on the initial model fit and the way instances are selected to be pseudo-labeled. Everything hinges upon the interplay \mbox{between} the selection criterion and the initial model's generalization performance. %If the underlying data generating process can be well grasped based on the limited number of initially labeled data, self-training will also generalize well, no matter which pseudo-labels are selected. %In such an edge case, however, pseudo-labeled data hardly adds any information, rendering self-training redundant. 
%On the other hand, 
If the initial model generalizes poorly, initial misconceptions can propagate throughout the process, only making things worse. High-dimensional data prone to overfitting are particularly sensitive to such confirmation bias \cite{arazo2020pseudo}. Usually, self-training's sweet spot lies somewhere else: When the labeled data allow the model to learn sufficiently well while still leaving some room for improvement.  
Generally, the poorer the initial generalization, the harder it is to select sensible pseudo-labels to improve generalization, i.e., the more crucial the role of the selection criterion. %While regularization methods like LASSO \cite{tibshirani1996regression} can improve the generalization of the initial model, they would also restrict the parameter space during PLS, reducing self-training's learning capacity. 
Note that SSL is applied to data with high shares (typically over $80\%$ \cite{Sohn2020, arazo2020pseudo}) of unlabeled data, where initial overfitting is more likely than final overfitting.

\subsection{Motivation}

Accordingly, we strive for a selection criterion that is robust with respect to the initial model fit, i.e., its learned parameters. At the same time, it should still exploit the information in the labeled data. Such a measure calls for disentangling the uncertainty contributions of the data and the model's parameters. This is in line with recent work on uncertainty quantification (UQ) that suggests decomposing epistemic uncertainty into approximation uncertainty driven by (a lack of) data and modeling uncertainty driven by (primarily parametric) assumptions \cite{hullermeier2021aleatoric}. Bayesian inference offers a sound and consistent framework for this distinction. Its rationale of technically modeling not only data but also parameters as random variables has proven to offer much insight into UQ for machine learning \cite{hullermeier2021aleatoric} and deep learning \cite{abdar2021review, malinin2018predictive}. 

We exploit the Bayesian framework for pinpointing uncertainty with regard to data and parameters in PLS. Our approach of Bayesian pseudo-label selection (BPLS) enables us to choose pseudo-labels that are likely given the observed labeled data but not necessarily likely given the estimated parameters of the fitted model. %With an uninformative prior on the parameters we can even go further and select pseudo-labels that are likely given the data no matter what the parameters are. 
What is more, BPLS allows to include prior information not only for predicting but also for selecting pseudo-labels. Notably, BPLS is flexible enough to be applied to any kind of predictive model whose likelihood and Fisher-information are accessible, including non-Bayesian models. BPLS entails a Bayes-optimal selection criterion, the \textit{pseudo posterior predictive} (PPP). Its intuition is straightforward yet effective: By averaging over all parameter values, PPP is more robust towards the initial fit compared to the
predictive distribution based on a single optimal parameter vector.
Our approximate version of the PPP is simple and computationally cheap to evaluate: $ \ell (\hat \theta) - \frac{1}{2} \log \lvert \mathcal{I(\hat \theta)} \rvert $ with $\ell(\hat \theta)$ being the likelihood and $\mathcal{I(\hat \theta)}$ the Fisher-information matrix at the fitted parameter vector $\hat \theta$. As an approximation of the joint PPP, it does not require an \textit{i.i.d.} assumption, rendering it applicable to a wide range of applied learning setups. 



\subsection{Main Contributions}
%In particular, our main contributions are: 

\textbf{(1)} We derive PPP by formalizing PLS as a decision problem and show\footnote{Proofs of all theorems in this paper can be found in the supplementary material.} that PPP corresponds to the Bayes criterion, rendering selecting instances with regard to it Bayes-optimal, see Sections~\ref{sec:case-for-margl} and~\ref{sec:bayes-opt}.

\textbf{(2)} Since our selection criterion includes a possibly intractable integral, we provide analytical approximations, exploiting Laplace's method and the Gaussian integral, both for uninformative and informative priors. Using varying levels of accuracy, we balance the trade-off between computational feasibility and precision, see Section~\ref{sec:approx}. 

\textbf{(3)} We provide empirical evidence\footnote{
Implementations of the proposed methods as well as reproducible scripts for the experiments are provided in the public repository named \textbf{Bayesian-pls} (\say{\textit{Bayesian, please!}}), see abstract. %: \url{https://anonymous.4open.science/r/Bayesian-pls}
} for BPLS' superiority over traditionally predominant PLS methods in case of semi-supervised generalized additive models (GAMs), generalized linear models (GLMs), and Bayesian neural networks (BNNs) faced with high-dimensional data prone to overfitting, see Section~\ref{sec:results}.
%What is more, we apply semi-supervised logistic regression with BPLS to simulated and real-world data. Results suggest ... \textcolor{red}{TODO}.  




% \subsection{Outline}
% The rest of this paper is structured as follows. Section~\ref{sec:background} discusses related work and provides background on and fixes some notation for SSL and self-training. In Section~\ref{sec:bpls} we introduce BPLS by first making the case for selecting pseudo-labeled instances with regard to their \textit{pseudo posterior predictive} (PPP) in Section~\ref{sec:case-for-margl}. This is followed by a Proof of its Bayes-optimality in~\ref{sec:bayes-opt}, before providing approximations of PML in~\ref{sec:approx} and describing the algorithmic procedure in~\ref{sec:algo}. Section~\ref{sec:results} presents experimental results for semi-supervised logistic regression. Section~\ref{sec:discussion} concludes.  


%%%%%%%%%%
% Related work
%%%%%%%%





%Chapter 3 here: OSL mit likelihood
%https://en.cs.uni-paderborn.de/fileadmin/informatik/fg/is/Publications/ECML2015-SL.pdf

%# superset learning opt und pess:
%https://hal.archives-ouvertes.fr/hal-02417287/document








\section{BAYES-OPTIMAL PLS}
\label{sec:bpls}
%\section{Pseudo Marginal Likelihood}

Most semi-supervised methods deal with classification or clustering tasks \cite{van2020survey, chapelle2006semi}. Loosely leaning on \cite{triguero2015self}, we formalize SSL as follows. Consider labeled data \begin{equation}
    \mathcal{D}=\left\{\left(x_{i}, y_{i}\right)\right\}_{i=1}^{n} \in \left(\mathcal{X} \times \mathcal{Y}\right)^{n}
\end{equation} and unlabeled data 
\begin{equation}  \mathcal{U}=\left\{\left(x_{i}, \mathcal{Y}\right)\right\}_{i=n+1}^{m} \in \left(\mathcal{X} \times 2^\mathcal{Y}\right)^{m-n}
\end{equation} from the same data generation process. $\mathcal{X}$ is the feature space and $\mathcal{Y}$ the categorical target space, whereby unlabeled data are notationally equated with observing the full space ${\cal Y}$. The aim of SSL is to learn a predictive classification function $f$ such that $f( x) = \hat y \in \mathcal{Y}$ utilizing both $\mathcal{D}$ and $\mathcal{U}$.  %As usual, consider a loss function $L: \mathcal{Y} \times \mathcal{Y} \rightarrow \mathbb{R}$ and a corresponding theoretical risk function $\int L(y,y) d \mathbb{P}$, whose empirical analogue $\frac{1}{n} \sum_i^n L(\hat y(x_i),y_i)$ is to be minimized. 
% The objective of SSL can be twofold \cite{triguero2015self}. On the one hand, one simply aims at labeling $\mathcal{U}$ (transductive learning). On the other hand, and more commonly, both $\mathcal{D}$ and $\mathcal{U}$ can be used to predict unseen test data (inductive learning) in a more accurate way than only relying on $\mathcal{D}$ as in classical supervised learning. 
As is customary in self-training, we start by fitting a model with unknown parameter vector $\theta \in \Theta$, where $\Theta$ is compact with $\dim(\Theta) = q$, on labeled data $
    \mathcal{D}=\left\{\left(x_{i}, y_{i}\right)\right\}_{i=1}^{n} $.
Our goal is -- as usual -- to learn the conditional distribution of $p( y \mid x)$ through $\theta$ from observing features ${x} = (x_1, \dots, x_n) \in \mathcal{X}^n$, and responses ${y} = (y_1, \dots, y_n) \in \mathcal{Y}^n$ in $\mathcal{D}$. %\footnote{Notably, this holds for both discriminative and generative models. The latter first model the joint distribution but eventually still draw inference regarding $y \mid x$.} 
Adopting the Bayesian perspective, we can state a prior function over $\theta$ as $\pi(\theta)$. %The prior can represent information on $\theta$ but may also be uninformative. 

Within existing frameworks for self-training (see Section~\ref{sec:background}) in SSL, one could deploy such a Bayesian setting for \textit{predicting} unknown labels of $\mathcal{U}=\left\{\left(x_{i}, \mathcal{Y}\right)\right\}_{i=1}^{m}$ as well as for the final predictions on unseen test data. However, we aim at a Bayesian framework for \textit{selecting} pseudo-labels. This is beneficial for two reasons. First and foremost, considering the Bayesian posterior predictive distribution in PLS will turn out to be more robust towards the initial fit on $\mathcal{D}$ than classical selection criteria. Second, the Bayesian engine brings along the usual benefit of allowing to explicitly account for prior knowledge when selecting instances to be labeled. Notably, our framework of Bayesian pseudo-label \textit{selection} is unrelated to how pseudo-labels are \textit{predicted}. %That is, the model to predict pseudo-labels can but does not need to be Bayesian. Our implementation of BPLS is applicable to any learner whose likelihood is accessible. %Further, even learners with intractable likelihood might use BPLS since it generally allows for likelihood-free inference along the lines of \cite{gutmann2016bayesian, pmlr-v97-greenberg19a, zhang2022unifying}.
  


\subsection{The Case for the Posterior Predictive in PLS}
\label{sec:case-for-margl}

For any model with parameters $\theta \in \Theta$, the likelihood function for observed features ${x}$ and labels ${y}$ is commonly defined as 
$
    \mathcal {L}_{ y \mid  x}(\theta) = f_{\theta }( y\mid  x),
$ where $f_{\theta }(\cdot)$ is from a parameterized family of probability density functions. 
%Loosely speaking, the idea behind the likelihood function relies on one ingeniously simple trick: One transforms the density $f_{\theta }({y}\mid {x})$ to the likelihood $\mathcal {L}_{{y} \mid {x}}(\theta)$ by thinking of $\theta$, $M$ as functional arguments (i.e., parameters of a family of functions) instead of parameters and of ${y}\mid {x}$ as parameters instead of variables. 
%Albeit sometimes treated as, note that the likelihood function is not a density function, since it maps from $\Theta \times \mathcal{M}$ to $\mathbb{R}$, with $\mathcal{M}$ as the space of model classes, and not from $\mathcal{Y}$ to $\mathbb{R}$. Nevertheless, fixing $\theta$ and $M$, it is possible to interpret a likelihood function (and thus the posterior predictive, see below) as the probability of ${y}$ given an observed ${x}$, having $\mathcal {L}_{ y \mid  x}(\theta) = f_{\theta }({y}\mid {x})$ in mind.
In the Bayesian universe, parameters $\theta$ are more than just functional arguments \cite{murphy2012machine}. They are random quantities themselves, allowing us to condition on them: $\mathcal{L}_{ y \mid  x}(\theta) = p({y}\mid {x}, \theta )$. Recall that we have specified a prior $\pi(\theta)$ on the parameters beforehand. After observing data, it can be updated to a posterior following Bayes' Theorem 
    $p(\theta \mid {y}, {x}) = p({y} \mid {x}, \theta ) \, \pi(\theta) / p({y} \mid {x}),$
where the denominator is the marginal likelihood 
\begin{equation}
p({y} \mid {x}) = \int_{\Theta} p({y} \mid {x}, \theta ) \, \pi(\theta )\,d\theta,
\end{equation} 
or, more colloquially, \say{Bayesian evidence} \cite{lotfi2022bayesian, barber2012bayesian}. For previously unseen data $(\Tilde{y}, \Tilde{x})$, the posterior predictive distribution is defined as

\begin{equation}
\label{eq:pp}
    p(\Tilde{y} \mid \Tilde{x}, {y}, {x})=\int_{\Theta} p(\Tilde{y} \mid \Tilde{x}, \theta) \, p(\theta \mid {y}, {x})\,d\theta.
\end{equation}

The posterior predictive closely resembles the marginal likelihood in case we include $(\Tilde{y}, \Tilde{x})$ in the data -- a fact that we will exploit for our approximations in Section~\ref{sec:approx}. Both marginalize the likelihood over $\theta$. The difference is the weight: The marginal likelihood integrates out $\theta$ with regard to the prior, while the posterior predictive integrates out $\theta$ with regard to the posterior. They thus give us the probability that we would generate data with a model if we randomly sample from a prior or a posterior, respectively, over its parameters.
Accordingly, both can be considered PLS criteria that are robust towards the initial fit: They average over all possible $\theta$-values instead of relying on one estimated $\hat \theta$ from the trained model. 
% Since the marginal likelihood integrates with regard to the prior, it is often also referred to as prior predictive of $(\Tilde{y}, \Tilde{x})$ in case $(\Tilde{y}, \Tilde{x})$ is considered solely, i.e., not included in the observed data: $
% p(\Tilde{y} \mid \Tilde{x})=\int_{\Theta} p(\Tilde{y} \mid \Tilde{x}, \theta) \pi(\theta )\,d\theta.
% $
Computational issues aside, the posterior predictive of pseudo-labeled data thus encapsulates a perfectly natural selection criterion for self-training: It selects pseudo-labels that are most likely conditioned on the true observed $\mathcal{D}$, the assumed model and all plausible parameters from the prior or posterior, respectively.

%\footnote{Strictly speaking, the estimation procedure also needs to be specified beforehand, that is, the way the model's parameters are to be estimated. However, this does not touch our line of reasoning here, since all ways of PLS require this specification for predicting pseudo-labels. In what follows, we will only consider (methods equivalent to) maximum likelihood estimation w.l.o.g..} 

%This is in stark contrast to the likelihood. The latter would imply conditioning on $\mathcal{D}$, the assumed model $M$ and one particular parameter vector, namely the learned $\theta = \hat \theta$. We argue marginal likelihood and posterior predictive are also more robust with respect to $\theta = \hat \theta$ than the predictive distribution of $(\Tilde{y}, \Tilde{x})$ in a non-Bayesian setup. The latter simply lacks methods, namely priors on $\theta$, to weaken the influence of the data on the predictive distribution. Instead, the single best estimate of $\theta$ would be artlessly plugged into the distribution of the predictions.



Both the data and the estimated parameters (as functions of the data) will change throughout the process of self-training. We argue that conditioning the choice of unlabeled instances solely on the estimated parameters in early iterations over-emphasizes the influence of the initial model. This optimistic reliance can be harmful in case of small $n$ and high $q$, where overfitting is likely. Selecting instances by the posterior predictive mitigates this. 
 


%We will make use of the latter with regard to the marginal likelihood. Recall that we have specified a prior $\pi(\theta)$ on the parameters. This allows us to marginalize out the parameters, i.e.

% \begin{equation}
% \label{eq:marginal-likelihood}
%     p({y} \mid {x})=\int p({y} \mid {x}, \theta )\pi(\theta )\,d\theta,
% \end{equation}


% While typically used to compare models with regard to how well they explain observed data, we will use the marginal likelihood to compare data with regard to how well they \textit{can be explained} by a given model averaged over its parameter values. We assess $\left(x_{j}, \mathcal{Y}_{j}\right) \in \mathcal{U}$ by how it possibly affects the marginal likelihood. %A high marginal likelihood of  $\mathcal{D} \cup \left(x_{i}, \mathcal{Y}_{i}\right)$ corresponds to a high probability of $\left(x_{i}, \mathcal{Y}_{i}\right)$ stemming from the same data generating process as $\mathcal{D}$, assuming model $M$. 
% The marginal likelihood thus measures how confident we can be in labeling a particular $\left(x_{j}, \mathcal{Y}_{j}\right)$ as $\left(x_{j}, \hat {y}_{j}\right)$ averaged over all possible $\theta$-values. Contrary to classical confidence measures (see Section~\ref{sec:self-training}), the marginal likelihood is no function of the estimated parameters $\hat \theta$.\footnote{In Section~\ref{sec:} we will even suggest attenuating the dependence on the model choice by bivariate optimization of the marginal likelihood with regard to both the data and the model('s size).} When comparing instances in $\mathcal{U}$ we thusask \say{how well can all possible (fitted) models within the chosen model class explain the data point?} instead of asking \say{how well can this one particular model explain the data point?}.    
   

%However, using the marginal likelihood (equation~\ref{eq:marginal-likelihood}) can be too much of a good thing in case of independently distributed data and uninformative prior. Note that we can decompose the marginal likelihood  $p({y}\mid {x})= \prod_i^n p(y_i \mid {y}_{<i}, {x}_{\leq i})$, where ${y}_{<i} = (y_1, \dots, y_{i-1})$ and ${x}_{\leq i} = (x_1, \dots, x_{i})$. For independent observations this simplifies to $p({y}\mid {x})= \prod_i^n p(y_i \mid x_{i})$. In this case the individual marginal likelihood contributions $p(y_i \mid x_{i})$ of unobserved data correspond to their prior predictive distribution (equation~\ref{eq:prior-pred}). With an uninformative prior, however, these prior predictive distribution of single pseudo-labeled instances will not differ, rendering PLS with regard to the marginal likelihood (prior predictive) arbitrary. In other words, for PLS by marginal likelihood to be meaningful, there has to be actual information in the labeled data for the pseudo-labeled data. That is, they have to be dependent. Then the marginal likelihood contributions of pseudo-labeled data do not correspond to the uninformative prior predictive distributions, rendering BPLS with regard to the marginal likelihood a viable robust alternative.
%To this end, we would thus need a model that captures the dependency structure in the data. The example of logistic regression in Section~\ref{sec:results} assumes the data to be (conditionally) independent; hence, we will restrict ourselves to the posterior predictive in this case. 


\subsection{Bayes-Optimality of Pseudo Posterior Predictive}
\label{sec:bayes-opt}

In the following, we show that selecting pseudo-labels with regard to their posterior predictive is Bayes-optimal. We further show the same holds for selection with regard to the marginal likelihood in case of a non-updated prior. To this end, we formalize the selection of data points to be pseudo-labeled as a canonical decision problem, where an action corresponds to the selection of an instance from the set of unlabeled data $\mathcal{U}$.   

\begin{definition}[PLS as Decision Problem]
\label{def:dec-probl}    
Consider the decision-theoretic triple $(\mathcal{U}, \Theta, u(\cdot))$ with an action space of unlabeled data\footnote{We assume absence of tied observations for simplicity such that we can understand $\mathcal{U}$ as set.} to be selected, i.e., instances $(x_i, \mathcal{Y})$ as actions, a space of unknown states of nature (parameters) $\Theta$ and a utility function $u : \mathcal{U} \times \Theta \to \mathbb{R}$. 
% For the decision problem of selecting instances to label given the observed data $ \mathcal{D} \in \left(\mathcal{X} \times \mathcal{Y}\right)^{n}$ the decision function
% \begin{align*}
%   s \colon \left(\mathcal{X} \times \mathcal{Y}\right)^{n} &\to \mathcal{U}\\
%   \mathcal{D} &\mapsto d(\mathcal{D}) = (x_i, \mathcal{Y})
% \end{align*}
% shall be called selection function. 
\end{definition}

Loosely inspired by \cite{cattaneo2007statistical}, we now define the utility of a selected data point $(x_i, \mathcal{Y})$ as the plausibility of being generated jointly with $\mathcal{D}$ by a model with parameters $\theta \in \Theta$ if we include it with pseudo-label $\hat y_i \in \mathcal{Y}$ (obtained through any predictive model) in $\mathcal{D} \cup (x_i, \hat{y}_i)$. This is incorporated by the likelihood of $\mathcal{D} \cup (x_i, \hat{y}_i)$, which shall be called \textit{pseudo-label likelihood} and written as $p(\mathcal{D} \cup (x_i, \hat{y}_i) \mid \theta)$. We thus condition the selection problem on a model class as well as on already predicted pseudo-labels. The former conditioning is not required (see the extension in Section~\ref{sec:ext}) for the well-definedness of the pseudo-likelihood while the latter is. 

\begin{definition}[Pseudo-Label Likelihood as Utility]
\label{def:pseud-lik}
Let~$(x_i, \mathcal{Y})$ be any decision (selection) from $\mathcal{U}$. We assign utility to each $(x_i, \mathcal{Y})$ given $\mathcal{D}$ and pseudo-labels $\hat{y} \in \mathcal{Y}$ by the following measurable utility function
\begin{align*}
  u \colon \mathcal{U} \times \Theta &\to \mathbb{R}\\
  ((x_i, \mathcal{Y}), \theta) &\mapsto u((x_i, \mathcal{Y}), \theta) = p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \theta),
  \end{align*}
  which is said to be the pseudo-label likelihood.
\end{definition}

This utility function is a natural probabilistic choice to assign utilities to selected pseudo-labels given the predicted pseudo-labels. With a prior $\pi(\theta)$, we get the following result.

\begin{Theorem}
\label{th:bayes-opt}
In the decision problem $(\mathcal{U}, \Theta, u(\cdot))$ (Definition~\ref{def:dec-probl}) with the pseudo-label likelihood as utility function (Definition~\ref{def:pseud-lik}) and a prior $\pi(\theta)$ on $\Theta$, the standard Bayes criterion 
\begin{align*}
    \Phi(\cdot,\pi) \colon \mathcal{U} &\to \mathbb{R}\\
    a &\mapsto \Phi(a, \pi) = \mathbb{E}_\pi(u(a,\theta)) 
\end{align*}

corresponds to the pseudo marginal likelihood $p(\mathcal{D}~\cup~(x_i, \hat{y}_i))$.
    
\end{Theorem}

%\begin{Proof}
%    All proofs are in the supplementary material.
%\end{Proof}



\begin{Corollary}
   For any prior $\pi(\theta)$ on $\Theta$, the action $a_{m}^* = \argmax_i p(\mathcal{D} \cup (x_i, \hat{y}_i))$ is Bayes-optimal.
\end{Corollary}

Taking the observed labeled data $\mathcal{D}$ into account by updating the prior $\pi(\theta)$ to a posterior $p(\theta \mid \mathcal{D})$, we end up with an analogous result for the \textit{pseudo posterior predictive}. The Theorem requires only the Proposition by \cite[section 4.4.1]{berger1985statistical} stating that posterior loss equals prior risk. That is, conditional Bayes-optimality equals unconditional Bayes-optimality.  

\begin{Theorem}
\label{th:ppp}
In the decision problem $(\mathcal{U}, \Theta, u(\cdot))$ and the pseudo-label likelihood as utility function as in Theorem~\ref{th:bayes-opt} but with the prior updated by the posterior $\pi(\theta) = p(\theta \mid \mathcal{D})$ on $\Theta$, the standard Bayes criterion 
$\Phi(\cdot, \pi) \colon \mathcal{U} \to \mathbb{R}; \, a \mapsto \Phi(a, \pi) = \mathbb{E}_\pi(u(a,\theta)) $
corresponds to the \textit{pseudo posterior predictive} $p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \mathcal{D})$.
\end{Theorem}

% \begin{Proof} See supplementary material. 
%     % Analogous to Proof 1, we have $ \Phi(a, \pi) = \mathbb{E}_\pi(u(a,\theta)) = \int u(a, \theta) d \pi(\theta).$ Now with the updated prior $\pi(\theta) = p(\theta \mid \mathcal{D})$ it follows $ \int u(a, \theta) d \pi(\theta)= \int p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid\theta) d p(\theta \mid \mathcal{D}) = p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \mathcal{D})$.
% \end{Proof}


\begin{Corollary}
   Action $a_{p}^* = \argmax_i p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \mathcal{D})$ is Bayes-optimal for any updated prior $\pi(\theta) = p(\theta \mid \mathcal{D})$.
\end{Corollary}

%Due to the aforementioned reasons, we will proceed with the posterior predictive.
%TODO: cite \cite{kofler2013entscheidungen} in published version

Further note that directly maximizing the likelihood with regard to $a$ corresponds to the optimistic max-max-criterion, see Theorem~\ref{th:max-max}. 


\begin{Theorem}
\label{th:max-max}
In the decision problem $(\mathcal{U}, \Theta, u(\cdot))$ with the pseudo-label likelihood as utility function as in Theorem~\ref{th:bayes-opt}, the max-max criterion 
\begin{align*}
    \Phi(\cdot) \colon \mathcal{U} &\to \mathbb{R};\\
    a &\mapsto \Phi(a) = \max_\theta (u(a,\theta)) 
\end{align*}

corresponds to the (full) likelihood.
\end{Theorem}

The max-max-criterion advocates deciding for an action (here: selection of pseudo-labeled data) with the highest utility (here: likelihood) according to the most favorable state of nature $\theta$, e.g. see \cite{rapoport1998decision}. It can hardly be seen as a robust criterion, as it reflects \say{wishful thinking} \cite[page 57]{rapoport1998decision}. We thus abstain from it in what follows. Our roughest approximation of the PPP in Section~\ref{sec:approx}, however, will correspond to this case as well as the more general concept of optimistic superset learning (OSL) \cite{hullermeier2014learning, rodemann2022levelwise}.

%TO DO: To which decision criterion does the likelihood correspond? --> maxmax (i.e. Hüllermeiers OSL) 
%TODO: pessimisitc superset learning is minimax!!!!
% whole new perspective on superset learning (evtl separat aufschreiben)

%TODO: use this as an argument pro ppp: 
%For generative models whose likelihood is intractable but allows sampling from, the \textit{pseudo posterior predictive} can be a viable alternative to the (then inaccessible) marginal likelihood. Likelihood-free inference allows approximating the posterior in such cases, see \cite{zhang2022unifying}, for instance. Yet, we will abstain from updating the prior this way in the following, i.e. we will select instances with regard to the pseudo marginal likelihood instead of the \textit{pseudo posterior predictive}. We consider not updating the prior a feature, not a bug. By explicitly forgoing the information on $\theta$ in $\mathcal{D}$ through $M$, we avoid selecting with regard to an overfitted $\hat \theta$, thus making the selection more robust with regard to the initial fit. Consulting the posterior predictive would correspond to asking how well those parameter that are likely given the data can explain the data point under consideration. Instead, we ask how well all possible parameters within the chosen model class can explain the data point. 



\section{Approximately Bayes-Optimal PLS Criteria}
\label{sec:approx}


The \textit{pseudo posterior predictive} (PPP) $p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \mathcal{D}) = p(\hat{y} \mid x,  y,  x)$ (Theorem~\ref{th:ppp}) is computationally costly to evaluate via Markov Chain Monte Carlo (MCMC) in the context of PLS. This is due to the fact that for iterative PLS procedures, the selection criterion (here: PPP) generally has to be evaluated for $m + (m - 1) + \dots + 1 = \frac{m^2 + m}{2}$ times, with $m = \lvert \mathcal{U} \rvert $ the number of unlabeled data points. Though there are possible simplifications, see Section \ref{sec:results}, the general complexity is $\mathcal{O}(m^2)$. 
Using MCMC to approximate the PPP would imply running MCMC-sampling $\frac{m^2 + m}{2}$ times. MCMC-sampling grows lineary in the number of draws~$d$. Hence, PLS with MCMC-approximation of the PPP is $\mathcal O(m^2 d) $ as opposed to $\mathcal{O}(m^2)$ for other methods. For large data sets and thus high $m$, this scales badly. For instance, using 20 chains of length 1000 (as we did in the experiments in supplement~G) makes the MCMC approach roughly 20000 times more expensive than the analytical approximation.

%For \textit{i.i.d.} data we could focus on the single PPP contributions $p(y_i \mid x_{i}, \mathcal{D})$ instead of the joint. Still, we would have to deal with a possibly intractable integral and end up with similar computational hustle. We thus opt for approximating the joint directly. \footnote{Moreover, considering the joint quantities instead of the distributions implies no loss of generality, with possible extensions for dependent data in mind.}  

Thus, we will approximate the joint PPP directly. Our method hence does not need an \textit{i.i.d.} assumption, which makes it very versatile.
Due to the aforementioned similarity of the PPP and the marginal likelihood, we are in the fortunate position of borrowing from some classical marginal likelihood approximations, see \cite{llorente2020marginal}. Especially popular are approximations based on Laplace's method as in \cite{schwarz1978estimating}. 
%We will now transfer Laplace's method to the posterior predictive. 
Our main motivation, however, is to obtain a Gaussian integral \cite{gauss1877theoria}, which we can then compute explicitly.



\subsection{Approximation of the PPP}


We will start by transferring Laplace's method to the PPP. 
Recall that the predictive posterior of a pseudo-sample $(x_i, \hat y_i)$ (the PPP) given data $\Dcal$ is defined as 
\begin{align*}
    p(\Dcal \cup (x_i, \hat y_i) | \Dcal)  &=  \int_\Theta  p(\Dcal \cup (x_i, \hat y_i) \mid \theta) p(\theta \mid \Dcal) d\theta,
\end{align*}
where Bayes' theorem gives
\begin{align*}
  p(\theta \mid \Dcal) =  p(\Dcal \mid \theta) \pi(\theta) / p(\Dcal).
\end{align*}
Denoting $\ell_{\Dcal}(\theta) = \log p(\Dcal \mid \theta)$ and $\tilde \ell(\theta) = \ell_{\Dcal\cup (x_i, \hat y_i)}(\theta)  + \ell_{\Dcal}(\theta)$, we can write the integrand as
\begin{align*} 
  p(\Dcal \cup (x_i, \hat y_i) \mid \theta) p(\theta \mid \Dcal) = \exp[\tilde \ell(\theta) \bigr) ] \pi(\theta) / p(\Dcal).
\end{align*}
Let $\mathcal I(\theta) = -\tilde \ell''(\theta)/n$ denote the observed Fisher information matrix. Further denote by  $\tilde \theta = \arg\max_{\theta} \tilde \ell(\theta)$ the maximizer of $\tilde \ell(\theta)$. It holds $\tilde \ell'(\theta) = 0$ by definition of $\tilde \theta$. A Taylor expansion around $\tilde \theta$ thus gives
\begin{align*} 
    \tilde \ell(\theta) \approx \tilde \ell(\tilde \theta)   - \frac{n}{2} (\theta - \tilde  \theta)' \mathcal{I}(\tilde  \theta) (\theta - \tilde \theta).
\end{align*}
The integrand decays exponentially in $n\|\theta - \tilde \theta\|$, so we can approximate it locally around $\tilde \theta$ by also taking $\pi(\theta)~\approx~ \pi(\tilde \theta)$ inside the integral with an analogous Taylor series. We refer to \cite[Section 3.7]{miller2006applied} and \cite[Theorem 2]{lapinski2019} for a rigorous treatment of the remainder terms and regularity conditions. 
%Specifically, see \cite[Theorem 2]{lapinski2019} for a detailed justification of~$\pi(\theta)~\approx~ \pi(\tilde \theta)$.

We can eventually approximate $p(\Dcal \cup (x_i, \hat y_i) | \Dcal)$ by
\begin{align*} 
     \frac{\exp[\tilde \ell(\tilde \theta)] \pi(\tilde \theta)}{p(\mathcal{D})}  \int_{\Theta} \exp\biggl[- \frac{n}{2} (\theta - \tilde  \theta)'  \mathcal{I}(\tilde  \theta) (\theta - \tilde \theta)\biggr] d \theta,
\end{align*}
 The integral on the right is a Gaussian integral. Defining $\Sigma = [n \mathcal{I}(\tilde  \theta)]^{-1}$ and $\phi_\Sigma$ as the density of the $\mathcal N(0, \Sigma)$ distribution, it equals
\begin{align*}
   (2\pi)^{q/2} |\Sigma|^{1/2} \int_{\Theta} \phi_\Sigma(\theta) d \theta 
    =\biggl(\frac{2\pi}{n}\biggr)^{q/2} |\mathcal I(\tilde \theta)|^{-1/2}.
\end{align*}
Altogether, we have shown that 
\begin{align} \label{eq:laplace}
     p(\Dcal \cup (x_i, \hat y_i) | \Dcal) 
    &\approx\biggl(\frac{2\pi}{n}\biggr)^{q / 2} \frac{\exp[\tilde \ell(\tilde \theta) ]  \pi(\tilde \theta)}{ | \mathcal I(\tilde \theta)  |^{1/2} p(\Dcal) }.
\end{align}

\subsection{Approximate selection criteria}

To find the pseudo-sample $(x_i, \hat y_i)$ maximizing the PPP, we can equivalently maximize its logarithm, i.e. maximize

\begin{equation*}    
\begin{split}
   \frac{q}{2} \, \log\biggl(\frac{2\pi}{n}\biggr) + \tilde \ell(\tilde \theta) + \log \pi(\tilde \theta) - \frac{1}{2} \log|\mathcal{I}(\tilde \theta)| - \log p(\mathcal{D}).   
\end{split} 
\end{equation*}

Dropping all terms that do not depend on $(x_i, \hat y_i)$ leads to the selection criterion 
\begin{align} \label{eq:psl-informative-0}
  \tilde \ell(\tilde \theta) - \frac 1 2 \log |\mathcal I(\tilde \theta)| + \log \pi(\tilde \theta).
\end{align}

The term
$$\tilde \ell(\theta) = \ell_{\Dcal\cup (x_i, \hat y_i)}(\theta)  + \ell_{\Dcal}(\theta),$$
quantifies how well the pseudo-sample $(x_i, \hat y_i)$ conforms with the data set $\Dcal$ given a parameter $\theta$, e.g. the optimal (argmax) parameter $\tilde \theta$ in Equation~\eqref{eq:psl-informative-0}. 
It is curious that samples in $\Dcal$ contribute twice to $\tilde \ell$, but $(x_i, \hat y_i)$ only once. However, this is irrelevant when comparing two pseudo-samples $(x_i, \hat y_i)$ and $(x_j, \hat y_j)$. To see this, we expand $\ell_{\Dcal}$ around its maximizer $\hat \theta$, so that $\ell_{\Dcal}(\tilde \theta) =  \ell_{\Dcal}(\hat \theta) + O(\|\hat \theta - \tilde \theta\|^2)$. Since  $\Dcal \cup (x_i, \hat y_i)$ and $\Dcal$ differ in only one sample, the difference $\hat \theta - \tilde \theta$ is of order $O(n^{-1})$. Thus,
$$ \tilde \ell(\theta) = \ell_{\Dcal\cup (x_i, \hat y_i)}(\theta) + \ell_{\Dcal}(\hat \theta) + O(n^{-2}).$$
The remainder is negligible compared to the other terms in \eqref{eq:psl-informative-0} and $\ell_{\Dcal}(\hat \theta)$ does not depend on the pseudo-sample $(x_i, \hat y_i)$. This suggests the simplified \emph{informative BPLS criterion}
\begin{align} \label{eq:psl-informative}
   \operatorname{iBPLS} = \ell_{\Dcal \cup (x_i, \hat y_i)}(\tilde \theta) - \frac 1 2 \log | \mathcal I(\tilde \theta)| + \log \pi(\tilde \theta).
\end{align}
Equivalence of \eqref{eq:psl-informative-0} and \eqref{eq:psl-informative} is verified numerically for small $n$ by experiments on real-world and simulated data in Supplement~F.


The ability to incorporate prior information into the selection is generally a strength of our criterion. By default, however, we cannot assume that such information is available. We can instead choose an uninformative prior where $\pi(\theta)$ is constant with respect to $\theta$. Recall that we assume $\Theta$ to be compact, which allows us to specify a uniform prior as uninformative prior. Then \eqref{eq:psl-informative} simplifies to the \emph{uninformative BPLS criterion} 
\begin{align} \label{eq:psl-uninformative}
    \operatorname{uBPLS} = \ell_{\Dcal \cup (x_i, \hat y_i)}(\tilde \theta) - \frac 1 2 \log | \mathcal I(\tilde \theta)|.
\end{align}



Our novel PLS criteria provide great intuition.

\begin{itemize}
    \item The first term is the joint likelihood of the pseudo-sample $(x_i, \hat y_i)$ and $\Dcal$ under the optimal parameter $\tilde \theta$. It measures how well the pseudo-sample complies with the previous model and previously seen data $\Dcal$. It tells the value of this joint likelihood at its maximum. Loosely speaking, this maximum height of the likelihood can be seen as a very rough approximation of the area under it, i.e., the integral with uniform weights.\footnote{Technically, we also need that $\lambda(\Theta) = 1$ with $\lambda$ a Lebesgue-measure for this interpretation.} 
    \item The second term penalizes high curvature of the pseudo-label likelihood function $\ell_{\Dcal \cup (x_i, \hat y_i)}(\theta)$ around its peak, since the Fisher-information is its second derivative. Due to the negative sign, the criterion prefers pseudo-samples that lead to flatter maxima of the likelihood. In line with recent insights into sharp and flat minima of loss surfaces \cite{dinh2017sharp, li2018visualizing, andriushchenko22a}, such a penalty can be expected to improve generalization. The lower the curvature, the more probability mass (area under the likelihood) is expected on $\Theta \setminus B_{\epsilon}(\tilde \theta)$ with $B_{\epsilon} = \{\theta \in \Theta \mid \|\theta - \tilde \theta \| < \epsilon \}$ an $\epsilon$-ball for fixed $\epsilon > 0$ around $\tilde \theta$ in the uninformative case. Intuitively, this corrects the very rough approximation of the area under the likelihood by the likelihood's maximal height, see above.  
    \item The third term in the informative BPLS criterion adjusts the selection for our prior beliefs $\pi$ about $\theta$. Here, the effect of $(x_i, \hat y_i)$ is only implicit, because it affects the maximizer $\tilde \theta$. The more likely the updated parameter $\tilde \theta$ is under $\pi$, the higher the PPP.
\end{itemize}

In summary, our approximation of the PPP 
%(being the integral of the pseudo-labeled instance's likelihood with regard to the posterior over $\theta$) 
grows in the absolute value of the likelihood's peak, decreases in its curvature at this point, and increases in the prior likelihood of the updated parameter.


When $n \to \infty$, the criteria iBPLS and uBPLS are dominated by the likelihood, thus
$$
    \log  p(\Dcal \cup (x_i, \hat y_i) | \Dcal)   \overset{n \to \infty}{\propto}  \,  \ell_{\Dcal \cup (x_i, \hat y_i)}(\tilde \theta).
$$
%The respective optimal action is then simply $a^* \overset{n \to \infty}{\approx} \argmax_a \ell(\hat \theta(\mathcal{D} \cup \left(x_{i}, \hat y_i\right))).$ 
This approximation is computationally cheaper to evaluate, as it does not involve the Fisher-information. However, this comes at the cost of poor accuracy in case of small $n$. Selection with regard to this rough approximation of the PPP corresponds to selection with regard to the likelihood.
As pointed out in Section~\ref{sec:bayes-opt}, this corresponds to the overly optimistic max-max-criterion.


% \section{Old stuff}

% \subsection{Uninformative Prior}

% In the following, we will write $\mathcal{L}(\theta) = \mathcal{L}_{ y \mid  x}(\theta)$ for brevity. The concept behind Laplace's method is to compute the Maximum Likelihood (ML) estimator $\hat \theta_{ML} = \argmax \mathcal {L}(\theta) = \argmax \ell(\theta)$, where $\ell(\theta) = \log \mathcal{L}(\theta) = \log p( y \mid  x, \theta) $, and then approximate $\ell(\theta)$ (required to be differentiable) by a Taylor expansion around $\hat \theta = \hat \theta_{ML}$: $\ell(\theta) = \ell(\hat \theta) + (\theta - \hat \theta)' \ell'(\hat \theta) - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta) + \dots,$ where $\mathcal{I}$ is the observed Fisher-information matrix. Note that the second summand in the Taylor series is zero, since $\ell'(\hat \theta)$ is a stationary point per definition of $\hat \theta$. We can disregard the Taylor summands of higher orders, since the ML-estimator converges to $\theta$ in probability, $\hat \theta \go \xrightarrow{\mathbb{P}} \theta$. Here, we consider a non-informative prior. The fact of $\Theta$ being compact allows us to specify a uniform prior as non-informative prior. Thus, we have $p(\theta \mid  \mathcal{D}) = \mathcal{L}(\theta)$. We can hence plug the Taylor expansion into the PPP $p(\Tilde{y} \mid \Tilde{x}, {y}, {x})=\int_{\Theta} p(\Tilde{y} \mid \Tilde{x}, \theta) \, p(\theta \mid {y}, {x})\,d\theta = \int_{\Theta} \mathcal{L}(\theta) p(\theta \mid \mathcal{D}) d \theta$ (Equation~\ref{eq:pp}) for $\ell(\theta)$ in $\mathcal{L}(\theta) = \exp(\ell(\theta))$ twice, for the posterior as well as for the likelihood. This results in the following approximation of the PPP:


% \begin{equation}
% \begin{split}
% \label{eq:marginal-l-approx}
%       p(\hat{y} \mid x,  y,  x)  &\approx  \mathcal{L}(\hat \theta)^2 \, \int_{\Theta} \exp(2 \cdot \mathcal{T}_3(\theta, \hat \theta)) d\theta,      
% \end{split}
% \end{equation}

% % % And in complete analogy, we have for the PPP $p(\hat{y} \mid x,  y,  x) &\approx \exp(\ell(\hat \theta)) \, p(\hat \theta \mid \mathcal{D}) \, \int \exp(\frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)) d\theta.$ 

% % % Approximating the prior $\pi(\theta )$ in a similar manner yields

% % % \begin{equation}
% % % \label{eq:taylor-prior}
% % %     \pi(\theta ) = \pi(\hat \theta ) + (\theta - \hat \theta) \cdot \pi'(\hat \theta ),
% % % \end{equation}

% % % and analogously for the posterior $p(\theta \mid \mathcal{D}) = p(\hat \theta \mid \mathcal{D}) + (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D})$.

% % % The fact of $\Theta$ being compact allows us to specify a uniform prior as uninformative prior. Thus, we have $\log p(\theta \mid \mathcal{D}) = \ell(\theta)$ and eventually: 
% % % Plugging Equation~\ref{eq:taylor-likelihood} into~\ref{eq:pp} 
% % % %and exploiting the fact that the ML-estimator converges to $\theta$ in probability $\hat \theta \go \xrightarrow{\mathbb{P}} \theta$ 
% % % gives the following approximation of the marginal likelihood:

% % where $\mathcal{T}_3(\theta, \hat \theta) = - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)$ the third summand of the Taylor series. We can rewrite Equation~\ref{eq:marginal-l-approx} as $p(\hat{y} \mid x,  y,  x) \approx \mathcal{L}(\hat \theta)^2 \int_\Theta [\exp(\mathcal{T}_3(\theta, \hat \theta))]^2 d \theta$. It then becomes evident that we can apply Jensen's inequality to the integral with regard to $\theta$:

% % \begin{equation}
% %     \int_\Theta [\exp(\mathcal{T}_3(\theta, \hat \theta))]^2 d \theta \overset{Jensen}{\geq} [ \int_\Theta \exp(\mathcal{T}_3(\theta, \hat \theta)) d \theta ]^2
% % \end{equation}

% % Now note that Jensen's inequality holds with equality when $\exp(\mathcal{T}_3(\theta, \hat \theta))$ is constant in $\theta$. This is approximately the case for $\lvert \mathcal{I}(\hat \theta) \rvert \to \infty$, i.e. assuming high curvature of the likelihood at $\hat \theta_{ML}$ (meaning data appears very informative about $\theta$ under the assumption of our model). Since we want to hedge against this case, we assume the Jensen inequality holds with equality: $ \int_\Theta [\exp(\mathcal{T}_3(\theta, \hat \theta))]^2 d \theta = [ \int_\Theta \exp(\mathcal{T}_3(\theta, \hat \theta)) d \theta ]^2$. As $\hat \theta$ is the ML-estimator, we are now in the fortunate position to make use of its famous property of asymptotic normality, obtaining $ \int \exp(- \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)) d \theta \approx (2 \pi)^{\frac{q}{2}} n^{- \frac{q}{2}} |\mathcal{I}(\hat \theta)|^{-\frac{1}{2}}$ with $q = dim(\Theta)$. Wrapping things up, we have $p(\hat{y} \mid x,  y,  x)  \approx  \mathcal{L}(\hat \theta)^2  (2 \pi)^q n^{-q} |\mathcal{I}(\hat \theta)|^{-1}$. Taking the natural logarithm delivers our cautious approximation of the logarithmic posterior predictive: 


% % \begin{equation}
% % \label{eq:laplaxe-approx-final-posterior}
% % \begin{split}
% % \log p(\hat{y} \mid x,  y,  x) \approx 2 \ell (\hat \theta) + q \log(\frac{2 \pi}{n}) - \log \lvert \mathcal{I(\hat \theta)} \rvert 
% % \end{split}
% % \end{equation} 

% % % \propto \ell(\hat \theta ) + \frac{q}{2} \, \log(\frac{2\pi}{n}) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)|    


% % % %old: 
% % % By exploiting the latter and taking the natural logarithm of Equation~\ref{eq:marginal-l-approx} we get the approximate logarithmic posterior predictive:
% % % $\log p(\hat{y} \mid x,  y,  x) \approx 2 \ell (\hat \theta) + q \log(\frac{2 \pi}{n}) - \log \lvert \mathcal{I(\hat \theta)} \rvert \propto \ell(\hat \theta ) + \frac{q}{2} \, \log(\frac{2\pi}{n}) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)| $ . 


% % -BEGIN- Alternative (by Thomas Nagler) without Jensen Inequality
% % (starte nach eq. 6)

% where $\mathcal{T}_3(\theta, \hat \theta) = - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)$ is the third summand of the Taylor series. We can rewrite Equation~\ref{eq:marginal-l-approx} as $p(\hat{y} \mid x,  y,  x) \approx \mathcal{L}(\hat \theta)^2 \int_\Theta \exp( - \frac{1}{2} (\theta - \hat \theta)' \,2\,n\, \mathcal{I}(\hat \theta) (\theta - \hat \theta)) d \theta$. Now note that we can set $\Sigma^{-1} = \,2\,n\, \mathcal{I}(\hat \theta)$ as precision matrix. With this formulation, we can identify $ \exp(- \frac{1}{2} (\theta - \hat \theta)' \Sigma^{-1}  (\theta - \hat \theta)') $ as $q$-dimensional Gaussian function \cite{gauss1877theoria}. The respective Gaussian integral is 

% \begin{equation}
%     \int_\Theta - \frac{1}{2} (\theta - \hat \theta)' \,2\,n\, \mathcal{I}(\hat \theta) (\theta - \hat \theta) d \theta = \lvert \Sigma \rvert^{\frac{1}{2}} (2 \pi)^{\frac{q}{2}}, 
% \end{equation}

% see \cite{zinn2021quantum} for a modern textbook proof. Plugging this result into Equation~\ref{eq:marginal-l-approx}, we obtain $p(\hat{y} \mid x,  y,  x) \approx  \mathcal{L}(\hat \theta)^{2} \lvert \Sigma \rvert^{\frac{1}{2}} \, (2 \pi)^{\frac{q}{2}} = \mathcal{L}(\hat \theta)^{2} \, 2^{- \frac{q}{2}} \, n^{- \frac{q}{2}} \, \lvert\mathcal{I}(\hat \theta) \rvert^{- \frac{1}{2}} \, (2 \pi)^{\frac{q}{2}}$. Taking the  logarithm delivers our final approximation of the logarithmic posterior predictive: 


% \begin{equation}
% \label{eq:laplaxe-approx-final-posterior}
% \begin{split}
% \log p(\hat{y} \mid x,  y,  x) \approx 2 \ell (\hat \theta) + \frac{q}{2} \log\left(\frac{ \pi}{n}\right) - \frac{1}{2} \log \lvert \mathcal{I(\hat \theta)} \rvert 
% \end{split}
% \end{equation} 


% Approximation~\ref{eq:laplaxe-approx-final-posterior} provides great intuition: Its first summand tells the value of the likelihood function at its maximum; that is, how well-supported the ML-estimator is by the data. Loosely speaking, this maximum height of the likelihood can be seen as a very rough approximation of the area under it, i.e., the integral with regard to the posterior. Since the latter is based on an uninformative prior, it contains the same information as the likelihood. Thus, there is no need to consider the posterior explicitely in the approximation. The second summand in Equation~\ref{eq:laplaxe-approx-final-posterior} corrects for the dimension of the parameter space, $dim(\Theta) = q$. The more parameters are involved, the more probability mass (area under the likelihood) is expected on $\Theta \setminus B_{\epsilon}(\hat \theta)$ with $B_{\epsilon} = \{\theta \in \Theta \mid \|\theta - \hat \theta \| < \epsilon \}$ an $\epsilon$-Ball for fixed $\epsilon > 0$ around $\hat \theta$. The logarithm of $\frac{\pi}{n}$ results from the normalizing constant. Notably, the second term does not depend on $\hat \theta$ and thus can be neglected when maximizing the PPP with regard to pseudo-labels. That is: $\log p(\hat{y} \mid x,  y,  x) \propto \ell (\hat \theta) - \frac{1}{4} \log \lvert \mathcal{I(\hat \theta)} \rvert $. This is the formulation we use in our implementation, see supplementary material and Section~\ref{sec:algo}. The third summand penalizes high curvature of the likelihood function at its peak, since the Fisher-information is its second derivative. In the same manner as in dimensionality penalization, the lower the curvature, the more probability mass is expected on $\Theta \setminus B_{\epsilon}(\hat \theta)$. 
% To sum it up, the Laplace approximation of the PPP 
% %(being the integral of the pseudo-labeled instance's likelihood with regard to the posterior over $\theta$) 
% grows in the absolute value of the likelihood's peak and decreases in its curvature at this point. This is reminiscent of deliberations regarding sharp and flat minima of loss functions \cite{dinh2017sharp, li2018visualizing}. %Notably, we would end up with a similar approximation for the marginal likelihood. The main difference is that the prior requires a separate Taylor series, since it cannot be assumed to equal the likelihood like the posterior in case of uninformative prior. For details, refer to the supplementary material.

% When $n \to \infty$, Equation~\ref{eq:laplaxe-approx-final-posterior} is dominated by the likelihood, thus
% $
%    \log p(\hat{y} \mid x,  y,  x)  \overset{n \to \infty}{\approx} 2 \, \ell(\hat \theta) \propto \ell(\hat \theta).
% $
% The respective optimal action is then simply $a^* \overset{n \to \infty}{\approx} \argmax_a \ell(\hat \theta(\mathcal{D} \cup \left(x_{i}, \hat y_i\right))).$ This approximation is computationally much cheaper to evaluate, as it does not involve the Fisher-information. However, this comes at the cost of poor accuracy in case of small $n$. Further note that with such a rough approximation, selection with regard to the PPP then corresponds to selection with regard to the likelihood.
% As pointed out in Section~\ref{sec:bayes-opt}, this corresponds to the optimistic max-max-criterion, rendering the selection with regard to the likelihood the risk-loving max-max-action. %In other words, assuming $n \to \infty$ matches max-max-decisions.


% % where possible approximations of the \textit{robust} PPP are also discussed. Generally, if we allow for improper priors in $\Pi$ rendering the posterior $p^*(\hat \theta \mid \mathcal{D})$ uniform, the respective posterior predictive equals the marginal likelihood in case of independently distributed data. This makes sence: In such a limiting case, the selection most robust towards the initial fit given no other information is just random selection. 


% % \begin{equation}
% % \log p(x) \approx \ell(\hat \theta) + \log \pi(\hat \theta ) + \frac{p}{2} \log(\frac{2\pi}{n}) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)|.   
% % \end{equation} 


% % Note that by letting $n \to \infty$, the prior's influence on the marginal likelihood is dominated by the likelihood. Multiplying by $-2$ yields the Bayesian Information Criterion (BIC) \cite{schwarz1978estimating}. In the realm of model selection, the BIC is commonly used to find an optimal model $ M^* =  \argmin_M \{ -2\ell(\hat \theta) + p \log(n)\}$. Since we alter neither the number of data nor of the features when optimizing the pseudo marginal likelihood, we can ignore $p \log(n)$ and simply proceed with the likelihood:


% \subsection{Informative Prior}

% With informative prior, we cannot simply plug in the likelihood twice, but actually have to consider the posterior. As above, denote the Maximum Likelihood (ML) estimator by $\hat \theta_{ML} = \argmax \mathcal {L}(\theta) = \argmax \ell(\theta)$, where $\ell(\theta) = \log \mathcal{L}(\theta) = \log p(y \mid x, \theta) $. We easily get the same Taylor expansion around $\hat \theta = \hat \theta_{ML}$ as above: 
% $\ell(\theta) = \ell(\hat \theta) - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta) + \dots.$ Again, the second summand $\ell'(\hat \theta)$ in the Taylor series is zero per definition of $\hat \theta$, and we can disregard terms of higher orders, as the ML-estimator converges to $\theta$ in probability, $\hat \theta \go \xrightarrow{\mathbb{P}} \theta$. Contrary to the above approximation, we now have to approximate the posterior with a separate Taylor series around $\hat \theta = \hat \theta_{ML}$ as follows: $p(\theta \mid  \mathcal{D}) = p(\hat \theta \mid  \mathcal{D}) + (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D}) + \dots .$ Plugging these two Taylor series for $\ell(\theta)$ and $p(\theta \mid \mathcal{D})$ into the PPP gives the following approximation: $\int_{\Theta} \mathcal{L}(\theta) p(\theta \mid \mathcal{D}) d \theta \approx \int_{\Theta} \exp(\ell(\hat \theta) + \mathcal{T}_3(\hat \theta)) \; [p(\hat \theta \mid  \mathcal{D}) + (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D})] d \theta  $, where $\mathcal{T}_3(\hat \theta) = - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)$ is again the third summand of the Taylor series for the likelihood. First, we can rewrite this approximation as $\int_{\Theta} \mathcal{L}(\theta) p(\theta \mid \mathcal{D}) d \theta \approx \mathcal{L}(\hat \theta) \int_{\Theta} \exp(\mathcal{T}_3(\hat \theta)) \cdot p(\hat \theta \mid  \mathcal{D}) + \exp(\mathcal{T}_3(\hat \theta)) \cdot (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D})] d \theta = \mathcal{L}(\hat \theta) \int_{\Theta} \exp(\mathcal{T}_3(\hat \theta)) \cdot p(\hat \theta \mid  \mathcal{D}) d \theta + \mathcal{L}(\hat \theta) \int_{\Theta} \exp(\mathcal{T}_3(\hat \theta)) \cdot (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D}) d \theta$, exploiting that $\mathcal{L}(\theta)$ does not depend on the variable of integration. For the second summand it now holds that $\mathcal{L}(\hat \theta) \int_{\Theta} \exp(\mathcal{T}_3(\hat \theta)) \cdot (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D})] d \theta = \mathcal{L}(\hat \theta) p'(\hat \theta \mid  \mathcal{D}) \int_{\Theta} \exp(\mathcal{T}_3(\hat \theta)) \cdot (\theta - \hat \theta)d \theta = 0$, because $\int_{\Theta} (\theta - \hat \theta) \exp(- \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta)  \cdot (\theta - \hat \theta))d \theta = 0$. This follows from the symmetry of the integrand. We thus end up with

% \begin{equation}
% \begin{split}
% \label{eq:approx-inf-prior}
%       p(\hat{y} \mid x,  y,  x) \approx \; \mathcal{L}(\hat \theta) \, p(\hat \theta \mid \mathcal{D}) \, \int \exp(\mathcal{T}_3(\hat \theta)) d\theta.     
% \end{split}
% \end{equation}


%  We can now make use of the ML-estimator's property of being asymptotically normally distributed, obtaining
% \begin{equation}
%     \int_{\Theta} \exp(- \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)) d \theta \approx (2 \pi)^{\frac{q}{2}} n^{- \frac{q}{2}} |\mathcal{I}(\hat \theta)|^{-\frac{1}{2}}. 
% \end{equation}

% By exploiting the latter and taking the logarithm of Equation~\ref{eq:approx-inf-prior}, we get the approximate logarithmic PPP for an informative prior as follows:

% \begin{equation}
% \label{eq:approx-informative-final}
% \begin{split}
% \log p(\hat{y} \mid x,  y,  x)  &\approx  \, \ell(\hat \theta) + \log p(\hat \theta \mid  \mathcal{D}) \\
% &+  \frac{q}{2} \, \log(\frac{2\pi}{n}) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)|,   
% \end{split}
% \end{equation} 

% %Note: Can we really use the proportionaliy wrt. \theta (see belows) here?
% where $\log p(\hat \theta \mid  \mathcal{D}) \propto \log \pi(\hat \theta) + \ell(\hat \theta)$. This approximation follows the same intuition as for the uninformative case, with $\log p(\hat \theta \mid  \mathcal{D})$ additionally providing information on the plausibility of the fitted model in light of the posterior.
% Notably, this approximation does not require stronger assumptions than in the uninformative case: In order to get rid of the second term in the Taylor series of the prior, we simply need to exploit the symmetry of $(\theta - \hat \theta) \exp(- \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta)  \cdot (\theta - \hat \theta))$. %The latter requires assuming $n \to \infty$.Arguably, this is a very strong assumption given the usually rather small $n$ in the initially labeled data. 
% The informative approximation is just a little more involved, since we do not have $p(\hat \theta \mid  \mathcal{D}) = \ell(\hat \theta)$ and thus $p'(\hat \theta \mid  \mathcal{D}) = \ell'(\hat \theta) = 0$ per definition of the ML-estimator as in the uninformative case. Just like the approximation in the uninformative case, Equation~\ref{eq:laplaxe-approx-final-posterior} is dominated by the likelihood $\ell(\hat \theta)$, when $n \to \infty$. This is due to the same reasoning as for the PPP's approximation and the fact that the posterior's influence (second term in Equation~\ref{eq:laplaxe-approx-final-posterior}) on the PPP is dominated by the likelihood asymptotically.

%\footnote{Multiplying by $-2$ yields the Bayesian Information Criterion (BIC) \cite{schwarz1978estimating}. In the realm of Bayesian model selection, the BIC is commonly used to find an optimal model $ M^* =  \argmin_M \{ -2\ell(\hat \theta) + q \log(n)\}$.} 


% \begin{figure*}[t!]
% \centering
% \includegraphics[width=\textwidth]{Sample UAI 2023 paper/figures/res-ionoshpere.png}
% \caption{Results on subsamples of varying size from ionosphere data ($q = 33$ features). Uninformative prior. Share of unlabeled: 0.8. Accuracy averaged over 40 SSL rounds. For $n < 240$ no PLS method outperforms supervised baseline (not shown here). }
% \label{fig:all-results}
% \end{figure*}



% \begin{figure*}[t!]
% \centering
% \includegraphics[width=\textwidth]{figures/plots-res.png}
% \caption{Results from 8 Classification Tasks in Descending Difficulty. Accuracy Averaged over 100 Repetitions.}
% \label{fig:all-results}
% \end{figure*}


\section{EXPERIMENTS}
\label{sec:results}



\textbf{Algorithmic Procedure:}
\label{sec:algo}
For all predicted pseudo-labels, we refit the model on $\mathcal{D}~\cup~(x_i, \hat{y}_i)$ and evaluate its PPP by means of the derived approximations iBPLS and uBPLS to select one instance to be added to the training data. Detailed pseudo code for BPLS can be found in Supplement A. The computational complexity depends on the evaluation of the PPP. With $\lvert \mathcal{U} \rvert = m$ unlabeled data points and no stopping criterion, $m + (m - 1) + \dots + 1 = \frac{m^2 + m}{2}$ PPPs have to be evaluated, see Section~\ref{sec:approx}. Hence, BPLS' complexity depends on the model's complexity and the amount of unlabeled data.

The bottleneck is the refitting of the whole model, which is needed because we evaluate the joint pseudo posterior predictive of both the pseudo-labeled and all the labeled data. Note, however, that this is required because we avoid the \textit{i.i.d.} assumption, in order to remain as general as possible. If we rely on the common regularity condition that the data are \textit{i.i.d} we can factorize the likelihood, Fisher information, and the pseudo posterior predictive. The contribution of the labeled data can then be dropped from the criterion, which renders its computation $\mathcal{O}(1)$
for each unlabeled sample. Similarly, the refitted model parameter could be approximated in $\mathcal{O}(1)$ time using an approximation similar to the infinitesimal jackknife, see \cite[Defintion 2]{giordano2019swiss}. Overall, we could thus achieve $\mathcal{O}(m)$
complexity if the data were \textit{i.i.d.}. We leave this to future work.



% \begin{Hypothesis}
% \label{hypo:BPLS-good}
% \textbf{(a)} Our approximate PPP with uninformative prior outperforms traditional PLS on data prone to initial overfitting (i.e., with a high ratio of features to data $\frac{q}{n}$).   
% \textbf{(b)} For low $\frac{q}{n}$ PPP is outperformed by traditional PLS.
% \end{Hypothesis}

% % \begin{Hypothesis}
% % \label{hypo:BPLS-initial-perf}
% % \textbf{(a)} Approximate PPP with uninformative prior outperforms traditional PLS in case of low initial generalization performance (hard tasks).   
% % \textbf{(b)} With high initial generalization, PPP is outperformed by traditional PLS.
% % \end{Hypothesis}

% \begin{Hypothesis}
% \label{hypo:likelihood}
%     Among all PLS methods, the pseudo-label likelihood (max-max-action) \textbf{(a)} reinforces the initial model fit the most and \textbf{(b)} hardly improves generalization. 
% \end{Hypothesis}

% \begin{Hypothesis}
% \label{hypo:informative}
%     Our approximate PPP with correctly specified informative prior outperforms traditional PLS methods universally.
% \end{Hypothesis}



\begin{Hypothesis}
\label{hypo:BPLS-good}
\textbf{(a)} PPP with uninformative prior outperforms traditional PLS on data prone to initial overfitting (i.e., with high ratio of features to data $\frac{q}{n}$ and poor initial generalization).   
\textbf{(b)} For low $\frac{q}{n}$ and high initial generalization, BPLS is outperformed by traditional PLS.
\end{Hypothesis}

\begin{Hypothesis}
\label{hypo:likelihood}
    \textbf{(a)} Among all PLS methods, the pseudo-label likelihood (max-max-action) reinforces the initial model fit the most and \textbf{(b)} hardly improves generalization. 
\end{Hypothesis}

\begin{Hypothesis}
\label{hypo:informative}
    PPP with informative prior outperforms traditional PLS methods universally.
\end{Hypothesis}

\textbf{Experimental Setup:} We formulate three hypotheses beforehand, all of which will be tested later by hypotheses tests along the lines of \cite{demvsar2006statistical}.
Hypothesis \ref{hypo:BPLS-good} corresponds to the main motivation behind BPLS; its second part is a logical consequence thereof: If we are sceptical towards the initial model in case it generalizes well, we expect to select pseudo-labels in a worse way than when trusting the initial model. Hypothesis \ref{hypo:likelihood} is based on the decision-theoretic insights regarding PLS by the likelihood, see Section \ref{sec:bayes-opt}: It embodies an optimistic reliance on the initial model and is thus expected to pick data that fits best into that model. We further expect (Hypothesis \ref{hypo:informative}) BPLS to unambiguously outperform non-Bayesian selection methods in case the prior provides actual information about the data generating process -- the latter is simply not available for non-Bayesian PLS.

We benchmark semi-supervised (parametric) generalized linear models (GLMs) and (non-parametric) generalized additive models (GAMs) \cite{hastie1987generalized, hastie2017generalized} with PPP and pseudo-label likelihood against two common selection criteria (probability score $\mathbb{P}(y = \hat y)$ and predictive variance $\Var[\hat y] = \mathbb{E}[\hat y - \mathbb{E}[\hat y]]^2$) \cite{triguero2015self} as well as a supervised baseline. Additionally, we compare our method against MC-based approximations of the PPP in supplement~G and showcase its applicability to Bayesian neural networks in supplement~H. For the supervised baseline, we abstain from self-training and only use the labeled data for training. Experiments are run on simulated binomially distributed data as well as on eight data sets for binary classification from the UCI repository \cite{Dua:2019}. The binomially distributed data was simulated through a linear predictor consisting of normally distributed features. Details on the simulations as well as on the data sets can be found in Supplement~C and~L.
 %The initial generalization performance is measured by a supervised learner's accuracy on test data (purple line in Figure~\ref{fig:all-results}). After each selected pseudo-labeled instance, we evaluate the current accuracy on test data ($50$\% of raw data set).
The share of unlabeled data was set to $0.8$ and $0.9$. PLS methods were compared with regard to (\say{inductive}) accuracy of prediction on unseen test. All data sets were found to be fairly balanced except for the EEG data (minority share: $0.29$).
%To assess Hypothesis~\ref{hypo:BPLS-good} in detail, we further focus on one particular high-dimensional ($n = 350, q = 33$) data set on radar signals from the ionosphere \cite{ionosphere}. The features are antenna pulses, based on which signals have to be classified as positive or negative. We test Hypothesis~\ref{hypo:BPLS-good} by taking random subsamples of increasing size, i.e., decreasing $\frac{q}{n}$.







\textbf{Results:}
Figures~\ref{fig:all-results} and~\ref{fig:res-sim} as well as Table \ref{tab:table} summarize the results in the uninformative case (grey figures) for real-world and simulated data, respectively. \say{Oracle stopping} in Table~\ref{tab:table} refers to comparing PLS methods with regard to their overall best accuracy as opposed to \say{final} comparisons after the whole data set was labeled. Figure~\ref{fig:res-sim} sheds further light on results for simulated data, while Figure~\ref{fig:results-inf} displays results from benchmarking BPLS to classical PLS methods in the informative case (black figures). Detailed figures displaying results from all experiments can be found in the supplementary material. 


\begin{figure*}[t!]
\centering
\includegraphics[width=\textwidth]{figures/plots-res.png}
\caption{Results from 8 classification tasks based on real-world data \cite{Dua:2019} in descending difficulty (measured by supervised test accuracy), where $p$ denotes the number of features here and the share of unlabeled data is 0.8. Accuracy averaged over 100 repetitions.}
\label{fig:all-results}
\end{figure*}

\begin{figure}
    \centering
    \includegraphics[width=\columnwidth]{figures/resl-sim.png}
    \caption{Results from simulated data. Accuracy averaged over 100 repetitions. Legend: see Figure \ref{fig:all-results}.}
    \label{fig:res-sim}
\end{figure}



\begin{figure*}[t!]
\centering
\includegraphics[width=\textwidth]{figures/res-plot-inf-GAMs.png}
\caption{Results of PPP with informative priors and non-parametric GAMs on simulated data with different shares of unlabeled data. Accuracy averaged over 100 repetitions.}
\label{fig:results-inf}
\end{figure*}




\begin{table}[h!]
\caption{Best performing PLS method (uninformative) on simulated data} 
\label{sample-table}
\begin{center}
\small
\begin{tabular}{c||c||ll}
\textbf{n} & \textbf{q} &\textbf{ORACLE STOPPING} & \textbf{FINAL} \\
\hline \hline
60 & 60 &  PPP         & PPP\\
100 & 60 & PPP & Supervised Learning \\
400 & 60 & PPP             & PPP \\
1000 & 60 & Probability Score & Probability Score \\
\end{tabular}
\end{center}
\label{tab:table}
\end{table}





\textbf{Interpretation:}
At first sight, comparing the accuracy gains in Figure~\ref{fig:all-results} on different data sets (in order of ascending baseline performance) clearly supports Hypothesis~\ref{hypo:BPLS-good}: For harder tasks like EEG or sonar with relatively high ratio of features to data $\frac{q}{n}$, Bayesian PPP outperforms traditional PLS, whilst being dominated by the probability score in case of easier tasks like banknote or breast cancer. For data sets with intermediate difficulty (mushrooms and ionosphere), PPP and other PLS methods compete head-to-head. The results on abalone data underpin a general fact in SSL (see Section~\ref{sec:intro}): Successful self-training requires at least some baseline supervised performance.

Results on simulated data (Table~\ref{tab:table}) further support the role of $\frac{q}{n}$ in Hypothesis~\ref{hypo:BPLS-good}. Their visualization (Figure~\ref{fig:res-sim}) nicely illustrates the inner working of selection by PPP: By not trusting the initial model, PPP affects the model's test accuracy the most. While $n = 400$ leaves some room for improvement through mitigating the overfitting by pseudo-labeled data, PPP leads to a noisy performance in case of $n = 100$ close to $p$. Here, even the final model still overfits. These promising results should not hide an inconsistency: The fact that PPP is superior on the cars task but not on the ionosphere task contradicts Hypothesis~\ref{hypo:BPLS-good}, since cars is harder than ionosphere, while having almost identical $\frac{q}{n}$.

We find Hypothesis \ref{hypo:likelihood} to be partially supported by the results. While \ref{hypo:likelihood} (a) holds for both the majority of simulated (see supplementary material) and real-world data (likelihood generally the closest to supervised performance), \ref{hypo:likelihood} (b) is challenged by considerable generalization performance gain on ionosphere and breast cancer data. 

Figure \ref{fig:results-inf} clearly supports Hypothesis \ref{hypo:informative}: When using informative priors based on the true data-generating process, BPLS clearly outperforms traditional PLS methods. Results from experiments with different shares of unlabeled data (Supplement~D.3) as well as with Bayesian neural networks (Supplement~G) further back this finding. This comes at no big surprise, since non-Bayesian PLS simply lack ways to incorporate such prior knowledge. From this perspective, the uninformative case (Hypothesis~\ref{hypo:BPLS-good}) corresponds to raising the bar and clearly is the theoretically more interesting benchmarking setup. However, many practical applications of SSL entail a myriad of pre-existing knowledge. For practical purposes, thus, the informative situation might even be more relevant. %, e.g., radio spectrum identification \cite{cameloetal}.

\textbf{Hypothesis Tests:} We perform statistical non-parametric hypotheses tests tailored to comparing classification accuracies of different ML methods across multiple data sets. Using the multiple comparison approaches from \cite{demvsar2006statistical}\footnote{Since we only compare accuracies, relying on \cite{demvsar2006statistical} is sufficient. For multi-criteria comparison of classifiers, we refer to \cite{jansen2022statistical, jansen_uai2023}.}, we deploy the Friedman-test \cite{friedman1937use, friedman1940comparison} for overall differences in accuracies and (in case we reject the null in the Friedman-test) the post-hoc Nemenyi-test \cite{nemenyi1963distribution} for pairwise comparisons. For conducting the tests, we compare both the final and the oracle-stopping (best among all iterations) accuracies of all PLS methods across different classification tasks on both simulated and real-world data. All tests were conducted with significance level $\alpha = 0.05$. We summarize the test decision in what follows and provide more detailed background on the hypotheses tests in  supplement~I.

The visual interpretation of the results is generally supported by the statistical tests. Hypothesis \ref{hypo:BPLS-good} (a) is found valid, especially for oracle-stopping accuracies. Interestingly, there is not enough evidence for Hypothesis \ref{hypo:BPLS-good} (b), which was not immediately clear from the visualized results. The reason for this unexpected result could be a small sample size of classification tasks falling in the very specific category of having low $\frac{q}{n}$ and high initial generalization. This finding is actually promising: It tells us that there is not enough evidence yet for writing off BPLS in these scenarios (which it was not intended for). That is, we cannot state it is significantly ($\alpha = 0.05$) outperformed in situations of no risk of overfitting (Hypothesis~\ref{hypo:BPLS-good} (b)), while we can state it significantly ($\alpha = 0.05$) outperforms other PLS methods when overfitting is present (Hypothesis~\ref{hypo:BPLS-good} (a)), which was the motivation for the method.

Hypothesis \ref{hypo:likelihood} is not supported by the tests. In particular, there appears not to be enough evidence in the benchmarking results supporting Hypothesis \ref{hypo:likelihood} (b), as mentioned earlier. We could not find that likelihood (max-max) hardly improves generalization. Furthermore, the statistical tests confirm our visual impression that Hypothesis \ref{hypo:informative} is unanimously supported by the empirical results. 





% \begin{figure*}[t!]
% \centering
% \includegraphics[width=\textwidth]{figures/plot-res-inf-GLM.png}
% \caption{Results for logistic regression with informative priors.}
% \label{fig:all-results}
% \end{figure*}




% \textbf{Results:}
% Table~\ref{tab:table} summarizes the results in the uninformative case for GLMs on high-dimensional ($q = 60$) simulated data. \say{Oracle stopping} in Table~\ref{tab:table} refers to comparing PLS methods with regard to their overall best accuracy as opposed to \say{final} comparisons after the whole data set was pseudo-labeled. We display results from subsamples of ionosphere data (in order of ascending size) in Figure~\ref{fig:all-results}. 
% %Figure~\ref{fig:all-results} and Table~\ref{tab:table} summarize the results in the uninformative case for real-world and simulated data, respectively.%\footnote{Further results (i.a. with different shares of unlabeled data), detailed data sources and code to reproduce the experiments can be found in the supplementary material. We will provide access to the project's (non-anonymous) repository in case of acceptance.}
% Figure~\ref{fig:res-sim} sheds further light on results for simulated data, while Figure~\ref{fig:res:sim-inf} displays results from benchmarking BPLS with informative priors to classical PLS methods with GLMs and GAMs on simulated data. Detailed figures displaying results from all eight data sets as well as from simulated data can be found in the supplementary material. 


 

% \begin{table}[h!]
% \caption{Best performing PLS (with uninformative prior) on high-dimensional simulated data} 
% \label{sample-table}
% \begin{center}
% \small
% \begin{tabular}{c||c||ll}
% \textbf{n} & \textbf{q} &\textbf{ORACLE STOPPING} & \textbf{FINAL} \\
% \hline \hline
% 60 & 60 &  PPP         & PPP\\
% 100 & 60 & PPP & Supervised Learning \\
% 400 & 60 & PPP             & PPP \\
% 1000 & 60 & Probability Score & Probability Score \\
% \end{tabular}
% \end{center}
% \label{tab:table}
% \end{table}

% % \begin{figure}[h]
% %     \centering
% %     \includegraphics[width=\columnwidth]{figures/resl-sim.png}
% %     \caption{Results from Simulated Data. Averaged over 100 Repetitions. Legend: See Figure~\ref{fig:all-results}.}
% %     \label{fig:res-sim}
% % \end{figure}


% \begin{figure}[h]
%     \centering
%     \includegraphics[width=\columnwidth, trim={0 0 0 1.9cm}, clip]{figures/res-sim.png}
%     \caption{Results from low-dimensional simulated Data. Averaged over 40 Repetitions. Legend: See Figure~\ref{fig:all-results}.}
%     \label{fig:res-sim}
% \end{figure}

% % \begin{figure}[h]
% %     \centering
% %     \includegraphics[width=\columnwidth, trim={0 0.86cm 0 0}, clip]{figures/resl-sim.png}
% %     \caption{Results from simulated data. Uninformative prior. Left: $n=100$, right: $n=400$. Axis labels and legend: See Figure~\ref{fig:all-results}.}
% %     \label{fig:res-sim}
% % \end{figure}



% \textbf{Interpretation:}
% Comparing the accuracy gains in Figure~\ref{fig:all-results} on different subsamples of ionosphere data supports Hypothesis~\ref{hypo:BPLS-good}: For higher ratios of features to data $\frac{q}{n}$, our approximate PPP outperforms traditional PLS, whilst being dominated by the probability score in case of lower $\frac{q}{n}$. For subsamples of size $n < 240$ (not shown) all PLS methods are outperformed by supervised learning, see also results on mushrooms, abalone, and cars data in Supplement E. This underpins a general fact in SSL, see Section~\ref{sec:intro}: Successful self-training requires at least some baseline supervised performance.  
% %For data sets with intermediate difficulty (mushrooms and ionosphere), PPP and other PLS methods compete head-to-head. The results on abalone data underpin a general fact in SSL (see Section~\ref{sec:intro}): Successful self-training requires at least some baseline supervised performance.
% On the other extreme, self-training can hardly improve accuracy in case the initial model already generalizes well, see e.g. results on simulated data with $n=200$ and $q=6$ in Figure \ref{fig:res-sim}.

% Results on high-dimensional simulated data (Table~\ref{tab:table}) again highlight the role of $\frac{q}{n}$, further backing Hypothesis~\ref{hypo:BPLS-good}. 
% Visualizations thereof (see Supplement E) illustrate the inner working of selection by approximate PPP: By not trusting the initial model, PPP affects the model's test accuracy the most. While $n = 400$ leaves some room for improvement through curing the overfitting by pseudo-labeled data, PPP leads to a noisy performance in case of $n = 100$. Here, even the final model still severely overfits. At first sight, the compilation of results from all eight UCI data sets (see Supplement E) further supports Hypothesis~\ref{hypo:BPLS-good}: PPP's performance relative to competing PLS methods improves as $\frac{q}{n}$ increases. However, this should not hide some inconsistencies. While competing with all other PLS methods on the banknote data, PPP is outperformed by probability score on EEG and cancer data. This contradicts Hypothesis~\ref{hypo:BPLS-good}, as both EEG and cancer data have higher $\frac{q}{n}$. We suppose the general difficulty of the classification task might interact with $\frac{q}{n}$. That is, not all data sets prone to overfitting actually lead to an overfitted model that performs worse on test data. If the underlying relationship is easy enough, even overfitted models might generalize reasonably well.

% %The fact that PPP is superior on the cars task but not on the ionosphere task contradicts Hypothesis~\ref{hypo:BPLS-good} (a), since cars is harder than ionosphere, while having almost identical $\frac{q}{n}$. 

% We find Hypothesis~\ref{hypo:likelihood} to be partially supported by the results. While~\ref{hypo:likelihood} (a) holds for both simulated and real-world data (likelihood generally the closest to supervised performance),~\ref{hypo:likelihood} (b) is challenged by considerable generalization performance gain on banknote and cancer data (supplement~E). 
% Figure~\ref{fig:res:sim-inf} clearly supports Hypothesis~\ref{hypo:informative}: When using informative priors based on the true data-generating process, BPLS outperforms traditional PLS methods. This comes as no big surprise since non-Bayesian PLS simply lack ways to incorporate such prior knowledge. From this perspective, the uninformative case corresponds to raising the bar and clearly is the theoretically more interesting benchmarking setup. However, many practical applications of SSL entail a myriad of pre-existing knowledge, e.g., radio spectrum identification \cite{cameloetal}. For practical purposes, thus, the informative situation might even be more relevant.



% % \begin{figure*}[t!]
% % \centering
% % \includegraphics[width=\textwidth, crop = {5cm 0 0 0}]{figures/res-plot-inf-GAMs.png}
% % \caption{Results from Simulated Data (with Different Shares of Unlabeled Data) in Case Of Informative Priors.}
% % \label{fig:results-inf}
% % \end{figure*}






\section{RELATED WORK}
\label{sec:background}

% \subsection{Self-training}
% \label{sec:self-training}
% Following \cite{pise2008survey} and \cite{van2020survey}, semi-supervised learning (SSL) can be broadly categorized into self-training and co-training. We will focus on the former, whose general idea is commonly described as fitting a model on $\mathcal{D}$ by empirical risk minimization and then exploiting this model's predictions to label $\mathcal{U}$. Typically, those instances from $\mathcal{U}$ are added whose predictions score high on some confidence measure. The predicted probability (probability score) is among the most popular \cite{triguero2015self}. Besides, the predictions' variance as well as a linear combination of variance and score are used \cite{rizve2020defense}

% With regard to the addition of newly labeled data from $\mathcal{U}$ to $\mathcal{D}$, \cite{triguero2015self} and \cite{kostopoulos2018semi} distinguish between (instance-wise) incremental, batch-wise and amending mechanisms. While incremental approaches strictly label instances from $\mathcal{U}$ one-by-one in a step-wise manner, batch-wise and amending techniques allow for adding multiple data points at once or removing data, respectively. \cite{triguero2015self} further differentiates self-training methods into single- and multi-classifier ones, depending on how many learned classifiers $\hat{y}(x)$ are used during the labeling phase. %In case multiple classifiers are deployed, they can be constituted by the same model class (Hypothesis space) or multiple ones. \cite{triguero2015self} refer to this distinction as single- versus multi-learning. Using multiple classifiers additionally opens up several possible ways of combining and aggregating their predictions and confidence measures. 
% % Self-training algorithms further differ with regard to their stopping criterion, see \cite{triguero2015self} for details. identify three main approaches in the literature. Firstly and most obviously, one can simply label the complete set $\mathcal{U}$. Secondly, labeling can be restricted to a subsample of $\mathcal{U}$, see \cite{blum1998combining} for instance. A third way to stop the labeling process would be to account for changes in the predictive classifier $\hat y(x)$. That is, to stop as soon as $\hat y(x)$ does no longer change as a function of $\mathcal{U}$ and leave the remaining data in $\mathcal{U}$ unlabeled. 
% In this paper, we present an incremental approach with a single classifier that can deploy any stopping criterion aiming at inductive learning. 

% The rationale of consistency regularization, related to contrastive learning, has recently sparked a lot of research and innovation in semi-supervised learning methods, mainly developed for image data \cite{Berthelot2019, Sohn2020, Zhang2021c}. 
% While showing strong empirical performance, these models rely on domain- and data modality-dependent data augmentation strategies, limiting their applicability across different domains \cite{rizve2020defense}. Self-training, however, is employable universally \cite{amini2022self}. What is more, \cite{cascante2021curriculum, rizve2020defense} show that self-training can in fact compete with consistency regularization if the model is well-calibrated.

%The first version of our proposed algorithm will label the complete set $\mathcal{U}$, while the second version will come with a natural stopping criterion, which -- to the best of our knowledge -- does not appear in the literature so far.  

%\subsection{Robust Pseudo-Label Selection}

\textbf{Robust PLS:} Robustness of PLS is a widely discussed issue in the self-training literature. \cite{aminian2022information} propose information-theoretic PLS robust towards covariate shift. \cite{lienen2021credal} label instances in the form of sets of probability distributions (credal sets), weakening the reliance on a single distribution. \cite{vandewalle2013predictive} aim at robustness to modeling assumptions by allowing model selection through the deviance information criterion during semi-supervised learning. \cite{rizve2020defense} propose uncertainty-aware pseudo-label selection which proves to compete with state-of-the-art SSL based on consistency regulation. The idea is to select pseudo-labeled instances whose probability score and predictive uncertainty are above (tunable) thresholds. The latter is operationalized by the prediction's variance, and thus, unlike BPLS, fails to decompose approximation and modeling uncertainty, see Section~\ref{sec:intro}. Both predictive variance and probability score serve as benchmarks in Section~\ref{sec:results}.

%\subsection{Bayesian and Likelihood-Based SSL}

\textbf{Bayesian Self-Training:} There is a broad body of research on deploying Bayesian \textit{predictions} in SSL and particularly in self-training \cite{gordon2020combining, ng2018bayesian, cai2011bassum, adams2009archipelago}. The same holds for explicit likelihood-based inference, such as weighted likelihood \cite{sokolovska2008asymptotics}, conditional likelihood \cite{grandvalet2004semi}, and joint mixture likelihood \cite{amini2002semi}. % with applications ranging from facial expression recognition \cite{cohen2003semi} to radio spectrum identification \cite{cameloetal}.
Most of them use Bayesian models for \textit{predicting} pseudo-labels. In contrast, we prove that the argmax of the PPP is the Bayes-optimal \textit{selection} of pseudo-labels given \textbf{any} predictive model. %Differently put, one could argue we rely on Bayesian decision theory while they use Bayesian statistics. 

Regarding Bayesian or likelihood-based \textit{selection} of pseudo-labels, there exists only little (Bayesian) or hardly any (likelihood-based) work. \cite{li2020pseudo} quantify the uncertainties of pseudo-labels by mixtures of predictive distributions of a neural net, applying MC dropout. This could be seen as an expensive MC-based approximation of the PPP. 

Very recently, \cite{patel2022seq} proposed PLS with regard to (a sampling-based approximation of) the entropy of the pseudo-labels' posterior predictive distribution. The entropy is considered a measure of total uncertainty (aleatoric and epistemic) and often considered as regularization for PLS, see \cite{saporta2020esl, liu2021cycle} for instance. Abstaining from the entropy -- as we do -- effectively means not considering the aleatoric uncertainty. While including aleatoric uncertainty (e.g. measurement noise) generally makes sense, we consider it of minor importance in the concrete problem of initial overfitting, where we aim at disentangling epistemic uncertainty with regard to \textit{data} and \textit{parameters}: We want to choose pseudo-labels that are likely given the \textit{observed labeled data} but not necessarily likely given the estimated \textit{parameters of the (over-)fitted model}. 

%Hence, \cite{patel2022seq} also consult the posterior predictive for PLS. However, they use a conceptually different approach than this paper: Uncertainty of pseudo-labels (which is the selection criterion) is quantified by the entropy of their predictive distribution. %The latter just happens to be the posterior predictive,


% Our method is conceptually similar, but abstains from the aleatoric part completely, focusing on disentangling modeling and approximation uncertainty within the epistemic part. We argue that for our use case of data prone to overfitting, the latter decomposition has more relevance (see Section~\ref{sec:intro}), whilst aleatoric (irreducible noise in the data) does not affect the confirmation bias. 



% Bayesian SSL:
% https://www.sciencedirect.com/science/article/pii/S003132031930456X
% https://www.sciencedirect.com/science/article/abs/pii/S0031320310005078
% https://proceedings.neurips.cc/paper/2018/hash/1fc214004c9481e4c8073e85323bfd4b-Abstract.html
% https://icml.cc/Conferences/2009/papers/258.pdf

 % \cite{sokolovska2008asymptotics} introduce a weighted likelihood approach where the weight corresponds to the ratio of the unlabeled data's density to the labeled data's. They show that maximization of such a weighted likelihood is never outperformed by supervised learning without any assumption, that is, it is considered safe. \cite{kawakita2014safe} extend their results to a wide range of data modalities and highlight its connection to a statistical paradox regarding the estimation of nuisance parameters. \cite{grandvalet2004semi} derive entropy minimization in SSL through maximizing a conditional likelihood. Generally, likelihood-based inference has proven fruitful in situation of weak supervision. This is underpinned by several applications ranging from facial expression recognition \cite{cohen2003semi} to radio spectrum identification \cite{cameloetal}.  

% semi-supervised logistic regression (generative, joint mixture likelihood maximization):
%http://ama.liglab.fr/~amini/Publis/SemiSupLogReg_ecai02.pdf

%EM in SSL: https://zenodo.org/record/3484301#.Yta50YTP1mM
% Wichtig: EM is typically iterative, but not incremental
% https://dspace.mit.edu/bitstream/handle/1721.1/7202/AIM-1509.pdf?sequence=2
% Wichtig: Relation to EM
% https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm





% \subsection{Superset Learning}

% \textcolor{red}{shorten this paragraph}

% The notion of superset learning is closely related, yet different to semi-supervised learning. Instead of completely unlabeled (i.e. fully ambiguos) data $\mathcal{U}=\left\{\left(x_{j}, \mathcal{Y}_{j}\right)\right\}_{j=1}^{m} \in\left(\mathcal{X} \times 2^\mathcal{Y}\right)^{m}$, superset learning considers $\left\{\left(x_{j}, Y_{i}\right)\right\}_{j=1}^{m} \in\left(\mathcal{X} \times 2^\mathcal{Y}\right)^{m}$, where $Y_i \subseteq \mathcal{Y}$. In this context, $Y_i$ are regarded supersets of \say{true} underlying singletons $y_i$, thus the name. From this point of view, semi-supervised learning can be regarded a special case of superset learning. There exist optimistic as well as pessimistic variants of superset learning \cite{hullermeier2019learning}. The general idea is to find a singleton representation (often called instantiation) of the supersets that corresponds to the most predictive (optimistic) or least predictive (pessimistic) model when trained and evaluated on it. In the optimistic case, this can be achieved by minimizing an optimistic version of the empirical risk, the generalized empirical risk: $\frac{1}{n} \sum_{i=1}^n L^*(\hat{y}_i, Y_i) =   \frac{1}{n} \sum_{i=1}^n \min_{y \in Y_i} L(\hat{y}_i , y)$.



% While typically concerned with optimistic (pessimistic) versions of empirical risk minimization, there is also some work which approaches the supersets from a likelihood-perspective, see \cite{hullermeier2015superset}. Analogous to the generalized empirical risk, this translates to maximizing an optimistic version of the likelihood: $\prod_{i=1}^n \max_{y \in Y_i} p(Y_i|\theta)$, $\theta \in \Theta$, a (possibly regularized) parameter space. %Inspired by these formulations of optimistic superset learning, we will choose those labels for $\left(x_{i}, \mathcal{Y}_{i}\right) \in \mathcal{U}$ that maximize the likelihood of $\mathcal{D} \cup \left(x_{i}, \mathcal{Y}_{i}\right)$.
% Contrary to \cite{hullermeier2015superset} and \cite{hullermeier2019learning}, we consider the \textit{marginal} likelihood. That is, we first integrate out the parameters, rendering the selection of instances in $\mathcal{U}$ less dependent on the estimation of $\theta$ through $\mathcal{D}$.

% %\textcolor{red}{TO DO:} Cautious Superset learning, levelwise OSL

% Recently, \cite{lienen2021credal} proposed to apply methods from superset learning to self-training in semi-supervised learning. Our work again seeks some inspiration from this co-called \say{credal self-supervised learning}, but is conceptually different. \cite{lienen2021credal} assigns sets of probability distributions (credal sets) on the assigned (pseudo-)labels of $\mathcal{U}$ and interprets its size as a confidence measure. Our approach completely abstains from weakening the probabilistic assumptions involved in the learning process. 
% TODO 



% \subsection{Interpretable Self-training}
% \textcolor{red}{TO DO}
% % white-box approach: 
% % https://www.researchgate.net/profile/Jose-Alonso-14/publication/325870079_Hybrid_Data-Expert_Explainable_Beer_Style_Classifier/links/5b976bb7299bf147394865d2/Hybrid-Data-Expert-Explainable-Beer-Style-Classifier.pdf#page=58

% %https://ojs.aaai.org/index.php/AAAI/article/view/16934



\section{DISCUSSION}
\label{sec:discussion}

\textbf{Extensions:}
\label{sec:ext}
We briefly discuss four venues for future work. %The first extension of BPLS gets rid of the dependency on the predicted pseudo-labels. Instead of assigning the predicted label to each data point under consideration, we could assign each possible label and evaluate the respective PPP and select from these set of possible pseudo-labels.
%This approach is outperformed by classical confidence measures as well as BPLS. We reason that completely abstaining from using predictions through $\mathcal{D}$ when selecting instances might be too much of a good thing. More details and a summary of results can be found in the supplementary material. 
The first extension loosens the restriction on one particular model class by performing model selection and PLS simultaneously. The idea would be to select these instances that can be best explained by the simplest learner (i.e., the one with least parameters), see \cite{in-all-likelihoods} for preliminary results. 
Further recall that both the framework of BPLS and our approximation of the PPP do not require data to be \textit{i.i.d} distributed. Applying BPLS on dependent observations, such as in auto-correlated data like time series, is thus another promising line of further research.
Thirdly, recall we showed that the max-max-actions corresponds to optimistic superset learning \cite{hullermeier2014learning}. This connection might pave the way for further extensions that connect superset learning to decision theory.
Conclusively, the robustness towards confirmation bias might also be of help in the presence of a distributional shift between the labeled and the unlabeled dataset. The PPP averages the likelihood over all possible parameters (not only the fitted ones), see Section \ref{sec:intro}. This renders it more robust towards the initial fit. Independent from overfitting, a distributional shift from labeled to unlabeled data might harm PLS methods: They rely on the labeled data that comes from a different distribution than the unlabeled data, from which pseudo-samples are selected \cite{rodemann2022not}. Downweighting the influence of the initial fit by averaging over all possible parameters with regard to the posterior could thus in principle also help here. Preliminary results in supplement~J support this intuition. 
  

% Due to our aforementioned skepticism regarding a possibly overfitted $\hat \theta$, we would like to further weaken the influence of the likelihood on the posterior. This can be achieved by means of generalizing Bayesian analysis \cite{dempster1968generalization, walley1991statistical, insua2012robust, augustin2014introduction}. Again, we can avail ourselves of rich decision theoretical literature dating back to \cite{ellsberg1961risk}. We will borrow from the theory on Max-E-Min or $\Gamma$-maximin, see for instance \cite{guo2010decision}. To this end, we introduce a convex set of priors $\Pi \subseteq \{\pi(\theta) \mid \pi(\cdot) \, \text{a probabilty measure on } \left(\Theta, \sigma(\Theta) \right) \}$ with $\Theta$ compact as above and $\sigma(\cdot)$ an appropriate $\sigma$-algebra.    
% The rough idea now is this: After observing data, we base our selection (action) on the prior from $\Pi$ that corresponds to the lowest posterior from the set of resulting posteriors. In other words, we hedge against the worst-case prior by artificially inducing a prior-data conflict \cite{evans2006checking}. In a nutshell, we select the pseudo-labeled instance that would have had the highest utility (likelihood) if we had specified the prior in such a way that it contradicted the (potentially overfitted) model the most. The respective decision criterion would be the $\Gamma$-maximin criterion $\Phi(\cdot,\Pi) \colon \mathcal{U} \to \mathbb{R}; \,
%     a \mapsto \Phi(a, \pi) = \underline{\mathbb{E}}_\Pi(u(a,\theta))$ with $\underline{\mathbb{E}}_\Pi(u(a,\theta)) = \inf_{\pi \in \Pi} \mathbb{E}(u(a,\theta))$. It corresponds to the posterior that results from updating the prior $\pi^*(\cdot) \in \Pi$ that has the lowest value in $\hat \theta$. 

%\subsection{Interpretability}
% \section{Bivariate Self-Training}


% self-learning konkreter algo:
% https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4709606/

% self learning survey
% https://link-springer-com.emedien.ub.uni-muenchen.de/article/10.1007/s10115-013-0706-y

% \section{Extension to other Learners}

% ICE curves

% \section{Discussion}
% \label{sec:discussion}

% \section{Ease of Use}

\textbf{Limitations:}
BPLS' strength of being applicable to any learner can imply high computational costs in case of expensive-to-train models such as neural nets, because PPP approximations require refitting the model $\frac{m^2 + m}{2}$ times. However, we have demonstrated in Section \ref{sec:algo} (algorithmic procedure) that assuming \textit{i.i.d.} data can reduce our method's complexity from $\mathcal{O}(m^2)$ to $\mathcal{O}(m)$.
Additionally, it might be difficult for practitioners to assess the risk of overfitting to the initial data set beforehand and opt for BPLS in response. Given the fact that BPLS is outperformed by traditional PLS in cases with no overfitting, this might be considered a drawback for practical application. However, Section~\ref{sec:results} demonstrated that $\frac{q}{n}$ and the baseline supervised performance (both easily accessible) provide sound proxies for initial overfitting scenarios that can induce a confirmation bias in PLS. These proxies can (alongside cross-validation) help practitioners to identify such scenarios.      


\textbf{Conclusion:}
BPLS renders self-training more robust with respect to the initial model. This improves final performance if the latter overfits and potentially harms it if not. Identifying overfitting scenarios is thus crucial for BPLS' usage. %Hence, we recommend further research in this direction. 
What is more, BPLS allows incorporating prior knowledge, with the help of which substantial performance gains can be achieved. Besides, our insights from formalizing PLS as a decision problem clear the way for promising future work exploiting rich literature on Bayesian decision theory. Ultimately, we conclude that a Bayesian view can add great value not only to predicting but also to selecting data for self-training.     


% First level headings are all caps, flush left, bold, and in point size
% 12. Use one line space before the first level heading and one-half line space
% after the first level heading.

% \subsection{Second Level Heading}

% Second level headings are initial caps, flush left, bold, and in point
% size 10. Use one line space before the second level heading and one-half line
% space after the second level heading.

% \subsubsection{Third Level Heading}

% Third level headings are flush left, initial caps, bold, and in point
% size 10. Use one line space before the third level heading and one-half line
% space after the third level heading.

% \paragraph{Fourth Level Heading}

% Fourth level headings must be flush left, initial caps, bold, and
% Roman type.  Use one line space before the fourth level heading, and
% place the Section text immediately after the heading with no line
% break, but an 11 point horizontal space.

% %%%
% \subsection{Citations, Figure, References}


% \subsubsection{Citations in Text}

% Citations within the text should include the author's last name and
% year, e.g., (Cheesman, 1985). 
% %Apart from including the author's last name and year, citations can follow any style, as long as the style is consistent throughout the paper.  
% Be sure that the sentence reads
% correctly if the citation is deleted: e.g., instead of ``As described
% by (Cheesman, 1985), we first frobulate the widgets,'' write ``As
% described by Cheesman (1985), we first frobulate the widgets.''


% The references listed at the end of the paper can follow any style as long as it is used consistently.

% %Be sure to avoid
% %accidentally disclosing author identities through citations.

% \subsubsection{Footnotes}

% Indicate footnotes with a number\footnote{Sample of the first
%   footnote.} in the text. Use 8 point type for footnotes. Place the
% footnotes at the bottom of the column in which their markers appear,
% continuing to the next column if required. Precede the footnote
% Section of a column with a 0.5 point horizontal rule 1~inch (6~picas)
% long.\footnote{Sample of the second footnote.}

% \subsubsection{Figures}

% All artwork must be centered, neat, clean, and legible.  All lines
% should be very dark for purposes of reproduction, and art work should
% not be hand-drawn.  Figures may appear at the top of a column, at the
% top of a page spanning multiple columns, inline within a column, or
% with text wrapped around them, but the Figure number and caption
% always appear immediately below the Figure.  Leave 2 line spaces
% between the Figure and the caption. The Figure caption is initial caps
% and each Figure should be numbered consecutively.

% Make sure that the Figure caption does not get separated from the
% Figure. Leave extra white space at the bottom of the page rather than
% splitting the Figure and Figure caption.
% \begin{figure}[h]
% \vspace{.3in}
% \centerline{\fbox{This Figure intentionally left non-blank}}
% \vspace{.3in}
% \caption{Sample Figure Caption}
% \end{figure}

% \subsubsection{Tables}

% All tables must be centered, neat, clean, and legible. Do not use hand-drawn tables.
% Table number and title always appear above the Table.
% See Table~\ref{sample-table}.

% Use one line space before the Table title, one line space after the Table title,
% and one line space after the Table. The Table title must be
% initial caps and each Table numbered consecutively.

% \begin{table}[h]
% \caption{Sample Table Title} \label{sample-table}
% \begin{center}
% \begin{tabular}{ll}
% \textbf{PART}  &\textbf{DESCRIPTION} \\
% \hline \\
% Dendrite         &Input terminal \\
% Axon             &Output terminal \\
% Soma             &Cell body (contains cell nucleus) \\
% \end{tabular}
% \end{center}
% \end{table}

% \section{SUPPLEMENTARY MATERIAL}

% If you need to include additional appendices during submission, you can include them in the supplementary material file.
% You can submit a single file of additional supplementary material which may be either a pdf file (such as Proof details) or a zip file for other formats/more files (such as code or videos). 
% Note that reviewers are under no obligation to examine your supplementary material. 
% If you have only one supplementary pdf file, please upload it as is; otherwise gather everything to the single zip file.

% You must use \texttt{aistats2023.sty} as a style file for your supplementary pdf file and follow the same formatting instructions as in the main paper. 
% The only difference is that it must be in a \emph{single-column} format.
% You can use \texttt{supplement.tex} in our starter pack as a starting point.
% Alternatively, you may append the supplementary content to the main paper and split the final PDF into two separate files.

% \section{SUBMISSION INSTRUCTIONS}

% To submit your paper to AISTATS 2023, please follow these instructions.

% \begin{enumerate}
%     \item Download \texttt{aistats2023.sty}, \texttt{fancyhdr.sty}, and \texttt{sample\_paper.tex} provided in our starter pack. 
%     Please, do not modify the style files as this might result in a formatting violation.
    
%     \item Use \texttt{sample\_paper.tex} as a starting point.
%     \item Begin your document with
%     \begin{flushleft}
%     \texttt{\textbackslash documentclass[twoside]\{article\}}\\
%     \texttt{\textbackslash usepackage\{aistats2023\}}
%     \end{flushleft}
%     The \texttt{twoside} option for the class article allows the
%     package \texttt{fancyhdr.sty} to include headings for even and odd
%     numbered pages.
%     \item When you are ready to submit the manuscript, compile the latex file to obtain the pdf file.
%     \item Check that the content of your submission, \emph{excluding} references, is limited to \textbf{8 pages}. The number of pages containing references alone is not limited.
%     \item Upload the PDF file along with other supplementary material files to the CMT website.
% \end{enumerate}

% \subsection{Camera-ready Papers}

% %For the camera-ready paper, if you are using \LaTeX, please make sure
% %that you follow these instructions.  
% % (If you are not using \LaTeX,
% %please make sure to achieve the same effect using your chosen
% %typesetting package.)

% If your papers are accepted, you will need to submit the camera-ready version. Please make sure that you follow these instructions:
% \begin{enumerate}
%     %\item Download \texttt{fancyhdr.sty} -- the
%     %\texttt{aistats2023.sty} file will make use of it.
%     \item Change the beginning of your document to
%     \begin{flushleft}
%     \texttt{\textbackslash documentclass[twoside]\{article\}}\\
%     \texttt{\textbackslash usepackage[accepted]\{aistats2023\}}
%     \end{flushleft}
%     The option \texttt{accepted} for the package
%     \texttt{aistats2023.sty} will write a copyright notice at the end of
%     the first column of the first page. This option will also print
%     headings for the paper.  For the \emph{even} pages, the title of
%     the paper will be used as heading and for \emph{odd} pages the
%     author names will be used as heading.  If the title of the paper
%     is too long or the number of authors is too large, the style will
%     print a warning message as heading. If this happens additional
%     commands can be used to place as headings shorter versions of the
%     title and the author names. This is explained in the next point.
%     \item  If you get warning messages as described above, then
%     immediately after $\texttt{\textbackslash
%     begin\{document\}}$, write
%     \begin{flushleft}
%     \texttt{\textbackslash runningtitle\{Provide here an alternative
%     shorter version of the title of your paper\}}\\
%     \texttt{\textbackslash runningauthor\{Provide here the surnames of
%     the authors of your paper, all separated by commas\}}
%     \end{flushleft}
%     Note that the text that appears as argument in \texttt{\textbackslash
%       runningtitle} will be printed as a heading in the \emph{even}
%     pages. The text that appears as argument in \texttt{\textbackslash
%       runningauthor} will be printed as a heading in the \emph{odd}
%     pages.  If even the author surnames do not fit, it is acceptable
%     to give a subset of author names followed by ``et al.''

%     %\item Use the file sample\_paper.tex as an example.

%     \item The camera-ready versions of the accepted papers are 8
%       pages, plus any additional pages needed for references.

%     \item If you need to include additional appendices,
%       you can include them in the supplementary
%       material file.

%     \item Please, do not change the layout given by the above
%       instructions and by the style file.

% \end{enumerate}



\begin{acknowledgements}
We thank all five anonymous reviewers for their helpful feedback. JR and TA gratefully acknowledge support by the Federal Statistical Office of Germany within the co-operation project "Machine Learning in Official Statistics". JR further acknowledges support by the LMU mentoring program and the bidt graduate school by the Bavarian Academy of Sciences (BAS).
JG was partially supported by the Bavarian Ministry of Economic Affairs, Regional Development, and Energy through the Center for Analytics – Data – Applications (ADA-Center) within the framework of BAYERN DIGITAL II (20-3410-2-9-8). JG, ED and TN were partially supported by the German Federal Ministry of Education and Research (BMBF) under Grant No. 01IS18036A, Munich Center for Machine Learning (MCML). 
% All acknowledgments go at the end of the paper, including thanks to reviewers who gave useful comments, to colleagues who contributed to the ideas, and to funding agencies and corporate sponsors that provided financial support. 
% To preserve the anonymity, please include acknowledgments \emph{only} in the camera-ready papers.

\end{acknowledgements}


%\newpage

\bibliography{rodemann_356/rodemann_356.bib}

\end{document}
