%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
% \usepackage{natbib} % has a nice set of citation styles and commands
%     \bibliographystyle{plainnat}
%     \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage[space]{cite}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{textcomp}
\usepackage{xcolor}
\usepackage{dirtytalk}
\usepackage{algorithm2e}
\usepackage{hyperref}
\usepackage{float}
\usepackage{bm}
\usepackage{graphics}

\usepackage{pdflscape}



% Define Commands:
\DeclareRobustCommand{\bbone}{\text{\usefont{U}{bbold}{m}{n}1}}
\DeclareMathOperator{\EX}{\mathbb{E}}% expected value
\DeclareRobustCommand{\bbone}{\text{\usefont{U}{bbold}{m}{n}1}}
\DeclareMathOperator{\Prob}{\mathbb{P}}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\Var}{\operatorname{Var}}
\newcommand{\Dcal}{\mathcal{D}}
\newcommand{\ppp}{\mathrm{PPP}}

%envs

\newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{proof}{Proof}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{Hypothesis}{Hypothesis}



% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{%Bayesian PLeaSe: 
Approximately Bayes-Optimal Pseudo-Label Selection  \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<rodemann@stat.uni-muenchen.de>?Subject=Your paper on Bayes-Optimal Pseudo-Label Selection}{Julian Rodemann}{}}
\author[1,2,3]{Jann Goschenhofer}
\author[1,2,4]{Emilio Dorigatti}
\author[1,2]{Thomas Nagler}
\author[1]{Thomas Augustin}

% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    Ludwig-Maximilians-Universität (LMU)\\
    Munich, Germany
}
 \affil[2]{%
     Munich Center for Machine Learning (MCML)\\
     Munich, Germany
 }
  \affil[3]{%
       Fraunhofer Institute for Integrated Circuits (IIS)\\
     Erlangen, Germany
 }
   \affil[4]{%
     Institute of Computational Biology\\
     Helmholtz-Zentrum\\
     Neuherberg, Germany
 }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
%envs


  
  \begin{document}

%\pagenumbering{gobble}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix

\section{PSEUDO-CODE FOR BPLS}

We summarize the procedure of Bayesian Pseudo-Label Selection (BPLS) with approximate Pseudo Posterior Predictive (PPP) in Algorithm~\ref{alg:main}. Pseudo-code describing the proposed extensions can be found in section \ref{sec:app-extensions} of this supplementary material. Notation and mathematical symbols follow the main paper. Notably, the number of unobserved data $\lvert \mathcal{U} \rvert$ was denoted $m$ in the main paper.


\RestyleAlgo{ruled}

%% This is needed if you want to add comments in
%% your algorithm with \Comment
\SetKwComment{Comment}{/* }{ */}

\begin{algorithm}[H]
\caption{Bayesian Pseudo-Label Selection (BPLS) with approximate Pseudo Posterior Predictive (PPP)}
\label{alg:main}

\KwData{$\mathcal{D}, \mathcal{U}$}
\KwResult{$\mathcal{D}$, fitted model $\hat y^*(x)$}
% $y \gets 1$\;
% $X \gets x$\;
% $N \gets n$\;
\textbf{Fit} model M on labeled data $\mathcal{D}$ to obtain prediction function $\hat y(x)$ \\
\While{stopping criterion not met}{
\For{$i \in \{1, \dots, \lvert \mathcal{U} \rvert \}$}{
\textbf{predict} $\mathcal{Y} \ni \hat y_i = \hat y(x_i)$ \\
% \textbf{retrain} model $\hat y_r(x)$ on data $\mathcal{D} \cup \left(x_{i}, \hat y_i\right)$ \\
\textbf{approximate}  PPP $p(\mathcal{D} \cup \left(x_{i}, \hat y_i\right) | \mathcal{D}) $ %of $\mathcal{D} \cup \left(x_{i}, \hat y_i\right)$, i.e. $p((\hat y_i, y_1, \dots, y_n,) \mid (x_i, x_1, \dots, x_n, \mathcal{D}))$ 
\\
}
\textbf{obtain} $i^* = \argmax_i \{p(\mathcal{D} \cup \left(x_{i}, \hat y_i\right) | \mathcal{D}) \} $ \\ 
\textbf{add} $(x_i, \hat y_i)$ to labeled data: $\mathcal{D} \leftarrow \mathcal{D} \cup (x_i, \hat y_i) $ \\
\textbf{update} $\mathcal{U} \leftarrow \mathcal{U} \setminus \left(x_{i}, \mathcal{Y}\right)_i $

}
\end{algorithm}








\newpage
\section{MISSING PROOFS}

We present the proofs for Theorems 1-3 in section 2 of the main paper. For the sake of readability, we repeat the underlying theorems as well.

\subsection{Proof of Theorem 1}


\begin{theorem}
\label{th:bayes-opt}
In the decision problem $(\mathbb{A}, \Theta, u(\cdot))$ with $\mathbb{A} = \mathcal{U}$ (definition 1), with the pseudo-label likelihood as utility function (definition 2), and a prior $\pi(\theta)$ on $\Theta$, the standard Bayes criterion
\begin{align*}
    \Phi(\cdot,\pi) \colon \mathcal{U} \to \mathbb{R}\\
    a &\mapsto \Phi(a, \pi) = \mathbb{E}_\pi(u(a,\theta)) 
\end{align*}

corresponds to the pseudo marginal likelihood $p(\mathcal{D}~\cup~(x_i, \hat{y}_i))$.
    
\end{theorem}

\begin{proof}
    The definition of the expected value for measurable $u(\cdot, \cdot)$ directly delivers $ \Phi(a, \pi) = \mathbb{E}_\pi(u(a,\theta)) = \int u(a, \theta) d \pi(\theta) = \int p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid\theta) d \pi(\theta) = p(\mathcal{D} \cup (x_i, \hat{y}_i))$.
\end{proof}

\subsection{Proof of Theorem 2}

\begin{theorem}
In the decision problem $(\mathbb{A}, \Theta, u(\cdot))$, using the pseudo-label likelihood as utility function as in theorem \ref{th:bayes-opt} but with the prior updated by the posterior $\pi(\theta) = p(\theta \mid \mathcal{D})$ on $\Theta$, the standard Bayes criterion 
$\Phi(\cdot, \pi) \colon \mathcal{U} \to \mathbb{R}; \, a \mapsto \Phi(a, \pi) = \mathbb{E}_\pi(u(a,\theta)) $
corresponds to the \textit{pseudo posterior predictive} $p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \mathcal{D})$.
\end{theorem}

\begin{proof}  
    Analogous to Proof 1, we have $ \Phi(a, \pi) = \mathbb{E}_\pi(u(a,\theta)) = \int u(a, \theta) d \pi(\theta).$ Now with the updated prior $\pi(\theta) = p(\theta \mid \mathcal{D})$ it follows $ \int u(a, \theta) d \pi(\theta)= \int p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid\theta) d p(\theta \mid \mathcal{D}) = p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \mathcal{D})$.
\end{proof}


\subsection{Proof of Theorem 3}


\begin{theorem}
In the decision problem $(\mathbb{A}, \Theta, u(\cdot))$, using the pseudo-label likelihood as utility function as in theorem \ref{th:bayes-opt}, the max-max criterion
\begin{align*}
    \Phi \colon \mathcal{U} \to \mathbb{R}\\
    a &\mapsto \Phi(a) = \max_\theta (u(a,\theta)) 
\end{align*}

corresponds to the (full) likelihood at $\hat \theta_{ML}$.
\end{theorem}


\begin{proof} 
Recall definition 2 of the pseudo-label likelihood as utility function: $ u \colon \mathcal{U} \times \Theta \to \mathbb{R} \; ; \; ((x_i, \mathcal{Y}), \theta) \mapsto u((x_i, \mathcal{Y}), \theta) = p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \theta).$
Thus, it holds for the max-max criterion $\Phi(a) = \max_\theta (u(a,\theta)) = \max_\theta (p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \theta)) = p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \hat \theta_{ML})$, with $\hat \theta_{ML}$ the ML-estimator. 
\end{proof}

The max-max criterion hence corresponds to direct optimization with regard to $a$ of the likelihood, evaluated at $\hat \theta_{ML}$. The respective max-max-action is thus $ a^*_{max-max} = \max_a \max_\theta p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \theta) = \max_a p(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \hat \theta_{ML})$.



% \subsection{Proof of Theorem 4}

% \textcolor{red}{TO DO!!!}

% In the following, we will write $\mathcal{L}(\theta) = \mathcal{L}_{\bm y \mid \bm x}(\theta)$ for brevity. The concept behind Laplace's method is to compute the Maximum Likelihood (ML) estimator $\hat \theta_{ML} = \argmax \mathcal {L}(\theta) = \argmax \ell(\theta)$, where $\ell(\theta) = \log \mathcal{L}(\theta) = \log p(\bm y \mid \bm x, \theta) $, and then approximate $\ell(\theta)$ (required to be differentiable) by a Taylor expansion around $\hat \theta = \hat \theta_{ML}$: $\ell(\theta) = \ell(\hat \theta) + (\theta - \hat \theta)' \ell'(\hat \theta) - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta) + \dots,$ where $\mathcal{I}$ is the observed Fisher-information matrix. Note that the second summand in the Taylor series is zero, since $\ell'(\hat \theta)$ is a stationary point per definition of $\hat \theta$. We can disregard the Taylor summands of higher orders, since the ML-estimator converges to $\theta$ in probability, $\hat \theta \xrightarrow{\mathbb{P}} \theta$. Here, we consider a non-informative prior. The fact of $\Theta$ being compact allows us to specify a uniform prior as non-informative prior. Thus, we have $p(\theta \mid  \mathcal{D}) = \mathcal{L}(\theta)$. We can hence plug the Taylor expansion into the PPP $p(\Tilde{y} \mid \Tilde{x}, \bm{y}, \bm{x})=\int_{\Theta} p(\Tilde{y} \mid \Tilde{x}, \theta) \, p(\theta \mid \bm{y}, \bm{x})\,d\theta = \int_{\Theta} \mathcal{L}(\theta) p(\theta \mid \mathcal{D}) d \theta$ (Equation~\eqref{eq:pp}) for $\ell(\theta)$ in $\mathcal{L}(\theta) = \exp(\ell(\theta))$ twice, for the posterior as well as for the likelihood. This results in the following approximation of the PPP:


% \begin{equation}
% \begin{split}
% \label{eq:marginal-l-approx}
%       p(\hat{y} \mid x, \bm y, \bm x)  &\approx  \mathcal{L}(\hat \theta)^2 \, \int_{\Theta} \exp(2 \cdot \mathcal{T}_3(\theta, \hat \theta)) d\theta,      
% \end{split}
% \end{equation}

% % % And in complete analogy, we have for the PPP $p(\hat{y} \mid x, \bm y, \bm x) &\approx \exp(\ell(\hat \theta)) \, p(\hat \theta \mid \mathcal{D}) \, \int \exp(\frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)) d\theta.$ 

% % % Approximating the prior $\pi(\theta )$ in a similar manner yields

% % % \begin{equation}
% % % \label{eq:taylor-prior}
% % %     \pi(\theta ) = \pi(\hat \theta ) + (\theta - \hat \theta) \cdot \pi'(\hat \theta ),
% % % \end{equation}

% % % and analogously for the posterior $p(\theta \mid \mathcal{D}) = p(\hat \theta \mid \mathcal{D}) + (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D})$.

% % % The fact of $\Theta$ being compact allows us to specify a uniform prior as uninformative prior. Thus, we have $\log p(\theta \mid \mathcal{D}) = \ell(\theta)$ and eventually: 
% % % Plugging Equation~\eqref{eq:taylor-likelihood} into~\ref{eq:pp} 
% % % %and exploiting the fact that the ML-estimator converges to $\theta$ in probability $\hat \theta  \xrightarrow{\mathbb{P}} \theta$ 
% % % gives the following approximation of the marginal likelihood:

% % where $\mathcal{T}_3(\theta, \hat \theta) = - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)$ the third summand of the Taylor series. We can rewrite Equation~\eqref{eq:marginal-l-approx} as $p(\hat{y} \mid x, \bm y, \bm x) \approx \mathcal{L}(\hat \theta)^2 \int_\Theta [\exp(\mathcal{T}_3(\theta, \hat \theta))]^2 d \theta$. It then becomes evident that we can apply Jensen's inequality to the integral with regard to $\theta$:

% % \begin{equation}
% %     \int_\Theta [\exp(\mathcal{T}_3(\theta, \hat \theta))]^2 d \theta \overset{Jensen}{\geq} [ \int_\Theta \exp(\mathcal{T}_3(\theta, \hat \theta)) d \theta ]^2
% % \end{equation}

% % Now note that Jensen's inequality holds with equality when $\exp(\mathcal{T}_3(\theta, \hat \theta))$ is constant in $\theta$. This is approximately the case for $\lvert \mathcal{I}(\hat \theta) \rvert \to \infty$, i.e. assuming high curvature of the likelihood at $\hat \theta_{ML}$ (meaning data appears very informative about $\theta$ under the assumption of our model). Since we want to hedge against this case, we assume the Jensen inequality holds with equality: $ \int_\Theta [\exp(\mathcal{T}_3(\theta, \hat \theta))]^2 d \theta = [ \int_\Theta \exp(\mathcal{T}_3(\theta, \hat \theta)) d \theta ]^2$. As $\hat \theta$ is the ML-estimator, we are now in the fortunate position to make use of its famous property of asymptotic normality, obtaining $ \int \exp(- \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)) d \theta \approx (2 \pi)^{\frac{q}{2}} n^{- \frac{q}{2}} |\mathcal{I}(\hat \theta)|^{-\frac{1}{2}}$ with $q = dim(\Theta)$. Wrapping things up, we have $p(\hat{y} \mid x, \bm y, \bm x)  \approx  \mathcal{L}(\hat \theta)^2  (2 \pi)^q n^{-q} |\mathcal{I}(\hat \theta)|^{-1}$. Taking the natural logarithm delivers our cautious approximation of the logarithmic posterior predictive: 


% % \begin{equation}
% % \label{eq:laplaxe-approx-final-posterior}
% % \begin{split}
% % \log p(\hat{y} \mid x, \bm y, \bm x) \approx 2 \ell (\hat \theta) + q \log(\frac{2 \pi}{n}) - \log \lvert \mathcal{I(\hat \theta)} \rvert 
% % \end{split}
% % \end{equation} 

% % % \propto \ell(\hat \theta ) + \frac{q}{2} \, \log(\frac{2\pi}{n}) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)|    


% % % %old: 
% % % By exploiting the latter and taking the natural logarithm of Equation~\eqref{eq:marginal-l-approx} we get the approximate logarithmic posterior predictive:
% % % $\log p(\hat{y} \mid x, \bm y, \bm x) \approx 2 \ell (\hat \theta) + q \log(\frac{2 \pi}{n}) - \log \lvert \mathcal{I(\hat \theta)} \rvert \propto \ell(\hat \theta ) + \frac{q}{2} \, \log(\frac{2\pi}{n}) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)| $ . 


% % -BEGIN- Alternative (by Thomas Nagler) without Jensen Inequality
% % (starte nach eq. 6)

% where $\mathcal{T}_3(\theta, \hat \theta) = - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)$ is the third summand of the Taylor series. We can rewrite Equation~\eqref{eq:marginal-l-approx} as $p(\hat{y} \mid x, \bm y, \bm x) \approx \mathcal{L}(\hat \theta)^2 \int_\Theta \exp( - \frac{1}{2} (\theta - \hat \theta)' \,2\,n\, \mathcal{I}(\hat \theta) (\theta - \hat \theta)) d \theta$. Now note that we can set $\Sigma^{-1} = \,2\,n\, \mathcal{I}(\hat \theta)$ as precision matrix. With this formulation, we can identify $ \exp(- \frac{1}{2} (\theta - \hat \theta)' \Sigma^{-1}  (\theta - \hat \theta)') $ as $q$-dimensional Gaussian function \cite{gauss1877theoria}. The respective Gaussian integral is 

% \begin{equation}
%     \int_\Theta - \frac{1}{2} (\theta - \hat \theta)' \,2\,n\, \mathcal{I}(\hat \theta) (\theta - \hat \theta) d \theta = \lvert \Sigma \rvert^{\frac{1}{2}} (2 \pi)^{\frac{q}{2}}, 
% \end{equation}

% see \cite{zinn2021quantum} for a modern textbook proof. Plugging this result into Equation~\eqref{eq:marginal-l-approx}, we obtain $p(\hat{y} \mid x, \bm y, \bm x) \approx  \mathcal{L}(\hat \theta)^{2} \lvert \Sigma \rvert^{\frac{1}{2}} \, (2 \pi)^{\frac{q}{2}} = \mathcal{L}(\hat \theta)^{2} \, 2^{- \frac{q}{2}} \, n^{- \frac{q}{2}} \, \lvert\mathcal{I}(\hat \theta) \rvert^{- \frac{1}{2}} \, (2 \pi)^{\frac{q}{2}}$. Taking the  logarithm delivers our final approximation of the logarithmic posterior predictive: 


% \begin{equation}
% \label{eq:laplaxe-approx-final-posterior}
% \begin{split}
% \log p(\hat{y} \mid x, \bm y, \bm x) \approx 2 \ell (\hat \theta) + \frac{q}{2} \log\left(\frac{ \pi}{n}\right) - \frac{1}{2} \log \lvert \mathcal{I(\hat \theta)} \rvert 
% \end{split}
% \end{equation} 


% Approximation~\ref{eq:laplaxe-approx-final-posterior} provides great intuition: Its first summand tells the value of the likelihood function at its maximum; that is, how well-supported the ML-estimator is by the data. Loosely speaking, this maximum height of the likelihood can be seen as a very rough approximation of the area under it, i.e., the integral with regard to the posterior. Since the latter is based on an uninformative prior, it contains the same information as the likelihood. Thus, there is no need to consider the posterior explicitly in the approximation. The second summand in Equation~\eqref{eq:laplaxe-approx-final-posterior} corrects for the dimension of the parameter space, $dim(\Theta) = q$. The more parameters are involved, the more probability mass (area under the likelihood) is expected on $\Theta \setminus B_{\epsilon}(\hat \theta)$ with $B_{\epsilon} = \{\theta \in \Theta \mid \|\theta - \hat \theta \| < \epsilon \}$ an $\epsilon$-Ball for fixed $\epsilon > 0$ around $\hat \theta$. The logarithm of $\frac{\pi}{n}$ results from the normalizing constant. Notably, the second term does not depend on $\hat \theta$ and thus can be neglected when maximizing the PPP with regard to pseudo-labels. That is: $\log p(\hat{y} \mid x, \bm y, \bm x) \propto \ell (\hat \theta) - \frac{1}{4} \log \lvert \mathcal{I(\hat \theta)} \rvert $. This is the formulation we use in our implementation, see supplementary material and Section~\ref{sec:algo}. The third summand penalizes high curvature of the likelihood function at its peak since the Fisher-information is its second derivative. In the same manner as in dimensionality penalization, the lower the curvature, the more probability mass is expected on $\Theta \setminus B_{\epsilon}(\hat \theta)$. 
% To sum it up, the Laplace approximation of the PPP 
% %(being the integral of the pseudo-labeled instance's likelihood with regard to the posterior over $\theta$) 
% grows in the absolute value of the likelihood's peak and decreases in its curvature at this point. This is reminiscent of deliberations regarding sharp and flat minima of loss functions \cite{dinh2017sharp, li2018visualizing}. %Notably, we would end up with a similar approximation for the marginal likelihood. The main difference is that the prior requires a separate Taylor series, since it cannot be assumed to equal the likelihood like the posterior in case of uninformative prior. For details, refer to the supplementary material.

% When $n \to \infty$, Equation~\eqref{eq:laplaxe-approx-final-posterior} is dominated by the likelihood, thus
% $
%    \log p(\hat{y} \mid x, \bm y, \bm x)  \overset{n \to \infty}{\approx} 2 \, \ell(\hat \theta) \propto \ell(\hat \theta).
% $
% The respective optimal action is then simply $a^* \overset{n \to \infty}{\approx} \argmax_a \ell(\hat \theta(\mathcal{D} \cup \left(x_{i}, \hat y_i\right))).$ This approximation is computationally much cheaper to evaluate, as it does not involve the Fisher-information. However, this comes at the cost of poor accuracy in case of small $n$. Further note that with such a rough approximation, selection with regard to the PPP then corresponds to selection with regard to the likelihood.
% As pointed out in Section~\ref{sec:bayes-opt}, this corresponds to the optimistic max-max-criterion, rendering the selection with regard to the likelihood the risk-loving max-max-action. %In other words, assuming $n \to \infty$ matches max-max-decisions.


% where possible approximations of the \textit{robust} PPP are also discussed. Generally, if we allow for improper priors in $\Pi$ rendering the posterior $p^*(\hat \theta \mid \mathcal{D})$ uniform, the respective posterior predictive equals the marginal likelihood in case of independently distributed data. This makes sence: In such a limiting case, the selection most robust towards the initial fit given no other information is just random selection. 


% \begin{equation}
% \log p(x) \approx \ell(\hat \theta) + \log \pi(\hat \theta ) + \frac{p}{2} \log(\frac{2\pi}{n}) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)|.   
% \end{equation} 


% Note that by letting $n \to \infty$, the prior's influence on the marginal likelihood is dominated by the likelihood. Multiplying by $-2$ yields the Bayesian Information Criterion (BIC) \cite{schwarz1978estimating}. In the realm of model selection, the BIC is commonly used to find an optimal model $ M^* =  \argmin_M \{ -2\ell(\hat \theta) + p \log(n)\}$. Since we alter neither the number of data nor of the features when optimizing the pseudo marginal likelihood, we can ignore $p \log(n)$ and simply proceed with the likelihood:


% \subsection{Informative Prior}

% \begin{theorem} \label{thm:laplace}
%     If $\tilde \ell$ and $\pi$ are sufficiently smooth,
%     \begin{align*}
%          p(\Dcal \cup (x_i, \hat y_i) | \Dcal) 
%        &\approx\biggl(\frac{2\pi}{n}\biggr)^{q / 2} \frac{\exp[\tilde \ell(\tilde \theta) ]  \pi(\tilde \theta)}{ |  \tilde \ell''(\theta) / n |^{1/2} p(\Dcal) },
%       \end{align*}
%       where $\tilde \theta = \arg\max_{\theta} \tilde \ell(\theta)$.
% \end{theorem}

% Recall that
% \begin{align*}
%     p(\Dcal \cup (x_i, \hat y_i) | \Dcal)  &=  \int_\Theta  p(\Dcal \cup (x_i, \hat y_i) \mid \theta) p(\theta \mid \Dcal) d\theta,
% \end{align*}
% where
% \begin{align} \label{eq:integrand} 
%   p(\Dcal \cup (x_i, \hat y_i) \mid \theta) p(\theta \mid \Dcal) = \exp[\tilde \ell(\theta) \bigr) ] \pi(\theta) / p(\Dcal).
% \end{align}
% Let $I(\theta) = -\tilde \ell'(\theta)/n$ denote the observed Fisher information matrix.
% We start with a Taylor expansion around $\tilde \theta$:
% \begin{align} \label{eq:ll-taylor}
%     \tilde \ell(\theta) = \tilde \ell(\tilde \theta)  + \tilde \ell'(\tilde \theta)(\theta - \tilde \theta) - \frac{n}{2} (\theta - \tilde  \theta)' \mathcal{I}(\tilde  \theta) (\theta - \hat \theta) + O(\|\theta - \tilde \theta\|^3).
% \end{align}
% The term $\ell'(\hat \theta)$ in the second summand is zero per definition of $\hat \theta$. And because $\tilde \theta$ maximizes $\tilde \ell(\theta)$, the right hand side in \eqref{eq:ll-taylor} decays exponentially in $\|\theta - \tilde \theta\|$. This justifies approximating the integrand \eqref{eq:integrand} locally around $\tilde \theta$. 
% Similarly approximating $\pi(\theta) \approx \pi(\tilde \theta)$ then gives 
% \begin{align} \label{eq:approx-1}
%     p(\Dcal \cup (x_i, \hat y_i) | \Dcal) &\approx  \exp[\ell(\hat \theta)] \pi(\tilde \theta)  \int_{\Theta} \exp\biggl[- \frac{n}{2} (\theta - \tilde  \theta)'  \mathcal{I}(\tilde  \theta) (\theta - \hat \theta)\biggr] d \theta.
% \end{align}
% The integral on the right is a Gaussian integral. Defining $\Sigma = (n \mathcal{I}(\tilde  \theta))^{-1}$ and $\phi_\Sigma$ as the density of the $\mathcal N(0, \Sigma)$ distribution, we have 
% \begin{align*}\label{eq:approx-2}
%     \int_{\Theta} \exp\biggl[- \frac{n}{2} (\theta - \tilde  \theta)'  \mathcal{I}(\tilde  \theta) (\theta - \hat \theta)\biggr] d \theta 
%     = (2\pi)^{q/2} |\Sigma|^{1/2} \int_{\Theta} \phi_\Sigma(\theta) d \theta 
%     =\biggl(\frac{2\pi}{n}\biggr)^{q/2} |I(\theta)|^{-1/2}.
% \end{align*}
% The claim follows from combining \eqref{eq:approx-1} and \eqref{eq:approx-2}.


% \newpage
% \section{APPROXIMATION OF MARGINAL LIKELIHOOD}
% In section 3 of the paper, we motivate PPP's approximation by approximations of the marginal likelihood by Laplace's method.\footnote{See \cite{schwarz1978estimating, bishop2006pattern,konishi2008information}} Based on our deliberations therein, we can approximate the marginal likelihood as well. Essentially, this approximation is in complete analogy to the approximation of the posterior predictive with informative prior. The only difference is that we integrate with regard to the prior instead of the posterior. Our approximation will result in (see equation 5 below). 


% \begin{equation*}
% \label{eq:laplaxe-approx-final-posterior}
% \begin{split}
% \log p(\bm y \mid \bm x) \approx  \, \ell(\hat \theta) + \log \pi(\hat \theta ) +  \frac{p}{2} \, \log(\frac{2\pi}{n}) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)|.   
% \end{split}
% \end{equation*} 
% Note that this equals our approximation of the PPP with informative prior -- but with prior $\pi(\hat \theta )$ instead of posterior $p(\hat \theta \mid \mathcal{D})$. For the sake of completeness, we will go through the approximation in what follows. For ease of exposition, we provide an approximation of the general marginal likelihood. The application to the pseudo-label marginal likelihood, that is, the marginal likelihood of the labeled data and the pseudo-labeled data, is straightforward. As in the main paper, consider the Maximum Likelihood (ML) estimator $\hat \theta_{ML} = \argmax \mathcal {L}(\theta) = \argmax \ell(\theta)$, where $\ell(\theta) = \log \mathcal{L}(\theta) = \log p(y \mid x, \theta) $, and then approximate $\ell(\theta)$ by a Taylor expansion around $\hat \theta = \hat \theta_{ML}$: 

% \begin{equation}  
% \label{eq:taylor-likelihood}
% \ell(\theta) = \ell(\hat \theta) - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta) + \dots,
% \end{equation}

% where $\mathcal{I}$ is the Fisher-information matrix. Note that the second summand $\ell'(\hat \theta)$ in the Taylor series is zero per definition of $\hat \theta$ since it is a stationary point. As mentioned in the main paper, we could focus on the marginal likelihood contributions $p(y_i \mid x_{i})$. However, we would still have to deal with a possibly intractable integral which would result in a high computational load. Moreover, recall that considering the joint quantities instead of the single distributions implies no loss of generality, with possible extensions for dependent data in mind. We thus opt for approximating the joint directly, in complete accordance with the main paper.

% In contrast to the main paper, however, we cannot use the Taylor-expansion of the likelihood twice, since the marginal likelihood integrates with regard to the prior and not with regard to the posterior, which would equal the likelihood in case of being uniform on compact $\Theta$. 
% Hence, we have to approximate the prior with a separate Taylor series around $\hat \theta = \hat \theta_{ML}$:

% \begin{equation}
% \label{eq:taylor-prior}
%     \pi(\theta ) = \pi(\hat \theta ) + (\theta - \hat \theta) \cdot \pi'(\hat \theta ),
% \end{equation}

% Plugging equation \ref{eq:taylor-likelihood} and \ref{eq:taylor-prior} into equation 3 from the main paper (definition of marginal likelihood), and again exploiting $\int_{\Theta} (\theta - \hat \theta)) \exp(- \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta)  \cdot (\theta - \hat \theta)d \theta = 0$ which follows from the symmetry of the integrand, gives the following approximation of the marginal likelihood:

% \begin{equation}
% \begin{split}
% \label{eq:marginal-l-approx}
%       p(\bm y \mid \bm x)  &\approx \; \mathcal{L}(\hat \theta) \, \pi(\hat \theta ) \, \int \exp(\mathcal{T}_3(\hat \theta)) d\theta,      
% %\end{split}
% %\end{equation}


% %where $\mathcal{T}_3(\hat \theta) = - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)$ is the third summand to the Taylor series. In analogy to the PPP's approximations, we can again make use of the ML-estimator's property of asymptotic normality, obtaining
% \begin{equation}
%     \int \exp(- \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)) d \theta \approx (2 \pi)^{\frac{p}{2}} n^{- \frac{p}{2}} |\mathcal{I}(\hat \theta)|^{-\frac{1}{2}}. 
% \end{equation}

% By exploiting the latter and taking the natural logarithm of equation \ref{eq:marginal-l-approx}, we get the approximate logarithmic marginal likelihood:

% \begin{equation}
% \label{eq:laplaxe-approx-final-posterior}
% \begin{split}
% \log p(\bm y \mid \bm x) \approx  \, \ell(\hat \theta) + \log \pi(\hat \theta ) +  \frac{p}{2} \, \log(\frac{2\pi}{n}) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)|.   
% \end{split}
% \end{equation} 

% As the approximations in the main paper, this approximation provides great intuition: its first summand tells the value of the likelihood function at its maximum; that is, how well-supported the ML-estimator is by the data. The second summand does so for the prior, while the third summand corrects for the dimension of the likelihood function's domain  (i.e., the parameter space), $dim(\Theta) = q$. That is, the more parameters are involved, the more probability mass is expected on $\Theta \setminus B_{\epsilon}(\hat \theta)$ with $B_{\epsilon} = \{\theta \in \Theta \mid \|\theta - \hat \theta \| < \epsilon \}$ an $\epsilon$-ball for fixed $\epsilon > 0$ around $\hat \theta$. The natural logarithm of $2\pi$ results from the normalizing constant. Just like for the posterior predictive, the third term does not depend on $\hat \theta$ and thus can be neglected when maximizing $\log p(x)$ with regard to pseudo-labels. The last summand penalizes high curvature of the likelihood function at its peak since the Fisher-info is its second derivative. In the same manner, as in dimensionality penalization, the lower the curvature, the more probability mass is expected on $\Theta \setminus B_{\epsilon}(\hat \theta)$. 

% Conclusively, the Laplace approximation of the marginal likelihood (and thus of the pseudo-label marginal likelihood) grows in the absolute value of the likelihood's and the prior's peak and decreases in the likelihood's curvature at this point. Notably, this approximation requires stronger assumptions than PPP's approximation in the main paper: In order to get rid of the second term in the Taylor series of the prior (equation \ref{eq:taylor-prior}), we need to exploit that the ML-estimator converges to $\theta$ in probability $\hat \theta \xrightarrow{\mathbb{P}} \theta$. The latter requires assuming $n \to \infty$. Arguably, this is a very strong assumption given the usually rather small $n$ in the initially labeled data. 

% Just like the approximation of the PPP, equation \ref{eq:laplaxe-approx-final-posterior} is dominated by the likelihood $\ell(\hat \theta)$, when $n \to \infty$. This is due to the same reasoning as for the PPP's approximation and the fact that the log prior's influence (second term in equation \ref{eq:laplaxe-approx-final-posterior}) on the marginal likelihood is dominated by the likelihood.\footnote{Multiplying by $-2$ yields the Bayesian Information Criterion (BIC) \cite{schwarz1978estimating}. In the realm of Bayesian model selection, the BIC is commonly used to find an optimal model $ M^* =  \argmin_M \{ -2\ell(\hat \theta) + p \log(n)\}$.} 



% % \section{Proof of Theorem 4}


% With informative prior, we cannot simply plug in the likelihood twice, but actually have to consider the posterior. As above, denote the Maximum Likelihood (ML) estimator by $\hat \theta_{ML} = \argmax \mathcal {L}(\theta) = \argmax \ell(\theta)$, where $\ell(\theta) = \log \mathcal{L}(\theta) = \log p(y \mid x, \theta) $. We easily get the same Taylor expansion around $\hat \theta = \hat \theta_{ML}$ as above: 
% $\ell(\theta) = \ell(\hat \theta) - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta) + \dots.$ Again, the second summand $\ell'(\hat \theta)$ in the Taylor series is zero per definition of $\hat \theta$, and we can disregard terms of higher orders, as the ML-estimator converges to $\theta$ in probability, $\hat \theta  \xrightarrow{\mathbb{P}} \theta$. Contrary to the above approximation, we now have to approximate the posterior with a separate Taylor series around $\hat \theta = \hat \theta_{ML}$ as follows: $p(\theta \mid  \mathcal{D}) = p(\hat \theta \mid  \mathcal{D}) + (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D}) + \dots .$ Plugging these two Taylor series for $\ell(\theta)$ and $p(\theta \mid \mathcal{D})$ into the PPP gives the following approximation: $\int_{\Theta} \mathcal{L}(\theta) p(\theta \mid \mathcal{D}) d \theta \approx \int_{\Theta} \exp(\ell(\hat \theta) + \mathcal{T}_3(\hat \theta)) \; [p(\hat \theta \mid  \mathcal{D}) + (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D})] d \theta  $, where $\mathcal{T}_3(\hat \theta) = - \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)$ is again the third summand of the Taylor series for the likelihood. First, we can rewrite this approximation as $\int_{\Theta} \mathcal{L}(\theta) p(\theta \mid \mathcal{D}) d \theta \approx \mathcal{L}(\hat \theta) \int_{\Theta} \exp(\mathcal{T}_3(\hat \theta)) \cdot p(\hat \theta \mid  \mathcal{D}) + \exp(\mathcal{T}_3(\hat \theta)) \cdot (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D})] d \theta = \mathcal{L}(\hat \theta) \int_{\Theta} \exp(\mathcal{T}_3(\hat \theta)) \cdot p(\hat \theta \mid  \mathcal{D}) d \theta + \mathcal{L}(\hat \theta) \int_{\Theta} \exp(\mathcal{T}_3(\hat \theta)) \cdot (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D}) d \theta$, exploiting that $\mathcal{L}(\theta)$ does not depend on the variable of integration. For the second summand it now holds that $\mathcal{L}(\hat \theta) \int_{\Theta} \exp(\mathcal{T}_3(\hat \theta)) \cdot (\theta - \hat \theta) \cdot p'(\hat \theta \mid  \mathcal{D})] d \theta = \mathcal{L}(\hat \theta) p'(\hat \theta \mid  \mathcal{D}) \int_{\Theta} \exp(\mathcal{T}_3(\hat \theta)) \cdot (\theta - \hat \theta)d \theta = 0$, because $\int_{\Theta} (\theta - \hat \theta) \exp(- \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta)  \cdot (\theta - \hat \theta))d \theta = 0$. This follows from the symmetry of the integrand. We thus end up with

% \begin{equation}
% \begin{split}
% \label{eq:approx-inf-prior}
%       p(\hat{y} \mid x, \bm y, \bm x) \approx \; \mathcal{L}(\hat \theta) \, p(\hat \theta \mid \mathcal{D}) \, \int \exp(\mathcal{T}_3(\hat \theta)) d\theta.     
% \end{split}
% \end{equation}


%  We can now make use of the ML-estimator's property of being asymptotically normally distributed, obtaining
% \begin{equation}
%     \int_{\Theta} \exp(- \frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)) d \theta \approx (2 \pi)^{\frac{q}{2}} n^{- \frac{q}{2}} |\mathcal{I}(\hat \theta)|^{-\frac{1}{2}}. 
% \end{equation}

% By exploiting the latter and taking the logarithm of Equation~\ref{eq:approx-inf-prior}, we get the approximate logarithmic PPP for an informative prior as follows:

% \begin{equation}
% \label{eq:approx-informative-final}
% \begin{split}
% \log p(\hat{y} \mid x, \bm y, \bm x)  &\approx  \, \ell(\hat \theta) + \log p(\hat \theta \mid  \mathcal{D}) \\
% &+  \frac{q}{2} \, \log(\frac{2\pi}{n}) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)|,   
% \end{split}
% \end{equation} 




% Laplace Approximation's need to    

%  Other summands grow less than $\mathcal{O}(1)$ in $n$. Thus,
% $
%    \log p(\hat{y} \mid x, \bm y, \bm x)  \overset{n \to \infty}{\approx} 2 \, \ell(\hat \theta) \propto \ell(\hat \theta).
% $
% The respective optimal action is then simply $a^* \overset{n \to \infty}{\approx} \argmax_a \ell(\hat \theta(\mathcal{D} \cup \left(x_{i}, \hat y_i\right))).$
% In case of such a rough approximation, selection with regard to the PPP then corresponds to selection with regard to the likelihood.
% As pointed out in section \ref{sec:bayes-opt}, this corresponds to the optimistic max-max-criterion, rendering the selection with regard to the likelihood the risk-loving max-max-action. In other words, assuming $n \to \infty$ matches max-max-decisions.









% \subsection{Robust PPP}


% JUST WARNING THAT THIS IS YET A PLACEHOLDER



% For a large enough set $\Pi$, we i.a. have

% $ \pi^*(\hat \theta ) \to 0$ and thus for the posterior $p^*(\hat \theta \mid \mathcal{D}) \to 0$.


% % = c \int_{\Theta} \mathcal{L}(\theta) \pi(\theta ) d \theta$, $c \in (0,1)$ a constant. This induces for the respective posterior $p

% For a large enough set $\Pi$, we i.a. get $ \pi^*(\hat \theta ) 
% = c \int_{\Theta} \mathcal{L}(\theta) \pi(\theta ) d \theta$, $c \in (0,1)$ a constant. This induces for the respective posterior $p^*(\hat \theta \mid \mathcal{D}) = c \; \mathcal{L}(\hat \theta)$. Hence, we write

% \begin{equation}
% \label{eq:laplaxe-approx-final-posterior}
% \begin{split}
% \log p^*(\hat{y} \mid x, \bm y, \bm x) \propto & \, \ell(\hat \theta) + \log(c) \, \ell(\hat \theta)  \\ &+\frac{p}{2} \log(\frac{2\pi}{n})  - \frac{1}{2} \log|\mathcal{I}(\hat \theta)|.   
% \end{split}
% \end{equation} 

% By letting $c \to 1$, the respective posterior is $p^*(\hat \theta \mid \mathcal{D})$. By considering a large enough set $\Pi$, we get $\pi^*(\hat \theta ) \to 1$ and let the prior dominate the posterior, hence $p^*(\hat \theta \mid \mathcal{D}) \to 1$. We thus have $p^*(\hat{y} \mid x, \bm y, \bm x) \approx \exp(\ell(\hat \theta)) \, \int \exp(\frac{n}{2} (\theta - \hat \theta)' \mathcal{I}(\hat \theta) (\theta - \hat \theta)) d\theta.$
% So we approximate the logarithmic robust PPP as follows:


% $\log p(y\mid x) \approx \ell(\hat \theta) + \log \pi(\hat \theta ) + \frac{p}{2} \log(2\pi) - \frac{p}{2} \log(n) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)|.$


% JUST WARNING THAT THIS IS YET A PLACEHOLDER; SECTION 3.2 NOT READY!

% note: everything in beetween 0 and 1 for posterior is hurwitz







% where possible approximations of the \textit{robust} PPP are also discussed. Generally, if we allow for improper priors in $\Pi$ rendering the posterior $p^*(\hat \theta \mid \mathcal{D})$ uniform, the respective posterior predictive equals the marginal likelihood in case of independently distributed data. In such an edge case, the selection is most robust towards the initial fit given no other information rendering it a random selection.

% \begin{equation}
% \log p(x) \approx \ell(\hat \theta) + \log \pi(\hat \theta ) + \frac{p}{2} \log(\frac{2\pi}{n}) - \frac{1}{2} \log|\mathcal{I}(\hat \theta)|.   
% \end{equation} 





\newpage
\section{EXPERIMENTAL SETUP}

We describe the setup for the experiments with both the simulated and the real-world data along with additional empirical results comparing our approximate PPP with predominant PLS methods in section \ref{sec:add-res}. 

\subsection{Benchmarks}
\label{sec:exp-setup}
Throughout our experiments, we compare our proposed approximate PPP with a set of baseline and competing approaches: 


\begin{itemize}
    \item \textit{Likelihood (max-max)}: Self-training using the Likelihood max-max action as selection criterion 
    \item \textit{Predictive Variance}: Self-training using the predictive variance $\Var[\hat y] = \mathbb{E}[\hat y - \mathbb{E}[\hat y]]^2$ of the model predictions as a selection criterion 
    \item \textit{Probability Score}: Self-training using the predicted probabilities (scores) $\mathbb{P}(y = \hat y)$ as a selection criterion 
    \item \textit{Supervised Learning}: regular supervised model fitting using the labeled training data only
\end{itemize}

All data sets reflect binary classification tasks with a fairly balanced class label distribution.
Hence, we report and compare with model performance as measured in accuracy on the holdout test data sets.

\subsubsection{Generalized Linear Models}

We choose generalized linear models (GLMs) \cite{nelder1972generalized} as predictive models for BPLS with PPP as well as for all competing methods listed in section \ref{sec:exp-setup}. By considering the binomial distribution from the exponential family this yields logistic regression:

\begin{equation}
  \mathrm {P} (Y=1\mid X=x_{i})=\mathrm {P} (Y_{i}=1)={\frac {\exp(\mathbf {x} _{i}^{\top }{\boldsymbol {\beta }})}{1+\exp(\mathbf {x} _{i}^{\top }{\boldsymbol {\beta }})}}={\frac {1}{1+\exp(-\mathbf {x} _{i}^{\top }{\boldsymbol {\beta }})}},
\end{equation}

with $\boldsymbol{\beta }=(\beta _{0},\beta _{1},\ldots ,\beta _{k})^{\top }$ and $\mathbf {x} _{i}^{\top }{\boldsymbol {\beta }}=\beta _{0}+x_{i1}\beta _{1}+x_{i2}\beta _{2}+\dotsc +x_{ik}\beta _{k}$. Such a regression with additive linear predictor $\mathbf {x} _{i}^{\top }{\boldsymbol {\beta }}$ can be easily be extended to target variables that follow a multinomial distribution (i.e. multi-class problems). Our setup described in section \ref{sec:exp-setup} can thus be extended in a straightforward manner to such learners for multi-class classification tasks. 




\subsubsection{Generalized Additive Models}
 
We also use non-parametric generalized additive models (GAMs) \cite{fahrmeir2013regression, hastie2017generalized} as predictive models. Here, the response variable depends on unknown smooth functions of some feature variables: 

\begin{equation}
    g(\mathbb{E}(Y))=\beta _{0}+f_{1}(x_{1})+f_{2}(x_{2})+\cdots +f_{m}(x_{m}).
\end{equation}

As above, we assume $Y$ to follow a binomial distribution in our experiments, since we only consider binary classification. Like GLMs, GAMs can be easily extended to multi-class problems.



\subsubsection{Simulation Design}

For the simulation study, we created a simulated dataset with $n$ samples for a binary classification based on a varying amount of $q$ features. This simulation follows the model equation

\begin{equation}
    y_i \sim Bin(1, p_i), \;
    \text{with} \; p_i = \left(1 + exp(- x_{i,0} + x_{i,1} + ... x_{i,p})\right)^{-1}
\end{equation}

where $x_i \sim \mathcal{N}(\mu, \sigma^2)$ independently with varying $\mu$ and $\sigma^2$. 


\subsubsection{Pre-Processing and Gathering of Real-world Data}

Detailed information on sources, features, and target variables of all data sets \cite{Dua:2019} that were used in the experiments can be found in section \ref{sec:data-sets}. The data sets were selected randomly after filtering according to the following criteria:
\begin{itemize}
    \item We only consider binary classification tasks, since we test the PLS methods based on semi-supervised logistic regression.
    \item We choose from datasets with a low number of missing values in order to minimize algorithm differences in missing value handling.
    \item We restrict ourselves to datasets with $q < 100$ to avoid massive overfitting and computational trouble.
\end{itemize}


In order to benchmark BPLS against classical PLS methods, we split the data sets into train and test data first, before removing labels from a pre-defined share of training data. Our detailed splitting procedure for the real-world datasets with a total size of $n$ samples each is the following:


\begin{enumerate}
    \item draw $n_{test}$ samples to create the holdout test set $D_{test}$ where the remainder constitutes the training set $D_{train}$ of size $n_{train}$ such that $n_{train} = n_{test}$  (share of test data thus $50 \%$). 
    \item draw $n_{labeled}$ samples from $D_{train}$ to create the labeled training data $D_{train}^{labeled}$ 
    \item Remove labels from remaining samples in $D_{train}$ and treat them as unlabeled data $D_{train}^{unlabeled}$
\end{enumerate}

Throughout our experiments, we repeat self-training $R$ times and use varying shares of labeled data $\frac{n_{unlabeled}}{n_{train}}$. 



\subsubsection{Hypotheses}

For interpretation purposes, recall our hypotheses that we specified before running the experiments: 




\begin{Hypothesis}
\label{hypo:BPLS-good}
\textbf{(a)} PPP with uninformative prior outperforms traditional PLS on data prone to initial overfitting (i.e., with high ratio of features to data $\frac{p}{n}$ and poor initial generalization).   
\textbf{(b)} For low $\frac{p}{n}$ and high initial generalization, BPLS is outperformed by traditional PLS.
\end{Hypothesis}

\begin{Hypothesis}
\label{hypo:likelihood}
    \textbf{(a)} Among all PLS methods, the pseudo-label likelihood (max-max-action) reinforces the initial model fit the most and \textbf{(b)} hardly improves generalization. 
\end{Hypothesis}

\begin{Hypothesis}
\label{hypo:informative}
    PPP with informative prior outperforms traditional PLS methods universally.
\end{Hypothesis}


\newpage

\section{FURTHER RESULTS}
\label{sec:add-res}

In this section, we present additional results. %Section~\ref{sec:res-real} presents results on all eight real-world data sets with a share of unlabeled data $\frac{n_{unlabeled}}{n_{train}} = 0.8$. 
Section \ref{sec:res-sim} has the complete results for simulated data with $q = 60$ features.\footnote{Results for $n = 100$ and $n = 400$ were already included in the paper, but are also shown here for the sake of completeness of the setup with $q = 60$. (Note that this is an exception; all other results presented herein have not been included in the paper.)} In section \ref{sec:res-sim-further}, we show additional results for smaller $q \in \{10,15,20,30\}$ with varying $n \in \{300, 400, 800, 1000\}$. 


% \subsection{Results on Real-World Data}
% \label{sec:res-real} 

% We visualize the results on the eight real-world data sets (section \ref{sec:data-sets}) from UCI machine learning repository \cite{Dua:2019} in figure \ref{fig:res-all-uci}.



% \begin{figure}[H]
%     \centering
%     \includegraphics[width=\textwidth]{Sample UAI 2023 paper/figures/res-all.png}
%     \caption{Results on Real-World Data. $R = 40$; $\frac{n_{unlabeled}}{n_{train}} = 0.8$. Plots are ordered according to $\frac{q}{n}$ in the data sets. }
%     \label{fig:res-all-uci}
% \end{figure}


% At first sight, comparing the accuracy gains in figure \ref{fig:all-results} on different data sets (in order of ascending baseline performance) clearly supports hypothesis~\ref{hypo:BPLS-good}: For harder tasks like EEG or sonar with relatively high ratio of features to data $\frac{p}{n}$, Bayesian PPP outperforms traditional PLS, whilst being dominated by the probability score in case of easier tasks like banknote or breast cancer. For data sets with intermediate difficulty (mushrooms and ionosphere), PPP and other PLS methods compete head-to-head. The results on abalone data underpin a general fact in SSL (see section~\ref{sec:intro}): Successful self-training requires at least some baseline supervised performance.

% Results on simulated data (table~\ref{tab:table}) further support the role of $\frac{p}{n}$ in hypothesis~\ref{hypo:BPLS-good}. Their visualization (figure~\ref{fig:res-sim}) nicely illustrates the inner working of selection by PPP: By not trusting the initial model, PPP affects the model's test accuracy the most. While $n = 400$ leaves some room for improvement through curing the overfitting by pseudo-labeled data, PPP leads to a noisy performance in case of $n = 100$ close to $p$. Here, even the final model still overfits. These promising results should not hide an inconsistency: The fact that PPP is superior on the cars task but not on the ionosphere task contradicts hypothesis~\ref{hypo:BPLS-good} (a), since cars is harder than ionosphere, while having almost identical $\frac{p}{n}$. 

% We find hypothesis \ref{hypo:likelihood} to be partially supported by the results. While \ref{hypo:likelihood} (a) holds for both simulated (see supplementary material) and real-world data (likelihood generally the closest to supervised performance), \ref{hypo:likelihood} (b) is challenged by considerable generalization performance gain on ionosphere and breast cancer data. 

% Figure \ref{fig:results-inf} clearly supports hypothesis \ref{hypo:informative}: When using informative priors based on the true data-generating process, BPLS clearly outperforms traditional PLS methods. This comes at no big surprise, since non-Bayesian PLS simply lack ways to incorporate such prior knowledge. From this perspective, the uninformative case corresponds to raising the bar and clearly is the theoretically more interesting benchmarking setup. However, many practical applications of SSL entail a myriad of pre-existing knowledge, e.g., radio spectrum identification \cite{cameloetal}. For practical purposes, thus, the informative situation might even be more relevant.





% Notably, we are currently running additional experiments with lower shares of unlabeled data (not visualized here). Preliminary results suggest that this leads to an increased initial generalization performance, rendering the tasks easier. For classification tasks like Breast Cancer that have already been fairly easy with $\frac{n_{unlabeled}}{n_{train}} = 0.8$, the order of best-performing methods does not change. The differences in performances on the Banknote and EEG tasks are minor (note the scale of the Y-axis). Other results are yet to arrive.


\subsection{Results on Simulated Data with $q = 60$}
\label{sec:res-sim}

\begin{figure}[H]
    \centering
    \includegraphics[width=\textwidth]{figures/res-sim-p=60.png}
    \caption{Complete Results on Simulated Data for $q = 60$. $R = 100$; $\frac{n_{unlabeled}}{n_{train}} = 0.8$.}
    \label{fig:my_label}
\end{figure}

\subsection{Further Results on Simulated Data with $q \in \{10,15,20,30\}$}
\label{sec:res-sim-further}


\begin{figure}[H]
    \centering
    \includegraphics[width=\textwidth]{figures/res_simulated_n=300.png}
    \caption{Results on Simulated Data, $n = 300$ and (from left to right) $q \in \{10,15,20,30\}$. $R = 100$; $\frac{n_{unlabeled}}{n_{train}} = 0.8$.}
    \label{fig:my_label}
\end{figure}


\begin{figure}[H]
    \centering
    \includegraphics[width=\textwidth]{figures/res-n=400.png}
    \caption{ Results on Simulated Data, $n = 400$ and (from left to right) $q \in \{10,15,20,30\}$. $R = 100$; $\frac{n_{unlabeled}}{n_{train}} = 0.8$.}
    \label{fig:my_label}
\end{figure}


\begin{figure}[H]
    \centering
    \includegraphics[width=\textwidth]{figures/res-sim-800.png}
    \caption{ Results on Simulated Data, $n = 800$ and (from left to right) $q \in \{10,15,20,30\}$. $R = 100$; $\frac{n_{unlabeled}}{n_{train}} = 0.8$.}
    \label{fig:my_label}
\end{figure}




\begin{figure}[H]
    \centering
    \includegraphics[width=\textwidth]{figures/res-sim-n=1000.png}
    \caption{ Results on Simulated Data, $n = 1000$ and (from left to right) $q \in \{10,15,20,30\}$. $R = 100$; $\frac{n_{unlabeled}}{n_{train}} = 0.8$.}
    \label{fig:my_label}
\end{figure}



\newpage


\begin{landscape}


\subsection{Informative Prior: Further Results on Simulated Data}


\begin{figure*}[h!]
\centering
\begin{minipage}[b]{0.99\linewidth}
\centering
\includegraphics[scale = 0.23]{figures/Rplot06.png}
\end{minipage}
\begin{minipage}[b]{0.99\linewidth}
\centering
\includegraphics[scale = 0.23]{figures/Rplot001(1).png}
\end{minipage}
\caption{Results from simulated data in case of informative priors with simple GLMs (logistic regression, first row) and more complex non-parametric GAMs (second row). Note that resolution allows zooming in.}
\label{fig:res:sim-inf}
\end{figure*}
% \begin{figure*}[t!]
% \centering
% \includegraphics[width=\textwidth]{figures/plot-res-inf-GLM.png}
% \caption{Results for logistic regression with informative priors.}
% \label{fig:all-results}
% \end{figure*}

\vfill


\end{landscape}


\newpage

\subsection{Summary of Results on Simulated Data}
Table \ref{tab:table} summarizes the results on simulated data in an ordinal manner. That is, it shows the best-performing method on the different setups. As in the main paper, \say{Oracle stopping} in table \ref{tab:table} refers to comparing PLS methods with regard to their overall best accuracy as opposed to \say{final} comparisons after the whole data set was labeled.

\begin{table}[H]
\caption{Best performing PLS on Simulated Data} 
\begin{center}
\begin{tabular}{c||c||ll}
\textbf{n} & \textbf{p} &\textbf{ORACLE STOPPING} & \textbf{FINAL} \\
\hline \hline
60 & 60 &  PPP         & PPP\\
100 & 60 & PPP & Supervised Learning \\
400 & 60 & PPP             & PPP \\
1000 & 60 & Probability Score & Probability Score \\
\hline
300 & 30 & Probability Score & Probability Score \\
300 & 20 & PPP & PPP \\
300 & 15 & PPP & PPP \\
300 & 10 & PPP/Probability Score & PPP/Probability Score \\
\hline
400 & 30 & Probability Score & PPP/Probability Score \\
400 & 20 & Probability Score & Probability Score \\
400 & 15 & PPP/Probability Score & Probability Score \\
400 & 10 & PPP/Probability Score & PPP/Probability Score \\
\hline
800 & 30 & PPP & PPP \\
800 & 20 & PPP/Probability Score & PPP/Probability Score \\
800 & 15 & PPP & PPP \\
800 & 10 & PPP & PPP/Probability Score \\
\hline
1000 & 30 & PPP & PPP/Probability Score  \\
1000 & 20 & PPP & PPP \\
1000 & 15 & PPP & PPP \\
1000 & 10 & Predictive Variance & Predictive Variance \\
\label{tab:table}

\end{tabular}
\end{center}
\end{table}

\newpage






\newpage

\section{EXTENSIONS}
\label{sec:app-extensions}

We provide further details on the suggested extensions in section 6. Besides, we briefly discuss other potential extensions.

\subsection{Extensions proposed in the Paper}

We summarize the proposed extensions' procedure from section 6 in the paper by pseudo-code as follows.  


\subsubsection{Bivariate Pseudo-Label Selection}

The idea of bivariate BPLS would be to touch the model class $M$. When comparing PPPs, one could then take into account the required model size $q$. The rough idea would be to prefer pseudo-labels that have high plausibility (high likelihood) even with simpler models (small $q$).


\RestyleAlgo{ruled}

%% This is needed if you want to add comments in
%% your algorithm with \Comment
\SetKwComment{Comment}{/* }{ */}

\begin{algorithm}[H]
\caption{Bivariate Bayesian Pseudo-Label Selection (BPLS)}

\KwData{$\mathcal{D}, \mathcal{U}$}
\KwResult{$\mathcal{D}$, fitted model $\hat y^*(x)$}
% $y \gets 1$\;
% $X \gets x$\;
% $N \gets n$\;
\textbf{Fit} model M on labeled data $\mathcal{D}$ to obtain prediction function $\hat y(x)$ \\
\While{stopping criterion not met}{
\For{$i \in \{1, \dots, \lvert \mathcal{U} \rvert \}$}{
\textbf{predict} $\mathcal{Y} \ni \hat y_i = \hat y(x_i)$ with models of varying $dim(\Theta)$ \\
% \textbf{retrain} model $\hat y_r(x)$ on data $\mathcal{D} \cup \left(x_{i}, \hat y_i\right)$ \\
\textbf{evaluate} PPP $p(\mathcal{D} \cup \left(x_{i}, \hat y_i\right) | \mathcal{D}) $ with predictions from the best performing (on training data) model and save respective $dim(\Theta)$  %of $\mathcal{D} \cup \left(x_{i}, \hat y_i\right)$, i.e. $p((\hat y_i, y_1, \dots, y_n,) \mid (x_i, x_1, \dots, x_n, \mathcal{D}))$ 
\\
}
\textbf{obtain} $i^* = \argmax_i \{f(p(\mathcal{D} \cup \left(x_{i}, \hat y_i\right) | \mathcal{D}), dim(\Theta)) \} $, with $f(\cdot, \cdot)$ some linear combination of the PPP and the model size $dim(\Theta)$ \\ 
 \textbf{retrain} M on $\mathcal{D} \cup \left(x_{i}, \hat y_{i^*}\right)$ \\
\textbf{predict} $\mathcal{Y} \ni \hat y_i^*(\textbf{x} \cup x_i), \textbf{x} \in \mathcal{D} $ \\
\textbf{add} $(x_i, \hat y_i)$ to labeled data: $\mathcal{D} \leftarrow \mathcal{D} \cup (x_i, \hat y_i) $ \\
\textbf{update} $\mathcal{U} \leftarrow \mathcal{U} \setminus \left(x_{i}, \mathcal{Y}\right)_i $

}
\end{algorithm}




\subsection{Additional Extensions}



\subsubsection{Robust PPP}


We further propose a robust extension of PPP based on generalized Bayesian inference \cite{dempster1968generalization, walley1991statistical, insua2012robust, augustin2014introduction}. Recall that for the \textit{robust} PPP, now denoted as $p^*(\hat{y} \mid x, \bm y, \bm x)$, we consider the prior $\pi^*(\theta )$ among all priors from a convex set of priors $\Pi$ that has the smallest value $\pi^*(\hat \theta )$ at the ML-estimator $\hat \theta$. Recall that $\Pi \subseteq \{\pi(\theta) \mid \pi(\cdot) \, \text{a probabilty measure on } \left(\Theta, \sigma(\Theta) \right) \}$ with $\Theta$ compact as throughout the paper and $\sigma(\cdot)$ an appropriate $\sigma$-algebra.



More formally and encapsulating the notion of $\Gamma$-Maximin as in \cite{guo2010decision}, for instance, we have the decision problem $(\mathbb{A}, \Theta, u(\cdot))$ with $\mathbb{A} = \mathcal{U}$ (definition 1 in paper) with the pseudo-label likelihood as utility function (definition 2) and a set of priors $\Pi$ as above. Then the $\Gamma$-maximin criterion 

\begin{equation}
\Phi(\cdot,\Pi) \colon \mathcal{U} \to \mathbb{R}; \,
a \mapsto \Phi(a, \pi) = \underline{\mathbb{E}}_\Pi(u(a,\theta))
\end{equation}

    
with $\underline{\mathbb{E}}_\Pi(u(a,\theta)) = \inf_{\pi \in \Pi} \mathbb{E}(u(a,\theta))$ corresponds to the \textit{robust pseudo posterior predictive} $p^*(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \mathcal{D})$ that results from updating the prior $\pi^*(\cdot) \in \Pi$ that has the lowest value in $\hat \theta$. Action $a_{\Gamma}^* = \argmax_i p^*(\mathcal{D} \cup (x_i, \hat{y}_i)\mid \mathcal{D})$ is $\Gamma$-maximin-optimal for prior $\pi^*(\cdot)$.




In practice, the proposed extension heavily depends on the exact nature of $\Pi$. For illustrative purposes, suppose that we can specify $\Pi$ such that the most contradicting prior is such that the resulting posterior is uniform. Effectively, we then end up with the same situation as with the marginal likelihood when the prior is uniform in case of independent observations, see the end of section 3.1 in the main paper: We randomly select pseudo-labeled instances. Quite intuitively, the selection that is most robust toward the initial fit given no other information is just such a random selection.



\subsubsection{Bayesian Pseudo-Label Selection without predictions}
\label{sec:no-preds}
The idea here would be to directly assign all possible $q$ classes in $\mathcal{Y}$ to the unlabeled data points with $q = \lvert \mathcal{Y} \rvert$. The following pseudo-code lines out the procedure. Note that the inner loop thus requires $\lvert \mathcal{U} \rvert \cdot \lvert \mathcal{Y}\rvert$ assignments and respective PPP evaluations.

\RestyleAlgo{ruled}

%% This is needed if you want to add comments in
%% your algorithm with \Comment
\SetKwComment{Comment}{/* }{ */}

\begin{algorithm}[H]
\caption{Bayesian Pseudo-Label Selection (BPLS) without predictions}

\KwData{$\mathcal{D}, \mathcal{U}$}
\KwResult{$\mathcal{D}$, fitted model $\hat y^*(x)$}
% $y \gets 1$\;
% $X \gets x$\;
% $N \gets n$\;
\textbf{Fit} model M on labeled data $\mathcal{D}$ to obtain prediction function $\hat y(x)$ \\
\While{stopping criterion not met}{
\For{$i \in \{1, \dots, \lvert \mathcal{U} \rvert \}$}{
\textbf{assign}  all possible $\hat y_i \in \mathcal{Y}$ to $\left(x_{i}, \hat y_i\right)$ \\
% \textbf{retrain} model $\hat y_r(x)$ on data $\mathcal{D} \cup \left(x_{i}, \hat y_i\right)$ \\
\textbf{evaluate} all possible PPP $p(\mathcal{D} \cup \left(x_{i}, \hat y_i\right) | \mathcal{D}) $ %of $\mathcal{D} \cup \left(x_{i}, \hat y_i\right)$, i.e. $p((\hat y_i, y_1, \dots, y_n,) \mid (x_i, x_1, \dots, x_n, \mathcal{D}))$ 
\\
}
\textbf{obtain} $i^* = \argmax_i \{p(\mathcal{D} \cup \left(x_{i}, \hat y_i\right) | \mathcal{D}) \} $ \\ 
 \textbf{retrain} M on $\mathcal{D} \cup \left(x_{i}, \hat y_{i^*}\right)$ \\
\textbf{predict} $\mathcal{Y} \ni \hat y_i^*(\textbf{x} \cup x_i), \textbf{x} \in \mathcal{D} $ \\
\textbf{add} $(x_i, \hat y_i)$ to labeled data: $\mathcal{D} \leftarrow \mathcal{D} \cup (x_i, \hat y_i) $ \\
\textbf{update} $\mathcal{U} \leftarrow \mathcal{U} \setminus \left(x_{i}, \mathcal{Y}\right)_i $

}
\end{algorithm}





\subsubsection{Fantasy PPP}

In complete analogy to the proposed extension in section \ref{sec:no-preds}, we consider assignment of all possible classes instead of predictions of single classes. As opposed to selecting from all possible pseudo-labels directly, we could also combine the PPPs from pseudo-labels for each instance to a fantasy PPP by a weighted sum. See the following pseudo-code for details. The formulation allows for different ways of how to define the weighted sum $\Sigma$. Regarding one instance, we would have a PPP for each class $y \in \mathcal{Y}$. One way to define $\Sigma$ would be to consider the maximal and minimal PPP only and compute a weighted sum thereof, leaning on the Hurwicz-criterion in decision theory \cite{hurwicz1951generalized}. The weight assigned to the maximal PPP is then regarded the decision-maker's degree of optimism.  


\begin{algorithm}[H]
\caption{Bayesian Pseudo-Label Selection (BPLS) with fantasy PPPs}

\KwData{$\mathcal{D}, \mathcal{U}$}
\KwResult{$\mathcal{D}$, fitted model $\hat y^*(x)$}
% $y \gets 1$\;
% $X \gets x$\;
% $N \gets n$\;
\textbf{Fit} model M on labeled data $\mathcal{D}$ to obtain prediction function $\hat y(x)$ \\
\While{stopping criterion not met}{
\For{$i \in \{1, \dots, \lvert \mathcal{U} \rvert \}$}{
\textbf{assign} all possible $y_i \in \mathcal{Y}$ to $\left(x_{i}, y_i\right)_i$ \\
% \textbf{retrain} model $\hat y_r(x)$ on data $\mathcal{D} \cup \left(x_{i}, \hat y_i\right)$ \\
\textbf{evaluate} weighted sum $\Sigma$ of respective PPPs $p(\mathcal{D} \cup \left(x_{i}, y_i\right) | \mathcal{D}) $ %of $\mathcal{D} \cup \left(x_{i}, \hat y_i\right)$, i.e. $p((\hat y_i, y_1, \dots, y_n,) \mid (x_i, x_1, \dots, x_n, \mathcal{D}))$ 
\\
}
\textbf{obtain} $i^* = \argmax_i \Sigma $ \\ 
 \textbf{retrain} M on $\mathcal{D} \cup \left(x_{i}, \hat y_{i^*}\right)$ \\
\textbf{predict} $\mathcal{Y} \ni \hat y_i^*(\textbf{x} \cup x_i), \textbf{x} \in \mathcal{D} $ \\
\textbf{add} $(x_i, \hat y_i)$ to labeled data: $\mathcal{D} \leftarrow \mathcal{D} \cup (x_i, \hat y_i) $ \\
\textbf{update} $\mathcal{U} \leftarrow \mathcal{U} \setminus \left(x_{i}, \mathcal{Y}\right)_i $

}
\end{algorithm}






\section{NUMERICAL EXPERIMENTS VERIFYING THE SIMPLIFIED APPROXIMATION}

\subsection{Simplified Approximation}

We test the equivalence of PLS with regard to the approximate PPP criterion (Equation (6) in main paper)

\begin{align*} %\label{eq:psl-informative-0}
  \tilde \ell(\tilde \theta) - \frac 1 2 \log |\mathcal I(\tilde \theta)| + \log \pi(\tilde \theta)
\end{align*}

with $\tilde \ell(\tilde \theta) = \ell_{\Dcal\cup (x_i, \hat y_i)}(\tilde \theta)  + \ell_{\Dcal}(\tilde \theta)$, and our simplified version thereof (Equation (7) in main paper):

\begin{align*} %\label{eq:psl-informative}
   \ell_{\Dcal \cup (x_i, \hat y_i)}(\tilde \theta) - \frac 1 2 \log | \mathcal I(\tilde \theta)| + \log \pi(\tilde \theta).
\end{align*}


Recall that these terms are approximately equivalent when comparing pseudo-samples $(x_i, \hat y_i)$ and $(x_j, \hat y_j)$. We expanded $\ell_{\Dcal}$ around its maximizer $\hat \theta$, so that $\ell_{\Dcal}(\tilde \theta) =  \ell_{\Dcal}(\hat \theta) + O(\|\hat \theta - \tilde \theta\|^2)$. Since  $\Dcal \cup (x_i, \hat y_i)$ and $\Dcal$ differ in only one sample, the difference $\hat \theta - \tilde \theta$ is of order $O(n^{-1})$. Thus,$$ \tilde \ell(\theta) = \ell_{\Dcal\cup (x_i, \hat y_i)}(\theta) + \ell_{\Dcal}(\hat \theta) + O(n^{-2}).$$
The remainder is negligible compared to the other terms in Equation (6) and $\ell_{\Dcal}(\hat \theta)$ does not depend on the pseudo-sample $(x_i, \hat y_i)$. This suggests the simplified \emph{informative BPLS criterion} : $   \ell_{\Dcal \cup (x_i, \hat y_i)}(\tilde \theta) - \frac 1 2 \log | \mathcal I(\tilde \theta)| + \log \pi(\tilde \theta).$

\subsection{Experimental Setup}

In addition to this theoretical argument, we provide empirical evidence for this equivalence. It is verified numerically for small $n$ by experiments on the ionosphere data \cite{ionosphere}, EEG data \cite{zhang1995event}, banknote data \cite{Dua:2019}, abalone data \cite{waugh1995extending} as well as on simulated binomially distributed data, see section \ref{sec:exp-setup}. For all data sets, we compare semi-supervised GLM performance of BPLS with simplified criterion (\say{rough PPP}, eq. 7) and unsimplified criterion (\say{fine PPP}, eq. 6) with regard to test accuracy averaged over 40 repetitions. 

\subsection{Results}

Figure~\ref{fig:eeg} shows the results for EEG data, Figure~\ref{fig:abalone} for abalone data, while Figure~\ref{fig:simulated} displayes results for the simulated binomially distributed data. In order to assess the \textit{ceteris paribus} effect of growing $n$, we take random subsamples of the ionosphere data with varying size $n \in \{220, 260, 300\}$ and the full data set with $n = 350$. Figures~\ref{fig:iono-1} through \ref{fig:iono-4} show the respective results.  

It becomes apparent that with growing $n$ the differences between the performances of the two approximations diminishes. Already for small $n$

\clearpage


\begin{figure}[!h]
    \centering
    \begin{minipage}{.4\textwidth}
        \centering
        \includegraphics[scale=0.4, trim={0 0.8cm 5cm 0},clip]{figures/Rplot11.pdf}
        \caption{Approximations' performances \\ on ionosphere subsample of size $n=220$.}
        \label{fig:iono-1}
    \end{minipage}%
    \begin{minipage}{0.4\textwidth}
        \centering
        \includegraphics[scale=0.4, trim={0 0.8cm 0cm 0},clip]{figures/Rplot10.pdf}
        \caption{Approximations' performances \\ on ionosphere subsample of size $n=260$.}
        \label{fig:iono-2}
    \end{minipage}
\end{figure}

\begin{figure}[!h]
    \centering
    \begin{minipage}{.4\textwidth}
        \centering
        \includegraphics[scale=0.4, trim={0 0.8cm 5cm 0},clip]{figures/Rplot09.pdf}
        \caption{Approximations' performances \\ on ionosphere subsample of size $n=300$.}        \label{fig:iono-3}
    \end{minipage}%
    \begin{minipage}{0.4\textwidth}
        \centering
        \includegraphics[scale=0.4, trim={0 0.8cm 0cm 0},clip]{figures/Rplot12.pdf}
        
        \caption{Approximations' performances \\ on ionosphere data set of size $n=350$.}
        \label{fig:iono-4}
    \end{minipage}
\end{figure}


\begin{figure}[H]
    \centering
    \includegraphics[scale=0.7]{figures/Rplot13.pdf}
    \caption{Approximations' performances on EEG data set ($n=185$, $q = 13$).}
    \label{fig:eeg}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[scale=0.7]{figures/Rplot14.pdf}
    \caption{Approximations' performances on banknote data set ($n=200$, $q = 3$).}
    \label{fig:banknote}
\end{figure}


\begin{figure}[H]
    \centering
    \includegraphics[scale=0.7]{figures/Rplot16.pdf}
    \caption{Approximations' performances on abalone data set ($n=400$, $q = 4$).}    \label{fig:abalone}
\end{figure}


\begin{figure}[H]
    \centering
    \includegraphics[scale=0.7]{figures/Rplot15.pdf}
    \caption{Approximations' performances on simultated data set ($n=120$, $q = 4$).}    \label{fig:simulated}
\end{figure}

\newpage
\newpage
\section{MCMC BASELINE}

In order to compare our method against MCMC approximations of the pseudo posterior predictive (PPP), we compare BPLS with our approximation against selecting pseudo-samples according to an MCMC-approximation of the PPP. 

The latter is reminiscent of \cite{li2020pseudo}: They propose to use mixtures of predictive distributions of a neural net (applying MC dropout) as a selection criterion. Essentially, this approach thus considers (MC)MC-based approximations of the posterior predictive of single pseudo-labeled data. Our approach differs by considering the joint posterior predictive but is similar with regard to the (Bayesian) concept. The main difference, however, is our analytical approximation of the posterior predictive. Li et al., 2020, thus propose a valid alternative to our approximate Bayes optimal criteria iBPLS (eq. 7 in main paper) and uBPLS (eq. 8 in main paper). 

We benchmark this PLS method against our uBPLS on the smallest data set (cars) from the 8 UCI data sets in the paper as well as on a balanced subsample of another small data set (Pima diabetes data, see \cite{chang2022pima}), since the MC-based approximation is computationally very costly for high $n$. The following tables show the final (after all data were pseudo-labeled) mean accuracy on unseen test data, averaged over 100 replications.



The experiments demonstrate that our analytical approximations can compete with MC-sampling based approximations of the pseudo posterior predictive. The results can be smoothly added to the existing results, as they simply entail an additional baseline. All in all, we emphasize these additional results do not change the takeaway from the empirical evaluation of our method in the main paper. 

The fact that our Laplace-based approximation outperforms MC-bases ones is slightly reminiscent of recent interesting trends in Bayesian uncertainty quantification in deep learning, where Laplace-based analytical approximations of the Hessian matrix were shown to outperform sampling-based (MC), see \cite{benzing2022gradient,daxberger2021bayesian, izmailov2021dangers, wenzel2020good}.


\begin{table}[!h]
    \centering
    \begin{tabular}{l|l}
 
        \textbf{ Cars Data} & ~ \\ \hline
        PLS Method & Mean Accuracy (Final) \\ \hline
        MC-based approximation of PPP (Li et al.) & 0.658 \\ 
        uBPLS approximation of PPP (our paper) & 0.760 \\ 
        Likelihood (max-max) & 0.719 \\ 
        Predictive Variance & 0.691 \\ 
        Probability Score & 0.733 \\ 
        Supervised Learning & 0.727 \\ 
\end{tabular}
\caption{Comparison of BPLS with uBPLS approximation of PPP against MC-based approximation and other baselines on cars data.}

\end{table}

\begin{table}[!h]
    \centering

    \begin{tabular}{l|l}
     
         \textbf{Pima Data} & ~ \\ \hline
        PLS Method & Mean Accuracy (Final) \\ \hline
        MC-based approximation of PPP (Li et al.) & 0.603 \\ 
        uBPLS approximation of PPP (our paper) & 0.670 \\ 
        Likelihood (max-max) & 0.667 \\ 
        Predictive Variance & 0.663 \\ 
        Probability Score & 0.677 \\ 
        Supervised Learning & 0.675 \\ 
\end{tabular}
\caption{Comparison of BPLS with uBPLS approximation of PPP against MC-based approximation and other baselines on Pima data.}

\end{table}

\begin{table}[!h]
    \centering

    \begin{tabular}{l|l}
      
         \textbf{Cervical Cancer Data} & ~ \\ \hline
        PLS Method & Mean Accuracy (Final) \\ \hline
        MC-based approximation of PPP (Li et al.) & 0.556 \\ 
        uBPLS approximation of PPP (our paper) & 0.701 \\ 
        Likelihood (max-max) & 0.611 \\ 
        Predictive Variance & 0.644 \\ 
        Probability Score & 0.688 \\ 
        Supervised Learning & 0.611 \\ 
\end{tabular}
\caption{Comparison of BPLS with uBPLS approximation of PPP against MC-based approximation and other baselines on cervical cancer data.}

\end{table}

\begin{table}[!h]   
\centering
    \begin{tabular}{l|l}
        
         \textbf{EEG Data} & ~ \\ \hline
        PLS Method & Mean Accuracy (Final) \\ \hline
        MC-based approximation of PPP (Li et al.) & 0.549 \\ 
        uBPLS approximation of PPP (our paper) & 0.551 \\ 
        Likelihood (max-max) & 0.544 \\ 
        Predictive Variance & 0.541 \\ 
        Probability Score & 0.547 \\ 
        Supervised Learning & 0.537 \\ 
\end{tabular}
\caption{Comparison of BPLS with uBPLS approximation of PPP against MC-based approximation and other baselines on cervical cancer data.}

\end{table}

\begin{table}[!h]   
\centering
    \begin{tabular}{l|l}
 
         \textbf{Sonar Data} & ~ \\ \hline
        PLS Method & Mean Accuracy (Final) \\ \hline
        MC-based approximation of PPP (Li et al.) & 0.521 \\ 
        uBPLS approximation of PPP (our paper) & 0.550 \\ 
        Likelihood (max-max) & 0.534 \\ 
        Predictive Variance & 0.535 \\ 
        Probability Score & 0.521 \\ 
        Supervised Learning & 0.52 \\ 
    \end{tabular}
    \caption{Comparison of BPLS with uBPLS approximation of PPP against MC-based approximation and other baselines on EEG data.}

\end{table}

\newpage
\newpage
\section{APPLICATION ON BAYESIAN NEURAL NETWORKS}

We have implemented BPLS on Bayesian neural nets (BNNs) and run experiments on simulated data to benchmark BPLS against other PLS methods (exactly the same setup as in the paper, just with BNNs used to predict pseudo-labels instead of GLMs and GAMs).

We opt for BNNs, because they come with out-of-the-box uncertainty quantification. As model architecture, we use a simple feed-forward neural network with one layer consisting of 128 hidden neurons with a tanh activation function and one output neuron with a sigmoid activation for the binary classification case. For computing iBPLS (eq. 7 in main paper) and uBPLS (eq. 8 in main paper), we simply access the log-likelihood of the trained network. As we use variational inference by posterior mean-field approximation in the BNN with a multivariate normal prior with covariance of 0 for all weights, the evaluation of the log-determinant of the Fisher-info matrix (being a diagonal matrix here) simplifies to summing up the weights’ variances. 

We present preliminary results in the following tables. They show the mean accuracies (on test data) for different PLS methods on simulated data with uninformative and informative prior and BNNs trained with 50 and 150 epochs each. For the informative setup, we simulate data from a BNN, while for the uninformative setup, we simulate from a simple binomial distribution, which makes the classification task easier (see the generally higher accuracies in the uninformative setup). The general simulation setup follows the one described in supplement C. 

The results confirm those for GLMs and GAMs reported in the main paper: In scenarios of low initial generalization (tables 2-4) inducing a high risk of overfitting, our method clearly outperforms other PLS methods, see experiments on hypothesis 1 a) in the paper. This is particularly pronounced in settings with informative priors (tables 3 and 4), see hypothesis 3 in the paper. In scenarios of high initial generalization (table 1) with low risk of overfitting, other methods have higher mean accuracies than our method, in line with hypothesis 1 b) in the paper. 

\begin{table}[H]
    \centering
    \begin{tabular}{l|l}
        \textbf{Uninformative Setup, 150 epochs} & ~ \\ \hline
        PLS Method & Mean Accuracy \\ \hline
        Likelihood (max-max) & 0.889 \\ 
        PPP (bayes-optimal) & 0.884 \\ 
        Predictive Variance & 0.884 \\ 
        Probability Score & 0.880 \\ 
        Supervised Learning & 0.890 \\ 
    \end{tabular}
    \caption{Comparison of mean accuracies of different PLS methods on simulated data with uninformative priors.}
\end{table}


\begin{table}[H]
    \centering
    \begin{tabular}{l|l}
        \textbf{Uninformative Setup, 50 epochs} & ~ \\ \hline
        PLS Method & Mean Accuracy \\ \hline
        Likelihood (max-max) & 0.562 \\ 
        PPP (bayes-optimal) & 0.677 \\ 
        Predictive Variance & 0.657 \\ 
        Probability Score & 0.557 \\ 
        Supervised Learning & 0.662 \\ 
    \end{tabular}
        \caption{Comparison of mean accuracies of different PLS methods on simulated data with uninformative priors.}
\end{table}

\begin{table}[H]
    \centering
    \begin{tabular}{l|l}
        \textbf{Informative Setup, 150 epochs} & ~ \\ \hline
        PLS Method & Mean Accuracy \\ \hline
        Likelihood (max-max) & 0.671 \\ 
        PPP (bayes-optimal) & 0.702 \\ 
        Predictive Variance & 0.695 \\ 
        Probability Score & 0.637 \\ 
        Supervised Learning & 0.583 \\ 
    \end{tabular}
        \caption{Comparison of mean accuracies of different PLS methods on simulated data with uninformative priors.}
\end{table}


\begin{table}[H]
   \centering
    \begin{tabular}{l|l}
       \textbf{Informative Setup, 50 epochs} & ~ \\ \hline
        PLS Method & Mean Accuracy \\ 
        \hline
        Likelihood (max-max) & 0.513 \\ 
        PPP (bayes-optimal) & 0.587 \\ 
        Predictive Variance & 0.520 \\ 
        Probability Score & 0.564 \\ 
        Supervised Learning & 0.578 \\ 
    \end{tabular}
        \caption{Comparison of mean accuracies of different PLS methods on simulated data with uninformative priors.}
\end{table}


\clearpage
\newpage

\section{EXPERIMENTS: STATISTICAL HYPOTHESIS TESTING}

As mentioned in section 4 of the main paper, we perform several non-parametric hypothesis tests tailored to comparing classification accuracies of different ML methods across multiple data sets, see especially
\cite{demvsar2006statistical}. All hypotheses formulated in the paper (1a), 1b), 2a), 2b), and 3)) were tested. For conducting the tests, we compare both
the final and the oracle-stopping (best among all iterations) accuracies
of all PLS methods across different classification tasks on both
simulated and real-world data. We present the results in what follows.

\textbf{Hypothesis 1} \textbf{(a)} \emph{PPP with uninformative prior
outperforms traditional PLS on data prone to initial overfitting (i.e.,
with high ratio of features to data} \(\frac{q}{n}\) \emph{and poor
initial generalization).} 
\textbf{(b)} \emph{For low} \(\frac{q}{n}\) \emph{and high initial
generalization, BPLS is outperformed by traditional PLS.} 


\textbf{1 a)}
\emph{Final:} Using the multiple comparison approaches from \cite{demvsar2006statistical}, the Friedman-test \cite{friedman1937use, friedman1940comparison} for overall
differences in final accuracies indicates a significant
(\(\alpha = 0.05\)) difference between performances of all PLS methods
(likelihood, PPP with uninformative prior, predictive variance,
probability score and supervised baseline) on tasks prone to initial
overfitting (with high ratio of features to data). A post-hoc
Nemenyi-test \cite{nemenyi1963distribution} for pairwise comparisons indicates a statistically significant (\(\alpha = 0.05\)) difference between PPP and the supervised baseline and no statistically significant
(\(\alpha = 0.05\)) difference between all other PLS methods. 

\textbf{1 b)} \emph{Final:} On tasks with low \(\frac{q}{n}\) and high initial generalization, the Friedman-test suggests no significant difference
among all the PLS methods. A post-hoc test for pairwise comparisons can
thus not be conducted. 

\textbf{1 a)} \emph{Oracle-stopping:} Again, the
Friedman-test \cite{friedman1937use, friedman1940comparison} for overall differences in oracle-stopping accuracies indicates a significant (\(\alpha = 0.05\)) difference between performances of the PLS methods on tasks prone to initial overfitting (with high ratio of features to data). This time,
however, the post-hoc Nemenyi-test \cite{nemenyi1963distribution} for pairwise comparisons indicates a statistically significant (\(\alpha = 0.05\)) pairwise difference between PPP with uninformative prior and all other methods. This confirms our heuristic reasoning in the interpretation section in section 4 in the main paper.

\textbf{1 b)} \emph{Oracle-stopping:} On tasks with a low ratio of features to data, the Friedman-test indicated a significant difference
(\(\alpha = 0.05\)) between all PLS methods. The post-hoc Nemenyi-test
for pairwise comparisons, however, does not indicate significant
differences (\(\alpha = 0.05\)) between any of the PLS methods.

\textbf{Hypothesis 2} \textbf{(a)} Among all PLS methods, the
pseudo-label likelihood (max-max-action) reinforces the initial model
fit the most and \textbf{(b)} hardly improves generalization.

For this hypothesis, we do not compare (final and oracle-stopping)
accuracies but the differences of them to the initial model's test
accuracy. Further note that for \textbf{2 b)} we do not need the
multiple comparison approaches from {[}Demšar, 2006{]}, because we only
compare the likelihood (max-max) PLS method to the supervised baseline.
A standard Wilcoxon rank sum test will do. 

\textbf{2 a)} \emph{Final:}
The Friedman-test \cite{friedman1937use, friedman1940comparison} for overall differences 
indicates a significant (\(\alpha = 0.05\)) difference between all PLS
methods' improvements compared to the supervised baseline. The post-hoc
Nemenyi-test \cite{nemenyi1963distribution} indicates a statistically significant (\(\alpha = 0.05\)) difference between the pseudo-label likelihood's (max-max-action's) improvements and the improvements of our PPP method.

\textbf{2 b)} \emph{Final:} The Wilcoxon rank sum test \cite{wilcoxon1992individual} does not reject
the (one-sided) null hypothesis that likelihood performs better than the
initial model. (Note that, in order to be able to control the error
probability when searching for evidence for our hypothesis \textbf{2 b)},
we test the complementary hypothesis as null.) As mentioned in section 4
(paragraph on interpretation) of the paper, there seems to be not enough
evidence that the likelihood method cannot improve generalization.
\textbf{Oracle-Stopping:} The test results for \textbf{Oracle-Stopping}
accuracies exactly match those for \textbf{final} accuracies regarding
Hypotheses \textbf{2 a)} and \textbf{2 b)}.

\textbf{Hypothesis 3} PPP with informative prior outperforms traditional
PLS methods universally.

\textbf{3)} \emph{Final:} The Friedman-test shows a significant
(\(\alpha = 0.05\)) difference between performances of PLS methods,
and, indeed, the post-hoc Nemenyi-test \cite{nemenyi1963distribution} for pairwise
comparisons indicates a statistically significant (\(\alpha = 0.05\))
pairwise difference between our PPP with informative prior and all other
methods. \textbf{Oracle-Stopping:} The test decisions for final and
oracle-stopping accuracy metrics do not differ.
\clearpage
\newpage
\section{EXPERIMENTS ON PLS UNDER DISTRIBUTIONAL SHIFT}



In order to check whether the robustness towards confirmation is also helpful in the presence of distributional shifts, see section 6 of the main paper, we have conducted some preliminary experiments: We simulated labeled and unlabeled data from two different binomial distributions with the test data from the same distribution as the unlabeled data (share of unlabeled: 0.8 and 0.9, train/test-ratio: $\frac{1}{9}$, q = 7, GAMs with non-parametric splines). We closely followed the experimental setup described in supplement C.1.3.

The preliminary results below support the intuition that our Bayesian approach robustifies PLS also towards distributional shift. The tables below depict mean accuracy after varying number of self-training iterations (colums of tables) of PPP with informative priors and concurring PLS methods with non-parametric GAMs on simulated binomially distributed (with mean shift from labeled to unlabeled) data of varying sizes, just like in the experiments presented in figure 3 of the main paper.

At first sight, the results closely resemble the results from the experimental setup without distributional shift (see figure 3 in the paper and figure 6 in supplement D.3 as well as supplement C, in particular C.1.3.). However, there are differences: PPP only needs very few iterations ($< 20$) to outperform other PLS methods here. In the experiments without distributional shift, PPP also achieves accuracy gains over other methods after 10-40 iterations in some setups, see results for share of unlabeled = 0.9 in figure 6 in supplement D.3. The extreme speed of this process for data with a distributional shift, however, appears a bit odd. We do not have an explanation for this phenomenon yet. All in all, however, it appears as if indeed our Bayesian approach to the selection problem of pseudo-labeled data robustifies PLS not only towards initial overfitting and confirmation bias but also towards distributional shift. This of course requires more careful empirical evaluation. We leave this to future work.

\begin{landscape}

\begin{table}[!ht]
    \centering
    \begin{tabular}{l|l|l|l|l|l|l|l|l|l|l}
        $n = 500$, share of unlabeled: $0.8$ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ \\ \hline
        Method & 20 & 40 & 60 & 80 & 100 & 120 & 140 & 160 & 180 & 200 \\ \hline
        -------------------- & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ \\ \hline
        Likelihood (max-max) & 0.9020 & 0.9005 & 0.8965 & 0.8950 & 0.8940 & 0.8915 & 0.8915 & 0.8910 & 0.8840 & 0.8830 \\ \hline
        PPP (bayes-optimal) & 0.9515 & 0.9540 & 0.9530 & 0.9520 & 0.9530 & 0.9560 & 0.9520 & 0.9530 & 0.9550 & 0.9555 \\ \hline
        Predictive Variance & 0.9065 & 0.8975 & 0.8975 & 0.8990 & 0.8990 & 0.8985 & 0.8980 & 0.8985 & 0.8985 & 0.8990 \\ \hline
        Probability Score & 0.9005 & 0.8980 & 0.8950 & 0.8940 & 0.8950 & 0.8940 & 0.8940 & 0.8940 & 0.8885 & 0.8790 \\ \hline
        Supervised Learning & 0.9035 & 0.9035 & 0.9035 & 0.9035 & 0.9035 & 0.9035 & 0.9035 & 0.9035 & 0.9035 & 0.9035 \\ 
    \end{tabular}
    \caption{Mean Accuracies after iterations $\{20,40,60, \dots\}$ from experiments on simulated binomially distributed data with distribution shift. $n = 500$.}
\end{table}

\begin{table}[!ht]
    \centering
    
    \begin{tabular}{l|l|l|l|l|l|l|l|l|l|l}
    
    \footnotesize
        $n = 500$, share of unlabeled: $0.9$ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ \\ \hline
        Method & 20 & 40 & 60 & 80 & 100 & 120 & 140 & 160 & 180 & 200 \\ \hline
        -------------------- & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ \\ \hline
        Likelihood (max-max) & 0.8600 & 0.8600 & 0.8655 & 0.8590 & 0.8530 & 0.8520 & 0.8505 & 0.8500 & 0.8515 & 0.8525 \\ \hline
        PPP (bayes-optimal) & 0.9100 & 0.9160 & 0.9150 & 0.9180 & 0.9110 & 0.9145 & 0.9120 & 0.9105 & 0.9105 & 0.9110 \\ \hline
        Predictive Variance & 0.8655 & 0.8540 & 0.8550 & 0.8580 & 0.8565 & 0.8555 & 0.8550 & 0.8555 & 0.8555 & 0.8565 \\ \hline
        Probability Score & 0.8605 & 0.8600 & 0.8585 & 0.8580 & 0.8530 & 0.8540 & 0.8535 & 0.8505 & 0.8540 & 0.8625 \\ \hline
        Supervised Learning & 0.8585 & 0.8585 & 0.8585 & 0.8585 & 0.8585 & 0.8585 & 0.8585 & 0.8585 & 0.8585 & 0.8585 \\ 
    \end{tabular}
        \caption{Mean Accuracies after iterations $\{20,40,60, \dots\}$ from experiments on simulated binomially distributed data with distribution shift. $n = 500$.}
\end{table}



\begin{table}[!ht]
    \centering
    \begin{tabular}{l|l|l|l|l|l|l|l|l|l|l|l|l|l|l}
    
        $n = 1000$, share of unlabeled: $0.8$ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ \\ \hline
        Method & 20 & 40 & 60 & 80 & 100 & 120 & 140 & 160 & 180 & 200 & 220 & 240 & 260 & 280 \\ \hline
        -------------------- & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ & ------ \\ \hline
        Likelihood (max-max) & 0.9128 & 0.9128 & 0.9128 & 0.9124 & 0.9120 & 0.9112 & 0.9112 & 0.9104 & 0.9104 & 0.9096 & 0.9084 & 0.9084 & 0.9068 & 0.9072 \\ \hline
        PPP (bayes-optimal) & 0.9752 & 0.9752 & 0.9752 & 0.9724 & 0.9744 & 0.9756 & 0.9756 & 0.9760 & 0.9760 & 0.9760 & 0.9764 & 0.9764 & 0.9760 & 0.9760 \\ \hline
        Predictive Variance & 0.9208 & 0.9188 & 0.9228 & 0.9204 & 0.9208 & 0.9200 & 0.9212 & 0.9200 & 0.9208 & 0.9208 & 0.9212 & 0.9212 & 0.9212 & 0.9212 \\ \hline
        Probability Score & 0.9128 & 0.9128 & 0.9128 & 0.9120 & 0.9120 & 0.9112 & 0.9112 & 0.9104 & 0.9104 & 0.9096 & 0.9084 & 0.9080 & 0.9068 & 0.9064 \\ \hline
        Supervised Learning & 0.9128 & 0.9128 & 0.9128 & 0.9128 & 0.9128 & 0.9128 & 0.9128 & 0.9128 & 0.9128 & 0.9128 & 0.9128 & 0.9128 & 0.9128 & 0.9128 \\ 
    \end{tabular}    
    \caption{Mean Accuracies after iterations $\{20,40,60, \dots\}$ from experiments on simulated binomially distributed data with distribution shift. $n = 1000$.}
\end{table}

\clearpage
\newpage

\begin{tiny}
    
\begin{table}[!ht]
    \centering
    \resizebox{\columnwidth}{!}{%
    \begin{tabular}{l|l|l|l|l|l|l|l|l|l|l|l}
    
        $n = 2000$, share of unlabeled: $0.8$ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ \\ \hline
        Method & 20 & 40 & 60 & 80 & 100 & 120 & 140 & 160 & 180 & 200 & 220 \\ \hline
        -------------------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- \\ \hline
        Likelihood (max-max) & 0.9169444 & 0.9158333 & 0.9147222 & 0.9122222 & 0.9080556 & 0.9063889 & 0.9108333 & 0.9108333 & 0.9169444 & 0.9158333 & 0.9147222 \\ \hline
        PPP (bayes-optimal) & 0.9655556 & 0.9688889 & 0.9691667 & 0.9691667 & 0.9688889 & 0.9688889 & 0.9675000 & 0.9686111 & 0.9655556 & 0.9688889 & 0.9691667 \\ \hline
        Predictive Variance & 0.9363889 & 0.9200000 & 0.9155556 & 0.9136111 & 0.9191667 & 0.9188889 & 0.9105556 & 0.9111111 & 0.9363889 & 0.9200000 & 0.9155556 \\ \hline
        Probability Score & 0.9163889 & 0.9152778 & 0.9152778 & 0.9125000 & 0.9080556 & 0.9036111 & 0.9072222 & 0.9063889 & 0.9163889 & 0.9152778 & 0.9152778 \\ \hline
        Supervised Learning & 0.9111111 & 0.9111111 & 0.9111111 & 0.9111111 & 0.9111111 & 0.9111111 & 0.9111111 & 0.9111111 & 0.9111111 & 0.9111111 & 0.9111111 \\ 
    \end{tabular}
    }
        \caption{Mean Accuracies after iterations $\{20,40,60, \dots\}$ from experiments on simulated binomially distributed data with distribution shift. $n = 2000$.}
\end{table}


\begin{table}[!ht]
    \centering
    \resizebox{\columnwidth}{!}{%
    \begin{tabular}{l|l|l|l|l|l|l|l|l|l|l|l}
    
        $n = 4000$, share of unlabeled: $0.8$ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ \\ \hline
        Method & 20 & 40 & 60 & 80 & 100 & 120 & 140 & 160 & 180 & 200 & 220 \\ \hline
        -------------------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- \\ \hline
        Likelihood (max-max) & 0.9551852 & 0.9550926 & 0.9550926 & 0.9546296 & 0.9540741 & 0.9556481 & 0.9545370 & 0.9540741 & 0.9531481 & 0.9523148 & 0.9518519 \\ \hline
        PPP (bayes-optimal) & 0.9665741 & 0.9676852 & 0.9673148 & 0.9679630 & 0.9665741 & 0.9657407 & 0.9656481 & 0.9654630 & 0.9661111 & 0.9673148 & 0.9673148 \\ \hline
        Predictive Variance & 0.9623148 & 0.9680556 & 0.9711111 & 0.9740741 & 0.9749074 & 0.9751852 & 0.9722222 & 0.9725926 & 0.9725000 & 0.9722222 & 0.9723148 \\ \hline
        Probability Score & 0.9550926 & 0.9550000 & 0.9550000 & 0.9546296 & 0.9541667 & 0.9535185 & 0.9524074 & 0.9518519 & 0.9508333 & 0.9501852 & 0.9497222 \\ \hline
        Supervised Learning & 0.9552778 & 0.9552778 & 0.9552778 & 0.9552778 & 0.9552778 & 0.9552778 & 0.9552778 & 0.9552778 & 0.9552778 & 0.9552778 & 0.9552778 \\ 
    \end{tabular}
    }
            \caption{Mean Accuracies after iterations $\{20,40,60, \dots\}$ from experiments on simulated binomially distributed data with distribution shift. $n = 4000$.}
\end{table}


\begin{table}[!ht]
    \centering
    \resizebox{\columnwidth}{!}{%
    \begin{tabular}{l|l|l|l|l|l|l|l|l|l|l|l}
    
        $n = 8000$, share of unlabeled: $0.8$ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ & ~ \\ \hline
        Method & 20 & 40 & 60 & 80 & 100 & 120 & 140 & 160 & 180 & 200 & 220 \\ \hline
        -------------------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- & --------- \\ \hline
        Likelihood (max-max) & 0.9132237 & 0.9125000 & 0.9123026 & 0.9116447 & 0.9112500 & 0.9107895 & 0.9097368 & 0.9111184 & 0.9136184 & 0.9138816 & 0.9146053 \\ \hline
        PPP (bayes-optimal) & 0.9673684 & 0.9676974 & 0.9586842 & 0.9586184 & 0.9536184 & 0.9536184 & 0.9551974 & 0.9581579 & 0.9584211 & 0.9584868 & 0.9596711 \\ \hline
        Predictive Variance & 0.9351316 & 0.9363816 & 0.9397368 & 0.9406579 & 0.9403289 & 0.9409868 & 0.9410526 & 0.9410526 & 0.9410526 & 0.9411842 & 0.9412500 \\ \hline
        Probability Score & 0.9133553 & 0.9124342 & 0.9123684 & 0.9116447 & 0.9112500 & 0.9107237 & 0.9098684 & 0.9112500 & 0.9134868 & 0.9138816 & 0.9150000 \\ \hline
        Supervised Learning & 0.9132895 & 0.9132895 & 0.9132895 & 0.9132895 & 0.9132895 & 0.9132895 & 0.9132895 & 0.9132895 & 0.9132895 & 0.9132895 & 0.9132895 \\ 
    \end{tabular}
    }
            \caption{Mean Accuracies after iterations $\{20,40,60, \dots\}$ from experiments on simulated binomially distributed data with distribution shift. $n = 8000$.}
\end{table}


\end{tiny}

\end{landscape}





\clearpage
\newpage
\section{REPRODUCIBILITY AND OPEN SCIENCE}

The implementation of the proposed methods as well as
reproducible scripts for the experiments are provided in the following repository named \textbf{Bayesian-pls} (\say{\textit{Bayesian, please!}}): \url{https://github.com/rodemann/Bayesian-pls}. Please follow the instructions on the Readme-file to reproduce the experiments. %After the reviewing process, the whole (non-anonymous) repository will be made public.



\section{DATA SETS}
\label{sec:data-sets}

The following tables provide details on data sources as well as features and target variables of the eight real-world datasets from the UCI machine learning repository \cite{Dua:2019}.

\begin{table}[H] \centering \renewcommand*{\arraystretch}{1.1}\caption{Breast Cancer Data, Details: \cite{street1993nuclear}}
\begin{tabular}{p{0.290909090909091\textwidth}p{0.145454545454545\textwidth}p{0.363636363636364\textwidth}}
\hline
\hline
Name & Class & Values \\ 
\hline
target & factor & '0' '1' \\ 
radius\_mean & numeric & Num: 6.981 to 28.11 \\ 
texture\_mean & numeric & Num: 9.71 to 33.81 \\ 
perimeter\_mean & numeric & Num: 43.79 to 188.5 \\ 
area\_mean & numeric & Num: 143.5 to 2501 \\ 
smoothness\_mean & numeric & Num: 0.053 to 0.145 \\ 
compactness\_mean & numeric & Num: 0.019 to 0.311 \\ 
concavity\_mean & numeric & Num: 0 to 0.427 \\ 
concave\_points\_mean & numeric & Num: 0 to 0.201 \\ 
symmetry\_mean & numeric & Num: 0.117 to 0.304 \\ 
fractal\_dimension\_mean & numeric & Num: 0.05 to 0.097 \\ 
radius\_se & numeric & Num: 0.112 to 2.873 \\ 
texture\_se & numeric & Num: 0.36 to 4.885 \\ 
perimeter\_se & numeric & Num: 0.757 to 21.98 \\ 
area\_se & numeric & Num: 6.802 to 542.2 \\ 
smoothness\_se & numeric & Num: 0.002 to 0.031 \\ 
compactness\_se & numeric & Num: 0.002 to 0.106 \\ 
concavity\_se & numeric & Num: 0 to 0.396 \\ 
concave\_points\_se & numeric & Num: 0 to 0.053 \\ 
symmetry\_se & numeric & Num: 0.008 to 0.061 \\ 
fractal\_dimension\_se & numeric & Num: 0.001 to 0.03 \\ 
radius\_worst & numeric & Num: 7.93 to 36.04 \\ 
texture\_worst & numeric & Num: 12.02 to 49.54 \\ 
perimeter\_worst & numeric & Num: 50.41 to 251.2 \\ 
area\_worst & numeric & Num: 185.2 to 4254 \\ 
smoothness\_worst & numeric & Num: 0.071 to 0.223 \\ 
compactness\_worst & numeric & Num: 0.027 to 1.058 \\ 
concavity\_worst & numeric & Num: 0 to 1.252 \\ 
concave\_points\_worst & numeric & Num: 0 to 0.287 \\ 
symmetry\_worst & numeric & Num: 0.156 to 0.664 \\ 
fractal\_dimension\_worst & numeric & Num: 0.055 to 0.208\\ 
\hline
\hline
\end{tabular}
\end{table}

\begin{table}[H] \centering 
\tiny \renewcommand*{\arraystretch}{1.1}\caption{Sonar Data Set, Details: \cite{gorman1988analysis}}
\begin{tabular}{p{0.290909090909091\textwidth}p{0.145454545454545\textwidth}p{0.363636363636364\textwidth}}
\hline
\hline
Name & Class & Values \\ 
\hline
V1 & numeric & Num: 0.002 to 0.137 \\ 
V2 & numeric & Num: 0.001 to 0.234 \\ 
V3 & numeric & Num: 0.002 to 0.306 \\ 
V4 & numeric & Num: 0.006 to 0.426 \\ 
V5 & numeric & Num: 0.007 to 0.401 \\ 
V6 & numeric & Num: 0.01 to 0.382 \\ 
V7 & numeric & Num: 0.003 to 0.373 \\ 
V8 & numeric & Num: 0.005 to 0.459 \\ 
V9 & numeric & Num: 0.007 to 0.683 \\ 
V10 & numeric & Num: 0.011 to 0.711 \\ 
V11 & numeric & Num: 0.029 to 0.734 \\ 
V12 & numeric & Num: 0.024 to 0.706 \\ 
V13 & numeric & Num: 0.018 to 0.713 \\ 
V14 & numeric & Num: 0.027 to 0.997 \\ 
V15 & numeric & Num: 0.003 to 1 \\ 
V16 & numeric & Num: 0.016 to 0.999 \\ 
V17 & numeric & Num: 0.035 to 1 \\ 
V18 & numeric & Num: 0.038 to 1 \\ 
V19 & numeric & Num: 0.049 to 1 \\ 
V20 & numeric & Num: 0.066 to 1 \\ 
V21 & numeric & Num: 0.051 to 1 \\ 
V22 & numeric & Num: 0.022 to 1 \\ 
V23 & numeric & Num: 0.056 to 1 \\ 
V24 & numeric & Num: 0.024 to 1 \\ 
V25 & numeric & Num: 0.024 to 1 \\ 
V26 & numeric & Num: 0.092 to 1 \\ 
V27 & numeric & Num: 0.048 to 1 \\ 
V28 & numeric & Num: 0.028 to 1 \\ 
V29 & numeric & Num: 0.014 to 1 \\ 
V30 & numeric & Num: 0.061 to 1 \\ 
V31 & numeric & Num: 0.048 to 0.966 \\ 
V32 & numeric & Num: 0.04 to 0.931 \\ 
V33 & numeric & Num: 0.048 to 1 \\ 
V34 & numeric & Num: 0.021 to 0.965 \\ 
V35 & numeric & Num: 0.022 to 1 \\ 
V36 & numeric & Num: 0.008 to 1 \\ 
V37 & numeric & Num: 0.035 to 0.95 \\ 
V38 & numeric & Num: 0.038 to 1 \\ 
V39 & numeric & Num: 0.037 to 0.986 \\ 
V40 & numeric & Num: 0.012 to 0.93 \\ 
V41 & numeric & Num: 0.036 to 0.899 \\ 
V42 & numeric & Num: 0.006 to 0.825 \\ 
V43 & numeric & Num: 0 to 0.773 \\ 
V44 & numeric & Num: 0 to 0.776 \\ 
V45 & numeric & Num: 0 to 0.703 \\ 
V46 & numeric & Num: 0 to 0.729 \\ 
V47 & numeric & Num: 0 to 0.552 \\ 
V48 & numeric & Num: 0 to 0.334 \\ 
V49 & numeric & Num: 0 to 0.198 \\ 
V50 & numeric & Num: 0 to 0.082 \\ 
V51 & numeric & Num: 0 to 0.1 \\ 
V52 & numeric & Num: 0.001 to 0.071 \\ 
V53 & numeric & Num: 0 to 0.039 \\ 
V54 & numeric & Num: 0.001 to 0.035 \\ 
V55 & numeric & Num: 0.001 to 0.045 \\ 
V56 & numeric & Num: 0 to 0.039 \\ 
V57 & numeric & Num: 0 to 0.035 \\ 
V58 & numeric & Num: 0 to 0.044 \\ 
V59 & numeric & Num: 0 to 0.036 \\ 
V60 & numeric & Num: 0.001 to 0.044 \\ 
V61 & matrix & Num: 1 to 2\\ 
\hline
\hline
\end{tabular}
\end{table}

\begin{table}[!htbp] \centering \renewcommand*{\arraystretch}{1.1}\caption{Mushrooms Data Set, Details: \cite{schlimmer1987concept}}
\begin{tabular}{p{0.290909090909091\textwidth}p{0.145454545454545\textwidth}p{0.363636363636364\textwidth}}
\hline
\hline
Name & Class & Values \\ 
\hline
cap.diameter & numeric & Num: 0.71 to 54.6 \\ 
stem.height & numeric & Num: 0 to 28.33 \\ 
stem.width & numeric & Num: 0 to 52.22 \\ 
target & factor & '0' '1'\\ 
\hline
\hline
\end{tabular}
\end{table}

\begin{table}[!htbp] \centering \renewcommand*{\arraystretch}{1.1}\caption{Banknote Data Set, Details: \href{https://archive.ics.uci.edu/ml/datasets/banknote+authentication}{archive.ics.uci.edu/ml/datasets/banknote+authentication}}
\begin{tabular}{p{0.290909090909091\textwidth}p{0.145454545454545\textwidth}p{0.363636363636364\textwidth}}
\hline
\hline
Name & Class & Values \\ 
\hline
target & factor & '0' '1' \\ 
Length & numeric & Num: 213.8 to 216.3 \\ 
Left & numeric & Num: 129 to 131 \\ 
Right & numeric & Num: 129 to 131.1 \\ 
Bottom & numeric & Num: 7.2 to 12.7 \\ 
Top & numeric & Num: 7.7 to 12.3 \\ 
Diagonal & numeric & Num: 137.8 to 142.4\\ 
\hline
\hline
\end{tabular}
\end{table}

\begin{table}[!htbp] \centering \renewcommand*{\arraystretch}{1.1}\caption{Abalone Data Set, Details: \cite{waugh1995extending}}
\begin{tabular}{p{0.290909090909091\textwidth}p{0.145454545454545\textwidth}p{0.363636363636364\textwidth}}
\hline
\hline
Name & Class & Values \\ 
\hline
target & factor & '0' '1' \\ 
rings & numeric & Num: 4 to 29 \\ 
length & numeric & Num: 0.165 to 0.775 \\ 
weight & numeric & Num: 0.024 to 2.493 \\ 
height & numeric & Num: 0.04 to 0.24 \\ 
diameter & numeric & Num: 0.125 to 0.605 \\ 
shell\_weight & numeric & Num: 0.008 to 0.885\\ 
\hline
\hline
\end{tabular}
\end{table}


\begin{table}[!htbp] \centering \renewcommand*{\arraystretch}{1.1}\caption{Cars Data Set, Details: \cite{ezekiel1930methods}}
\begin{tabular}{p{0.290909090909091\textwidth}p{0.145454545454545\textwidth}p{0.363636363636364\textwidth}}
\hline
\hline
Name & Class & Values \\ 
\hline
wt & numeric & Num: 1.513 to 5.424 \\ 
qsec & numeric & Num: 14.5 to 22.9 \\ 
vs & factor & '0' '1' \\ 
\hline
\hline
\end{tabular}
\end{table}


\begin{table}[!htbp] \centering \renewcommand*{\arraystretch}{1.1}\caption{EEG Data Set, Details: \cite{zhang1995event}}
\begin{tabular}{p{0.290909090909091\textwidth}p{0.145454545454545\textwidth}p{0.363636363636364\textwidth}}
\hline
\hline
Name & Class & Values \\ 
\hline
V1 & numeric & Num: -2.035 to 1 \\ 
V2 & numeric & Num: -1.005 to 1 \\ 
V3 & numeric & Num: -0.912 to 1 \\ 
V4 & numeric & Num: -1.107 to 1 \\ 
V5 & numeric & Num: -1.078 to 1 \\ 
V6 & numeric & Num: -1.073 to 1 \\ 
V7 & numeric & Num: -1.651 to 1 \\ 
V8 & numeric & Num: -1.024 to 1 \\ 
V9 & numeric & Num: -1.864 to 1 \\ 
V10 & numeric & Num: -1.604 to 1 \\ 
V11 & numeric & Num: -0.883 to 1 \\ 
V12 & numeric & Num: -1.087 to 1 \\ 
target & factor & '0' '1'\\ 
\hline
\hline
\end{tabular}
\end{table}

\vfill

\begin{table}[H] \centering \renewcommand*{\arraystretch}{1.1}\caption{Ionosphere Data, Details: \cite{sigillito1989classification}}
\begin{tabular}{p{0.290909090909091\textwidth}p{0.145454545454545\textwidth}p{0.363636363636364\textwidth}}
\hline
\hline
Name & Class & Values \\ 
\hline
V1 & integer & Num: 0 to 1 \\ 
V3 & numeric & Num: -1 to 1 \\ 
V4 & numeric & Num: -1 to 1 \\ 
V5 & numeric & Num: -1 to 1 \\ 
V6 & numeric & Num: -1 to 1 \\ 
V7 & numeric & Num: -1 to 1 \\ 
V8 & numeric & Num: -1 to 1 \\ 
V9 & numeric & Num: -1 to 1 \\ 
V10 & numeric & Num: -1 to 1 \\ 
V11 & numeric & Num: -1 to 1 \\ 
V12 & numeric & Num: -1 to 1 \\ 
V13 & numeric & Num: -1 to 1 \\ 
V14 & numeric & Num: -1 to 1 \\ 
V15 & numeric & Num: -1 to 1 \\ 
V16 & numeric & Num: -1 to 1 \\ 
V17 & numeric & Num: -1 to 1 \\ 
V18 & numeric & Num: -1 to 1 \\ 
V19 & numeric & Num: -1 to 1 \\ 
V20 & numeric & Num: -1 to 1 \\ 
V21 & numeric & Num: -1 to 1 \\ 
V22 & numeric & Num: -1 to 1 \\ 
V23 & numeric & Num: -1 to 1 \\ 
V24 & numeric & Num: -1 to 1 \\ 
V25 & numeric & Num: -1 to 1 \\ 
V26 & numeric & Num: -1 to 1 \\ 
V27 & numeric & Num: -1 to 1 \\ 
V28 & numeric & Num: -1 to 1 \\ 
V29 & numeric & Num: -1 to 1 \\ 
V30 & numeric & Num: -1 to 1 \\ 
V31 & numeric & Num: -1 to 1 \\ 
V32 & numeric & Num: -1 to 1 \\ 
V33 & numeric & Num: -1 to 1 \\ 
V34 & numeric & Num: -1 to 1 \\ 
target & factor & '0' '1'\\ 
\hline
\hline
\end{tabular}
\end{table}


\vfill


\newpage

\vfill

\clearpage



\section{REFERENCES OF SUPPLEMENTARY MATERIAL}
\bibliographystyle{apalike}
\bibliography{rodemann_356/rodemann_356}





\end{document}


