\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amsfonts}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage[ruled,vlined]{algorithm2e}
\usepackage[thinc]{esdiff}
\usepackage{xspace}
\usepackage{adjustbox}
\usepackage{mathtools}

\newcommand{\xhdr}[1]{\vspace{1.3mm}\noindent{{\bf #1.}}}
\newcommand{\mname}{\textsc{Pseud$\sigma$}\xspace}

\newcommand{\kexin}[1]{{\color{green}[[K: #1]]}}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}
\newcommand{\revision}[1]{{\color{black} {#1}}}
\newcommand{\revisionVS}[1]{{\color{black} {#1}}}
\title{Uncertainty-Aware Pseudo-labeling for Quantum Calculations\\ (Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Kexin Huang}
\author[1]{Vishnu Sresht}
\author[1]{Brajesh Rai}
\author[1]{Mykola Bordyuh}
\affil[1]{%
     Machine Learning and Computational Sciences\\
    Pfizer Inc.\\
    Cambridge, Massachusetts, USA
}
\affil[2]{%
    Department of Computer Science\\
    Stanford University\\
    Stanford, California, USA
}
  
\begin{document}
\maketitle

\section{Entropy of Student's t-distribution}\label{sec:appedixA}
While the entropy of the Student's t-distribution is well known, we derive it for completeness. Student's probability distribution defined in terms of location $\gamma$, scale factor $\sigma_{st}^2$ and $\nu_{st}$ degrees of freedom is 
\begin{equation}\label{student}
\begin{aligned}
     & p(y; \gamma, \sigma_{st}^2,  \nu_{st})  = \mathrm{St}(y; \gamma, \sigma_{st}^2,  \nu_{st}) \\ 
    & = \frac{\Gamma(\frac{\nu_{st}+1}{2})} {\sqrt{\nu_{st\, }\pi \sigma_{st}^2}\,\Gamma(\frac{\nu_{st}}{2})} \left(1+\frac{1}{\nu_{st}}\frac{(y - \gamma)^2}{\sigma_{st}^2} \right)^{\!-\frac{\nu_{st}+1}{2}},
\end{aligned}
\end{equation}
where $\Gamma$ i a gamma function. Student's t-distribution can be written in terms of beta function $\textup{B} = \frac{\Gamma(x) \Gamma(y)}{\Gamma(x + y)}$ if we take advantage of the fact that $\Gamma(\frac{1}{2}) = \sqrt{\pi}$
\begin{equation}
\begin{aligned}
    & \mathrm{p}(y; \sigma_{st}^2, \nu_{st})  = \mathrm{St}(y; \sigma_{st},  \nu_{st}) \\ & =\frac{1}{\sqrt{\nu_{st}\, \sigma_{st}^2}\,\mathrm{B} (\frac{1}{2}, \frac{\nu_{st}}{2})} \left(1+\frac{1}{\nu_{st}}\frac{(y - \gamma)^2}{\sigma_{st}^2} \right)^{\!-\frac{\nu_{st}+1}{2}}\, .
\end{aligned}
\end{equation}
In the main text of the manuscript, we used empirical estimate of Student's t-distribution, which corresponds to evaluation at the highest mode $y = \gamma$. Empirical estimation of probability becomes:
\begin{equation}\label{eq:empirical_estimate}
\begin{aligned}
    \mathrm{p^{emp}}(y=\gamma; \sigma_{st}^2, \nu_{st}) & \approx \mathrm{St}(y=\gamma; \sigma_{st},  \nu_{st}) \\
    & =\frac{1}{\sqrt{\nu_{st}\, \sigma_{st}^2}\,\mathrm{B} (\frac{1}{2}, \frac{\nu_{st}}{2})}. 
\end{aligned}
\end{equation}
If we introduce a new variable  $t = \frac{y - \gamma}{\sigma_{st}}$, Student's t-distribution converts into the standard form with probability density 
\begin{equation}
    \mathrm{p}(t; \nu_{st}) = \mathrm{St}(t; \nu_{st}) =\frac{1}{\sqrt{\nu_{st}}\,\mathrm{B} (\frac{1}{2}, \frac{\nu_{st}}{2})} \left(1+\frac{t^2}{\nu_{st}} \right)^{\!-\frac{\nu_{st}+1}{2}} \, .
\end{equation}




\subsection{Proposition}
\textbf{Proposition}: Entropy of the generalized and standard Student's t-distributions are related via the formula
\begin{equation}\label{eq: entropytransform}
    \mathcal{H}(y; \sigma_{st}^2, \nu_{st}) = \mathcal{H}(t; \nu_{st}) + \frac{1}{2}\mathrm{log} \, \sigma_{st}^2 \,.
\end{equation}
\textbf{Proof:} The transformation $t = \mathrm{g}(y) =  \frac{y - \gamma}{\sigma_{st}}$ is bijective  and invertible with the inverse transformation $y = g^{-1}(t) = \sigma_{st}\, t + \gamma$. The Jacobin of the transformation $\mathrm{g}$  is $\frac{\mathrm{d}}{\mathrm{d} y} \, g(y) = \frac{1}{\sigma_{st}}$.
According to the change of variables probability density formula 
\begin{align}
&\mathrm{p}_{y}\,(y; \sigma_{st}^2, \nu_{st})  =\mathrm{p}_{t}\,(g(y); \nu_{st}) \, \vert \, \frac{\mathrm{d}}{\mathrm{d} y} \, g(y)\, \vert.
\end{align}
The equation for the entropy transformation (equation~\ref{eq: entropytransform}) follows directly from the definition of the entropy.

To find the generalized entropy, we just need to calculate the entropy of the standard Student's t-distribution
\begin{equation}
    \begin{aligned}
         \mathcal{H}(t; \nu_{st})  = & -\int_{-\infty}^{+\infty} \, \mathrm{p}(t; \nu_{st}) \log \mathrm{p}(t; \nu_{st})~ \mathrm{d}t  \\ =
 &\log \left(\sqrt{\nu_{st}} \, \mathrm{B} (\frac{1}{2}, \frac{\nu_{st}}{2})\right)  \int_{-\infty}^{+\infty} \mathrm{p}(t; \nu_{st}) ~\mathrm{d}t \;  \\ +
& \frac{\nu_{st}+1}{2}\int_{-\infty}^{+\infty} \mathrm{log} (1 + \frac{t^2}{\nu_{st}})\, \mathrm{p}(t; \nu_{st}) ~ \mathrm{d}t 
\\ + & \log \left(\sqrt{\nu_{st}} \, \mathrm{B} (\frac{1}{2}, \frac{\nu_{st}}{2})\right) \\ + 
 &\frac{\nu_{st}+1}{2}\int_{-\infty}^{+\infty} \,\mathrm{log} (1 + \frac{t^2}{\nu_{st}}) \, \mathrm{p}(t; \nu_{st}) ~ \mathrm{d}t \, (*).
    \end{aligned}
\end{equation}
To find the second integral, we make a substitution $x = \frac{t^2}{\nu_{st}}$ and obtain
\begin{equation}
\begin{aligned}
    (*) & = \frac{\nu_{st}+1}{\revision{2}\, \mathrm{B}(\frac{1}{2}, \frac{\nu_{st}}{2})}\int_{0}^{+\infty} \, \mathrm{log}(1 + x) \, (1 + x)^{\!-\frac{\nu_{st}+1}{2}} \frac{\mathrm{d}x}{\sqrt{x}}  
    \\ & = -\frac{\nu_{st}+1}{\mathrm{B}(\frac{1}{2}, \frac{\nu_{st}}{2})} \, \diffp{}{{\nu_{st}}}\,\int_{0}^{+\infty} \, (1 + x)^{\!-\frac{\nu_{st}+1}{2}} \,\frac{\mathrm{d}x}{\sqrt{x}} 
    \\ & = -\frac{\nu_{st}+1}{\mathrm{B}(\frac{1}{2}, \frac{\nu_{st}}{2})} \, \diffp{}{{\nu_{st}}}\,\int_{0}^{1} \, x^{\!\frac{\nu_{st}}{2}-1}\, (1-x)^{\frac{1}{2}-1} \,\mathrm{d}x 
    \\ & = -\frac{\nu_{st}+1}{\mathrm{B}(\frac{1}{2}, \frac{\nu_{st}}{2})} \, \diffp{}{{\nu_{st}}}\,\mathrm{B}(\frac{1}{2}, \frac{\nu_{st}}{2}) 
    \\ & = -(\nu_{st}+1) \, \diffp{}{{\nu_{st}}}\,\mathrm{log} \, \mathrm{B}(\frac{1}{2}, \frac{\nu_{st}}{2}) 
    \\ & = -({\nu_{st}+1})\, \diffp{}{{\nu_{st}}}\,\left(\mathrm{log} \, \Gamma(\frac{\nu_{st}}{2}) - \mathrm{log}\,\Gamma(\frac{\nu_{st}+1}{2})\right) 
    \\ & = 
    \frac{\nu_{st}+1}{2} \, \left( \Psi(\frac{\nu_{st}+1}{2}) -  \, \Psi(\frac{\nu_{st}}{2}) \right), 
\end{aligned}
\end{equation}
where digamma function is defined as $\Psi(x) = \frac{\Gamma'(x)}{\Gamma(x)}$. Putting all the terms together, the entropy of the standard Student's t-distribution becomes
\begin{equation}
\begin{aligned}
     \mathcal{H}(t; \nu_{st}) & = \frac{\nu_{st}+1}{2} \, \left( \Psi(\frac{\nu_{st}+1}{2}) -  \, \Psi(\frac{\nu_{st}}{2}) \right) \\ & + \log \left(\, \sqrt{\nu_{st}} \,\mathrm{B} (\frac{1}{2}, \frac{\nu_{st}}{2})\right)\, .
\end{aligned}
\end{equation}

The final formula for the entropy of the labels $y$ is given by
\begin{equation}
\begin{aligned}
    \mathcal{H}(y; \sigma_{st}^2, \nu_{st}) & = \frac{\nu_{st}+1}{2} \, \left( \Psi(\frac{\nu_{st}+1}{2}) -  \, \Psi(\frac{\nu_{st}}{2}) \right) \\ & + \log \left(\, \sqrt{\nu_{st}} \,\mathrm{B} (\frac{1}{2}, \frac{\nu_{st}}{2})\right) + \frac{1}{2} \,\log \, \sigma^2_{st} \, .
\end{aligned}
\end{equation}

\section{Pseudo-labeling, Entropy minimization and Aleatoric uncertainty}\label{sec:appedixB}
In the main section of the text, we have considered the case where observed targets $(y_1, \cdots, y_i)  \sim \mathcal{N}(\mu, \sigma^2)$ are drawn from the Normal distribution with unknown mean and variance $\mu$ and $\sigma^2$ and we have imposed a prior on them.  The problem is significantly simplified if we treat $\mu$ and $\sigma^2$ in a deterministic way, such that our model $f$ outputs two parameters $\mu$ and $\sigma^2$. The model here is able to estimate aleatoric (data) uncertainty $\sigma^2$ but unable to model epistemic (model) uncertainty. 
By minimizing negative log-likelihood, the loss is significantly simpler than in Eq.~4 in the main text.
\begin{equation}\label{eq:simple_loss}
\mathcal{L}_i = -\mathrm{log} \,  \mathcal{N}  ( y_i; \mu_i, \sigma_i^2 ) = \frac{2 \pi \sigma^2_i}{2} + \frac{(y_i - \mu_i)^2}{2 \sigma_i^2}.
\end{equation}
Empirical estimate of the entropy on the unlabeled data set becomes 
\begin{equation}
\begin{aligned}
     \mathcal{H}(\mathcal{Y} \, \vert \,  \mathcal{U}) & = \sum_{\mathbf{x}_i \in \, \mathcal{U}}\mathrm{E}_{y \sim \mathrm{p}_\theta (y \, \vert \, \mathbf{x}_i)}[-\mathrm{log} \, \mathrm{p}_\theta (y \, \vert \, \mathbf{x}_i)] \\ & \approx  - \sum_{\mathbf{x}_i \in \, \mathcal{U}} \mathcal{E}_i^{emp} \, [\mathrm{log} \, \mathrm{p}_\theta (y \, \vert \, \mathbf{x}_i)] 
\end{aligned}
\end{equation}
with log probability weights $\mathcal{E}_i^{emp} = \frac{1}{\sqrt{2 \pi  \sigma^2_i}}$. One can notice that the weights are inversely related to  aleatoric uncertainties $\mathcal{E}_i \sim (\sigma_i^2)^{-\frac{1}{2}}$.


\end{document}
