\documentclass[accepted]{uai2022} 

\usepackage[american]{babel}

\usepackage{natbib} 
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} 
%
\usepackage{booktabs} 
\usepackage{tikz} 

%
%
%
%
%


%
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} 
\usepackage{url}
%
%
\usepackage{amsmath,amsfonts,bm}
\usepackage{amsthm,amssymb}
\usepackage{bbm}   
\usepackage{diagbox}
\usepackage{booktabs}  
%
\usepackage{color}
\usepackage{xcolor}
\definecolor{darkblue}{RGB}{25, 50, 112}
\usepackage{float}
\usepackage{lipsum}  
\usepackage{xr}

\makeatletter
\newcommand*{\addFileDependency}[1]{
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{
    \externaldocument{#1}
    \addFileDependency{#1.tex}
    \addFileDependency{#1.aux}
}

\myexternaldocument{rolf_637-supp}

\usepackage{amsmath,amsfonts,amssymb}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{url}
%
\usepackage{newtxmath}
\usepackage{enumitem}
\usepackage{multirow}
\usepackage{tablefootnote}

\usepackage{bbm}
\usepackage{wrapfig}
\usepackage{nicefrac}


\usepackage{chngcntr}


\newcommand{\h}{\ell}

%
\newcommand{\theHalgorithm}{\arabic{algorithm}}



%
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

%
\usepackage[capitalize,noabbrev]{cleveref}  

%
%
%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}

\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}

\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\newcommand{\tocite}[1]{\textcolor{gray}{cite #1}}


\DeclareMathOperator{\KL}{KL}
\DeclareMathOperator*{\argmax}{arg\ max}

\newcommand{\prob}[1]{\mathbb{P}\left(#1 \right)}
\newcommand{\probt}[1]{\mathbb{P}_t\left(#1 \right)}

\newcommand{\newtext}[1]{\textcolor{black}{#1}}

\title{Resolving Label Uncertainty with Implicit Posterior Models}


\author[1,6]{\href{mailto:esther_rolf@berkeley.edu}{Esther~Rolf$^{*\ }$}}
\author[2,6]{Nikolay~Malkin$^*$}
\author[3,6]{Alexandros~Graikos}
\author[4]{Ana~Jojic}
\author[5]{Caleb~Robinson}
\author[6]{Nebojsa~Jojic}

\affil[1]{
    University of California,
    Berkeley, CA, USA
}
\affil[2]{
    Mila and Universit\'e de Montr\'eal,
    Montreal, QC, Canada
}
\affil[3]{
    Stony Brook University,
    Stony Brook, NY, USA
}
\affil[4]{
    Paul G. Allen School of Computer Science and Engineering, University of Washington,
    Seattle, WA, USA
}
\affil[5]{
    Microsoft AI for Good, 
    Redmond, WA, USA
}
\affil[6]{
    Microsoft Research, 
    Redmond, WA, USA
}

\begin{document}
\maketitle

\begin{abstract}
We propose a method for jointly inferring labels across a collection of data samples, where each sample consists of an observation and a \emph{prior belief} about the label. By implicitly assuming the existence of a generative model for which a differentiable predictor is the posterior, we derive a training objective that allows learning under weak beliefs. This formulation unifies various machine learning settings; the weak beliefs can come in the form of noisy or incomplete labels, likelihoods given by a different prediction mechanism on auxiliary input, or common-sense priors reflecting knowledge about the structure of the problem at hand. We demonstrate the proposed algorithms on diverse problems: classification with negative training examples, learning from rankings, weakly and self-supervised aerial imagery segmentation, co-segmentation of video frames, and coarsely supervised text classification.
\end{abstract}

\section{Introduction}

In prediction problems, coarse and imprecise sources of input can provide rich information about labels.
%
Negative labels (what an instance is \emph{not}), rankings (which of two instances is larger), or coarse labels (aggregated by taxonomy or geography) give clues on what the ground truth label of an instance \emph{might} be, but not what it \emph{is} directly.
%
%
%
%
%
%
We consider a collection of data samples, indexed by $i$, consisting of observations (features) $x_i$ and corresponding sample-specific \emph{prior beliefs} about their latent label variables, $p_i(\h)$. This paper proposes algorithms to \textbf{resolve the uncertainty in these prior beliefs} by jointly inferring an assignment of target labels $\h_i$ and a model that predicts $\h_i$ given $x_i$.

Partial or aggregate annotations and auxiliary data sources are often more widely available and convenient to collect than ``ground-truth" or high-resolution labels, but they are not readily used by discriminative learners. Supervision from probabilistic targets can result in uncertain predictions (\S\ref{sec:background_and_approach}). Most approaches to resolve these uncertainties involve iterative generation of hard pseudolabels~\citep{zhang2021refining} or loss functions promoting low entropy of  predictions~\citep{nguyen2008classification,yu2016maximum,zou2020pseudoseg,yao2020ambiguous}. Typically, these approaches are application-specific \citep{han2014object,zheng2021Weakly,bao2021MRTA,li2021Change}. In many settings, fusing weak input data into a probability distribution over classes is a more natural  alternative to transforming the weak input into hard labels~\citep{mac2019presence}. Further connections and comparisons to prior work are made throughout this paper and synthesized in \S\ref{sec:related_work_extras} and \S\ref{sec:implicit_details}.

Our key modeling insight (\S\ref{sec:implicit_generative_model}) is to identify the output distribution of a discriminative model, a feed-forward neural network $q$, with an approximate posterior over latent variables in an \emph{generative} model of features, of which the given prior belief is a part. Bayesian reasoning about the generative model and its posterior makes it possible to learn the inference network \emph{without instantiating the full generative model}, while reaping the benefits of generative modeling: high certainty in the posterior under soft priors and rich opportunities to model structure in the prior beliefs. 

Prior beliefs about labels can arise from many sources (\S\ref{sec:priors}). We validate the effectiveness of our approach with experiments (\S\ref{sec:experiments}, \S\ref{sec:additional_exp}) on multiple domains and data modalities that highlight: prior beliefs as a natural way to fuse weak inputs, graceful degradation of performance with increasingly noisy or incomplete inputs, and comparison with explicitly generative modeling approaches.


\begin{figure*}[t]
    \centering
    \hspace{-4mm}
    \setlength{\fboxrule}{2mm}
    \definecolor{nicecolor}{rgb}{0.94,0.94,1.0}
    \definecolor{xi_color1}{rgb}{0,0,0}
    \definecolor{xi_color2}{rgb}{1,1,1}
    \fcolorbox{white}{white}{
    \hspace{-1em}
    \scalebox{1.35}{
    \includegraphics[width=0.105\textwidth,trim=0 160 0 0,clip]{figures/mnist4.png}
    \hspace{-16.5mm}\raisebox{15mm}{\textcolor{xi_color2}{$x_i$}\hspace{13mm}}
    \includegraphics[width=0.125\textwidth,trim=0 0 0 180,clip]{figures/mnist4.png}
    
        \includegraphics[width=0.105\textwidth,trim=0 160 0 0,clip]{figures/mnist2.png}
    \hspace{-16.5mm}\raisebox{15mm}{\textcolor{xi_color2}{$x_i$}\hspace{13mm}}
    \includegraphics[width=0.125\textwidth,trim=0 0 0 180,clip]{figures/mnist2.png}
        \includegraphics[width=0.105\textwidth,trim=0 160 0 0,clip]{figures/mnist1.png}
    \hspace{-16.5mm}\raisebox{15mm}{\textcolor{xi_color2}{$x_i$}\hspace{13mm}}
    \includegraphics[width=0.125\textwidth,trim=0 0 0 180,clip]{figures/mnist1.png}
    }
    \hspace{-1em}
    }
    
    
    \begin{tabular}{@{}c@{\hspace{8pt}}c@{\hspace{8pt}}c@{}}
    \hline \\
    \includegraphics[width=0.31\textwidth]{figures/seducer_v1_img_annotated.pdf} 
 
    \hspace{-48mm}\raisebox{27mm}{\textcolor{xi_color1}{\huge $x_i$}\hspace{45mm}}
    & \includegraphics[width=0.31\textwidth]{figures/seducer_v1_prior.png}\hspace{3mm}
    & \includegraphics[width=0.31\textwidth]{figures/seducer_v1_q.png}\\
    (a) $\{x_i\}$: \textit{Le s\'educteur}, Ren\'e Magritte
    & (b) $p_i(\ell)$: \textit{Boat Prior}, anonymous artist
    & (c) $q_i(\ell)$: Inferred segmentation
    \end{tabular}
    \caption{\textbf{Above:} Inference of latent MNIST digit classes with negative label supervision using a small CNN trained on the \textbf{RQ} criterion (\S\ref{sec:implicit_generative_model}).  \textbf{Below:} (a) Joint inference of latent pixel classes in an image. (b) Prior beliefs $p_i(\ell)$ over three classes -- sky (red), boat (green), water (blue) -- are manually set. (c) A small CNN trained on $(x_i, p_i(\ell))_i$ infers the posterior classes.}
    \label{fig:examples}
\end{figure*}

\section{Background and approach}
\label{sec:background_and_approach}

\paragraph{Two motivating examples.}

Two illustrative examples are shown in Fig.~\ref{fig:examples}. In the first example, the $x_i$ are 784-dimensional vectors representing 28$\times$28 MNIST digits. We aim to infer the digit classes $\h_i \in \{0,1,...,9\}$ for all images in the given collection based on data in which we are given just one \emph{negative} label per sample, i.e., the prior beliefs $p_i(\h)$ (top row) are uniform over all classes except for one incorrect class. The procedure described in this paper produces inferred distributions over labels (bottom row) that are usually peaky and place the maximum at the correct digit 97\% of the time (see Fig.~\ref{fig:mnist_cifar} and \S\ref{sec:negative_labels}).

In the second example, the observations $\{x_i\}_{i \in \textrm{pixels}}$ are image patches centered around each pixel coordinate $i$ in a Surrealist painting, with patch size ($11\times11$) equal to the receptive field of a 5-layer convolutional neural network used in our inference procedure. The prior beliefs $p_i(\h)$ are distributions over 3 classes (sky, boat, water) depending on the coordinate $i$. 
The joint inference of all labels in this image yields a feasible segmentation despite the high similarity in colors and textures (see \S\ref{sec:seducer} for more details).

These examples illustrate the problem of training on weak beliefs, which is often encountered in some form in machine learning. Weak supervision, semi-supervised learning, domain transfer, and integration of modalities are all settings where coarse, partial, or inexact sources of data can provide rich information about the state of a prediction instance, though not always a ``ground truth'' label for each instance. 
An inference technique that uses
weak beliefs as the sole source of supervision
needs to estimate statistical links between observations $x_i$ and corresponding latents $\h_i$. 
%
These links should simultaneously be highly confident (i.e., lead to low entropy in the posterior distributions) and explain the varying prior beliefs, which typically have low confidence (high entropy in the prior distributions). 


\paragraph{Supervised learning on prior beliefs.}

Supervised learning models, including many neural nets, are typically trained to minimize the cross-entropy $-\sum_i\sum_\h p_i^d(\h)\log q_i(\h)$  between a ``hard" distribution over labels with $p_i^d(\h) \in \{0,1\} $ and the distribution $q_i(\h)=q(\h| x_i;\theta)$ output by a predictor $q$ using data features $x_i$.
This is equivalent to minimizing the KL divergence $\sum_i \KL(p^d_i \| q_i)$,
%
minimized when the two distributions $p^d_i(\h)$ and $q_i(\h)$ are equal. 
%
Thus, when  $p^d_i(\h)$ is a ``softer" prior over latent labels, $p_i(\h)$, 
%
the trained model $q$ will reflect this, and also be highly uncertain. 

Transforming soft labels into hard training targets, (e.g. training on $\mathbbm{1}[\h=\argmax_\h p^d_i(\h)]$
%
), can introduce the opposite bias. In these cases, the cost would be minimized by predictions with zero entropy, but learning such a prediction function faces difficulty with overconfident labels which are often wrong, and the possibility that certain labels often receive substantial weight in the prior, but never the maximum. These issues are illustrated in Fig.~\ref{fig:sammamish_loss_comparison}.

\paragraph{Generative modeling resolves the prior's uncertainty.}

The approach to classification problems through \emph{generative} modeling, instead of targeting the conditional probability of latents given the data features, assumes that there is a forward (generative) distribution $p(x_i|\h)$ and optimizes the log-likelihood of the observed features, $\sum_i \log(x_i)=\sum_i \log \sum_\h p(x_i|\h)p_i(\h)$, with respect to the parameters of that distribution. The posterior under the model $q(\h|x_i)\propto p(x_i|\h)p_i(\h)$ is then used to infer latent labels for individual data points \citep{seeger}. The generative modeling approach does not suffer from uncertainty in the posterior distribution over latents given the input features, even when the priors $p_i(\h)$ are soft. (Recall that the posterior distributions in a mixture of high-dimensional Gaussians are often peaky even when the priors are flat.) 
%

However, expressive generative models are typically harder and more expensive to train compared to supervised neural networks, as they often require sampling (e.g., sampling of the posterior in variational auto-encoders \cite[VAEs;][]{kingma2014auto} and sampling of the generator in GANs \citep{gan}). Furthermore, the modeling often requires doubling of parameters to express both the forward (generative) model \emph{and} the reverse (posterior) model. And, in case of GANs, the learning algorithms may not even cover all modes in the data, which would prevent joint inference for \emph{all} data points. 
(See \S\ref{sec:implicit_details} for further discussion.)
%

\subsection{Optimizing implicit posterior models}
\label{sec:implicit_generative_model}


Suppose 
that there exists a generative model $p(x|\h)$ of observed features conditioned on latent labels. Optimization of the log-likelihood of observed features, $\sum_i\log p(x_i)=\sum_i\log(\sum_\h p(x_i|\h)p_i(\h))$, can be achieved by introducing a variational posterior distribution $q(\h|x_i)$ over the latent variable for each instance $x_i$ and minimizing the free energy (a negated evidence lower bound (ELBO)), 
%
%
defined as
\begin{equation}
    - \sum_i\sum_\h q(\h|x_i)\log\frac{p(x_i|\h)p_i(\h)}{q(\h|x_i)}
    \geq-\sum_i \log p(x_i).
    \label{eq:free_eng}
\end{equation}
Minimizing the free energy involves estimating both the forward distributions $p(x_i|\h)$ and the posteriors $q(\h|x_i)$. 

One could parametrize both $p(x|\h)$ and $q(\h|x)$ as functions $p(x|\h,\theta_p)$ and $q(\h|x,\theta_q)$ using neural networks, as done by VAEs (although VAEs use continuous latent variables $\ell$ and do not involve sample-specific priors).
However, in our algorithms, we only parametrize $q(\ell|x; \theta)$  as a neural network taking input $x$ and producing a distribution over $\ell$. The generative conditional $p(x_i|\ell)$ is defined only on data points $x_i$ and is calculated by minimizing 
\eqref{eq:free_eng} for fixed $q(\h|x)$,
subject to the constraint that
$\sum_{i} p(x_i|\h)=1$ for all $\h$.\footnote{\newtext{This constraint allows nonzero likelihood under the generative model only for the observed data points $x_i$. The derivation still holds if the assumption is relaxed to $\sum_{i} p(x_i|\h) \leq 1$. Subject to this weaker condition, the minimum of free energy is achieved on the boundary of the constraint domain, when $\sum_{i} p(x_i|\h) = 1$.}} The optimum is achieved by:
\begin{equation}
    p(x_i|\h)=a_{i,\ell}=\frac{q(\h|x_i)}{\sum_j q(\h|x_j)}~.
    \label{eq:aux_p}
\end{equation}
%
Here the generative conditional $p(x|\ell)$ is not fully specified for all values $x$. Rather, it is represented as a matrix of numbers $a_{i,\ell}$
%
describing the conditional probabilities of different values of $x_i$  given different latent labels $\ell$. The probabilities $p(x_i|\ell)$  are greater for the data points $i$ for which $q(\h|x_i)$ is more certain, 
relative to how popular assignment to class $\h$ is across data points (denominator in \eqref{eq:aux_p}).

In our formulation, $q$ plays the role of a variational posterior, but \emph{implicitly}, in a generative model consisting of varying instance-specific priors $p_i(\ell)$ and a complex conditional $p(x|\ell)$ that is never fully estimated, but is instead maximized for the data points studied. The full link between $x$ and $\h$ 
is left entirely to the neural network $q$ to capture explicitly. 
%




In variational methods, the free energy (\ref{eq:free_eng}) is usually rewritten as $\sum_i {\rm KL}(q(\ell|x_i)\|r_i(\ell)))-\log p(x_i)$, where $r$ is the posterior of the forward model,
i.e., for the  points $i$, $r_i(\ell) \propto p_i(\ell)p(x_i|\ell)$. The minimization of free energy then reduces to minimizing the KL divergence between $r$ and $q$.

We define $q_i(\h)=q(\h|x_i;\theta)$. After our reduction of $p(x_i|\ell)$ to the auxiliary matrix in \eqref{eq:aux_p}, the posterior $r$ has the form  
\begin{equation}
    r_i(\h) = c_i \cdot p_i(\h)  p(x_i|\h) = c_i \frac{p_i(\h)q_i(\h)}{\sum_j q_j(\h)}~,
    \label{eq:true_post}
\end{equation}
where $c_i$ are scalars making $\sum_\ell r_i(\h) = 1$. 
%
%
For each instance $i$ we have two outputs: the direct model outputs of the variational posterior $q_i$  and their \emph{implied posterior} $r_i$, which is computed by multiplying the renormalized model outputs with the provided prior at each instance as in \eqref{eq:true_post}. 
%
Using these two outputs, we can optimize a single set of model parameters $\theta$ to minimize \eqref{eq:free_eng}:
\begin{align}
\label{eq:KL_written_out}
   & \min_\theta \sum_i \text{KL}( q_i \| r_i) = \\[-4em]
&  \min_\theta \sum_i \text{KL}\bigg(  
\underbracket{\Big(q(\ell|x_i;\theta)\Big)_\ell }_{
\substack{\text{model output } \\ \text{with input $x_i$}}}
\Big\| 
\Big(
c_i \cdot\hspace{-2mm}
\overbracket{p_i(\ell)
\vphantom{\dfrac{q(|)}{\sum_jq(|)}} 
}^{
\substack{\text{per-} \\ \text{\phantom{p}instance\phantom{p}} \\ \text{priors}}}
\overbracket{\frac{q(\ell | x_i; \theta)}{\sum_j q(\ell | x_j; \theta)}}^{
\substack{
\text{model output} \\ \text{\phantom{p}normalized\phantom{p}} \\ \text{per-class} \\ \text{as in Eq.~\eqref{eq:aux_p}}
}}
\Big)_\ell 
 \bigg) \nonumber ~.
\end{align}
%
While \eqref{eq:KL_written_out} optimizes the free energy \eqref{eq:free_eng}  by minimizing $\textrm{KL}(q_i \| r_i)$, minimizing $\textrm{KL}(r_i \| q_i)$ would also find solutions for which the direct model and its implied posterior are close.
%
We propose to optimize either of these two objectives with respect to the model parameters $\theta$ by gradient steps. We iterate over data instances $x_i$ with priors $p_i(\h)$:
\begin{enumerate}[label*=(\arabic*)]
    \item Calculate the distributions $r_i$ in terms of $q_i$ as in (\ref{eq:true_post}).
    \item {Update the parameters of $q$ with a gradient step:  \\
    \mbox{$\bullet$ Option \textbf{QR}: $\theta \leftarrow \theta -\eta \nabla_\theta \sum_i \KL(q_i \| r_i)$.}\\
    \mbox{$\bullet$ Option \textbf{RQ}: $\theta \leftarrow \theta -\eta \nabla_\theta \sum_i \KL(r_i \| q_i)$.}
    }
\end{enumerate}
Gradients of the objectives are propagated to the expression of $r_i$ through $q_i$ (see (\ref{eq:KL_written_out}) and Fig.~\ref{fig:qr_losses_torch}). 
Both losses have a stable point when $q_i=r_i$, and \textbf{RQ} reduces to the cross-entropy loss in the case of priors which put all mass on one label (e.g. $p_i(\h) = \mathbbm{1}[\h = \ell_i]$). A discussion of the relative benefits and limitations of the \textbf{QR} and \textbf{RQ} losses is given in \S\ref{sec:practical_considerations}, along with practical considerations for implementation.

\begin{figure}[t]
    \centering
    \includegraphics[width=0.48\textwidth]{figures/qr-losses-torch.png}
    \caption{Cross-entropy and  implicit \textbf{QR} / \textbf{RQ} losses in PyTorch. Here the normalization in (\ref{eq:aux_p}) is done within batches.
    }
    \label{fig:qr_losses_torch}
\end{figure}
%


By defining the conditional model $p(x|\ell)$ as an auxiliary matrix of probabilities $a_{i,\ell}$ that 
is fit to the
reverse model $q$ during learning, we avoid parametrizing both directions of the link $\ell-x$ with highly nonlinear models.\footnote
{Note that the use of an auxiliary matrix $a_{i,\ell}$ is also found in expectation-maximization \citep[EM;][]{dempster1977em}, which also minimizes the free energy. However, in EM, it is the variational posterior $q(\ell|x_i)$ which is optimized as a matrix of numbers $a_{i,\ell}$ only on data points, while the \emph{generative} model $p$ is fully parametrized (see Table~\ref{tab:em_vae_comparison}).} We thus manage to keep the problem in the realm of training a single feed-forward network $q$ as a predictor of variables $\ell$, but in a way that treats the instance-specific priors $p_i(\ell)$ as they would be in generative modeling. 

 
%


%
%
%
%

Next, we discuss the consequences of implicitly modeling the generative model $p$ with an auxiliary distribution.  
Option \textbf{QR} uses the KL distance in the direction it appears in (\ref{eq:free_eng}) and thus guarantees continual improvements in free energy and convergence to a local minimum (with the exception for the effects of stochasticity in minibatch sampling).
%
Substituting $r_i$ from (\ref{eq:true_post}), 
%
the free energy (\ref{eq:free_eng}) becomes:
\begin{equation}
    F=\sum_{i,\h} q_i(\h) \log \left(\sum_{j} q_j(\h)\right)  - 
    \sum_{i,\h} q_i(\h)\log \left(p_i(\h) \right)
    \label{eq:QR}
\end{equation}
This criterion does not encourage entropy of individual $q_i$ distributions, but of their \emph{average}. The second term alone would be minimized if $q$ could put all the mass on $\argmax_\h p_i(\h)$ for each data point, but the first term promotes diversity in assignment of latents (labels) $\h$ across the entire dataset. Thus a network $q$ can optimize 
%
(\ref{eq:QR})
if it makes different confident predictions for different data points. 

To illustrate this, 
consider the case when all data points have the \emph{same} prior, $p_i(\h)=p(\h)$. Then
%
(\ref{eq:QR}) and the \textbf{RQ} objective
are minimized when $\frac{1}{N}\sum_i q_i(\h) = p(\h)$. This can be achieved when $q$ learns a constant distribution $q(\h|x_i; \theta)=p(\h)$. But both objectives are also minimized if $q$ predicts only a single label for each data point with high certainty, but it varies in predictions so that the counts of label predictions match the prior. 

As demonstrated in Fig.~\ref{fig:examples} and in our experiments, avoiding degenerate solutions is not hard. We attribute this to two factors. First, the situations of interest typically involve uncertain, but varying priors $p_i(\h)$ which break symmetries that could lead to predictors ignoring the data features $x_i$. Second, the neural networks used to model $q$, and their training algorithms, come with their own constraints and inductive biases. 
In fact, as discussed in \S\ref{sec:priors} and \S\ref{sec:aaai_regret}, even unsupervised clustering is possible with suitably chosen priors that break symmetry, allowing this approach to be used for self-supervised training. See also \S\ref{sec:related_work_extras}, \S\ref{sec:implicit_details} for more on relationships with other approaches.

\newtext{
In practice, the normalization in (\ref{eq:aux_p}) is done within batches,  rather than across the entire dataset (see Fig.~\ref{fig:qr_losses_torch}). This may be sufficient if batches are large and representative of the diversity in the data. 
%
Experiments in \S\ref{sec:practical_considerations} examine the effect of batch size  on performance. While our algorithm is relatively tolerant to moderate batch sizes, performance degrades for small batches, in particular when batches are likely to be missing samples of some classes.
%
Addressing this problem in more general settings is an interesting subject for future work. When intra-batch diversity is an issue, the denominator in  (\ref{eq:true_post}) may need to be updated in an online fashion or even replaced by a learned parametric estimate. 
}  


\section{Sources of label priors}
\label{sec:priors}

Having detailed our approach for learning from prior beliefs as weak supervision in \S\ref{sec:background_and_approach}, 
%
we now describe a range of machine learning settings where priors $p_i(\h)$ emerge. 
All of these settings are illustrated by experiments in \S\ref{sec:experiments} and \S\ref{sec:additional_exp}.


\paragraph{Negative or partial labels (\S\ref{sec:negative_labels}).}

When we are given a set of equally possible labels $L_i$ for each point data point $i$, instead of a single label $\h_i$, then we set the prior $p_i(\h)=\frac{1}{|L_i|}\mathbbm{1}[\h \in L_i]$. An extreme example is when one negative label is given and hence can be ``ruled out" (Fig.~\ref{fig:examples}).

\paragraph{Joint labels and learning from rankings (\S\ref{sec:ranking}).}

Priors may also come in the form of joint distributions over labels of multiple instances. For example, \textit{ranking supervision} -- the knowledge of which example in a pair is greater with respect to an ordering of the labels -- gives prior beliefs about \emph{pairs} of labels. Suppose our data is organized into pairs of images of digits $T_j=\{x_{j,1},x_{j,2}\}$, and for each pair we are told which image represents the digit (0--9) which is greater (or equal). This sets a prior $p(\h_1,\h_2)$ over pairs of labels in each pair, represented by either an upper or a lower triangular matrix, depending on which digit in the pair is known to be greater, with all nonzero entries equal to $\nicefrac{1}{55}$.

We assume the underlying generative model has the form $p(x_1,x_2|\h_1,\h_2)=p(x_1|\h_1)p(x_2|\h_2)$. We aim to fit its posterior model $q(\h|x;\theta)$. For each pair $T_j$, we have two outputs of the predictor network, $q(\h_1|x_{j,1})$ and $q(\h_2|x_{j,2})$, for the two images in the pair. The joint posterior under the generative model is
\begin{align}
    r_j(\h_1,\h_2) \propto p(\h_1,\h_2)p(x_{j,1}|\h_1)p(x_{j,2}|\h_2) \propto\nonumber\\\propto \frac{ p(\h_1,\h_2)q(\h_1|x_{j,1})q(\h_2|x_{j,2})}{ \sum_{j} q(\h_1|x_{j,1}) \sum_{j} q(\h_2|x_{j,2})},
    \label{eq:pair_posterior}
\end{align}
and we can now use \textbf{QR} or \textbf{RQ} loss to fit $q(\h_1|x_{j,1})$ to the marginal $r_j(\h_1)$ and $q(\h_2|x_{j,2})$ to $ r_{j}(\h_2)$.

\paragraph{Coarse data in weakly supervised segmentation (\S\ref{sec:chesapeake_experiments}, \S\ref{sec:lymphocytes}, \S\ref{sec:seducer}).}

We often have side information $z$ associated to each instance $i$ that allows setting the priors $p_i(\h)=p(\h|z_i)$ for each point directly by hand. These include situations when we have beliefs about labels for different points, as in the \emph{Seducer} example (Fig.~\ref{fig:examples}). 
%
Interesting weak supervision settings also arise in remote sensing (\S\ref{sec:chesapeake_experiments}) and medical pathology (\S\ref{sec:lymphocytes}) applications. For example, in a task of segmenting aerial imagery into land cover classes, we often have coarse labels $c$ associated to large \emph{blocks} of pixels, but not the target labels $\ell$ for individual pixels. If the conditional $p(\ell|c)$ is known, it sets a belief about the high-resolution labels $\ell$ for pixels in a block of class $c$.

\paragraph{Fusing models and data sources (\S\ref{sec:enviroatlas_experiments}, S\ref{sec:nlp}).}

Auxiliary information $z$ may not always come with a known correspondence $p(\h|z)$. In the land cover mapping problem, auxiliary information includes different modalities and resolutions (road maps, sparse point labels, etc.). While these sources can be fused into a prior by hand-coded rules, the prior may be more accurately set as the output of a model $p(\h|z_i)$ \emph{trained} on a separate dataset of points $(\h_i,z_i)$. This is especially useful when the data $x_i$ (imagery) is informative about the latents $\h_i$ but is prone to domain shift problems, while the auxiliary data $z_i$ does not suffer from domain shift issues but is not sufficient on its own to predict the labels. In a text classification problem, $z_i$ might be the encoding of text $x_i$ by a pretrained language model, and $p(\h|z_i)$ a noisy distribution over labels given by their likelihoods under the language model as continuations of a prompt. 


\paragraph{Priors for self-supervision (\S\ref{sec:aaai_regret}).}
In \S\ref{sec:implicit_generative_model} we discussed the pitfalls of using a constant prior $p_i(\h)=p(\h)$ for all data points in training models under the \textbf{QR} loss as a potential method for unsupervised clustering. However, in \S\ref{sec:aaai_regret} we give an example of \emph{joint} learning of the posterior model $q$ and an energy model (Markov random field) on the latent labels $\h_i$ that expresses local structure of labels in an image. This results in unsupervised clusterings that are useful in downstream segmentation tasks. Such an approach is an example of a benefit of generative modeling -- the possibility of learning of a parametrized distribution over latents -- being inherited by implicit posterior models.

\paragraph{Priors with latent structure (\S\ref{sec:tracking}).}
  
Implicit posterior modeling allows building hierarchical latent structure into the prior (another benefit of classical generative models), as we demonstrate in \S\ref{sec:tracking} on a video segmentation task. The prior is an admixture of possible segmentations with a structure similar to \citet{jojic2009stel}, but using a set of mask proposals $p(\h_i|m)$ from a Mask R-CNN model \citep{he2017maskrcnn}, indexed by a latent $m$. The prior is $p_i(\h)=\sum_m p(\h_i|m)p(m)$, where $p(m)$, a probabilistic selection of the masks for the admixture in the given frame, is estimated by minimizing the free energy.


\section{Experiments}
\label{sec:experiments}

\newtext{
The experiments in this section and in \S\ref{sec:additional_exp} cover a variety of domains, illustrating the sources of label priors listed in \S\ref{sec:priors}. 
%
%
%
%
%
The experimental baselines are chosen to reflect the different goals of each experiment. 
%
Experiments on classification with negative training examples (\S\ref{sec:negative_labels}) 
and learning from rankings (\S\ref{sec:ranking})
serve to illustrate how our algorithm works in different conditions.  
%
For experiments on label super-resolution in image segmentation (\S\ref{sec:chesapeake_experiments}, \S\ref{sec:enviroatlas_experiments}, \S\ref{sec:aaai_regret}) 
and text classification (\S\ref{sec:nlp}), self-supervision for image clustering (\S\ref{sec:lymphocytes}), and video segmentation (\S\ref{sec:tracking}), baseline methods provide a comparison by which to benchmark performance, showing that we are reaching or close to state-of-the-art accuracy across these domains with a unified approach.
 }
 

\subsection{Partial labels in MNIST and CIFAR-10}
\label{sec:negative_labels}


%
%
%
%
%
%
%
%
%
%
%

\begin{figure}[t!]
    \centering
    \includegraphics[width=0.45\textwidth]{figures/negative-mnist.png}
    \\
    \includegraphics[width=0.45\textwidth]{figures/negative-cifar.png}
    \\
    \caption{Accuracies of MNIST and CIFAR-10 classifiers trained with varying numbers of negative labels per example; the lighter variant of each color and marker shows the peak accuracy over 300 training epochs. (Average of 10 runs with standard error region.)}
    \label{fig:mnist_cifar}
\end{figure}


In this experiment, we compare algorithms for learning with partial labels on two 10-class image classification datasets, MNIST and CIFAR-10. 
To each training example $x_i$, we randomly assign a set $N_i$ of $k$ negative labels, chosen from the 9 labels distinct from the ground truth. The prior $p_i(\h)$ is set to be uniform over $\h\notin N_i$ and 0 for $\h\in N_i$. We vary $k$ from 1 (one negative label per example) to 9 (one-hot prior, full supervision). The data of $k$ negative labels carries $-\log_2(1-k/10)$ bits of label information; if $k=1$, $22\times$ less label information than in the fully supervised setting.

For both datasets, the base model $q$ is taken to be a small convolutional network, with four layers of ReLU-activated $3\times3$ convolutions with stride 2 and a linear map to the 10 output logits ($\sim$33k learnable parameters for MNIST, $\sim$34k for CIFAR-10). We experiment with four training losses:\\
$\bullet\;$ \textbf{CE:} cross-entropy between predictions $q(\h|x_i;\theta)$ and the prior $p_i(\h)$.\\
$\bullet\;$ \textbf{NLL (union):} negative logarithm of the sum of likelihoods assigned by $q$ to labels in $\h\notin N_i$, or, equivalently, $\log\sum_\h p_i(\h)q(\h|x_i;\theta)$, as done, e.g., by \citet{jin2002learning,nlnl}.\\
$\bullet\;$ The \textbf{QR} and \textbf{RQ} losses defined in \S\ref{sec:implicit_generative_model}.

The \textbf{CE}, \textbf{NLL (union)}, and \textbf{RQ} loss objectives are equivalent when $k=9$. 
%
The \textbf{RQ} and \textbf{NLL (union)} losses are equivalent when $\sum_iq_i(\h)$ is uniform over $\h$ \newtext{(see derivation in \S\ref{sec:related_work_extras})}, which approximately holds after a sufficient number of training epochs. 


All models are trained for 300 epochs on batches of 256 images with the Adam optimizer \citep{kingma2014adam} and a learning rate of $10^{-4}$. After each epoch, we compute the accuracy of the predictor $q$ on the ground truth labels in the train and test sets. Fig.~\ref{fig:mnist_cifar} shows the final train and test set accuracies, as well as the maximum accuracies achieved at any epoch. Reported results are averaged over 10 choices of partial label sets and random initializations. 

Models trained on \textbf{RQ} loss perform best, with the greatest benefit over \textbf{CE} seen for very few negative labels. 
%
\newtext{This reinforces the claim in \S\ref{sec:background_and_approach} that optimizing the \textbf{CE} loss results in uncertain predictions when the priors are highly ambiguous}.
%
As expected, the performance of \textbf{RQ} and \textbf{NLL (union)} is very similar across $k$. We hypothesize that the small advantage of \textbf{RQ} over \textbf{NLL (union)} loss can be attributed to regularization in early training. Meanwhile, \textbf{QR} performs as well as \textbf{CE} for very uncertain priors at the peak epoch (light curves), but its predictions degenerate -- usually toward uniform predictions -- with longer training.


%

\subsection{Multiple-instance supervision: Learning from ranks}
\label{sec:ranking}

\begin{figure}[t!]
    \centering
    \includegraphics[width=0.45\textwidth,trim=5 5 360 2,clip]{figures/mnist-rank.png}\\
    \includegraphics[width=0.45\textwidth,trim=360 10 5 0,clip]{figures/mnist-rank.png}\\
    \caption{ Confusion matrices of MNIST classifiers in the course of training on batches of 128 ranked pairs of digits. The trajectory of convergence to the diagonal shows that uncertainty is first resolved for the digits 0/9, then 1/8, etc.}
    \label{fig:mnist_pairs}
\end{figure}

We train a CNN of the same architecture as in \S\ref{sec:negative_labels} on MNIST, but with the only supervision coming in the form of pairs of images in which it is known which image represents the greater digit. The training set of 60k images is divided into pairs that are fixed throughout the training procedure; each digit appears in exactly one pair. We optimize to match the predictor $q$ with the implicit posterior model (\ref{eq:pair_posterior}) using the \textbf{RQ} loss. Fig.~\ref{fig:mnist_pairs} shows the confusion matrices at initial iterations of training. The learned classifier has 97\% accuracy on both training and testing sets, which means that from pairwise comparisons alone, we can group the digit images and place them in order.
 

\subsection{Label super-resolution}
\label{sec:chesapeake_experiments}


\begin{table}[t!]
    \centering
    \caption{Pixel accuracy and class mean intersection over union on the Chesapeake Land Cover dataset. All models use only coarse NLCD labels as supervision. For our proposed methods, we evaluate both the trained predictor ($q_i$) and the posterior under the generative model ($r_i$). The score of the best overall model is \textbf{bolded}. }
    \resizebox{\linewidth}{!}{
\begin{tabular}{@{}lllllll}
    \toprule
   
    & \multicolumn{2}{c}{PA} & \multicolumn{2}{c}{NY} & \multicolumn{2}{c}{Chesapeake} \\
    \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} 
    Model & acc \% & IoU \% & acc \% & IoU \% & acc \% & IoU \% \\
    \midrule
    
    
    Self-epitomic$^a$  
             & \textbf{86.2} & 67.6 & 86.4  & 70.5 & 86.3 & 69.7 \\
    Hard na\"ive$^b$  & 85.3 & 63.0 & 83.6 & 59.8 & 83.6 & 59.7 \\ 
    \midrule
    \textbf{QR} ($q$) & 85.9 & 69.3 & 87.3 & 73.0 & 86.4 & 71.1 
 \\
    \textbf{QR} ($r$) & \textbf{86.2}  & \textbf{69.9} & \textbf{87.9}  & \textbf{74.4} & \textbf{86.8}  & \textbf{72.1} 
  \\
    \textbf{RQ} ($q$) & 81.5 & 63.1 & 77.4 & 60.2 & 79.8 & 62.2 
\\
    \textbf{RQ} ($r$) & 81.5  & 63.2 & 77.5  & 60.3 & 79.8  & 62.4 
\\
%
%

%
%
%
%
%
%
%
    
    
    
    
    
    
    \bottomrule
\end{tabular}
}

\footnotesize
$^a$\citep{malkin2020mining}
$^b$\citep{malkin2019label}
    \label{tab:landcover_results_chesapeake}
\end{table}




\begin{figure*}[t!]
    \centering
    \includegraphics[width=0.9\textwidth]{figures/chesapeake_predictions-95compressed.jpg}
    \caption{Predictions of models trained with \textbf{QR} loss on the  NLCD-only prior in the Chesapeake region, shown on regions of 1000$\times$1000 pixels in Pennsylvania and 500$\times$500 pixels in New York.}
    \label{fig:chesapeake_predictions}
\end{figure*}

We benchmark our method's performance on the Chesapeake Land Cover dataset
\footnote{\href{https://lila.science/datasets/chesapeakelandcover}{https://lila.science/datasets/chesapeakelandcover}}, 
a large 1m-resolution land cover dataset used previously for label super-resolution \citep{robinson2019large,malkin2019label}. It consists of several aligned data layers, including: NAIP (4-channel high-resolution aerial imagery at about 1m/px), NLCD (16-class, 30m-resolution coarse land cover labels), and \text{high-resolution land cover labels (LC)} in four classes. The task is to train high-resolution segmentation models, in the four target classes, using only NLCD labels as supervision. The NLCD layer is at 30$\times$ lower resolution than the imagery and target labels and follows a different class scheme. Cooccurrence statistics of NLCD classes $c$ and LC labels $\h$ are assumed to be known (Fig.~\ref{fig:cooccurrence_matrices}).

To form a prior over land cover classes $\h$ at each pixel position, we map the 
NLCD classes to probabilities over the target LC classes using these known cooccurrence counts and apply a spatial blur to reduce low-resolution block artifacts (Fig.~\ref{fig:chesapeake_predictions}, ``Prior"). We then train small convolutional networks (receptive field $11\times11$) to predict high-resolution land cover from input imagery.
We evaluate both the \textbf{QR} and \textbf{RQ} variants of our approach on the two states that comprise the ``Chesapeake North" test set: Pennsylvania (PA) and New York (NY), and the two states combined, after picking hyperparameters based on an independent validation set in Delaware (details in \S\ref{app:experiment_details_landcover}). A depiction of the data and prediction results is given in Fig.~\ref{fig:chesapeake_predictions}.



\Cref{tab:landcover_results_chesapeake} compares our algorithms against the algorithmic technique with the best published performance on the Chesapake dataset, self-epitomic LSR \citep{malkin2020mining} and the hard na\"ive baseline from \citet{malkin2019label}. Self-epitomic LSR, a generative modeling approach that explicitly produces likelihoods $p(x|\h)$, analyzes small patches of data by making a large number of comparisons between sampled $7\times7$ image patches and \emph{all other} image patches. It does not produce a trained feedforward inference model, and the inference procedure is at least an order of magnitude slower than evaluation of our convolutional model. The hard na\"ive baseline maps the  NLCD classes to LC classes based on a given concurrence matrix, then trains a standard semantic segmentation model on these pseudo-labels. 

Training on the \textbf{QR} loss outperforms (in once case, matches) performance of self-epitomic LSR (\Cref{tab:landcover_results_chesapeake}), and the generative model for $p(x|c)$ from (\ref{eq:aux_p}) is largely consistent with the epitomic generative model  (Fig.~\ref{fig:p_x_given_c_figure}). Moreover, our methods handle \emph{batched input},
%
where self-epitomic LSR trains on one data tile at a time. Similar per-tile approaches have been shown to degrade in performance and exhaust computation capacity when training on multiple tiles \citep{malkin2020mining}).
%
Optimization under an implied generative model has the computational advantage of scaling naturally to large training data while maintaining the benefits of leading generative modeling approaches. (See also \S\ref{sec:lymphocytes}.)

\subsection{Data fusion and learned priors}
\label{sec:enviroatlas_experiments}


\begin{figure*}[t!]
    \centering
    
    \includegraphics[width=0.9\textwidth]{figures/learned_prior-95compressed.jpg}
    \caption{Prior generation for land cover mapping: ``NLCD only prior" (\S\ref{sec:chesapeake_experiments}) and ``$\{$Hand-coded, Learned$\}$ prior" (\S\ref{sec:enviroatlas_experiments}). }
    \label{fig:prior_generation_landcover_mapping}
\end{figure*}


In this set of experiments, we augment NLCD with information about the presence of buildings, road networks, and waterbodies/waterways from public sources (see Fig.~\ref{fig:prior_generation_landcover_mapping} and \S\ref{app:landcover_data_sources}). To evaluate the ability of models to generalize to across regions, we use 1m 5-class land cover labels from the geographically diverse EnviroAtlas dataset \citep{pickard2015enviroatlas} in four cities in the US: Pittsburgh, PA, Durham, NC, Austin, TX, and Phoenix, AZ. The NLCD-based prior model from \S\ref{sec:chesapeake_experiments} is augmented with the auxiliary information to obtain a hand-coded prior for each image (see \S\ref{app:forming_priors_landcover}). These types of priors can be made everywhere in the United States, while hard 1m-resolution labels are rarely available.

An alternative to performing local inference under such priors is to simply apply supervised models trained on hard labels elsewhere, hoping that the domain shift is tolerable. \Cref{tab:landcover_results_enviroatlas} compares the performance of a model (of the same architecture as in \S\ref{sec:chesapeake_experiments}) trained on Pittsburgh high-resolution data (HR) in each of the three other cities with that of models tuned on the hand-coded prior in each other city. 
%
The \textbf{QR} method trained on the local handmade prior outperforms the HR model in each evaluation city. This may be attributed to the extra data in each city given to our method in the form of prior beliefs. To isolate this effect, we also compare to a high-resolution model that consumes the prior belief to \emph{input} data, concatenated with the NAIP imagery (HR + aux). While the HR + aux model does increase performance substantially from the HR model with NAIP imagery alone as input, the \textbf{QR} model remains the highest-fidelity approach in two of the three cities. These results illustrate that information that generalizes across domains may find its best use within a separate model -- to build a prior in our setting -- and then used to supervise local inference. 

In practice, prior beliefs could be crafted by a domain expert to reflect the uniquities in geographic and structural features for each city.
%
We emulate incorporating such context-specific knowledge  by training (on a disjoint set of instances) a neural network that consumes the inputs to the handmade prior function (NLCD and auxiliary map data), and predicts high-resolution labels
%
(Fig.~\ref{fig:prior_generation_landcover_mapping}, ``Learned prior"). 
%
Alongside structural interactions between the inputs that generalize across cities (e.g., tree canopy supersedes rivers, roads supersede water), the learned prior
captures region-specific knowledge (e.g., buildings in Durham tend to have grass surrounding them and trees farther out, while in Austin,  this is reversed, and in Phoenix, riverbeds surrounded by barren land are likely to be dry).  Using these tailored prior beliefs during \textbf{QR} training tends to increase scores (Table~\ref{tab:landcover_results_enviroatlas}).

The final row in \Cref{tab:landcover_results_enviroatlas} benchmarks the performance of a high-resolution land cover model trained on imagery and labels over the entire contiguous US~\citep{robinson2019large}. This large model takes NAIP, Landsat 8 satellite imagery, and building footprints as inputs. Small, local models with priors created from only weak supervision outperform the US-wide model in all cities. (See \S\ref{app:additional_results} for details.)

\begin{table}[t]
    \centering
    \caption{Land cover classification experiments for generalizing across cities. In each column, the score of the best model not depending on auxiliary data as input is \textit{italicized} and the score of the best overall model is \textbf{bolded}. (A larger set of experimental results is given in \Cref{tab:landcover_results_enviroatlas_full}.)}
    \resizebox{\linewidth}{!}{
\begin{tabular}{@{}llllllll}
\toprule
& 
& 
\multicolumn{2}{c}{Durham, NC} & \multicolumn{2}{c}{Austin, TX} & \multicolumn{2}{c}{Phoenix, AZ} \\
\cmidrule(lr){3-4}
\cmidrule(lr){5-6}\cmidrule(lr){7-8}
Train region & Model 
& acc  & IoU  & acc  & IoU  & acc & IoU  \\\midrule
Pittsburgh & HR & 74.2 & 35.9 & 71.9 & 36.8 & 6.7 & 13.4 
 \\
(supervised) & HR + aux 
& 78.9 & 47.9 & 77.2 & 50.5 & 62.8 & 24.2 
 \\\midrule
Local  & \textbf{QR} ($q$) 
& 78.9 & 47.7 & 76.6 & 49.1 & \textit{75.8} & \textit{45.4} 
 \\
(hand-coded prior) & \textbf{QR} ($r$) 
& 79.0  & 48.4 & 76.6  & 49.5 & \textbf{76.2}  & \textbf{46.0} \\
\midrule
Local   & \textbf{QR} ($q$) 
& \textit{79.0} & \textit{48.7} & \textit{\textbf{79.4}} & \textit{51.3} & 73.4 & 42.8  \\
(learned prior) & \textbf{QR} ($r$) 
& \textbf{79.2}  & 49.5 & 79.1  & \textbf{51.9} & 73.6  & 43.1  \\ \midrule
Full US$^a$ & U-Net Large  
& 77.0 & \textbf{49.6} & 76.5 & 51.8 & 24.7 & 23.6 \\
\bottomrule
\end{tabular}
}

\footnotesize
$^a$\citep{robinson2019large}
    \label{tab:landcover_results_enviroatlas}
\end{table}


\subsection{Text classification}
\label{sec:nlp}



This experiment follows the recent work of \citet{mekala2021coarse} and illustrates the effectiveness of learning on prior beliefs beyond computer vision. We work with a dataset of $\sim$12k New York Times news articles. Each article belongs to one of 20 fine categories (e.g., `energy companies', `tennis',`golf'), which are grouped into 5 coarse categories (e.g., `business', `sports'). The goal is to train text classifiers that predict fine labels, but only the coarse label for each article is available in training. 

Some external knowledge about the fine categories is necessary to resolve the coarse labels into fine labels. Past work on this problem \citep{meng2018weakly,mekala2020contextualized,meng2020text,wang2021xclass} has trained supervised models on pseudolabels created by mechanisms such as propagation of seed words and querying large pretrained models. On the other hand, \citet{mekala2021coarse} create training data by sampling additional \emph{features} (articles) from a finetuned version of the large generative language model GPT-2 \citep{radford2019language} conditioned on fine categories, then tune a classifier based on the almost equally large model BERT \citep{devlin2019bert} in a supervised manner.

We obtain comparable results using an elementary predictor, far less computation, and no finetuning of massive language models (Table~\ref{tab:coarse_text_results}). We form a prior $p_i(\ell)$ on the fine class $\ell$ of each article $x_i$ by querying GPT-2 for the likelihood of each fine category name $\ell$ compatible with the known coarse label  following the prompt ``[article text] Topic: '' and normalizing over $\ell$. We then divide $p_i(\ell)$ by the mean likelihood of $\ell$ over all articles $x_i$ and renormalize.
%
We represent each article as a vector of alphabetic trigram counts ($26^3$ features, of which only 8k are ever nonzero) and train a logistic regression with the \textbf{RQ} objective against this `GPT-2 prior'.
%
After ten epochs of training ($\sim$10s on a Tesla K80 GPU), the trained classifier nears or exceeds the performance of models requiring at least $100\times$ longer to train, even excluding the time to generate any pseudo-training data.

\begin{table}[t]
\centering 
\caption{F1-scores of various models on the coarsely supervised text classification task. The first five rows are taken from \citet{mekala2021coarse}. The last two rows use the GPT-2 prior defined in \S\ref{sec:nlp} as weak supervision  with cross-entropy and \textbf{RQ} loss, respectively (mean of 10 random trials).}
\resizebox{1\linewidth}{!}{
\begin{tabular}{@{}llcc}
\toprule
 & Algorithm & Micro-F1 \% & Macro-F1 \% \\\midrule
%
%
\multirow{4}{*}{pseudolabeling}
 & {WeSTClass}$^a$ & 76.23 & 69.82 \\
 & {ConWea}$^b$    & 73.96 & 65.03 \\
 & {LOTClass}$^c$  & 15.00 & 20.21 \\
 & {X-Class}$^d$   & 91.16 & 81.09 \\\midrule
 pseudodata & C2F$^e$         & 92.62 & \textbf{87.01} \\\midrule
%
%
 \multirow{3}{*}{\begin{minipage}{25mm}GPT-2 prior\\(trigram features)\end{minipage}}
 & prior argmax    & 86.33 & 77.61 \\
 & CE              & 87.18 & 77.90 \\
 & \textbf{RQ}     & \textbf{93.18} & 84.26 \\
 \bottomrule
\end{tabular}
}

\footnotesize
$^a$\cite{meng2018weakly}
$^b$\cite{mekala2020contextualized}
$^c$\cite{meng2020text}
$^d$\cite{wang2021xclass}
$^e$\cite{mekala2021coarse}
\label{tab:coarse_text_results}
\end{table}

\section{Discussion and conclusion}

In summary, we found that the generative distribution in a free energy criterion can be left implicit to the minimization process in posterior (discriminative) model training. This allowed us to unite the training of neural networks $q(\h|x_i; \theta)$ for prediction of labels $\h$ from features $x$ with the modeling of the prior $p_i(\h)$, possibly with its own latent structure. Implicit modeling of the conditional generative distributions removes the burden of training accurate (and therefore large or deep) generative models, but still allows natural generative approaches to modeling priors.

Learning a discriminative network $q$ and its implicit posterior model $r$ via the \textbf{QR} and \textbf{RQ} methods can unify common supervised learning paradigms with realistic label supervision settings, enabling high-fidelity predictions from weak supervision sources carrying far less information. 
%
%
%
%
%
The additional experimental results in \S\ref{sec:additional_exp} detail further results for weakly supervised image segmentation, self-supervised learning, and co-segmentation in video data.  
%
%

Code is available in an accompanying GitHub repository (see \S\ref{sec:code}): \href{https://github.com/estherrolf/implicit-posterior}{\url{ https://github.com/estherrolf/implicit-posterior}}.



\begin{contributions}
\newtext{       
E.R., N.M., A.G., N.J. jointly conceived the main ideas and their analysis and presentation in this work. 
E.R. conducted the land cover experiments.
N.M. conducted the experiments on negative labels and ranks, text, and lymphocytes and ran the land cover baselines.
A.G. conducted the experiments on video tracking and the \textit{Le s\'educteur} experiments.
A.J. and N.J. conducted the experiments on self-supervised image clustering.
C.R. helped with compute and storage resources and with implementation of land cover experiments in TorchGeo. 
All authors collaboratively wrote the paper.
}
\end{contributions}

\begin{acknowledgements}

\newtext{
We thank Anthony Ortiz for helpful feedback during the ideation and writing stages of this work. We also thank the anonymous reviewers for their comments and suggestions.}

\newtext{The main contributions of this work were conceptualized and conducted while E.R. and A.G. were interns at Microsoft Research, Redmond.
%
Computation resources were provided by Microsoft AI for Earth. E.R. additionally acknowledges the support of a Google PhD Fellowship.}
\end{acknowledgements}

\bibliography{references}

\end{document}
