%\documentclass{uai2022}  % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\pdfoutput=1

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
% \usepackage{siunitx} % for proper typesetting of numbers and units

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)


% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{amssymb}
\usepackage{bm}
\usepackage{bbm}
\usepackage{amsmath}  % Define \boldsymbol (in amsbsy too) and align
\usepackage{amsthm}
\usepackage{xcolor}
\usepackage{textcomp}
\usepackage{multirow}
\usepackage{mathtools}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{thmtools}

%\usepackage[margin=1in]{geometry}

\usepackage{xr}
\externaldocument{chen_560-supp}

\usepackage{tikz}
\usetikzlibrary{arrows}
\usetikzlibrary{positioning}

\tikzset{
  treenode/.style = {align=center, inner sep=0pt, text centered,
    font=\sffamily},
  arn_n/.style = {treenode, circle, black, font=\sffamily\bfseries, draw=black,
    fill=white, text width=1.5em},% arbre rouge noir, noeud noir
  arn_r/.style = {treenode, circle, black, font=\sffamily\bfseries, draw=black,
    fill=white, text width=1.0em},% arbre rouge noir, noeud rouge
  arn_x/.style = {treenode, rectangle, draw=black,
    minimum width=0.5em, minimum height=0.5em}% arbre rouge noir, nil
}

\usepackage{hyperref}

% Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}


% bolded matirx
\usepackage{bm}
\usepackage{enumitem}

% \newcommand*{\ShowNotes}{}
\newcommand{\yell}[1]{\textcolor{red}{#1}}
% \newcommand{\assume}[1]{\textcolor{blue}{\textbf{#1}}}
\newcommand{\assume}[1]{#1}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{observation}{Observation}
\newtheorem{definition}{Definition}
\newtheorem{example}{Example}
\newtheorem{proposition}{Proposition}
\newtheorem{assumption}{Assumption}

%% OPERATORS:
\newcommand{\norm}[1]{\left|\left|#1\right|\right|}
\newcommand{\argmin}[2]{\textrm{argmin}_{#1}~#2}
\newcommand{\argmax}[2]{\textrm{argmax}_{#1}~#2}
% Inner product
\newcommand{\ip}[2]{\left\langle#1, #2\right\rangle}
% Trace
\newcommand{\tr}{\textrm{tr}}
% Expected value
\newcommand{\E}[2]{\mathbb{E}_{#1}\left[#2\right]}
% Sample mean
\newcommand{\Ehat}[1]{\hat{\mathbb{E}}\left[#1\right]}
% Variance
\newcommand{\Var}[2]{\textrm{Var}_{#1}\left[#2\right]}
% Covariance
\newcommand{\Cov}[2]{\textrm{\textbf{Cov}}_{#1}\left[#2\right]}
% Indicator
\newcommand{\ind}[1]{\mathbbm{1}\left\{#1\right\}}
\newcommand{\indpm}[1]{\mathbbm{1}^{\pm}\left\{#1\right\}}
% sech
\newcommand{\sech}[0]{\textrm{sech}}
% diag
\newcommand{\diagm}[1]{\textrm{diagm}\left(#1\right)}
% supp
\newcommand{\supp}{\text{supp}}
% independent
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
% sign
\newcommand{\sgn}[0]{\textrm{sgn}}
% labeling function
\newcommand{\lf}[0]{\lambda}
% extended labeling function
\newcommand{\lfbar}[0]{\bar{\lambda}}
% estimated probability
\newcommand{\phat}[0]{\hat{p}}
%
\newcommand{\bmin}[0]{b_{\text{min}}}
%
\newcommand{\NN}[0]{\text{NN}}

%% variables
\newcommand{\D}[0]{\mathcal{D}}
\newcommand{\T}[0]{\mathcal{T}}
\newcommand{\X}[0]{\mathcal{X}}
\newcommand{\Y}[0]{\mathcal{Y}}
\newcommand{\p}[0]{\mathcal{P}}
\newcommand{\loss}[0]{\mathcal{L}}
\newcommand{\C}[0]{\mathcal{C}}
\newcommand{\A}[0]{\mathcal{A}}
\newcommand{\Z}[0]{\mathcal{Z}}

\newcommand{\sysname}{\textsc{Liger}}

%\newenvironment{bsmallmatrix}
%  {\left[\begin{smallmatrix}}
%  {\end{smallmatrix}\right]}

% see line at top of main file to show/hide notes
\ifdefined\ShowNotes
  \newcommand{\colornote}[3]{{\color{#1}\bf{#2 #3}\normalfont}}
\else
  \newcommand{\colornote}[3]{}
\fi

\definecolor{darkred}{rgb}{0.7,0.1,0.1}
\definecolor{darkgreen}{rgb}{0.1,0.5,0.1}
\definecolor{cyan}{rgb}{0.7,0.0,0.7}
\definecolor{dblue}{rgb}{0.2,0.2,0.8}
\definecolor{maroon}{rgb}{0.76,.13,.28}
\definecolor{burntorange}{rgb}{0.81,.33,0}
\definecolor{royalpurple}{rgb}{0.47,.31,0.66}

% \newcommand {\note}[1]{\colornote{maroon}{}{#1}}
\newcommand {\todo}[1]{\colornote{cyan}{TODO}{#1}}
\newcommand {\mayee}[1]{\colornote{darkgreen}{Mayee:}{#1}}
\newcommand {\authortwo}[1]{\colornote{burntorange}{A2:}{#1}}
\newcommand {\authorthree}[1]{\colornote{red}{A3:}{#1}}

\newcommand{\spam}{\textbf{Spam}}
\newcommand{\spouse}{\textbf{Spouse}}
\newcommand{\weather}{\textbf{Weather}}
\newcommand{\recommender}{\textbf{Recommender}}
\newcommand{\interview}{\textbf{Interview}}
\newcommand{\commercial}{\textbf{Commercial}}
\newcommand{\tennis}{\textbf{Tennis}}
\newcommand{\basketball}{\textbf{Basketball}}

% not a colornote since we don't want these to ever be removed from the document
\ifdefined\ShowNotes
  \newcommand{\num}[1]{{\color{red}\bf{#1}\normalfont}}
\else
  \newcommand{\num}[1]{#1}
\fi

% Headers
\newcommand{\minihead}[1]{{\vspace{.45em}\noindent\textbf{#1.} }}

\newif\ifarxiv

\newif\ifsinglecolumn
\singlecolumnfalse

\title{Shoring Up the Foundations: Fusing Model Embeddings and Weak Supervision}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
% Add authors
\author[1]{\href{mailto:<mfchen@cs.stanford.edu>}{Mayee~F.~Chen}$^*$}
\author[1]{\href{mailto:<danfu@cs.stanford.edu>}{Daniel~Y.~Fu}\thanks{Equal Contribution. A preliminary version of the results in this paper can be found at https://arxiv.org/abs/2006.15168.}}
\author[2]{Dyah~Adila}
\author[1]{Michael~Zhang}
\author[2]{Frederic~Sala}
\author[1]{Kayvon~Fatahalian}
\author[1]{Christopher~R\'e}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science \\
    Stanford University\\
    Stanford, CA, USA
}
\affil[2]{%
    Department of Computer Science\\
    University of Wisconsin-Madison\\
    Madison, WI, USA
}
  
\begin{document}

\maketitle

% for compiling before there's content
% \nocite{*}

\begin{abstract}
  %!TEX root = ../main.tex

Foundation models offer an exciting new paradigm for constructing models with out-of-the-box embeddings and a few labeled examples.
%
However, it is not clear how to best apply foundation models without labeled data.
%
A potential approach is to fuse foundation models with weak supervision frameworks, which use weak label sources---pre-trained models, heuristics, crowd-workers---to construct pseudolabels.
%
The challenge is building a combination that best exploits the signal available in both foundation models and weak sources.
%
We propose \sysname, a combination that uses foundation model embeddings to improve two crucial elements of existing weak supervision techniques.
%
First, we produce finer estimates of weak source quality by partitioning the embedding space and learning per-part source accuracies.
Second, we improve source coverage by extending source votes in embedding space.
%
Despite the black-box nature of foundation models,
%
we prove results characterizing how our approach improves performance and show that lift scales with the smoothness of label distributions in embedding space.
%
On six benchmark NLP and video tasks,
\sysname\ outperforms vanilla weak supervision by \num{14.1} points, weakly-supervised kNN and adapters by \num{11.8} points, and kNN and adapters supervised by traditional hand labels by \num{7.2} points.

\end{abstract}

%!TEX root = ../main.tex




\section{Introduction}
\label{sec:intro}

\input{figs/banner.tex}

%What is the problem?
Foundation models---large pretrained models such as GPT-3, BERT, CLIP, and DALL-E \citep{brown2020language, devlin-2019-bert, radford2021learning, dalle}---offer powerful representations that can be used in a broad array of settings~\citep{Bommasani21FM}. 
%
These models have achieved state-of-the-art performance on many tasks.
%
However, it remains unclear how to best apply foundation models in situations where users lack access to any labeled data but do have some weak signals.
%
These are the cases where another class of techniques---weak supervision \citep{Ratner18, fu2020fast}---shines. 


%Why is it interesting and important?
The broad success of foundation models (FMs) suggests that fusing them with weak supervision may offer substantial benefits.
%
Intuitively, the signals present in both can be used to replace large amounts of hand-labeled data in supervised learning. These signals are complementary.
%
Foundation models are trained on huge amounts of data and thus offer powerful general-purpose embeddings.
%
Weak supervision frameworks rely on multiple weak sources of signal that can be synthesized into pseudolabels for downstream training.
%
These weak sources typically express specialized domain expertise. 
%
The fusion may enable each component to be improved: FM embeddings can be used without labeled data, while weak sources may be extended to be more general-purpose.


%Why is it hard? (E.g., why do naive approaches fail?) %Why hasn't it been solved before? (Or, what's wrong with previous proposed solutions? How does mine differ?)
%
Our goal is to combine these complementary signals to address two challenges in existing approaches to weak supervision.
%
The first challenge is performing fine-grained estimation of source quality.
%
Current weak supervision approaches typically coarsely model source quality by assuming error distributions are uniform over unlabeled points~\citep{ratner2019training, fu2020fast}, but source quality may vary across points in actuality.
%
The second challenge is producing votes on points where sources abstain. 
Weak sources often abstain, so that current approaches suffer from low coverage and have many points lacking any signal. 
%
We seek to exploit the powerful embeddings from FMs---and the geometry induced by them---to address these challenges.


%What are the key components of my approach and results? Also include any specific limitations.
We propose \sysname, a new weak supervision approach based on the notion of \emph{local} quality of weak sources in the FM embedding space (named after a well-known fusion of powerful animals).
%
We introduce an efficient algorithm that partitions the embedding space and learns per-part local source accuracies. 
%
\sysname\ also extends weak sources into nearby regions of the embedding space that they previously abstained on, improving coverage.
%
Despite the fact that FMs are typically black-box, our localized approach exploits a simple measurable notion of their signal: the smoothness of the label distribution in the embedding space.
%
When the distribution of label values does not vary significantly over an embedding region, local source accuracies can be estimated well, and local source extensions maintain their accuracy.
%
We introduce generalization error bounds that individually characterize the impact of partitioning and extending.
These error bounds scale in the embedding smoothness and involve a bias-variance tradeoff in 
the number of partitions and the radii that specify extensions, suggesting that careful incorporation of the FM's signal into our approach is necessary.

We evaluate \sysname\ on six benchmark NLP and video weak supervision tasks, fusing weak sources with GPT-3 embeddings~\citep{brown2020language, neelakantan2022text} for the NLP tasks, and with image embeddings from CLIP~\citep{radford2021learning} for the video tasks.
We compare \sysname\ against using FMs or weak supervision on their own, as well as baseline techniques for fusing them together.
First, \sysname\ outperforms two strong baselines for traditional supervision of FMs, kNN and adapters~\citep{houlsby2019parameter}, by \num{7.2} points, and outperforms traditional weak supervision by \num{14.1} points.
Next, \sysname\ outperforms kNN or adapter-based fusions of weak supervision and FMs by \num{11.8} points.
We find that lift scales with embedding smoothness---confirming our theoretical findings.
We measure the smoothness of CLIP embeddings against BiT-M~\citep{kolesnikov2020big}, ResNet-101 embeddings pretrained on ImageNet~\citep{ILSVRC15}, and raw pixels on a video task.
We find that CLIP embeddings are smoothest and result in the best performance.
Similarly, we find that using the right prompt for GPT-3 has a strong effect on smoothness and performance on a relation extraction task.

In summary, we contribute:
\begin{itemize}[leftmargin=*]
    \item \sysname, a new approach for fusing foundation models with weak supervision by exploiting local smoothness of labels and weak sources in embedding space.
    \item Finite-sample generalization error bounds of our algorithm that scale in this smoothness.
    \item Evaluation of \sysname\ on six benchmark NLP and video weak supervision tasks, where \sysname\ outperforms simple fusions of foundation models and weak supervison, as well as either on its own.
\end{itemize}


%!TEX root = ../main.tex

\section{Background}
\label{sec:background}

We describe the problem setting for weak supervision (Section~\ref{sec:setup}). We introduce two general challenges in weak supervision that our approach using foundation model embeddings can mitigate.
We then propose a model and explain its two stages---source quality estimation and pseudolabel inference (Section~\ref{sec:lm}). 
We provide a brief background on the estimation technique from~\cite{fu2020fast}, on top of which we build our approach.

\subsection{Problem Setup} \label{sec:setup}
Our goal is to predict label $y \in \Y = \{-1,+1\}$ from datapoints $x \in \X$. If we had access to pairs $(x, y)$, we could train a supervised model. However, we do not have access to any samples of $y$; instead, we observe $m$ \emph{weak sources} $\bm{\lf} = \{\lf_1, \dots, \lf_m\}$, each
voting or abstaining on each point $x$ via a probabilistic \textit{labeling function}
$\lf_j: \X \rightarrow \Y \cup \{0\}$ for all $j \in [m]$. 
We refer to $\lf_j(x) = 0$ as an abstain, which occurs when a source is uncertain or not applicable on a point. 

We also have access to FM embeddings. These embeddings are the outputs of a mapping $f: \X \rightarrow \Z$ from input space to an embedding space $\Z$ equipped with metric $\rho: \Z \times \Z \rightarrow \mathbb{R}^+$. 
This mapping is fixed and obtained from an off-the-shelf model.
Overall, we have an unlabeled dataset $\D = \{x_i\}_{i = 1}^n$ of $n$ i.i.d. points, as well as access to $m$ weak sources and the embedding map $f$. 

Given an input $x$ and $\bm{\lf}(x)$, we aim to learn a \emph{label model} that predicts $y$ by estimating $\hat{\Pr}(y | \bm{\lf}, x)$ (we drop the $x$ in $\bm{\lf}(x)$ when obvious). The goal of the label model is to combine sources based on their individual accuracies (i.e. $\lf_i$'s rate of agreement with $y$) by upweighting high-quality sources and downweighting low-quality ones. The resulting pseudolabels given by $\hat{\Pr}(y | \bm{\lf}, x)$ can be used to train a downstream supervised \emph{end model} or used just directly as predictions. The latter case is often ideal, since users need not train an additional model. We focus on this setting.

\textbf{Two Challenges and Opportunities.} 
Next, we describe two challenges common to weak supervision techniques. Fusing weak supervision with FM embeddings presents opportunities to mitigate these challenges.
\begin{itemize}[itemsep=0.5pt,topsep=0pt,leftmargin=*]
\item {\bf Coarse Accuracy Modeling.} The most common assumption in weak supervision is to model $\hat{\Pr}(y | \bm{\lf}, x)$ as $\hat{\Pr}(y | \bm{\lf})$. That is, conditioned on the weak sources, the true label is viewed as independent of the features, so only one set of accuracies is learned over the data. Removing this assumption is desirable, since the feature space may have information about the task not captured fully by weak sources. However, naively attempting to model per-point accuracies leads to noisy estimation.
\item {\bf Low Coverage.} Weak sources frequently abstain, leading to low coverage---a situation where much of the dataset has no votes. A simple mitigation is to extend votes from nearby non-abstaining points, but this is risky if the notion of distance is not well-aligned with the label distribution. 
\end{itemize}


An intuitive way to tackle these two challenges is to operate \emph{locally}. 
Suppose the source votes and the true label satisfy some level of smoothness such that within some local region of the feature space, they have a low probability of changing values. 
We can then model accuracies specific to such local regions and can extend source votes to points they abstain on within the regions. 
However, raw image and text features may lack signal and not offer sufficient smoothness to permit operating locally. 
By acting on the embedding space, the desired smoothness property is improved (see Figure~\ref{smoothness}). We can thus obtain finer-grained accuracy estimation and improved coverage by using FM embeddings to model local accuracies and extend locally.

Next, we make these notions concrete by presenting the explicit model for $\Pr(y, \bm{\lf} | x)$.

\subsection{Label Model} \label{sec:lm}
We model $\Pr(y, \bm{\lf} | x)$ as a probabilistic graphical model. 
%
Our use of this model has two steps. First, in training, we must estimate the accuracy parameters of $\Pr(y, \bm{\lf} | x)$ without access to $y$. Then, at inference, we compute $\hat{\Pr}(y | \bm{\lf}, x)$.

Let the graphical model be based on $G = (V, E)$, where $V = y \cup \bm{\lf}$ and $E$ consists of edges from $y$ to each $\lf_j$ (see Figure~\ref{fig:banner} middle). 
For simplicity, we assume there are no dependencies between the weak sources, although the dependencies can be learned~\citep{varma2019learning} and handled by our choice of base estimator from~\citep{fu2020fast}. Therefore, our approach can be extended to that case as well.
We model the data distribution as
\ifsinglecolumn
\begin{align}
\Pr(y, \bm{\lf} | x) = \frac{1}{Z} \exp &\Big(\underbrace{\theta_y(x) y}_{\text{Class Balance}} + \sum_{i = 1}^m \underbrace{\theta_i(x) \lf_i y}_{\text{Source Accuracy}} + \sum_{i = 1}^m \underbrace{\theta_{i, 0}(x) \ind{\lf_i = 0}}_{\text{Abstain Rate}} \Big) 
\label{eq:pgm} 
\end{align}
\else
\begin{align}
\Pr(y, \bm{\lf} | x) = \frac{1}{Z} \exp &\Big(\underbrace{\theta_y(x) y}_{\text{Class Balance}} + \sum_{i = 1}^m \underbrace{\theta_i(x) \lf_i y}_{\text{Source Accuracy}} \nonumber \\
&\qquad   + \sum_{i = 1}^m \underbrace{\theta_{i, 0}(x) \ind{\lf_i = 0}}_{\text{Abstain Rate}} \Big) 
\label{eq:pgm} 
\end{align}
\fi
with partition function $Z$ and a set of canonical parameters per $x$, $\Theta(x) = \{ \theta_y(x), \theta_i(x), \theta_{i, 0}(x) \; \forall i \in [m]\}$. 
%
An important property above is that $\lf_i \independent \lf_j | y, x\; \forall i, j \in [m]$.

The model concretely portrays the two challenges in weak supervision. 
%
First, canonical parameters $\Theta(x)$ that are a function of the input can capture varying accuracy across the data. 
This is less strict than prior formulations that model the marginal $\Pr(y, \bm{\lf})$ with one set of canonical parameters without considering input data.
However, estimating $\Theta(x)$ is challenging; parametric approaches require certain assumptions on the function $\Theta$ as well as the distribution of $x$ in order to recover the ground truth labels, but these assumptions (e.g., Gaussian $x$) are often not realistic. Standard nonparametric approaches have a high computational complexity and rely on smoothness of the input space $\X$.
%
Second, when $\lf_i(x) = 0$, the weak source provides no information on $x$ at inference and is thus typically ignored on that point in previous approaches. 
This is reflected in the graphical model by Lemma~\ref{lemma:abstain} in Appendix~\ref{sec:supp_pgm}, by which $\Pr(y | \lf_i = 0, \bm{\lf} \backslash \lf_i, x) = \Pr(y | \bm{\lf} \backslash \lf_i, x)$. 
In fact, the weak sources provide no direct signal on $x$  when $\bm{\lf}(x) = \vec{0}$. 

\textbf{Pseudolabel Inference.} To perform inference, we compute $\hat{\Pr}(y | \bm{\lf}, x)$ for some $x\in\X$.  This is done via Bayes' rule and the conditional independence of weak sources: $\Pr(y | \bm{\lf}, x) = \prod_{i = 1}^m \Pr(\lf_i | y, x) \Pr(y | x) / \Pr(\bm{\lf} | x)$. The latent parameter of interest in this decomposition is $\Pr(\lf_i | y, x)$, which corresponds to the accuracy of $\lf_i$.


\textbf{Source Parameter Estimation.} Previous approaches have considered how to estimate $\Pr(\lf_i | y)$ in a model of $\Pr(\lf, y)$ via the \emph{triplet method}~\citep{fu2020fast}, using conditional independence properties.
For our setting, \eqref{eq:pgm} tells us that $\lf_i y \independent \lf_j y | \lf_i \wedge \lf_j \neq 0, x$ for any $i \neq j$ (Lemma~\ref{lemma:triplet_independence} in Appendix~\ref{sec:supp_pgm}). 
As a result, $\E{}{\lf_i y | \lf_i \neq 0, x} \times \E{}{\lf_j y | \lf_j \neq 0, x} = \E{}{\lf_i \lf_j y^2 | \lf_i \wedge \lf_j \neq 0, x} = \E{}{\lf_i \lf_j | \lf_i \wedge \lf_j \neq 0, x}$, which consists of observable variables. 
Define $a_i(x) = \E{}{\lf_i y | \lf_i \neq 0, x}$ as the \emph{accuracy} of $\lf_i$ on $x$. 
If we introduce a third $\lf_k$, we can generate a system of equations over $a_i(x), a_j(x), a_k(x)$ in terms of the conditional expected products of pairs of $\lf_i, \lf_j, \lf_k$. 
As a result, 
\ifsinglecolumn
\begin{align}
|a_i(x) | :=  \sqrt{\bigg| \frac{\E{}{\lf_i \lf_j | \lf_i \wedge \lf_j \neq 0, x} \E{}{\lf_i \lf_k | \lf_i \wedge \lf_k \neq 0, x}}{\E{}{\lf_j \lf_k | \lf_j \wedge \lf_k \neq 0, x}}\bigg|}, \label{eq:triplet}
\end{align}
\else
\begin{align}
&|a_i(x) | := \label{eq:triplet} \\
&\sqrt{\bigg| \frac{\E{}{\lf_i \lf_j | \lf_i \wedge \lf_j \neq 0, x} \E{}{\lf_i \lf_k | \lf_i \wedge \lf_k \neq 0, x}}{\E{}{\lf_j \lf_k | \lf_j \wedge \lf_k \neq 0, x}}\bigg|}, \nonumber
\end{align}
\fi
and likewise for $\hat{a}_j(x), \hat{a}_k(x)$.
More details are in Appendix~\ref{sec:supp_triplet}.
\eqref{eq:pgm} allows us to write $\Pr(\lf_i | y, x) = \frac{1 + \sgn(\lf_i y) a_i(x)}{2} \times \Pr(\lf_i \neq 0 | x)$ (Lemmas~\ref{lemma:abstain} and~\ref{lemma:symmetry}), so the desired probability estimate is just a linear transformation of $a_i(x)$ scaled by $\lf_i$'s coverage.



%!TEX root = ../main.tex

\section{Fusion Algorithm}
\label{sec:method}


We are ready to present \sysname, our approach to fusing foundation model embeddings and weak supervision.
We explain the two components: first, how to compute conditional estimates of the label model parameters over local regions of the partitioned embedding space for finer-grained accuracy estimation;
second, how to extend weak sources via a kNN-like augmentation in the embedding space, improving their coverage and hence the signal available at inference.
The full approach is shown in Algorithm~\ref{alg:main}. 

\begin{algorithm}[t]
	\caption{\sysname}
	\begin{algorithmic}
		\STATE \textbf{Input:}
		Dataset $\D = \{x_i\}_{i = 1}^n$, weak sources $\bm{\lf}$,  embedding mapping $f$ and metric $\rho$, threshold radii $r_1, \dots r_m$, partition $\C$ and class balances $\Pr(y | C_j)$ for $j \in [s]$.
		\STATE \textbf{Returns:} Label model $\hat{\Pr}(y | \bm{\lfbar}, x)$.
		\FOR{$\lf_i \in \bm{\lf}$}
			\STATE Construct extended source $\lfbar_i$ using $r_i, f, \rho$ as in~\eqref{eq:extended}.
		\ENDFOR
		\FOR{$C_j \in \C$}
			\FOR {$\lfbar_i \in \bm{\lfbar}$}
				\STATE Compute accuracy $\hat{a}_i(C_j)$ using Algorithm~\ref{alg:triplet} on $\lfbar_i$ over $C_j$, and compute coverage $\hat{\Pr}(\lfbar_i \neq 0 | C_j)$ on $\D$. 
				\STATE Set $\hat{\Pr}(\lfbar_i | y, C_j)$ equal to $\frac{1 + \sgn(\lfbar_i y) \hat{a}_i(C_j)}{2} \hat{\Pr}(\lfbar_i \neq 0 | C_j) $ for $\lfbar_i \in \{-1, 1\}$, $\hat{\Pr}(\lfbar_i = 0 | C_j)$ otherwise.
			\ENDFOR
			\STATE Compute $\hat{\Pr}(\bm{\lfbar} | C_j)$ on $\D$.
		\ENDFOR
		\RETURN For test point $x \in \X$, compute $\hat{\Pr}(y | \bm{\lfbar}, x) = \hat{\Pr}(y | \bm{\lfbar}, C(x)) = \frac{\prod_{i = 1}^m \hat{\Pr}(\lfbar_i | y, C(x)) \Pr(y | C(x))}{\hat{\Pr}(\bm{\lfbar} | C(x))}$.
	\end{algorithmic}
	\label{alg:main}
\end{algorithm}


\paragraph{Local Parameter Estimation}

Our first task is to compute the label model's local parameters. Based on~\eqref{eq:triplet}, the quantities to estimate are of the form $\E{}{\lf_i \lf_j | \lf_i \wedge \lf_j \neq 0, x}$, $\Pr(\lf_i \neq 0 | x)$, $\Pr(\bm{\lf} | x)$, $\Pr(y | x)$. 
These conditional statistics can be estimated using nonparametric approaches such as the Nadaraya-Watson estimator, but they require $\mathcal{O}(n)$ computations per point at inference. 

Instead of estimating parameters per point, we partition the embedding space and compute \emph{per-part} statistics. Intuitively, this choice exploits smoothness.
If label distributions are smooth, i.e., they do not vary greatly within a local region, it is sufficient to estimate per-point statistics using a part given that parts are not too large. Controlling the size of the partition is thus important in determining how well we can approximate per-point statistics.

Concretely, partition $\Z$ into $s$ subsets $\C = \{C_1, \dots, C_s\}$ of equal size $n' = \frac{n}{s}$ (we use K-means clustering with $K=s$ in practice).
Denote $C(x)$ as the subset $f(x)$ belongs to. 
Instead of estimating statistics and performing inference conditioned on $x$, we condition on $C(x)$, producing $s$ sets of parameters overall. 
We estimate $\E{}{\lf_i \lf_j | \lf_i \wedge \lf_j \neq 0, C(x)}$, yielding a local accuracy estimate $\hat{a}_i(C(x))$ formalized in Algorithm~\ref{alg:triplet}, as well as $\Pr(\lf_i \neq 0 | C(x))$, $\Pr(\bm{\lf} | C(x))$, $\Pr(y | C(x))$. Then, we use $\hat{\Pr}(y | \bm{\lf}, x) = \hat{\Pr}(y | \bm{\lf}, C(x))$ as our label model prediction on $x$.
These estimates are done over the subsets; for instance, $\Pr(\bm{\lf} | C(x)) \approx \frac{1}{n'}\sum_{x' \in C(x)} \ind{\bm{\lf}(x') \texttt{=} \bm{\lf}}$.
We assume that class balance on subsets, $\Pr(y | C(x))$, are known.
There are also several techniques that can be used to estimate these~\citep{ratner2019training}, or they can be treated as hyperparameters.


\paragraph{Weak Source Extension}
Next, we improve the model of $\hat{\Pr}(y | \bm{\lf}, x)$ by increasing source coverage. 
Let $\lfbar_i$ be an extended labeling function with corresponding threshold radius $r_i > 0$ for $i \in [m]$.
The extension works as follows.
For any $x$, let $\NN(x) = \argmin{x' \in \D: \lf_i(x) \neq 0} \rho(f(x), f(x'))$ be the nearest neighbor of $x$ in embedding space from $\D$ such that $\lf_i$ has coverage on it. 
$\lfbar_i$ uses nearest neighbors to weakly label points within $r_i$ of $\lf_i$'s support on $\D$. Formally,
\begin{align}
\lfbar_i(x) := \begin{cases}
\lf_i(x) & \lf_i(x) \neq 0 \\
\lf_i(\NN(x)) & \rho(f(x), f(\NN(x))) \le r_i \\ 
0 & \text{o.w.}
\end{cases}. \label{eq:extended}
\end{align}

We can view $\lfbar_i$ as an augmentation on $\lf_i$ using $\D$ and $f$.
We thus perform parameter estimation and inference using $\bm{\lfbar}$ instead of $\bm{\lf}$, namely learning $\Pr(y | \bm{\lfbar}, C(x))$. 

There are two advantages to using extended sources.
First, extended sources improve sampling error, since expressions like $\E{}{\lf_i \lf_j | \lf_i \wedge \lf_j \neq 0, x}$ are estimated over more data in $\D$.
Second, $\lfbar_i$ provides signal at inference on points that $\lf_i$ previously abstains on. However, the quality of this signal greatly depends on $r_i$. If $\lf_i$ is overextended and the embedding space is not sufficiently smooth, points far away from $\lf_i$'s support may receive incorrect extended source votes, suggesting that careful choice of $r_i$ is needed.

Our approach combines the two components discussed---partitioning the embedding space and extending sources---to output predictions $\hat{\Pr}(y | \bm{\lfbar}, C(x))$ as in Algorithm~\ref{alg:main}. Note that our approach builds on the algorithm from~\cite{fu2020fast}, but partitioning and extending can also be done on top of other weak supervision algorithms that model things differently. 

%!TEX root = ../main.tex

\section{Theoretical Analysis}
\label{sec:theory}
Now we turn to analyzing Algorithm~\ref{alg:main}. Our goal is to understand how performance depends on the key parameters: fineness of the partition $\C$, radii $r_i$ of the extensions used to improve coverage, and smoothness of the embedding space.

We begin with a result on the generalization error of the label model $\hat{\Pr}(y | \bm{\lf}, x)$, which relies on the number of partitions $s$ to control the granularity of the estimated parameters (Theorem~\ref{thm:gen_err}). Then, we discuss the improvement from using $\bm{\lfbar}$ instead of $\bm{\lf}$. We first bound the local accuracy of an extended source in a region it previously abstains (Lemma~\ref{lemma:extended_acc}), and then we show that as long as this local accuracy is better than random, we can further reduce the generalization error (Theorem~\ref{thm:lift}). The former result presents a bias-variance tradeoff depending on $s$, while the latter has a tradeoff dependent on the threshold radius $r_i$. In both cases, $s$ and $r_i$ must be carefully set based on the signal in the FM embeddings, namely the smoothness of label distributions in the FM embedding space, in order to optimize performance. We provide proofs in Appendix~\ref{sec:supp_proofs}, synthetic experiments supporting our findings in Appendix~\ref{sec:supp_exp_synthetics}, and smoothness measurements on real data in Section~\ref{sec:exp-smoothness} and Appendix~\ref{sec:supp_smooth}.

Define the generalization error of the label model using weak sources $\bm{\lf}$ as the expected cross-entropy loss, $L(\bm{\lf}) = \mathbb{E}_{\D, x, y, \bm{\lf}}[-\log \hat{\Pr}(y | \bm{\lf}, x)]$. 



\subsection{Label Model Generalization Error} \label{subsec:gen_err}
We bound the generalization error $L(\bm{\lf})$ of the label model using the unextended, initial weak sources. The key quantity in this analysis is embedding smoothness: %We present our smoothness assumption for this result.
\begin{definition}[Lipschitzness]
The distributions $\Pr(y | x)$ and $\Pr(\lf_i | y, x)$
are \emph{Lipschitz-smooth} on the metric space $(\Z, \rho)$ with constants $K_y, K_{\lf}, K_{\lf, 0} > 0$ if for all $i \in [m]$,
\ifsinglecolumn
\begin{align*}
&|\Pr(y = 1 | x) - \Pr(y = 1 | x')| \le K_y \rho(f(x), f(x')), \\
&|\Pr(\lf_i = 1 | y, \lf_i \neq 0, x) - \Pr(\lf_i = 1 | y, \lf_i \neq 0, x')| \le K_{\lf}\rho(f(x), f(x')), \\
&|\Pr(\lf_i \neq 0 | x) - \Pr(\lf_i \neq 0 | x')| \le K_{\lf, 0} \rho(f(x), f(x')),
\end{align*}
\else
\begin{align*}
&|\Pr(y = 1 | x) - \Pr(y = 1 | x')| \le K_y \rho(f(x), f(x')), \\
&|\Pr(\lf_i = 1 | y, \lf_i \neq 0, x) - \Pr(\lf_i = 1 | y, \lf_i \neq 0, x')| \\
& \quad \le K_{\lf}\rho(f(x), f(x')), \\
&|\Pr(\lf_i \neq 0 | x) - \Pr(\lf_i \neq 0 | x')| \le K_{\lf, 0} \rho(f(x), f(x')),
\end{align*}
\fi
We refer to these three properties as label, source, and coverage Lipschitzness, respectively.

\label{assumption:lipschitzness}
\end{definition}
In words, if the constants $K_y, K_{\lf}, K_{\lf, 0}$ are small, the class balance of $y$ and the way each source votes (or doesn't) do not vary significantly over a local region of the embedding space. %The support itself per source is also smooth over the embedding space.

We define some additional quantities. Set $\alpha = \max_i \E{x}{\frac{1}{p_{ij}} \; \big| \; p_{ij} \neq 0}$, where $p_{ij} = \Pr(\lf_i \neq 0 | f(x) \in C_j)$ is the coverage of $\lf_i$ on $C_j$, to be the largest average inverse source coverage over the subsets. $\alpha$ corresponds to how often sources abstain. Assume that $a_i(C_j) > 0$ for all $\lf_i$ and $C_j$, meaning that the average source accuracy on a subset is better than random. Then, define $a_{\max} = \max_{i,j} a_i(C_j)$, and $b_{\min} = \min\limits_{i, j, k} \{\E{}{\lf_i \lf_k | \lf_i \wedge \lf_k \neq 0, C_j}, \Ehat{\lf_i \lf_k | \lf_i \wedge \lf_k \neq 0, C_j} \}$ as the minimum rate of agreement between sources over subsets, where $\hat{\mathbb{E}}$ denotes the empirical estimate on $\D$. Define $d_{C_j}=\max_{f(x), f(x') \in C_j} \rho(f(x), f(x'))$ as the diameter of $C_j$ and $d_{\C}=\E{x}{d_{C(x)}}$ as its average.

\begin{restatable}[]{theorem}{generr}
Suppose that data $x, y, \bm{\lf}$ follows the model in~\eqref{eq:pgm} and $\Pr(y | x)$ and $\Pr(\lf_i | y, x)$ for each $\lf_i$ are Lipschitz-smooth. The generalization error of the label model $\hat{\Pr}(y | \bm{\lf}, x)$ in Algorithm~\ref{alg:main} when $r_i = 0 \; \forall i$ can be decomposed into $L(\bm{\lf}) \texttt{=} \text{Bias} + \text{Variance} + \text{Irreducible Error} + o(1/n)$, where
\ifsinglecolumn
\begin{align*}
&\text{Bias} \le 2 d_{\C}(K_y + mK_{\lf} + mK_{\lf, 0}), \\
&\text{Variance} \le \frac{ms}{ n} \bigg(\frac{3 \alpha (1 - b_{\min}^2)}{8b_{\min}^2 (1 - a_{\max}^2)} \Big(\frac{1}{b_{\min}^4} + \frac{2}{b_{\min}^2} \Big) + 1 \bigg), \\
&\text{Irreducible Error} = H(y | \bm{\lf}, x),
\end{align*}
\else
\begin{align*}
&\text{Bias} \le 2 d_{\C}(K_y + mK_{\lf} + mK_{\lf, 0}), \\
&\text{Variance} \le \frac{ms}{n} \bigg(\frac{3\alpha (1 - b_{\min}^2)}{8b_{\min}^2 (1 - a_{\max}^2)} \Big(\frac{1}{b_{\min}^4} + \frac{2}{b_{\min}^2} \Big) + 1 \bigg), \\
&\text{Irreducible Error} = H(y | \bm{\lf}, x),
\end{align*}
\fi
where $H(y | \bm{\lf}, x)$ denotes conditional entropy.
\label{thm:gen_err}
\end{restatable}

We discuss each term of this bound.
\begin{itemize}[itemsep=0.5pt,topsep=0pt,leftmargin=*]
\item The bias comes from the partition $\C$, since conditional statistics on $C(x)$ are not equivalent to those on $x$. When the embedding space is smooth with small $K_y, K_{\lf}, K_{\lf, 0}$, the bias is low. Note that making the subset diameter $d_C \rightarrow 0$ makes the bias go to zero.
\item The variance comes from sampling error in Algorithm~\ref{alg:triplet} and $\hat{\Pr}(\lf_i \neq 0 | C_j)$. This quantity scales in $\mathcal{O}(s\alpha /n)$ and also depends on accuracy and agreement among weak sources.
\item The irreducible error depends on quality of $\bm{\lf}$. If knowledge of $\bm{\lf}$ significantly reduces uncertainty in $y$, i.e., the sources contain lots of signal, this quantity is low. On the other hand, $H(y | \bm{\lf}, x)$ is maximized when $\bm{\lf} \independent y | x$, i.e. there is no signal about $y$ in $\bm{\lf}$.
\end{itemize}

Our result reveals a bias-variance tradeoff dependent on the number of parts $s$. As $s$ increases, subset diameter $d_{\C}$ tends to decrease, resulting in lower bias because the subset parameters estimated will be closer in true value to those conditional on $x$. The variance increases in $s$ because there are fewer points per subset for estimation. 
The $s = 1$ case, which incurs a large bias, is algorithmically equivalent to the approach in~\cite{fu2020fast}. Such approaches thus suffer from model misspecification in our setting---and likely in most practical cases---as they assume uniform quality per source. 



\subsection{Improvement from Extensions}\label{subsec:lift}
Suppose that $x, y, \bm{\lfbar}$ follows~\eqref{eq:pgm}.
When we use $\bm{\lfbar}$ rather than $\bm{\lf}$ (i.e. $r_i \neq 0$), there are several changes to the decomposition in Theorem~\ref{thm:gen_err}:
\begin{itemize}[itemsep=0.5pt,topsep=0pt,leftmargin=*]
\item The bias is now bounded by $2d_{\C}K_y + 2m(d_{\C} + 2\max_i r_i) (K_{\lf} + K_{\lf, 0})$ (see Lemma~\ref{lemma:ext_bias} in Appendix~\ref{sec:supp_proofs}). We must consider when $\NN(x)$ is not in $C(x)$, essentially resulting in a wider subset diameter.
\item The variance is still $\mathcal{O}(1/n)$, but multiplicative factors change. For instance, $\alpha$ decreases due to improved coverage, thus decreasing the variance. 
\item The irreducible error is now $H(y | \bm{\lfbar}, x)$. 
\end{itemize}

We analyze $H(y | \bm{\lfbar}, x)$ in this section.
$\lfbar_i$ provides more signal than $\lf_i$ at inference on points where $\lf_i(x) = 0$, but the signal about $y$'s value may be incorrect. Extending $\lf_i$ using too large of $r_i$ could yield incorrect source votes, resulting in lower accuracy of the extended weak source.

We first present a result on how $r_i$ controls the extended source's accuracy. Define $a_i = \E{}{\lf_i y | \lf_i \neq 0}$ as the average accuracy of $\lf_i$, and $\bar{a}_i(r_i) = \E{}{\lfbar_i y | \lfbar_i \neq 0, \lf_i = 0}$ as $\lfbar_i$'s average accuracy on the extended region.
We also need a notion of smoothness of $y$ between the original support and the extended region. We define a local notion of \emph{probabilistic Lipschitzness} (PL), originally introduced in~\cite{urner2013probabilistic}. 

\begin{definition}[Probabilistic Lipschitzness]
Define $\p_{\lf_i} = \Pr_{x, y}(\cdot | \lf_i \neq 0)$ to be the distribution of $(x, y)$ over the support of $\lf_i$, and let $\p_{\lf_i, x}$ be its marginal distribution on $x$. Then $\p_{\lf_i}$ is $M$-\emph{probabilistically Lipschitz} for an increasing function $M: \mathbb{R}^+ \rightarrow [0, 1]$ if for any $r > 0$,
\ifsinglecolumn
\begin{align*}
\Pr_{x, y \sim \p_{\lf_i}}(\exists (x', y') &\in \X \backslash \supp(\p_{\lf_i, x}) \times \Y: \rho(f(x), f(x')) \le r, y' \neq y) \le M(r).
\end{align*}
\else
\begin{align*}
\Pr_{x, y \sim \p_{\lf_i}}(\exists (x', y') &\in \X \backslash \supp(\p_{\lf_i, x}) \times \Y: \\
& \rho(f(x), f(x')) \le r, y' \neq y) \le M(r).
\end{align*}
\fi
We refer to this property as local label PL. 
\label{def:pl}
\end{definition}

In words, the probability that there is a point outside of the support of $\lf_i$ but within $r$ of $(x, y) \sim \p_{\lf_i}$ with a different label from $y$ is bounded by an increasing function of $r$. We also define $\beta_i = \mathbb{E}[\lf_i y | \lf_i \neq 0, \exists (x', y'): \lf_i(x') = 0, \rho(f(x), f(x')) \le r_i, y' = y]$ as $\lf_i$'s accuracy over a region close to where $\lf_i$ is extended and $y$ changes value.

With this definition, we show that:
\begin{restatable}[]{lemma}{extendedacc}
Suppose $\p_{\lf_i}$ is $M$-probabilistically Lipschitz. The average accuracy of $\lfbar_i$ on the extended region is at least
$\bar{a}_i(r_i) \ge a_i - (1 + \beta_i) M(r_i)$.
\label{lemma:extended_acc}
\end{restatable}

Our result provides local accuracy guarantees on $\lfbar_i$ as a function of the original $\lf_i$'s accuracy, the probabilistic Lipschitzness of the embedding space, and the $r_i$ the user sets.
Extending a source with higher original accuracy will yield stronger accuracy guarantees in the extended region. 
On the other hand, if $M(r_i)$ is too large due to improper $r_i$ or lack of smoothness, the true label is more likely to change value, and hence accuracy in the extended region worsens. 

Now we can use our result on $\bar{a}_i(r_i)$ to analyze the improvement in irreducible error. We extend just one weak source $\lf_i$ by $r_i$ and keep $\lf_{-i} := \bm{\lf} \backslash \lf_i$ unextended. 
Define $p_i = \Pr(\lfbar_i \neq 0, \lf_i = 0)$ as the proportion of the region where $\lfbar_i$ is extended and $p(\lf_{-i}) = \E{y', \lf_{-i}, \lfbar_i \neq 0, \lf_i = 0}{\Pr(y = y' | \lf_{-i}, x)}$ as the label model's probability of outputting the correct label in the extension region when only using $\lf_{-i}$.

\begin{restatable}[]{theorem}{lift}
Suppose that data follows the model in~\eqref{eq:pgm}. The irreducible error decreases by at least the following amount when using $\lfbar_i$ rather than $\lf_i$ in Algorithm~\ref{alg:main}:
\begin{align*}
H(y | \bm{\lf}, x) - H(y | \bm{\lfbar}, x) \ge 2 p_i (1 - p(\lf_{-i}))^2 \cdot \bar{a}_i(r_i)^2.
\end{align*}
\label{thm:lift}
\end{restatable}

Lift increases with probability mass $p_i$ on the extended region since more of the data is impacted by $\lfbar_i$. Lift is not as significant if $p(\lf_{-i})$ is large because the other weak sources already are providing sufficient signal for $y$. Most importantly, lift scales with how far $\bar{a}_i(r_i)$ is from $0$ (random voting). This highlights a tradeoff in $r_i$: as $r_i$ increases, $p_i$ increases but the lower bound on $\bar{a}_i(r_i)$ from Lemma~\ref{lemma:extended_acc} decreases. This shows that threshold radii must be selected carefully; if the embedding space has strong probabilistic Lipschitzness (i.e. small $M$) or the original weak source has high accuracy, then the source can be extended further while providing lift. However, overextension of the source can yield low local accuracy and thus less lift.

Our results demonstrate that $s$ and $r_i$ control the label model's performance, and setting these terms depends on how smooth label distributions are in the embedding space.


%!TEX root = ../main.tex

\section{Experiments}
\label{sec:exp}
\input{tables/main_results.tex}

This section evaluates the following claims about \sysname:
\begin{itemize}[itemsep=0.5pt,topsep=0pt,leftmargin=*]
    \item \textbf{Performance (Section~\ref{sec:exp-performance}):} \sysname\ outperforms vanilla weak supervision, as well as baseline approaches for using foundation models directly, either with traditional weak supervision or hand supervision.
    \item \textbf{Smoothness (Section~\ref{sec:exp-smoothness}):} Lift is correlated with the smoothness of the label distribution in the representation space.
    We measure smoothness and performance of CLIP against three other embedding methods on a video task, and measure three prompting strategies for GPT-3 on a relation extraction task.
    \item \textbf{Ablations (Section~\ref{sec:exp-ablations}):} Both components of \sysname---partitioning the representation space and extending labeling function votes---are important for performance.
\end{itemize}

\paragraph{Datasets}
We evaluate \sysname\ on six benchmark NLP and video tasks used to evaluate previous weak supervision methods~\citep{fu2020fast,zhang2021wrench}.
In NLP, \spam\ identifies spam YouTube comments~\citep{alberto2015tubespam}; \weather\ identifies the sentiment of weather-related tweets~\citep{CrowdflowerWeather}; and \spouse\ identifies spouse relationships in newspaper articles~\citep{corney2016million}.
In video, \commercial\ identifies commercial segments in TV news~\citep{hong2021analysis, fu2019rekall}; \tennis\ identifies rallies in tennis segments; and \basketball\ identifies basketball videos in a subset of ActivityNet~\citep{caba2015activitynet}. Each dataset consists of a large unlabeled training set, a smaller hand-labeled \textit{development set} (train/dev split sizes from 187/50 points to 64,130/9,479 points), and a held-out test set. We use the unlabeled training set to train label models and use the development set for a) training of traditional supervision baselines, and b) hyperparameter tuning of the label models, including $s$ and $r_i$.

\paragraph{Pre-trained embeddings} For the NLP datasets, we use pre-trained GPT-3 \citep{brown2020language} embeddings from OpenAI's Ada model.
For \spam\ and \weather, we simply embed the text directly.
For \spouse, we add a prompt ``Are [person 1] and [person 2] spouses?'' after the end of the sentence.
We discuss further prompting strategies in Section~\ref{sec:exp-smoothness}.
For video datasets, we use image embeddings from CLIP~\citep{radford2021learning} over individual frames of the videos.

\input{tables/video_smoothness}

\subsection{Performance}\label{sec:exp-performance}

We compare \sysname\ against baseline approaches for fusing foundation models with weak supervision, as well as against using either on their own.
We split our evaluation into two parts: methods that only have access to weak sources, and methods that additionally have access to the dev set.

\paragraph{Weak Sources Only}
We compare the performance of \sysname\ against vanilla weak supervision's label model (WS-LM)~\citep{fu2020fast}, as well as two end models, weakly-supervised kNN (WS-kNN), and weakly-supervised adapters (WS-Adapter).
In the latter two methods, we use the predictions from WS-LM to generate pseudolabels for the train set and use the FM embeddings as input data (since we do not access the full FM) to the kNN and adapter approaches.
We consider an adapter that is a linear layer on the FM embeddings. We also provide results on 3-layer MLP adapters in Appendix~\ref{sec:supp_exp}.
%We report results on a single layer for our adapters (MLP in Appendix~\ref{sec:supp_exp}), and we only use adapters on the final embeddings since we do not have access to the full foundation model.

Table~\ref{table:main_results} (left) shows the results, as well as statistics on the additive change in coverage ($\%$ of the dataset that sources vote on) between \sysname\ and WS-LM.
\sysname\ outperforms WS-LM and has better coverage (33.2 points on average).
\sysname\ also outperforms both of the baseline approaches for fusing foundation models with weak supervision, WS-kNN and WS-Adapter.

\paragraph{Weak Sources and Dev Labels}
Next, we compare performance against methods that have access to a small hand-labeled dev set.
We compare against two baselines: kNN and Adapter, both trained over the dev set labels.
For our method \sysname-Adapter, we train an adapter over \sysname\ labels on the train set, as well as the dev labels.
In some cases, \sysname\ labels are too noisy to provide good signal on the train set; in this case, our solution automatically downsamples the pseudolabels on the train set.
We also provide the original \sysname\ prediction as input to the adapter.
See Appendix~\ref{sec:supp_details} for the details.

Table~\ref{table:main_results} (right) shows the results.
\sysname-Adapter outperforms Adapter and kNN.
On the datasets where \sysname\ labels are very accurate, we see additional lift from the adapters because we have more points to train on.
When the labels are not very accurate, our downsampling prevents the noisy labels from harming adapter performance.
In one case, learning an adapter over the embeddings is very hard (\spouse).
Here, providing the \sysname\ prediction as input is critical for performance.

\subsection{Embedding Smoothness}\label{sec:exp-smoothness}

We measure how smoothness of the embedding space affects the performance of \sysname.
First, we compare embeddings from CLIP against BiT-M embeddings~\citep{kolesnikov2020big}, a ResNet-101 pretrained on ImageNet~\citep{ILSVRC15}, and raw pixels.
Second, we vary the GPT-3 prompting strategy for \spouse\ and compare against two alternative methods that result in a less smooth representation.
We report label Lipschitzness---the smoothness of embeddings with respect to ground-truth labels---in this section.
See Appendix~\ref{sec:supp_smooth} for additional measures of Lipschitzness.

Figure~\ref{smoothness} (left) shows the performance of CLIP, BiT-M, ResNet-101, and raw pixels as embeddings for \sysname, as well as measures of Lipschitzness for each method (lower is smoother).
CLIP embeddings are smoother than the other methods---which matches their performance when used in \sysname.

\paragraph{Comparing Prompting Strategies}

Next, we examine the impact of prompting strategies for \spouse.
\spouse\ is a relation extraction dataset, where the task is to predict whether two entities in a sentence are married.
Since there may be multiple entities in a sentence, \spouse\ contains multiple duplicate sentences in the dataset, with different labels.
To alleviate this problem, we introduce a prompt ``Are [person 1] and [person 2] spouses?'' after the end of the sentence, where ``[person 1/2]'' are replaced by the names of the first/second entity in the sentence.
We compare this prompting strategy against two others: appending the same prompt to the beginning of the sentence, and leaving the original sentence as-is, without any prompting.

Figure~\ref{smoothness} (right) shows the performance and smoothness of each of these prompting methods.
Adding the prompt to the end of the sentence results in the best performance and smoothest embeddings.
Both methods perform better than leaving the sentence alone (the flat line is a result of multiple sentences with different labels having the same embedding).

\subsection{Ablations}\label{sec:exp-ablations}

\input{tables/ablations_k}

We report ablations on each component of \sysname.
Table~\ref{table:ablations_k} removes the partioning component and the extensions component.
Partitioning improves performance on \num{four} tasks, and extensions improves performance on all tasks (\num{13.1} points of lift on average from partitioning, \num{3.8} points from extensions).
Combining both additionally offers the best performance on \num{four} tasks.


%!TEX root = ../main.tex

\section{Related Work}
\label{sec:related}

We present an abbreviated related work here. See Appendix~\ref{sec:supp_related} for an extended treatment.

Weak supervision frameworks typically model source accuracies to generate weak labels and then fine-tune an end model for generalization~\citep{Ratner18, bach2018snorkel, khetan2017learning, sheng2020gmail, fu2020fast, zhan2019sequentialws, safranchik2020weakly, boecking2019pairwise}.
One framework models the end-to-end process all at once~\citep{cachay2021endtoend}, but requires training the end model at the same time---which is computationally expensive with large foundation models.
Our work removes the fine-tuning step completely.

Our work is similar to transfer learning techniques, which adapt pretrained models for downstream tasks~\citep{kolesnikov2020big,devlin2018bert}.
Foundation models offer new requirements for transfer learning setting: when it is impossible to fine-tune the original models~\citep{Bommasani21FM}.
We build on approaches such as prompting~\citep{lester2021power, brown2020language}, embedding search~\citep{neelakantan2022text}, and adapters~\citep{houlsby2019parameter,alain2016understanding}.


%!TEX root = ../main.tex

\section{Conclusion}
\label{sec:conc}
We present \sysname, a system for fusing foundation models and weak supervision.
We use embeddings to produce finer-grained estimates of weak source accuracies and improve weak source coverage.
We prove a series of results on how the performance of this approach scales with the smoothness of the embeddings, and demonstrate \sysname\ on six benchmark NLP and video weak supervision datasets.
We hope our work will encourage further work in combining foundation models and weak supervision and in utilizing the signal from foundation models to help with other tasks.


\begin{contributions}
The first two authors contributed equally. Co-first authors can prioritize their names when adding this paper's reference to their resumes.
\end{contributions}

\begin{acknowledgements}
%\paragraph{Authors' Note}
% The first two authors contributed equally. Co-first authors can prioritize their names when adding this paper's reference to their resumes.

%\section*{Acknowledgments}

We thank Fait Poms and Ravi Teja Mullapudi for helpful discussions.
We thank Neel Guha, Megan Leszczynski, Vishnu Sarukkai, and Maya Varma for feedback on early drafts of this paper.
We gratefully acknowledge the support of NIH under No. U54EB020405 (Mobilize), NSF under Nos. CCF1763315 (Beyond Sparsity), CCF1563078 (Volume to Velocity), 1937301 (RTML), and CCF2106707 (Program Synthesis for Weak Supervision); ARL under No. W911NF-21-2-0251 (Interactive Human-AI Teaming); ONR under No. N000141712266 (Unifying Weak Supervision); ONR N00014-20-1-2480: Understanding and Applying Non-Euclidean Geometry in Machine Learning; N000142012275 (NEPTUNE); NXP, Xilinx, LETI-CEA, Intel, IBM, Microsoft, NEC, Toshiba, TSMC, ARM, Hitachi, BASF, Accenture, Ericsson, Qualcomm, Analog Devices, Google Cloud, Salesforce, Total, the HAI-GCP Cloud Credits for Research program,  the Stanford Data Science Initiative (SDSI), Department of Defense (DoD) through the National Defense Science and
Engineering Graduate Fellowship (NDSEG) Program, Wisconsin Alumni Research Foundation (WARF), and members of the Stanford DAWN project: Facebook, Google, and VMWare. The U.S. Government is authorized to reproduce and distribute reprints for Governmental purposes notwithstanding any copyright notation thereon. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views, policies, or endorsements, either expressed or implied, of NIH, ONR, or the U.S. Government. 
\end{acknowledgements}


%\bibliographystyle{plain}
\bibliography{chen_560}
%\appendix
%\input{sections/supp_frontmatter.tex}
%\input{sections/supp_related.tex}
%\input{sections/supp_glossary}
%\input{sections/supp_alg}
%\input{sections/supp_proofs}
%\input{sections/supp_details}
%\input{sections/supp_exp}

\end{document}
