\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{rotating}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{On the Informativeness of Supervision Signals\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<is2961@princeton.edu>?Subject=Your UAI 2023 paper}{Ilia~Sucholutsky}{}}
\author[1]{Ruairidh~M.~Battleday}
\author[2]{Katherine~M.~Collins}
\author[3]{Raja~Marjieh}
\author[1]{Joshua~C.~Peterson}
\author[1]{Pulkit~Singh}
\author[2,4]{Umang~Bhatt}
\author[5]{Nori~Jacoby}
\author[2,4]{Adrian~Weller}
\author[2,1]{Thomas~L.~Griffiths}
% Add affiliations after the authors
\affil[1]{%
    Dept. of Computer Science\\
    Princeton University\\
}
\affil[2]{%
    Dept. of Engineering\\
    University of Cambridge\\
  }
  \affil[3]{%
    Dept. of Psychology\\
    Princeton University\\
}
  \affil[4]{%
    Alan Turing Institute\\
  }
\affil[5]{%
    Max Planck Institute for Empirical Aesthetics
  }
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
\vspace{10mm}
% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 

% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 

\appendix

\section{Practical guidance on human soft label elicitation}

Our framework provides users with a way to quantify the relative amount of information contained in each type of label so that they can optimize which labels to collect for their dataset. Specifically, the findings from our theory, simulations, and experimental results are that the relative informativeness of labels depends on three factors associated with the dataset (the number of labeled examples, the number of classes, and the latent dimensionality) and two factors associated with labels (error rate and sparsity). Out of these, most factors are not under the user’s control but the primary factor that users can control (in supervised learning settings) is label sparsity. Our guidance to users is thus the following:
\begin{itemize}
    \item Use our framework to estimate the relative informativeness of the label types you are considering collecting. The key parameter to optimize is label sparsity so compute the informativeness of soft labels with different levels of sparsity. 
    \item Pick out promising label types and run a small pilot study collecting each label type for a small set of objects. Compute error rates and per-label costs for each label type. 
    \item Update the relative informativeness estimates based on error rates and calculate the cost-benefit tradeoff for each label type. Pick the type with the most favorable tradeoff.

\end{itemize}
For users who want a simpler procedure, we offer the following rule-of-thumb guidance:
\textit{
Generally, softer labels are preferable in smaller data regimes (e.g. one-shot and less-than-one-shot learning) while harder labels are preferable in big data regimes (i.e. many-shot learning).
}




\section{Label optimization simulations}
See Figure~\ref{fig:fig4}.
\begin{figure*}[htb!]
    \centering
    \includegraphics[width=0.24\textwidth]{fig4/fig4A.png}
    \includegraphics[width=0.24\textwidth]{fig4/fig4E.png}
    \includegraphics[width=0.24\textwidth]{fig4/fig4C.png}
    \includegraphics[width=0.24\textwidth]{fig4/fig4H.png}
    \includegraphics[width=0.24\textwidth]{fig4/fig4B.png}
    \includegraphics[width=0.24\textwidth]{fig4/fig4F.png}
    \includegraphics[width=0.24\textwidth]{fig4/fig4D.png}
    \includegraphics[width=0.24\textwidth]{fig4/fig4G.png}
    \caption{Loss curves for soft labels (solid green), sparse labels (purple), top-class labels (red), and hard labels (dashed green) based on subjective utility function ($u(\rho)$), cost weighting parameter ($\beta$), sparsity ($\hat k$), number of points ($n$), and number of classes ($k$). Global minima for sparse labels and top-class labels are marked with a point.}
    \label{fig:fig4}
\end{figure*}

\section{Elicited-Human vs Model-Predicted Entropy}
We investigate the hypothesis that the labels which confer the best downstream performance may strike a natural resonance with the models they are used as supervision signals for. In Figure~\ref{fig:modeltraininglabelent}, we compare the entropy of the training labels against the entropy of the trained models' predicted distributions. In other words, we compare the probability distributions produced by each model, to the probability distributions that the model was trained on. We find a remarkable alignment between the entropy of models' predictions trained on the \texttt{CIFAR-10S} varieties. Future work could investigate the links between the inductive biases of models and the labels best suited for training specific architectures. 

% \begin{figure*}[htb!]
%     \centering
%     \includegraphics[width=0.24\textwidth]{ent_comps/aggregate_ent_comp.png}
%     \includegraphics[width=0.24\textwidth]{ent_comps/aggregate_katie_ent_comp.png}
%     \includegraphics[width=0.24\textwidth]{ent_comps/aggregate_katie_DS_ent_comp.png}
%     \includegraphics[width=0.24\textwidth]{ent_comps/aggregate_raja_ent_comp.png}
%     \includegraphics[width=0.24\textwidth]{ent_comps/aggregate_smoothed_low_ent_comp.png}
%     \includegraphics[width=0.24\textwidth]{ent_comps/control_ent_comp.png}
%     \includegraphics[width=0.24\textwidth]{ent_comps/aggregate_smoothed_high_ent_comp.png}
%     % \includegraphics[width=0.24\textwidth]{fig4/fig4G.png}
%     \caption{Comparing the entropy of the models' predictions against the entropy of the labels used to train them. The training label type is listed as the title for the respective histogram.}
%     \label{fig:modeltraininglabelent}
% \end{figure*}

\begin{figure*}[htb!]
    \centering
    \includegraphics[width=0.18\textwidth]{ent_comps/aggregate_ent_comp.png}
    \includegraphics[width=0.18\textwidth]{ent_comps/aggregate_katie_ent_comp.png}
    \includegraphics[width=0.18\textwidth]{ent_comps/aggregate_katie_DS_ent_comp.png}
    \includegraphics[width=0.18\textwidth]{ent_comps/aggregate_raja_ent_comp.png}
    \includegraphics[width=0.18\textwidth]{ent_comps/aggregate_typicality_ent_comp.png}
    \includegraphics[width=0.18\textwidth]{ent_comps/aggregate_smoothed_low_ent_comp.png}
    \includegraphics[width=0.18\textwidth]{ent_comps/control_ent_comp.png}
    \includegraphics[width=0.18\textwidth]{ent_comps/aggregate_smoothed_high_ent_comp.png}
    % \includegraphics[width=0.24\textwidth]{fig4/fig4G.png}
    \caption{Comparing the entropy of the models' predictions against the entropy of the labels used to train them. The training label type is listed as the title for the respective histogram.}
    \label{fig:modeltraininglabelent}
\end{figure*}

\section{Additional Details on Human Soft Labels} 
\subsection{Collecting \texttt{CIFAR-10DS} and Similarity Judgments}
Soft labels for \texttt{CIFAR-10DS}, as well as similarity judgments, were collected on Amazon Mechanical Turk (AMT). The recruitment and experimental pipelines were automated using the PsyNet framework for online experiment design \cite{harrison2020gibbs}. Prior to participation in the studies, participants provided informed consent in accordance with an Institutional Review Board (IRB), and were paid at a rate of \$12 per hour. In addition, participants were required to have successfully completed at least 2000 tasks on AMT.

To collect \texttt{CIFAR-10DS}, participants observed individual images and were given a set of sliders (10, one for each category) ranging from 0 to 1 and were asked to move the sliders in accordance with how well they thought each category matched a given image, with 0 being ``not at all matching'' and 1 being ``completely matching''. We aimed for about 10 multi-ratings per image and each participant completed 50 such multi-ratings.

As for similarity judgments, participants were presented with pairs of unlabeled images and were required to rate their similarity on a 7-point Likert scale ranging from 0 (``completely dissimilar'') to 6 (``completely similar''). Here we aimed for 5 judgments per pair of images and each participant completely an average of 80 such judgments.

\subsection{In-Filling \texttt{CIFAR-10S} Labels}
The \texttt{CIFAR-10S} labels collected in \cite{collins2022eliciting} included only 1,000 of the full 10,000 \texttt{CIFAR-10} test set. Note, however, that these 1,000 examples were already enriched to be those that are naturally more confusing -- so it can be considered a sensible sampling of what -10S labels \textit{may} look like more generally. However, for adequate comparison against the other label types, we needed to choose a labeling method to label the remaining 9,000. We elected two variants: 1) using hard labels, or 2) simulating \texttt{CIFAR-10S} labels via sparsified version of \texttt{CIFAR-10DS}. The former represents a real-world cost efficient scenario; we could imagine a researcher only having the budget to annotate a subset of a dataset with soft labels. The second case is designed to mimic what the labels may have been like had we elicited \texttt{CIFAR-10S} over the full set. Taking only the scalar values for the top two highest sliders from \texttt{CIFAR-10DS} offered a nice entropy- and conceptual-match (entropy of 0.69 for \texttt{CIFAR-10S} to 0.75 for the adjusted \texttt{-10DS} labels). Future work could explore automated measures to extend label conversions (e.g., learning a mapping from \texttt{CIFAR-10DS} to simulated \texttt{CIFAR-10S} labels). We note that the \texttt{CIFAR-10S} labels used in this work are the T2 Clamp varieties, with a redistribution factor of $10\%$ following \citeauthor{collins2022eliciting}.

\textbf{CIFAR-10T} The \texttt{CIFAR-10T} labels are a novel set of labels we crowdsourced, comprising over 350,000 typicality ratings for each image under the ground truth category (about 35  judgements per image). 1759 unique participants were recruited on Amazon Mechanical Turk, and presented with a sequence of 200 randomly sampled \texttt{CIFAR-10} test set images, upsampled to 160x160 pixels (see \cite{peterson2019human, battleday2020capturing}. Participants were given the category of each image, and asked to rate how
typical it was of the category on a sliding scale
of ``Not at all typical" to ``Extremely typical". We interpret an image's typicality as the probability of the ground truth class, and spread the remaining probability mass over the 9 remaining labels---a smoothed version of a \textit{sparse} soft label with K=1).

\subsection{Additional Similarity Judgment Studies}
We extend the GNMDS analyses in the main text by examining the similarity structure of image representations extracted from the penultimate layer of each network. For each image and network, we derive an abstract vector representation by storing the unit activations of the last layer during classification. Then, for each image we compute the pairwise cosine similarity between the representations derived from our classifiers. We correlate these to the ground truth similarity ratings, and present the results in Figure \ref{fig:gnmdsHumanOther}. The images used for these analyses are discussed below, and displayed in Figures \ref{fig:entropy_images} and \ref{fig:relentropy_images}.

\begin{figure}[htb!]
    \centering
    \includegraphics[width=0.8\textwidth]{gnmds_label_compmodel.png}
    \caption{Correlation between ground-truth similarity judgments and the cosine similarity of image representations for different model architectures.}
    \label{fig:gnmdsHumanOther}
\end{figure}


\section{Additional Computational Experiment Details and Observations} 

\subsection{Models}
We use ten fold cross-validation to partition the images of the \texttt{CIFAR-10} test subset into train and validation sets for each set of soft labels. We train a number of models using stochastic gradient descent over a range of learning rates and seeds, and use the best performing seed for all subsequent analyses (Table \ref{tab:models}). We use ten fold cross-validation to partition the images of the \texttt{CIFAR-10} test subset into train and validation sets for each set of soft labels. 

\begin{table}[!h]
    \centering
    \caption{Image Classifiers.} \label{tab:models}
    \begin{tabular}{rccc}
      \toprule % from booktabs package
      \bfseries Model & \bfseries Key Features & \bfseries  Parameters & \bfseries Citation \\
      \midrule % from booktabs package
      VGG & very deep connections & 14{,}728{,}266 & \cite{simonyan2014very}\\
      ResNet & residual connections &  & \cite{he2016deep}\\
      WRN & wide residual connections & 36{,}479{,}194 & \cite{zagoruyko2016wide}\\
      DenseNet & dense connections & 769{,}162 & \cite{huang2017densely}\\
      Shake shake & shake shake regularization & 11{,}709{,}514& \cite{gastaldi2017shake}\\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table}

\subsection{Datasets}
 In order of increasing distributional shift, \texttt{CIFAR-10 50K} is the \textit{training} subset of \texttt{CIFAR10} (50{,}000 images; \cite{krizhevsky2009learning}, \texttt{CIFAR10.1v6,v4} are two near-sample datasets constructed from the same TinyImages classes \cite{torralba200880} as \texttt{CIFAR-10} (2{,}000 images each; \cite{recht2018cifar}), our subset of \texttt{CINIC10} contains rescaled images from ImageNet using the \texttt{CIFAR-10} classes (210{,}000 images; \cite{cinic}), and \texttt{ImageNet-Far} contains a label-coarsened version of rescaled \texttt{ImageNet} images such that the CIFAR classes now contain a more diverse range of examples (for example, now ``deer'' contains ``ibex'' and ``gazelle''; 63{,}895 images; \cite{peterson2019human, cinic}). 

% \subsection{Learning from small data (generalization results)}
% We use the images selected by \cite{collins2022eliciting} to reduce the number of training examples, and provide a difficult set of images for models to learn. We train all models in the same way, except now using five-fold cross validation on 1{,}000, rather than 10{,}000. In Figure \ref{fig:80shot_gen} we present the cross-label and generalization results in this restricted data regime (80 high-label-entropy exemplars per class). We see the pattern from the full data regime become more extreme, with models trained on soft labels generalizing more robustly, and the most informative \textit{dense} soft labels (\texttt{CIFAR-10DS}) providing the best representational signal for far out-of-training distribution generalization.

% % \begin{figure}[h!]
% %     \centering
% %     \includegraphics[width=\textwidth]{80shot_gen.png}
% %     \caption{Generalization results from models trained on 80 examples per class.}
% %     \label{fig:80shot_gen}
% % \end{figure}

% \begin{figure*}[t!]
%     \centering
%     % \includegraphics[width=0.9\textwidth]{80shot_crosslabel_eval.png}
%     % \includegraphics[width=0.9\textwidth]{80shot_generalization_checks.png}
%     \includegraphics[width=0.9\textwidth]{FS80_crosslabel_eval.png}
%     \includegraphics[width=0.9\textwidth]{FS80_generalization_checks.png}
%     \vspace{-1mm}
%     \caption{Cross-label (top) and generalization results (bottom) results from models trained on 80 examples per class. }
%     \vspace{-3mm}
%     \label{fig:80shot_gen}
% \end{figure*}

\subsection{Softness, Task Accuracy, and Information Content}

In the main text, we depicted the relationship between label softness and task performance using crossentropy (CE) as our principal metric. We focus on CE as it better captures the fidelity of the models’ predictive distributions. This is particularly important when we evaluate the model on held-out soft labels; CE offers more information about model performance than just top-1 accuracy. However, we include top-1 accuracy in Figure \ref{fig:full_score_acc} for completeness.

\begin{figure*}[t!]
    \centering
    % \includegraphics[width=0.9\textwidth]{80shot_crosslabel_eval.png}
    % \includegraphics[width=0.9\textwidth]{80shot_generalization_checks.png}
    \includegraphics[width=0.9\textwidth]{FULL_crosslabel_eval_acc.png}
    \includegraphics[width=0.9\textwidth]{FULL_generalization_checks_acc.png}
    \vspace{-1mm}
    \caption{Cross-label (top) and generalization results (bottom), scored by top-1 accuracy against the respective labels. }
    \vspace{-3mm}
    \label{fig:full_score_acc}
\end{figure*}

% \subsection{Small-data accuracy results}

% \begin{table}[!b]
%     \centering
%     \caption{Small data results (accuracy).} \label{tab:few-shot}
%     \begin{tabular}{lcccc}
%       \toprule % from booktabs package
%       \bfseries Labels & \bfseries 80 l/c & \bfseries 8 l/c \\
%       \midrule % from booktabs package
%       \texttt{CIFAR-10} & 0.36 & 0.2\\
%       \texttt{CIFAR-10S+hard} & 0.38 & 0.21 \\
%       \texttt{CIFAR-10H} &  \textbf{0.39} & 0.22\\
%       \texttt{CIFAR-10S+dense} & 0.38 & \textbf{0.23}\\
%       \texttt{CIFAR-10LS (Low)} & 0.37& 0.18\\
%       \texttt{CIFAR-10DS} & 0.38 & 0.21 \\
%       \texttt{CIFAR-10LS (High)} & 0.40 & 0.22\\
%      \texttt{CIFAR-10T} & 0.35 & 0.18\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

We also depict performance in Figure \ref{fig:infoContentPerf} as function of the information content of the labels. Here, we use the Spearman rank correlation coefficient between the \texttt{CIFAR-10} GNMDS and elicited similarity judgments as a proxy for information content. Note, here, \texttt{CIFAR-10S+hard} and \texttt{CIFAR-10S+dense} have the same score, as the similarity judgments are collected over the 1000 shared original \texttt{CIFAR-10S} examples.

\begin{figure*}[t!]
    \centering
    % \includegraphics[width=0.9\textwidth]{crosslabel_eval.png}
    % \includegraphics[width=0.9\textwidth]{generalization_checks.png}
    \includegraphics[width=0.9\textwidth]{crosslabel_eval_info.png}
    \includegraphics[width=0.9\textwidth]{generalization_checks_info.png}
    \vspace{-1mm}
    \caption{\textbf{Top:} Model performance on different label types at test time, as a function of information content of the labels. Information content is approximated by the Spearman rank correlation with similarity judgments. \textbf{Bottom:} Generalization performance under increasing distributional shift, as a function of training label information content.}
    \vspace{-3mm}
    \label{fig:infoContentPerf}
\end{figure*}




\subsection{Effective dimensionality of soft labels}
Our theory and simulations address the number of features available for representation learning but did not discuss the nature of these features—i.e., whether they are essential or superfluous. Estimating the effective dimensionality of a dataset is tied to the nature of the computational task required. For classification, there is a range of methods for estimating this (e.g., \citep{doi:10.1137/1116025,jha2023extracting}).

\subsection{Varying Label Softness}


% We thank the reviewer for this insightful comment, which has improved the work. We now provide a selective ablation study for increasing levels of sparsity in the Appendix, which interpolates between the previous sparse and dense labels and is consistent with the rest of our results. Again, the reason for choosing a different level of sparsity is dependent on the downstream task, a consideration pertinent to all of our findings that we address in a new paragraph in the discussion.

% The sparsity cross-entropy results are presented in the table below, averaged across architectures for clarity here (they will be presented in full in the final work):

% | **Classes per label** | **CIFAR10-50k** | **CIFAR10.1v4** | **CIFAR10.1v6** | **CINIC-10** | **ImageNet-Far** |
% |------------|---------|----------|----------|--------------|------------------|
% | k=2        | 0.46    | 0.769    | 0.761    | 1.32         | 1.565            |
% | k=3        | 0.475   | 0.769    | 0.759    | 1.297        | 1.51             |
% | k=4        | 0.488   | 0.77     | 0.768    | 1.255        | 1.47             |
% | k=10       | 0.76    | 1.086    | 1.081    | 1.40         | 1.597            |


% These empirical results corroborate the theory and empirical patterns described in the main paper. We provide two further levels of sparsity, interpolating between the dense (k=10) and sparse (k=2) soft labels with k=3 and k=4 sparse soft labels. For near-generalization performance (measured via validation set loss), training with sparser labels results in better performance. However, as the distributional shift of the validation set increases, training with denser soft labels provides an increasing benefit. As a reminder, the collection experiment for the dense soft labels (k=10) was noisier than for the sparse soft labels (k=2, 3, 4), meaning the effect in the last row of this table, although consistent, is dampened. 

% We next provide a selective ablation study for increasing levels of sparsity, which interpolates between the previous sparse and dense labels. Again, the reason for choosing a different level of sparsity is dependent on the downstream task, a consideration pertinent to all of our findings that we address in a new paragraph in the discussion.

We further investigate how the amount of softness we elicit from humans when constructing our supervision signals impacts downstream performance: a selective ablation for increasing levels of sparsity. In our real-world soft label experiments, we in-fill missing \texttt{CIFAR-10S} labels by simulating if \texttt{CIFAR-10DS} labels had only provided uncertainty over $\hat{k}=2$ labels. As we have softness over all $k=10$, we can simulate varying $\hat{k}$. We train additional models in-filling with $\hat{k} = 3$ and $4$, respectively. We find that the extra softness does not add substantial value over $\hat{k}=2$ when evaluating on near-domain generalization (i.e. CIFAR10-50k, .1v4, and .1v6), but does appear to have a positive effect when evaluating on further out-of-domain generalization (e.g., CINIC10 and ImageNet-Far). 

%Again, the reason for choosing a different level of sparsity is dependent on the downstream task, a consideration pertinent to all of our findings.

\begin{table}[]
    \centering
    \caption{Crossentropy (lower is better) as a function of varying the classes we permit human uncertainty specification over.}
    \label{tab:varyK}
    \begin{tabular}{@{}llllll@{}}
    \toprule
    Classes per Label & CIFAR10-50k & CIFAR10.1v4 & CIFAR10.1v6 & CINIC10 & ImageNet-Far \\ \midrule
    k = 2             & \textbf{0.46}        & \textbf{0.77}      & \textbf{0.76}       & 1.32     & 1.57         \\
    k = 3             & 0.48        & \textbf{0.77}        & \textbf{0.76}        & 1.30     & 1.51         \\
    k = 4             & 0.49        & \textbf{0.77}        & 0.77        & \textbf{1.26}     & \textbf{1.47}         \\
    k = 10            & 0.76        & 1.09        & 1.08        & 1.40     & 1.60         \\ \bottomrule
    \end{tabular}
    
\end{table}

% We further investigate how the amount of softness we elicit from humans when constructing our supervision signals impacts downstream performance. In .... [main experiments] ..., we in-fill missing \texttt{CIFAR-10S} labels via simulating if \texttt{CIFAR-10DS} labels had only provided uncertainty over $\hat{k}=2$ labels. As we have softness over all $k=10$, we can simulate varying $\hat{k}$. We train additional models in-filling with $\hat{k} = 3$ and $4$, respectively. We find that the extra softnessness does not add substantial value over $\hat{k}=2$, in either the full data (Figure \ref{fig:full_vary_k}) or few-shot (Figure \ref{fig:80shot_vary_k}) setting. While further work is needed to...., these empirical observations ... [something about cost-effectivness of elicitation and connecting to Ilia's top-class soft label theory]. 

% \begin{figure*}[t!]
%     \centering
%     % \includegraphics[width=0.9\textwidth]{80shot_crosslabel_eval.png}
%     % \includegraphics[width=0.9\textwidth]{80shot_generalization_checks.png}
%     \includegraphics[width=0.9\textwidth]{FULL_crosslabel_eval_varyk.png}
%     \includegraphics[width=0.9\textwidth]{FULL_generalization_checks_varyk.png}
%     \vspace{-1mm}
%     \caption{Cross-label (top) and generalization results (bottom) results from models trained on labels constructed from varied $k$. }
%     \vspace{-3mm}
%     \label{fig:full_vary_k}
% \end{figure*}

% \begin{figure*}[t!]
%     \centering
%     % \includegraphics[width=0.9\textwidth]{80shot_crosslabel_eval.png}
%     % \includegraphics[width=0.9\textwidth]{80shot_generalization_checks.png}
%     \includegraphics[width=0.9\textwidth]{FS80_crosslabel_eval_varyk.png}
%     \includegraphics[width=0.9\textwidth]{FS80_generalization_checks_varyk.png}
%     \vspace{-1mm}
%     \caption{Cross-label (top) and generalization results (bottom) results from models trained on 80 examples per class, when varying $k$. }
%     \vspace{-3mm}
%     \label{fig:80shot_vary_k}
% \end{figure*}

\subsection{Representative images and model predictions}
In Figures \ref{fig:entropy_images} and \ref{fig:relentropy_images} we present the images used as the basis of the similarity experiments (see above for details on label and similarity judgment collection). These images were chosen \textit{using our trained classification models} to include images that were likely to have high label entropy (Figure \ref{fig:entropy_images}), and images where model predictions diverged (Figure \ref{fig:relentropy_images}). The models making the predictions had not been trained on these images (i.e., the predictions were based on the held-out cross-validation folds). 

\begin{figure}[h!]
    \centering
    \includegraphics[width=\textwidth]{entropy_images.pdf}
    \caption{100 images from the \texttt{CIFAR-10} testing subset. These were chosen to include images that were likely to have high label entropy.}
    \label{fig:entropy_images}
\end{figure}


\begin{figure}[h!]
    \centering
    \includegraphics[width=\textwidth]{relentropy_images.pdf}
    \caption{100 images from the \texttt{CIFAR-10} testing subset. These were chosen to include images that were likely to cause model disagreement.}
    \label{fig:relentropy_images}
\end{figure}

In Figures \ref{fig:plane}-\ref{fig:truck}, we present four exemplars from each class, along with the soft labels and model predictions. We see that this analysis picks out genuinely ambiguous images, with borderline cases between two classes, many classes, noisy images, and categorically uncertain images. For each image, the top row are images in which models agree on high likely entropy (average prediction entropy). The bottom row is where models maximally disagree (average symmetric relative entropy between pairwise comparisons of models).

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{plane.png}
    \caption{Representative ambiguous plane images. Top row: model agreement on high entropy image. Bottom row: maximal model disagreement.}
    \label{fig:plane}
\end{figure}


\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{auto.png}
    \caption{Representative ambiguous automobile images. Top row: model agreement on high entropy image. Bottom row: maximal model disagreement.}
    \label{fig:auto}
\end{figure}


\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{bird.png}
    \caption{Representative ambiguous bird images. Top row: model agreement on high entropy image. Bottom row: maximal model disagreement.}
    \label{fig:bird}
\end{figure}


\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{cat.png}
    \caption{Representative ambiguous cat images. Top row: model agreement on high entropy image. Bottom row: maximal model disagreement.}
    \label{fig:cat}
\end{figure}

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{deer.png}
    \caption{Representative ambiguous deer images. Top row: model agreement on high entropy image. Bottom row: maximal model disagreement.}
    \label{fig:deer}
\end{figure}


\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{dog.png}
    \caption{Representative ambiguous dog images. Top row: model agreement on high entropy image. Bottom row: maximal model disagreement.}
    \label{fig:dog}
\end{figure}


\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{frog.png}
    \caption{Representative ambiguous frog images. Top row: model agreement on high entropy image. Bottom row: maximal model disagreement.}
    \label{fig:frog}
\end{figure}


\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{horse.png}
    \caption{Representative ambiguous horse images. Top row: model agreement on high entropy image. Bottom row: maximal model disagreement.}
    \label{fig:horse}
\end{figure}


\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{boat.png}
    \caption{Representative ambiguous ship images. Top row: model agreement on high entropy image. Bottom row: maximal model disagreement.}
    \label{fig:ship}
\end{figure}


\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\textwidth]{truck.png}
    \caption{Representative ambiguous truck images. Top row: model agreement on high entropy image. Bottom row: maximal model disagreement.}
    \label{fig:truck}
\end{figure}

\section{Enlarged main results figure}
See Figure~\ref{fig:genCheck}. 

\begin{sidewaysfigure}[h!]
    \centering
    % \includegraphics[width=0.9\textwidth]{crosslabel_eval.png}
    % \includegraphics[width=0.9\textwidth]{generalization_checks.png}
    \includegraphics[width=0.9\textwidth]{FULL_crosslabel_eval.png}
    \includegraphics[width=0.9\textwidth]{FULL_generalization_checks.png}
    \vspace{-1mm}
    \caption{\textbf{Top:} Model performance on different label types at test time. \textbf{Bottom:} Generalization performance under increasing distributional shift. Each point represents the average score for a single model architecture (specified by color), trained on a particular label type (indicated via shape). Vertical lines represent points for a given label type.}
    \vspace{-3mm}
    \label{fig:genCheck}
\end{sidewaysfigure}

% \smallsec{Test Datasets}
% A key prediction from section \ref{section:theory} is that the uncertainty in our labels will be increasingly informative when generalizing to increasingly out-of-training-sample distributions. We test this prediction empirically by examining generalization ability to the following datasets:

% {\bf \texttt{CIFAR10}:} This is the standard within-dataset evaluation. Since our \texttt{CIFAR10H} soft labels are for the \texttt{CIFAR10} test set, here we use the 50{,}000-images of the standard \texttt{CIFAR10} training set to instead evaluate the models.

% {\bf \texttt{CIFAR10.1v6,v4}:} These are two 2{,}000-image near-sample datasets created by \cite{recht2018cifar} to assess overfitting to \texttt{CIFAR10} ``test'' data often used for validation. The images are taken from TinyImages \cite{torralba200880}  and match the sub-class distributions in \texttt{CIFAR10}. \texttt{v6} has 200 images per class while \texttt{v4} is the original class-unbalanced version (90\% overlap).

% {\bf \texttt{CINIC10}:} This is an out-of-sample generalization test. The \texttt{CINIC10} dataset collected by \cite{cinic} contains both \texttt{CIFAR10} images and rescaled ImageNet images from equivalent classes \cite{cinic}. For example, images from the {\em airplane, aeroplane, plane (airliner)} and {\em airplane, aeroplane, plane (bomber)} ImageNet classes were allocated to the {\em airplane} \texttt{CIFAR10} top-level class. Here we use only the 210{,}000 images taken from ImageNet.

% {\bf \texttt{ImageNet-Far}:} Finally, as stronger exemplar of distributional shift, we built ImageNet-Far. As above,  we used rescaled ImageNet images, but chose classes that might not be under direct inheritance from the \texttt{CIFAR10}-synonymous classes. For example, for the \texttt{CIFAR10} label {\em deer}, we included the ImageNet categories {\em ibex}, {\em gazelle}, and for the \texttt{CIFAR10} label {\em horse} we included the ImageNet category {\em zebra}, which was not included in \texttt{CINIC10}.

% \section{Example}
% Table~\ref{tab:supp-data} lists additional simulation results; see also \citet{einstein} for a comparison. 

% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.
\clearpage
\bibliography{uai2023-template}

\end{document}
