% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
  \usepackage{xcolor}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
\definecolor{deepred}{HTML}{940000}
\hypersetup{linkcolor=deepred}
\hypersetup{citecolor=[rgb]{0.4,0.15,0.95}}
\usepackage{url}  
\title{Human-in-the-Loop \textit{Mixup}\\(Supplementary Material)}
\appendix
% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Katherine M. Collins\thanks{Correspondence to: \href{mailto:kmc61@cam.ac.uk}{kmc61@cam.ac.uk}}
}\author[1,2]{Umang Bhatt}
\author[1,3]{Weiyang Liu}
\author[1]{Vihari Piratla}
\author[4]{Ilia Sucholutsky}
\author[2,5]{Bradley Love}
\author[1,2]{Adrian Weller}
\affil[1]{University of Cambridge}
\affil[2]{The Alan Turing Institute}
\affil[3]{Max Planck Institute for Intelligent Systems}
\affil[4]{Princeton University}
\affil[5]{University College London}

  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\section{Related Work}

Our work connects most closely to human-in-the-loop data augmentation and the expansive literature surrounding human categorical perception from the cognitive science community, as well as ongoing efforts in the machine learning community to develop more efficacious \textit{mixup}-based data and label mixing functions. 

\subsection{Human-in-the-Loop Data Augmentation}
Incorporating expert feedback into the learning procedure has received increasing attention~\citep{chen2022perspectives}.
In particular, previous work has considered incorporating humans ``in the loop'' for data augmentation. For instance, DatasetGAN~\citep{datasetGAN} employs human participants to label GAN-generated images and feeds these back to the model to generate more synthetic data. \citep{counterfactuallyAugmentedHuman} similarly incorporate human feedback by having humans \textit{create} counterfactual samples, and has been shown to be an efficient method to adjust model behavior \citep{efficacyCounterfactual}. Other works have considered employing humans to provide ``rationales'' about examples to improve data-efficiency and downstream modeling performance \citep{zaidan2007using}. Here, we marry these ideas in the context of \textit{mixup} by eliciting data and label-mixing function parameters to align with human percepts.  
%\todo{Umang -- do you have thouhts on what else we should say, or other key works to cite, particulary based on your work with Valerie?}

\subsection{Human Categorical Perception} In cognitive science, eliciting humans' judgments over synthetically-constructed examples is a tried-and-true method to characterize human category boundaries \citep{newell2002categorical,folstein2013category,feldmanCategoricalPerception, folsteinFactorizedvBlended}. Such studies often reveal a non-linear structure of humans' percepts. For instance, in the audio domain, the identification of vowel categories has been found to demonstrate ``warping'' close to prototypical category members -- known as the ``perceptual magnet effect'' \citep{perceptualMagnetEffect, perceptualMagnetStats}. Similar nonlinearities have been found in the perception of boundaries between face identities \citep{beale1995categorical} and the transitions between 3D shapes \citep{newell2002categorical, morphSpacesShape}. Our linearly interpolated stimuli are similar in spirit to the morphological trajectories used in these works, as well as other synthetically-combined images \citep{hybridImages}. \citep{gruber2018perceptual} also consider 50/50 mixed images; however, their elicitation involves open-ended judgments which does not permit the same kind of data and label mixing alignment studies as our methods more directly elicit human-inferred generative parameters. Our work also connects to other non-linear perceptual phenomena encountered in the visual domain; namely, binocular rivalry, whereby present participants with a different image in each eye has been shown to induce oscillatory percepts \citep{blake2002visual, tong2006neural}. 

% Perhaps closest to our work, \citep{gruber2018perceptual} also study linear combinations in pixel space of \texttt{ImageNet} \citep{imagenet}; however, they focus on the 50/50 point only and instead elicit open-ended descriptions of what a participant sees in each example. In our work, we emphasize the benefits of being able to precisely investigate human alignment of the data and label mixing policies by informing participants of the endpoints. Though we begin to explore more open-ended elicitation in Appendix C. 

% We also study a kind of perceptual dominance; however, we always present the same image to both eyes and  are only eliciting a snapshot of people's perception. Perhaps closest to our work, \citep{gruber2018perceptual} also study linear combinations in pixel space of \texttt{ImageNet} \citep{imagenet}; however, they focus on the 50/50 point only and instead elicit open-ended descriptions of what a participant sees in each example. In our work, we emphasize the benefits of being able to precisely investigate human alignment of the data and label mixing policies by informing participants of the endpoints. Though we begin to explore more open-ended elicitation in Appendix B. Lastly, we note that there other synthetically-constructed stimuli similar to ours have been studies, such as ``hybrid images'' wherein frequency composition of images is controlled to create observer-distance-dependent percepts \citep{hybridImages}. In our work, we do not modulate the frequency of images; we only combine in pixel space.  

\subsection{Other \textit{mixup}-Based Synthetic Data Schemes} Many alternative \textit{mixup} data and label mixing functions have been proposed \citep{manifoldMixUp, cutMix, puzzleMix, coMixup, hendrycks2022pixmix}. Closest to our work, \citep{sohn2022genlabel} highlight particular issues with the linear interpolation in label space on the learned topology of the model's category boundaries and instead utilize a Gaussian Mixture Model (GMM)-based relabeling scheme to construct ``better'' labels than those used in baseline \textit{mixup}. Additional work on learning better pseudo-labels over \textit{mixup} samples have been proposed \citep{pseudoLabelMixup, cascante2020curriculum, fixmatch, Qiu2022DHT}. Similarly, Between-class (BC) learning \citep{bcLearningAudio,bcLearning} proposes hand-crafted adjustments to label construction to better align with human perception based on waveform modulations; however, to our knowledge, no previous works have \textit{directly} considered incorporating humans in-the-loop for either the construction of \textit{mixup} samples or associated relabeling.

% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 


% \section{Analyzing Human Uncertainty}

% Additionally, while intuitive, we investigate whether there are specific predictors of when and why a mixed image may be hard to label -- e.g., perhaps images which are naturally ambiguous become even more muddled when combined. We use the entropy of the \texttt{CIFAR-10H} labels as a measure of image ``ambiguity''\citep{peterson2019human, battleday2020capturing}. Recall, \texttt{CIFAR-10H} labels are constructed from many annotator's judgments about the most probable image category; entropy is therefore computed over the frequencies of these class selections and captures some sense of the amount of disagreement between annotators. 

% \begin{figure}[h!]
%  \begin{center}
%  \includegraphics[width=0.6\linewidth]{figures/decomp_endpoint_ent.pdf}
% \caption{Confidence reported by annotators in their inference of $\lambda$, as a factor of whether the combined labels $y_i, y_j$ are high or low entropy. Entropy is measured over the \texttt{CIFAR-10H} human-derived labels. }
%  \label{fig:decompEndpoints}
%  \end{center}
% \end{figure}

% We compare humans' elicited confidence in their mixing coefficient, and the amount of relabeling ($| \lambda_h - \lambda_f |$) against the entropy of the \texttt{CIFAR-10H} labels of the images being combined. We find in Fig. \ref{fig:decompEndpoints} that if both endpoints are high entropy under \texttt{CIFAR-10H} (where we consider ``high'' being entropy $\geq$ 0.5), participants report markedly lower confidence in their inference than if both endpoints have low entropy (entropy $\leq$ 0.1). However, we do not find a significant effect of endpoint entropy and amount of relabeling. This suggests that the ambiguity of the underlying images being mixed plays some role in determining when the resulting synthetic image may be hard to label, but there remains a question as to what can predict high amounts of relabeling from participants. We leave these questions for future investigation.


\section{Additional Notes on \texttt{H-Mix}}

\subsection{Human Subject Experiments}

We include additional details on our human elicitation studies. For all experiments, we require participants speak English as a first-language and reside in the United States. Across all experiments, the mean age for participants was 37.5 years old ($\pm$ standard deviation of 12.7 yrs) . The self-reported sex breakdown was approximately 57\% male and 43\% female.

\paragraph{Elicitation (RQ1)} Each participant sees a total of 32 mixed images, where the final two are repeats. Repeats are primarily used here to measure raters' internal consistency\footnote{Participants' selections, for each interface type, change by a median of $0.1$ in repeat trials, suggesting some inconsistencies in participants' judgments which persists across elicitation method.}. The median time taken per participant per image as 9.30 and 11.01 seconds for the \texttt{Construct} and \texttt{Select-Shuffled} interfaces, respectively. A bonus was offered to encourage participants to provide responses which would match what other participants would provide; we applied this bonus to all participants post-hoc resulting in the average participant being paid at a rate of \$11.78. 

\paragraph{Multiple Interface Styles (RQ1)} Why do we consider two styles of elicitation interfaces? We reason that the first interface could be prone to ordering effects -- an astute participant could realize that they can count out where the midpoint is located. This led us to design the second variety (\texttt{Select-Shuffled}) wherein the participant sees all images shuffled simultaneously. We hypothesize that \texttt{Construct} could induce responses biased by the participant's starting position. To probe this, we run two sub-variants wherein participants start from either $\lambda_f = 0.1$ or $\lambda_f = 0.9$. 

\paragraph{Elicitation (RQ2)} Each participant sees $59-62$ images, where two images are repeated. Repeats are placed at the end and correspond to the images presented on trials 15 and 20, respectively\footnote{We observe a median difference of $0.03$ and $0.05$ in the inferred mixing coefficient and confidence on repeat trials, indicative of high intra-annotator consistency.}. The order of the images presented in a batch, as well as the order of the endpoint labels displayed for a given image, are shuffled across participants. We follow the same third-person perspective prompting in Section 3 from \citep{efficientElic}. Participants are asked ``what combinations of classes'' they thought other participants would say is ``used to make'' each image, and ``how confident'' they thought other participants would be in their estimate. Responses are indicated on a slider per question. An example survey screen can be seen in Fig. \ref{fig:inferCoeffInterface}. Subjects took a median of 8.41 seconds per image and were payed at a rate of \$8/hr, with an optional bonus which sought to encourage participants to provide calibrated confidence estimates, similar to that of \citep{vodrahalli2021humans}; the bonus was applied to all participants post-hoc. Each mixed image was seen by at least two different participants each. Our interface is depicted in Fig. \ref{fig:inferCoeffInterface}. 

% \subsection{Relationship Between Participant Uncertainty and Mixing Coefficient} 

% A natural 

\subsection{Break from Monotonicty}

For users of \texttt{H-Mix}, it is worth noting that we do encounter some breaks with monotonicity (see Fig. \ref{fig:nonMonotonic}) in a few of the aggregated ``category boundaries.'' We reason this could be in part due to several aspects of our set-up. First, our study involved irregular sampling across the space of mixing coefficients we consider: the 50/50 point is enriched. We ran two phases of elicitation: in the first, we sampled $6$ image classes per pair to be shown for three mixing coefficients: 0.5, and one chosen randomly from each of the sets \{0.1, 0.25\} and \{0.75, 0.9\}, respectively (810 images of the 2070). All 1260 other images are shown for a single mixing coefficient sampled uniformly from the set. Second, while we have human judgments for over 2000 total images, there are less than 50 synthetic images considered for each category pair, giving any participant noise -- or the odd image  -- greater leverage to impact trends. We encourage others to use \texttt{HILL-MixE Suite} and continue to scale this work and elucidate the stability of the inferred mixing coefficient category boundaries we begin to hint at here. 

\begin{figure}[h!]
 \begin{center}
 \includegraphics[width=0.8\linewidth]{figures/non_monotonic_example_rev.png}
   \caption{Category boundary elicited from human participants involves a break with monotonicity.}
 \label{fig:nonMonotonic}
 \end{center}
 \end{figure}
 
\section{Confidence-Based Smoothing Details}

We include further details of our methodology for leveraging human-provided confidence to construct $\tilde{y}$ introduced in Section 5. Human-derived soft labels have been demonstrated to be valuable for learning \citep{softlossMed, peterson2019human, selfCiteSoftLabel, sandersambiguous}. We transform humans' reported confidence into a smoothing parameter to induce softness using an exponentially-decaying function of human-provided confidence $\omega$: $a * (b^\omega)$; here, $a = 50, b = 0.0001$. We use the transformed confidence for additive smoothing on the two-category $\tilde{y}$, spread mass accordingly across the full gamut of classes. That is, we use smooth the mass between a completely uniform distribution and a ``two-hot'' label which uses the human-derived relabeling. Parameters $a, b$ are selected using a held-out set of regular \texttt{CIFAR-10} images (from $a \in \{5, 10, 15, 25, 50, 100\}, b \in \{0.00001, 0.0001, 0.001, 0.01, 0.1\}$). We recommend the consideration of alternate smoothing functions, which could, for instance, account for miscalibration in humans' reported confidence.  

Further, we compare the impact of learning with aggregated versus de-aggregated participants' predictions. In Section 5, we considered learning with relabelings averaged across participants for a mixed image, and smoothed with confidence reports averaged across participants. Here, we consider instead separating out participants' responses to learn with individual relabelings smoothed by individual confidence, closely related to \citep{wei2022aggregate}. We find in Table \ref{tab:trainHumanConf} that learning with \textit{de-aggregated} data could potentially offer greater performance gains. However, as \citep{wei2022aggregate} discuss: whether to aggregate can depend on many factors. Our empirical findings support the need for tailoring label construction in context. 

\begin{table*}[]
    \centering
        \caption{Varying whether to aggregate when using incorporating human confidence $\omega$ in label construction.}
\begin{tabular}{llll}
\toprule
 Label Type                                      & CE             & FGSM       & Calib   \\
\midrule
Ours (Avg with $\omega$)      & 1.48$\pm$0.06 & 8.89$\pm$1.59  & \textbf{0.19$\pm$0.01}      \\
 Ours (Separated with $\omega$) & \textbf{1.44$\pm$0.11}  & \textbf{8.33$\pm$1.92}  & \textbf{0.19$\pm$0.01}      \\
\bottomrule
\end{tabular}

    \label{tab:trainHumanConf}
\end{table*}



\section{Interfaces Included in \texttt{HILL MixE Suite}}

We display sample pages of the interfaces created and used in this work, which we release as part of \texttt{HILL MixE Suite}. Interfaces for Section 3 are shown in Figs. \ref{fig:clickNextInterface} and \ref{fig:synthDataSelectInterface}; the interface used Sections 4 is depicted in Fig. \ref{fig:inferCoeffInterface}. 

 \begin{figure}[h!]
 \begin{center}
 \includegraphics[width=0.95\linewidth]{figures/clickToCreateInterface.png}
  \caption{Construct interface where participants press arrow keys to select $\tilde{x}.$}
    \label{fig:clickNextInterface}
 \end{center}
 \end{figure}

\begin{figure}[h!]
 \begin{center}
 \includegraphics[width=0.95\linewidth]{figures/elic_select.png}
  \caption{Interface for the selection of a given $\lambda_g$ from a set of possible mixed images.}
    \label{fig:synthDataSelectInterface}
 \end{center}
 \end{figure}

 \begin{figure}[h!]
 \begin{center}
 \includegraphics[width=0.95\linewidth]{figures/elic_interface.png}
   \caption{Interface for inferring the \textit{mixup} generative label parameter and providing confidence in such inference.}
    \label{fig:inferCoeffInterface}
 \end{center}
 \end{figure}

%  \begin{figure}[h!]
%  \begin{center}
%  \includegraphics[width=0.8\linewidth]{figures/uncertain_elic.png}
%   \caption{Alternate, soft label elicitation for synthetic mixed images, based on the interface of \citep{selfCiteSoftLabel}.}
%     \label{fig:softLabelInterface}
%  \end{center}
%  \end{figure}



\section{Alternative Synthetic Example Category Composition Elicitation} 

Given human participants are uncertain about the underlying mixing coefficient in a number of cases, we consider whether the category composition typically used in \textit{mixup} -- e.g., placing mass only on the labels of the images used to form the synthetic combined sample -- are reasonable. As demonstrated in the main text and in Fig. \ref{fig:elicSoftLabelsMore}, a synthetic \textit{mixup} image may look like something else entirely. 
%As demonstrated in Fig \ref{fig:elicSoftLabels}, the synthetic \textit{mixup} image may look like something else entirely. 

We therefore consider a follow-up small-scale human elicitation study wherein we relax the \textit{mixup} assumption that the label mixing function must output a label constructed only from the two classes used to form the mixed image -- and instead collect $\tilde{y}$ \textit{directly} by showing the mixed image %(\ie, image) 
to human annotators in the form of soft labels. This provides a comparison to the previous human-annotated endpoint label mixing coefficients, and can further inspire useful designs for the label mixing policy. 

\begin{figure*}[h!]
 \begin{center}
 \includegraphics[width=0.98\linewidth]{LaTeX/figures/more_soft_labels.png}
   \caption{Additional example soft labels elicited from individuals. Original \textit{mixup} label for each associated image is shown in red; the soft label elicited from humans (averaged over two individuals) is shown in blue. The left and center examples involve substantial discrepancies between human percepts and the label which would be used in \textit{mixup}; the rightmost image highlights that some percepts do match the underlying mixing components (even without being informed of the underlying classes). Examples are deliberately chosen to illustrate the range of soft labels elicited; all examples are include in \texttt{H-Mix}.}
    \label{fig:elicSoftLabelsMore}
 \end{center}
 \end{figure*}

\subsection{Study Design} We recruit $N=8$ participants again from Prolific \citep{palan2018prolific}, yielding soft labels over a total of $100$ mixed images. Each participant saw 25 mixed images; each mixed image of the $100$ was seen by two participants. The images are drawn from the same set of stimuli created in Section 4; however, here, we only show images with a mixing coefficient $\in \{0.25, 0.5, 0.75 \}$. Participants are told that images are formed by combining other images, and are asked to provide what they think others would see in the image. Participants are asked to specify what others would view as the most probable category with an associated percentage (on a scale of 0-100), an optional second most probable category with a probability, and any categories that would be perceived as definitely not in the image. Again employing the third-person viewpoint framing borrowed from \citep{efficientElic}. We rely on the soft label elicitation interface proposed in~\citep{selfCiteSoftLabel} and modify the instructions to be better suited combinations of images. Following \citeauthor{selfCiteSoftLabel}, we construct ``Top 2 Clamp'' labels with a redistribution factor of 0.1, which controls how we spread mass over any categories still leftover as ``possible'' once accounting for those ruled out as definitely not possible. 

%; an example screen is shown in Fig. \ref{fig:softLabelInterface}. 

\subsection{Analyzing Elicited Soft Labels for Synthetic Images}

We explore the correspondence between the elicited category compositions of the mixed images with the labels that would be used to generate the mixed image (as would be used in traditional \textit{mixup}; i.e., placing mass only on two categories). While participants did tend to place probability mass on the generating endpoints that correlated with the mixing coefficient used (Pearson $r = 0.52$), interestingly, we find that participants report thinking that 38.3\% ($\pm$0.6\%) of the probability mass of a label should be placed on \textit{different} classes from those which are used to create the image. This is remarkable and suggests that mixed images \textit{do not} consistently look like the labels used to create them, corroborate similar trends found in \citep{gruber2018perceptual} wherein humans endorse categories which are not present in the image. Hence, alternative labelings even beyond the kind we explore in the main text may be preferred which are more aligned with human percepts. Examples of such labeled mixed images are shown in Fig.~\ref{fig:elicSoftLabelsMore} and the main text. 

\paragraph{\textit{Takeaways}} The typical two-category labels used in \textit{mixup} do \textit{not} consistently match human perception. 
We find that human annotators often assign probabilities to alternate classes when asked to label a mixed image. This suggests that the pursuit of aligning synthetic data labeling to match human perception, at least for the synthetic data constructor used in \textit{mixup}, warrants the design of alternative label mixing functions $g_\text{rich}$ which yield richer label distributions over a broader range of categories. 

% \section{Generalizing Relabeling}

% So far, we have focused on varying the labels of a pre-supposed augmenting set of mixed images; however, the set was comparatively small (2070 images) and therefore does not directly mimic the \textit{mixup} learning paradigm. In practice, \textit{mixup} is typically applied over the entire dataset; that is, on each batch, a new mixing coefficient is sampled, resulting in often entirely new images being generated per batch. It is infeasible to consider recruiting human participants to relabel every such image. Automated human-aligned labeling policies are therefore worth considering. We argue that our data offers a prime starting point to explore such questions.

% We offer a preliminary alternative label mixing policy based on the human data we have collected in \texttt{H-Mix}. Inspired by the non-linearities we observe at a category level, we use \texttt{scipy.curve\_fit} to fit a logistic function per category pair. For each batch, we swap in our label mixing policy to map from the sampled generating mixing coefficient to an approximately more human-perceptually aligned coefficient. Such fits only account for humans' relabelings, not their confidence. Accounting for human confidence in automated label policies is a ripe direction for future work. 

% \subsection{Setup} We follow the same ensembling and evaluation methodology laid out in Section 5.1, but now run traditional \textit{mixup} following \citep{mixup} where generating mixing coefficients are sampled from a $Beta(1,1)$ distribution (i.e., uniform on $(0,1)$). 

% \subsection{Results} We observe (see Table \ref{tab:autoLabel}) a striking parity in performance across models. These data highlight that constructing more human-aligned data simulators does not necessarily harm downstream performance and perhaps could be beneficial. Note, we are only looking at performance on a small set of possible metrics, and a relatively small set of held-out data ($3,000$ examples). It is quite feasible that training on more human-aligned data-generating policies could induce functional fits that are preferable to stakeholders even if we see no objective improvement along particular performance measures. We recommend such studies for future work. 

% % \begin{table}[]
% %     \centering
% % \begin{tabular}{llll}
% % \toprule
% %  Label Policy   & CE             & FGSM      & Calib   \\
% % \midrule
% %  \textit{mixup}           & 1.155$\pm$0.08 & 7.458$\pm$2.4  & 0.099$\pm$0.01      \\
% %   Human-Fits (Ours) & \textbf{1.152$\pm$0.08} & \textbf{7.411$\pm$2.32} & \textbf{0.097$\pm$0.01}      \\
% % \bottomrule
% % \end{tabular}

% \begin{table}[]
%     \centering
%         \caption{Training with mixing policies fitted per category pair, compared against full \textit{mixup}.}
% \begin{tabular}{llll}
% \toprule
%  Label Policy   & CE             & FGSM      & Calib   \\
% \midrule
%  \textit{mixup}           & \textbf{1.15$\pm$0.08} & 7.46$\pm$2.40  & \textbf{0.10$\pm$0.01}      \\
%   Human-Fits (Ours) & 1.16$\pm$0.08 & \textbf{7.32$\pm$2.27} & \textbf{0.10$\pm$0.01}      \\
% \bottomrule
% \end{tabular}


%     \label{tab:autoLabel}
% \end{table}


% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 


% \section{Additional simulation results}
% Table~\ref{tab:supp-data} lists additional simulation results; see also \citet{einstein} for a comparison. 

% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

\bibliography{collins_256-supp}

\end{document}
