% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} 
%% In your camera-ready you should use the 'accepted' parameter. This shows the authors and how an accepted paper will look like. The footer is 'Acccepted for X'. In the final version, the proceedings chairs will add the page numbers for PMLR and the final footer will be 'Proceedings of X'.
%
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% ****** our packages
\usepackage{xcolor}
% \usepackage[colorlinks=true,allcolors=blue]{hyperref}
\usepackage{amsmath, amssymb, amsthm}
\usepackage{mathtools}
% \usepackage{multibib}
% \newcites{APX}{Appendix}
\newtheorem{theorem}{Theorem}
\renewcommand*{\thetheorem}{\Alph{theorem}}
\newtheorem{assumption}{Assumption}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{definition}{Definition}
\newtheorem{fact}{Fact}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\indep}{\ \large\perp\!\!\!\!\!\!\perp\ }
\DeclareMathOperator*{\defn}{ \ \overset{\mathrm{def}}{=} \ }

\newenvironment{proofsk}{%
  \renewcommand{\proofname}{Proof Sketch}\proof}{\endproof}
  
% things for xr
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
\myexternaldocument{sicilia_277}
% end things for xr

\title{PAC-Bayesian Domain Adaptation Bounds for Multiclass Learners \\ (Supplementary material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
% 
% Important: in case of equal contributions, we strongly recommend to NOT show it in this part of the paper, but rather describe it in the appropriate section at the end of the paper "Author Contribution", where you have more space to describe how each author contributed.
%
% Add authors
% Remember to use the order convention "First/Given name" "Last/Family name", e.g. John Smith, Hanako Yamada, Marco Rossi, Wei Zhang
\author[1]{\href{mailto:<anthonysicilia@pitt.edu>?Subject=Your UAI 2022 paper}{Anthony Sicilia}{}}
\author[2]{\href{mailto:<kaa139@pitt.edu>?Subject=Your UAI 2022 paper}{Katherine Atwell}{}}
\author[1,2]{\href{mailto:<malihe@pitt.edu>?Subject=Your UAI 2022 paper}{Malihe Alikhani}{}}
\author[3]{\href{mailto:<seongjae@yonsei.ac.kr>?Subject=Your UAI 2022 paper}{Seong Jae Hwang}{}}
% Add affiliations after the authors
\affil[1]{%
    Intelligent Systems Program\\
    University of Pittsburgh\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Department of Computer Science\\
    University of Pittsburgh\\
    Pittsburgh, Pennsylvania, USA
}
\affil[3]{%
    Department of Artificial Intelligence\\ Yonsei University\\
    Seoul, South Korea
  }
  
\begin{document}
\onecolumn
\maketitle
\appendix
\setcounter{equation}{24}
\setcounter{figure}{4}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Proofs}
\label{sec:proofs}
\subsection{Theorem~\ref{thm:ben2010theory}}
\begin{proof}
This is Thm. 2 of \citet{apx_ben2010theory} with added bound on $\mathbf{R}_S(h) - \mathbf{R}_\mathbb{S}(h)$ by standard uniform convergence arguments; e.g., Ch. 28.1 of \citet{apx_shalev2014understanding}. Boole's Inequality is used to combine bounds. 
\end{proof}
\subsection{Theorem~\ref{thm:germain2020pac} (Theorem~7 of Germain et al. [2020])}
\label{sec:germain2020pac}
\begin{theorem}\label{thm:germain2020pac}
\citep{apx_germain2020pac} Let $\mathcal{Y}$ be binary, $\mathbb{P}$ any distribution over $\mathcal{H}$, and $\omega > 0$. For all $\delta > 0$, w.p. at least $1-\delta$, for all distributions $\mathbb{Q}$ over $\mathcal{H}$,
\begin{equation}\small
\begin{split}
    & \mathbf{R}_\mathbb{T}(\mathbb{Q}) \leq \omega' ( \mathbf{R}_S(\mathbb{Q}) + |\mathrm{d}_S(\mathbb{Q}) - \mathrm{d}_T(\mathbb{Q})| )
    + |\mathrm{e}_\mathbb{S}(\mathbb{Q}) - \mathrm{e}_\mathbb{T}(\mathbb{Q})| + 2\omega \tfrac{\mathrm{KL}(\mathbb{Q} \mid \mid \mathbb{P})  - \ln ( \delta / 3) }{m\omega'} + 2(\omega' - 1)
\end{split}
\end{equation}
where $\omega' = 2\omega / (1 - \exp(-2\omega))$ and for $H_i \sim (\mathbb{Q})_i$, $(X,Y) \sim \mathbb{S}$ we have
\begin{equation}\small
\begin{split}
    & \mathrm{e}_\mathbb{S}(\mathbb{Q}) \defn \mathbf{E}[(1-\mathbf{1}_{\{H_1(X)\}}\{Y\}) (1-\mathbf{1}_{\{H_2(X)\}}\{Y\})], \\
    & \mathrm{d}_{\mathbb{S}}(\mathbb{Q}) \defn \mathbf{E} [1 - \mathbf{1}_{\{H_1(X)\}}\{H_2(X)\}].
\end{split}
\end{equation}
\end{theorem}
In comparison to Thm.~\ref{thm:ben2010theory}, the absolute difference in disagreement $\mathrm{d}$ is most similar to the $\mathcal{H}\Delta\mathcal{H}$-divergence and the absolute difference in joint-error $\mathrm{e}$ is most similar to the adaptability $\lambda$ \citep{apx_germain2020pac}. For this reason, in our discussion in Section~\ref{sec:background}, we refer to the former as the ``divergence'' and the latter as the ``adaptability''.
\begin{proof}
As noted, this is a simplification of Thm.~7 of \citet{apx_germain2020pac}. We set $\omega = a$ in the original notation and use the fact that $\omega / (1 - \exp(-\omega))$ is increasing for $\omega > 0$. \end{proof}
\subsection{Theorem~\ref{thm:pb-bound}}
Before diving into the proof, we setup some helpful notation and Lemmas.
\subsubsection{Notation}
Frequently in our proofs, we use the \textit{error gap}, defined for any distributions $\mathbb{S}, \mathbb{T}$ and hypothesis $h$
\begin{equation}\label{eqn:error_gap}
    \Delta_h(\mathbb{S}, \mathbb{T}) \defn \lvert \mathbf{R}_\mathbb{S}(h) - \mathbf{R}_\mathbb{T}(h) \rvert.
\end{equation}
By the identification in Eq.~\eqref{eqn:sample_pmf}, we observe that $\Delta_h(S, T)$ is also well-defined for any random samples $S$ and $T$. Also, using the usual definition of the Gibbs risk, $\Delta_\mathbb{Q}(\mathbb{S}, \mathbb{T})$ is well-defined for any distribution $\mathbb{Q}$ over a hypothesis space $\mathcal{H}$. Occasionally, we also use two-subscripts on the error-gap $\Delta$. The intended meaning is intuitive:
\begin{equation}
    \Delta_{q, p}(\mathbb{S}, \mathbb{T}) \defn \lvert \mathbf{R}_\mathbb{S}(q) - \mathbf{R}_\mathbb{T}(p) \rvert.
\end{equation}
This notation will be especially useful in proofs since $\Delta_{q, p}(\mathbb{S}, \mathbb{T})$ obeys a triangle-inequality with respect to the subscripts and arguments. Further, any bound on $\Delta_\mathbb{Q}(S, \mathbb{T})$ trivially yields a PAC-Bayesian adaptation bound for the Gibbs predictor $\mathbb{Q}$ by definition of the absolute value.

As another short-hand in proofs, we frequently use the following more evocative expressions for the indicator function:
\begin{equation}
    1[a = b] \defn \mathbf{1}_{\{a\}}\{b\}; \qquad 1[a \neq b] \defn  1 - \mathbf{1}_{\{a\}}\{b\}.
\end{equation}
Now, we can proceed with the employed Lemmas.
\subsubsection{Lemmas}
In this section, we build to the proof of Theorem~\ref{thm:pb-bound}. These results consist of most of the ``real'' work in proving the result. They range in degree of novelty and we provide some exposition on this point here. Lemma~\ref{lem:multi-class-triangle-eq} is an adaptation of the triangle-inequality for 01-loss \citep{apx_crammer2007learning, apx_ben2007analysis} to the multiclass setting. Similarly, Lemma~\ref{lem:ben-david} is an adaptation of the main inequality of \citet{apx_ben2010theory} to the multiclass setting. The former requires some work to verify the logic, while our overall strategy for the latter is similar to the binary case. Next, Lemma~\ref{lem:simple-da-bendavid} uses the identification in Eq.~\eqref{eqn:sample_pmf} to apply Lemma~\ref{lem:ben-david} to the random samples $S$ and $T$. While it is a simple insight, it is extremely important, since it enables us to introduce the sample-dependent adaptability $\tilde{\lambda}$. The next result, Lemma~\ref{lem:maurer}, is well-known in PAC-Bayes. Meanwhile, the final result, Lemma~\ref{lem:stoch_ben_david}, is a new result which allows us to apply Lemma~\ref{lem:simple-da-bendavid} to Gibbs predictors. When broken down in this manner, as is our intention, the individual pieces that build to our bound may appear simple. Still, it is important to remember that PAC-Bayesian bounds have never previously been combined with multiclass variants of the results of \citet{apx_ben2007analysis, apx_ben2010theory}. After some trial and error, we've found our primary innovations -- the use of sample-independent adaptability, along with Lemma~\ref{lem:stoch_ben_david} -- are vital to introducing the desired non-uniform notion of sample complexity. In any case, we now proceed by stating and proving each of the discussed Lemmas.
\begin{lemma}
\label{lem:multi-class-triangle-eq}
For any $(h, h') \in \mathcal{H}^2$ and any $(x,y) \in \mathcal{X} \times \mathcal{Y}$,
\begin{equation}\label{eqn:h_dis_y}
    1[h(x) \neq y] \leq 1[h(x) \neq h'(x)] + 1[h'(x) \neq y]
\end{equation}
and 
\begin{equation}\label{eqn:h_dis_h}
    1[h(x) \neq h'(x)] \leq 1[h'(x) \neq y] + 1[y \neq h(x)].
\end{equation}
\end{lemma}
\begin{proof}
We begin with Eq.~\eqref{eqn:h_dis_y}. We use proof by exhaustion. If $h(x) = y$, then the LHS is 0 and the RHS will always be non-negative so the equation is true. If $h(x) \neq y$ and $h(x) \neq h'(x)$, then the equation evaluates to $1 \leq 1 + c$  for $c \geq 0$ which is true. If $h(x) \neq y$ and $h(x) = h'(x)$, then $h'(x) \neq y$, and $1 \leq 1$ which is true. This concludes the argument.

Next, we consider Eq.~\eqref{eqn:h_dis_h}. Again, we use proof by exhaustion. If $h(x) =  h'(x)$, the LHS is 0. If $h(x) \neq h'(x)$ and $h(x) = y$, we have $h'(x) \neq y$ and the equation evaluates to $1 \leq 1$ which is true. If $h(x) \neq h'(x)$ and $h(x) \neq y$, it evaluates to $1 \leq 1 + c$ for $c \geq 0$ which is true and concludes the argument.
\end{proof}
Note, one observation is that the function $\tilde{d}(y, y') \defn 1[y' \neq y]$ for any arguments $y, y' \in \mathcal{Y}$ is identical to a well-known function called the trivial metric or the discrete metric. As implied by the name, the tuple $(\mathcal{Y}, \tilde{d})$ forms a \textit{metric space}, and subsequently, Lemma~\ref{lem:multi-class-triangle-eq} above is a simple consequence of this fact. Nonetheless, we maintain the proof above to keep our discussion relatively self-contained.  
\begin{lemma}
\label{lem:ben-david}
For any distributions $\mathbb{D}_1$ and $\mathbb{D}_2$ over $\mathcal{X} \times \mathcal{Y}$, for any $h \in \mathcal{H}$
\begin{equation}
    \mathbf{R}_{\mathbb{D}_1}(h) \leq \mathbf{R}_{\mathbb{D}_2}(h) + \mathbf{d}_{\mathcal{C}_h}((\mathbb{D}_1)_X, (\mathbb{D}_2)_X) + \min_{\eta \in \mathcal{H}} \Big \{\mathbf{R}_{\mathbb{D}_1}(\eta) + \mathbf{R}_{\mathbb{D}_2}(\eta) \Big \}
\end{equation}
where $\mathcal{C}_h = \mathcal{H}\Delta\mathcal{H}$ or $\mathcal{C}_h = h\Delta\mathcal{H}$ and $(\mathbb{D}_i)_X$ is the $\mathcal{X}$-marginal of $\mathbb{D}_i$.\footnote{In a formal sense, $(\mathbb{D}_i)_X$ is the pushforward distribution $\mathbb{D}_i \circ \pi^{-1} $of the projection $\pi : \mathcal{X} \times \mathcal{Y} \to \mathcal{X}$ defined $\pi(x,y) = x$.}
\end{lemma}
\begin{proof}
Let $\mathbb{D}_1$, $\mathbb{D}_2$, and $h$ as assumed. 

Recall by Lemma~\ref{lem:multi-class-triangle-eq} Eq.~\eqref{eqn:h_dis_y}, for any $h'$ in $\mathcal{H}$ and any $(x,y) \in \mathcal{X} \times \mathcal{Y}$
\begin{equation}
    1[h(x) \neq y] \leq 1[h(x) \neq h'(x)] + 1[h'(x) \neq y].
\end{equation}
Then, by monotonicity and linearity of the expectation, for any choice of $h'$,
\begin{equation}
\begin{split}
\mathbf{R}_{\mathbb{D}_1}(h) & \leq \mathbf{E}[1[h(X_1) \neq h'(X_1)]] + \mathbf{R}_{\mathbb{D}_1}(h'); \qquad X_1 \sim (\mathbb{D}_1)_X \\
& \leq \mathbf{E}[1[h(X_2) \neq h'(X_2)]] + \mathbf{R}_{\mathbb{D}_1}(h') + \xi; \qquad X_2 \sim (\mathbb{D}_2)_X
\end{split}
\end{equation}
where
\begin{equation}
\begin{split}
    \xi & = \big \lvert \mathbf{E}[1[h(X_2) \neq h'(X_2)]] - \mathbf{E}[1[h(X_1) \neq h'(X_1)]] \big \rvert \\
    & \leq \mathbf{d}_{\mathcal{C}_h}((\mathbb{D}_1)_X, (\mathbb{D}_2)_X)) \qquad \text{(by definition of supremum, for either choice of } \mathcal{C}_h).
\end{split}
\end{equation}
Alternatively, by Lemma~\ref{lem:multi-class-triangle-eq} Eq.~\eqref{eqn:h_dis_h}, for any choice of $h',x, y$,
\begin{equation}
    1[h(x) \neq h'(x)] \leq 1[h'(x) \neq y] + 1[y \neq h(x)].
\end{equation}
Using monotonicty and linearity of the expectation as before, we have
\begin{equation}
   \mathbf{E}[1[h(X_2) \neq h'(X_2)]] \leq \mathbf{R}_{\mathbb{D}_2}(h') + \mathbf{R}_{\mathbb{D}_2}(h); \qquad X_2 \sim (\mathbb{D}_2)_X.
\end{equation}
As the above holds for any $h' \in \mathcal{H}$, select $h'$ to be minimizer of the quantity $\mathbf{R}_{\mathbb{D}_1}(h') + \mathbf{R}_{\mathbb{D}_2}(h')$. 

This yields the desired result.
\end{proof}
\begin{lemma}
\label{lem:simple-da-bendavid}
Almost surely, w.r.t samples $S$ and $T$,
\begin{equation}\small
    \forall h \in \mathcal{H} \ : \ \Delta_h(S, T) \leq \tilde{\lambda} + \mathbf{d}_{\mathcal{C}_h}(S_X, T_X)
\end{equation}
where $\tilde{\lambda} \defn \min_{h \in \mathcal{H}} \mathbf{R}_S(h) + \mathbf{R}_T(h)$ and the bound holds for both $\mathcal{C}_h = \mathcal{H}\Delta\mathcal{H}$ and $\mathcal{C}_h = h\Delta\mathcal{H}$.
\end{lemma}
\begin{proof}
The statement asserts the following holds with probability 1 according to the random draws of $S$ and $T$:
\begin{equation}
    \forall h \in \mathcal{H} \ : \ \Delta_h(S, T) \leq \tilde{\lambda} + \mathbf{d}_{\mathcal{C}_h}(S_X, T_X)
\end{equation}
It is sufficient to show the statement holds for any realization of $S$ and $T$. Recall, for any realization, $S$ and $T$ themselves define distributions by the identification in Eq.~\eqref{eqn:sample_pmf}. So, Lemma~\ref{lem:ben-david} may be applied. Doing so twice and interchanging the roles of $S$ and $T$ gives
\begin{equation}
    \forall h \in \mathcal{H} \ : \ \mathbf{R}_h(S) - \mathbf{R}_h(T) \leq \tilde{\lambda} + \mathbf{d}_{\mathcal{C}_h}(S_X, T_X) \qquad \text{and} \qquad \mathbf{R}_h(T) - \mathbf{R}_h(S) \leq \tilde{\lambda} + \mathbf{d}_{\mathcal{C}_h}(S_X, T_X).
\end{equation}
So, the absolute difference between $\mathbf{R}_h(S)$ and $\mathbf{R}_h(T)$ is also bounded and we have our result.
\end{proof}
\begin{lemma} \citep{apx_maurer2004note}
\label{lem:maurer}
For any distribution $\mathbb{P}$ over $\mathcal{H}$, for any $\delta > 0$,
\begin{equation}
\mathbf{Pr} \Big ( \forall \ \mathbb{Q} \ : \ \Delta_{\mathbb{Q}}(T, \mathbb{T}) \leq \sqrt{\tfrac{\mathrm{KL}(\mathbb{Q} \mid \mid \mathbb{P}) + \ln \sqrt{4m} - \ln ( \delta) }{2m}}  \ \Big ) \geq 1 - \delta.
\end{equation}
\end{lemma}
\begin{proof}
This is the result of \citet{apx_maurer2004note} given below
\begin{equation}
    \mathbf{Pr} \Bigg ( \mathrm{kl}( \mathbf{R}_T(\mathbb{Q}) \mid \mid \mathbf{R}_\mathbb{T}(\mathbb{Q}) ) \leq \frac{\mathrm{KL}(\mathbb{Q} || \mathbb{P}) - \ln \delta + \ln \sqrt{4m} }{m} \ \Bigg ) \geq 1 - \delta,
\end{equation}
where the ``little'' $\mathrm{kl}$ is the KL-divergence between Bernoulli distributions parameterized by its arguments. The above bound implies the stated result by application of Pinsker's Inequality.
\end{proof}
\begin{lemma}
\label{lem:stoch_ben_david}
For any distribution $\mathbb{Q}$, almost surely w.r.t samples $S$ and $T$,
\begin{equation}
   \Delta_\mathbb{Q}(S, T) \leq \tilde{\lambda} + \mathbf{E}_H[\mathbf{d}_{\mathcal{C}_H}(S_X, T_X)]
\end{equation}
where $\tilde{\lambda}$ and $\mathcal{C}_H$ are defined as in Lemma~\ref{lem:simple-da-bendavid}.
\end{lemma}
\begin{proof}
We apply Lemma~\ref{lem:simple-da-bendavid}. By Jensen's Inequality, monotonicity of $\mathbf{E}$, and linearity of $\mathbf{E}$, we have
\begin{equation}
   \Delta_\mathbb{Q}(S, T) \leq \mathbf{E}_H[\Delta_H(S, T)] \leq \tilde{\lambda} + \mathbf{E}_H[\mathbf{d}_{\mathcal{C}_H}(S_X, T_X)]
\end{equation}
almost surely. In more details, for any realization of $S$ and $T$,
\begin{align*}
\Delta_\mathbb{Q}(S, T) & =  \big \lvert \mathbf{R}_\mathbb{Q}(S) - \mathbf{R}_\mathbb{Q}(T) \big \rvert & \\
& = \big \lvert \mathbf{E}[\mathbf{R}_S(H)] - \mathbf{E}[\mathbf{R}_T(H)] \big \rvert & (H \sim \mathbb{Q}, \ S \ \text{fixed}, \ T \ \text{fixed}) \\
& = \Big \lvert \mathbf{E} \Big [\mathbf{R}_S(H) - \mathbf{R}_T(H) \Big] \Big \rvert & \text{(Linearity of }\mathbf{E}) \\
& \leq \mathbf{E} \big [\Delta_H(S,T) \big ] & \text{(Jensen's Inequality)}\\
& \leq \mathbf{E} \big [ \tilde{\lambda} + \mathbf{d}_{\mathcal{C}_H}(S_X, T_X) \big ] & \text{(Lemma~\ref{lem:simple-da-bendavid} and monotonicity of }\mathbf{E}) \\
& \leq \tilde{\lambda} + \mathbf{E}[\mathbf{d}_{\mathcal{C}_H}(S_X, T_X)] & \text{(Linearity of }\mathbf{E}).
\end{align*}
\end{proof}
\subsubsection{Proof}
We give the final proof of Theorem~\ref{thm:pb-bound} below. Admittedly, it is a bit underwhelming, since most of the work has gone into the Lemmas above. The remaining component we rely on is our notation for the error-gap $\Delta$. By design, this notation exhibits a triangle-inequality.
\begin{proof}
Observe,
\begin{equation}\label{eqn:pattern-use-2}
    \Delta_{\mathbb{Q}}(S, \mathbb{T}) \leq \Delta_{\mathbb{Q}}(S, T) + \Delta_{\mathbb{Q}}(T, \mathbb{T}).
\end{equation}
% In more details, this is true through the following manipulation
% \begin{equation}
%      \Delta_{Q, \mathbb{Q}}(S, \mathbb{T}) = \lvert \mathbf{R}_Q(S) - \mathbf{R}_\mathbb{Q}(\mathbb{T})\rvert = \lvert \mathbf{R}_S(Q) -\mathbf{R}_T(Q) + \mathbf{R}_T(Q) - \mathbf{R}_\mathbb{T}(\mathbb{Q})\rvert
% \end{equation}
% followed by an application of the triangle inequality.
To bound the former, we use Lemma~\ref{lem:stoch_ben_david}. To bound the latter, we use Lemma~\ref{lem:maurer}. We use Boole's Inequality to combine to the desired result. 
\end{proof}
\subsection{Theorem~\ref{thm:mid_div_red2erm}}
As noted in the main text, we employ the overall strategy of \citet{apx_ben2010theory}. The main distinction in our result below is the removal of any symmetry assumption on $\mathcal{H}$.
\begin{proof}
As before, we show the statement holds for any realization of $S_X$ and $T_X$. 

Let $\mathcal{C} = \mathcal{H}\Delta\mathcal{H}$ and expand the divergence as below
\begin{equation}
\begin{split}
    & \mathbf{d}_\mathcal{C}(S_X, T_X) = \max_{\varphi \in \mathcal{H} \Delta \mathcal{H}} \big \lvert \mathbf{E}[\varphi(X)] - \mathbf{E}[\varphi(\tilde{X})]\big \rvert = \max_{\varphi \in \mathcal{H} \Delta \mathcal{H}} \big \lvert \mathbf{Pr}(\varphi(X) = 1) - \mathbf{Pr}(\varphi(\tilde{X}) = 1)\big \rvert
\end{split}
\end{equation}
where $X \sim S_X$, $\tilde{X} \sim T_X$. Note, we substitute $\max$ for $\sup$ because both $S_X$ and $T_X$ are finitely supported, and thus, some $\varphi \in \mathcal{C}$ does achieve the maximum. Then, we have
\begin{equation}
\label{eqn:dis_divergence_optim}
\begin{split}
& \max_{\varphi \in \mathcal{H} \Delta \mathcal{H}} \big \lvert \mathbf{Pr}(\varphi(X) = 1) - \mathbf{Pr}(\varphi(\tilde{X}) = 1)\big \rvert \\
& = \max_{\varphi \in \mathcal{H} \Delta \mathcal{H}} \max \begin{rcases}
    \begin{dcases}
       \mathbf{Pr}(\varphi(X) = 1) - \mathbf{Pr}(\varphi(\tilde{X}) = 1), \\
      \mathbf{Pr}(\varphi(\tilde{X}) = 1) - \mathbf{Pr}(\varphi(X)=1)
    \end{dcases}
  \end{rcases} \\
& = \max_{\varphi \in \mathcal{H} \Delta \mathcal{H}} \max \begin{rcases}
    \begin{dcases}
       1 - \mathbf{Pr}(\varphi(X) =  0) - \mathbf{Pr}(\varphi(\tilde{X}) = 1), \\
      1 - \mathbf{Pr}(\varphi(\tilde{X}) =  0) - \mathbf{Pr}(\varphi(X) = 1)
    \end{dcases}
  \end{rcases} \\
& = \max \begin{rcases}
    \begin{dcases}
       1 - \min_ {\varphi \in \mathcal{H} \Delta \mathcal{H}} \Big \{ \mathbf{Pr}(\varphi(X) =  0) +  \mathbf{Pr}(\varphi(\tilde{X}) = 1) \Big \}, \\
      1 - \min_ {\varphi \in \mathcal{H} \Delta \mathcal{H}} \Big \{ \mathbf{Pr}(\varphi(\tilde{X}) =  0) + \mathbf{Pr}(\varphi(X) = 1) \Big \}
    \end{dcases}
  \end{rcases}. \\
\end{split}
\end{equation}
The first equality follows by definition of absolute value, the second by law of complements, and last because consecutive applications of the $\max$ operation may be interchanged. Taking $P,Q,U,V$ as assumed, the result follows by the definition of risk; i.e., Eq.~\eqref{eqn:risk}.
\end{proof}
\subsection{Theorem~\ref{thm:surrogate_loss}}
As we are aware, Theorem~\ref{thm:surrogate_loss} is the first proposal for approximation of ERM over the class $\mathcal{H}\Delta\mathcal{H}$ when $\mathcal{H}$ has multiclass output. Our strategy is to identify an appropriate score-based surrogate expression for any $\varphi \in \mathcal{S}\Delta\mathcal{S}$; i.e., which is positive where $\varphi$ returns 1 and negative otherwise. Upon doing so, we can use standard techniques for giving smooth upperbounds to the 01-loss.
\begin{proof}
Let $x \in \mathcal{X}$, $\mathbf{f}, \mathbf{g} \in \mathcal{F}$ and suppose $\mathbf{f}(x)$ and $\mathbf{g}(x)$ have no repeated entries. Recall, for any two sets of non-negative numbers $S_1$ and $S_2$ the following equality holds\footnote{Suppose not. Then, WLOG $\max \{a \cdot b\} = d \cdot e > (\max S_1) \cdot (\max S_1)$ for some $d \neq \max S_1$ or some $e \neq \max S_2$. But, we also have $d \cdot e \leq d \cdot \max S_2 \leq (\max S_1) \cdot (\max S_2)$, a contradiction.}
\begin{equation}
    \max \{a \cdot b \mid a \in S_1, b \in S_2\} = (\max S_1) \cdot (\max S_2).
\end{equation}
From this and the fact that $\tau$ is non-negative and order-preserving, we know $\mathbf{A}_{ii} \geq \mathbf{A}_{jk}$ for some $i \in [C]$ and all $(j,k) \in [C]^2$ if and only if 
\begin{equation}
i = \argmax_{\ell \in C} \mathbf{f}_\ell(x) = \argmax_{\ell \in C} \mathbf{g}_\ell(x).   
\end{equation}
Notice, ties are impossible due to the assumed uniqueness of the scores. So, by this same logic, we observe
\begin{equation}
\begin{split}
    & \argmax_{\ell \in C} \mathbf{f}_\ell(x) \neq \argmax_{\ell \in C} \mathbf{g}_\ell(x) \\
    \mathrm{iff} \quad & \forall \ i \in [C], \ \exists \ (j, k) \in [C]^2 \ : \mathbf{A}_{ii} < \mathbf{A}_{jk} \\
    \mathrm{iff} \quad & \max_{i \in [C]} \mathbf{A}_{ii} < \max_{(j,k) \in [C]^2} \mathbf{A}_{jk} \\
    \mathrm{iff} \quad & 0 < \max_{(j,k) \in [C]^2} \mathbf{A}_{jk} - \max_{i \in [C]} \mathbf{A}_{ii} = z(x)
\end{split}
\end{equation}
So, under the current assumptions, the score $z(x)$ is positive if and only if $\hat{y} = 1 - \mathbf{1}_{\{\Psi_\mathbf{f}(x)\}}\{\Psi_\mathbf{g}(x)\} = 1$. Using this fact, it is easy to verify $\mathcal{L}(z(x),y) \geq 1[\hat{y} \neq y]$ for each case $(\hat{y}, y) \in \{(0,0), (0,1), (1,0), (1,1)\}$. The loss $\mathcal{L}$ is actually a standard surrogate -- i.e., the cross-entropy -- multiplied by a constant factor as in \citet{apx_dziugaite2017computing} to turn it into a propper upperbound on the 01-loss. The main novelty here comes from defining $z(x)$ to be positive whenever $\hat{y}$ is. % Note, by itself, $\mathcal{L}(z, y)$ is a standard log loss multiplied by a constant factor to upperbound the 01-loss as in \citet{apx_dziugaite2017computing}.

Notice, the inequality holds on all but a set of measure 0, according to $\mathbb{D}$. Thus, monotonicity of $\mathbf{E}$ gives the result.
\end{proof}
\subsection{Theorem~\ref{thm:mdp_div_red2erm}}
\label{sec:mdp_div_red2erm}
As noted in the main text, Theorem~\ref{thm:mdp_div_red2erm} is conceptually similar to a result -- in the binary case -- given by \citet{apx_kuroki2019unsupervised}. Unfortunately, their strategy does not simply extend to the multiclass case: there is a loss of precision due to the increased degrees of freedom in multiclass classification. As a result, we observe the need to add additional constraints on the labeling function for the classification problem. Specifically, we introduce the class $\Upsilon$ for use in our reduction. Careful attention is paid to show the constrained labeling function can be independent of the classifier we wish to learn $\varphi \in \mathcal{H}$, which enables our appeal to a simple heuristic that is also independent of $\varphi$. Otherwise, in simpler formulations, this dependence produces a more complicated minimization problem.
\begin{proof}
We show the statement holds for any realization of $S_X$ and $T_X$. 
 
Let $h \in \mathcal{H}$ arbitrarily and let $\mathcal{C} = h\Delta\mathcal{H}$. We proceed by expanding the divergence:
\begin{equation}\label{eqn:mdp_expansion}
\begin{split}
& \mathbf{d}_{\mathcal{C}}(S_X, T_X) = \max_{\nu \in h \Delta \mathcal{H}} \big \lvert \mathbf{Pr}(\nu(X) = 1) - \mathbf{Pr}(\nu(\tilde{X}) = 1)\big \rvert \\
& = \max \begin{rcases}
    \begin{dcases}
       1 - \min_ {\nu \in h \Delta \mathcal{H}} \Big \{ \mathbf{Pr}(\nu(X) =  0) +  \mathbf{Pr}(\nu(\tilde{X}) = 1) \Big \}, \\
      1 - \min_ {\nu \in h \Delta \mathcal{H}} \Big \{ \mathbf{Pr}(\nu(\tilde{X}) =  0) + \mathbf{Pr}(\nu(X) = 1) \Big \}
    \end{dcases}
  \end{rcases} \\
  & = \max \begin{rcases}
    \begin{dcases}
       1 - \min_ {\varphi \in \mathcal{H}} \Big \{ \mathbf{Pr}(h(X) = \varphi(X)) +  \mathbf{Pr}(h(\tilde{X}) \neq \varphi(\tilde{X})) \Big \}, \\
      1 - \min_ {\varphi \in \mathcal{H}} \Big \{ \mathbf{Pr}(h(\tilde{X}) =  \varphi(\tilde{X})) + \mathbf{Pr}(h(X) \neq \varphi(X)) \Big \}
    \end{dcases}
  \end{rcases}
\end{split}
\end{equation}
where $X \sim S_X$ and $\tilde{X} \sim T_X$. The first and second lines follow from an identical expansion as in the proof of Theorem~\ref{thm:mid_div_red2erm}. The last follows by definition of $h \Delta \mathcal{H}$.

Next, we observe
\begin{equation}
\label{eqn:impercision_introduced}
    \forall \varphi \in \mathcal{H}, \ \forall \bar{h} \in \Upsilon \ : \ \mathbf{Pr}(h(X) = \varphi(X)) +  \mathbf{Pr}(h(\tilde{X}) \neq \varphi(\tilde{X})) \leq \mathbf{Pr}(\bar{h}(X) \neq \varphi(X)) +  \mathbf{Pr}(h(\tilde{X}) \neq \varphi(\tilde{X})).
\end{equation}
The inequality follows by the monotonicity of probability and the fact 
\begin{equation}
\{x \mid h(x) = \varphi(x)\} \subseteq \{x \mid \bar{h}(x) \neq \varphi(x)\} \qquad\text{(by definition of }\bar{h}).
\end{equation}
Meanwhile, setting 
\begin{equation}
    \bar{h}^*_\varphi(x) \defn \begin{cases}
    \varphi(x), & \text{if} \ \varphi(x) \neq h(x) \\
    \max \{\ell \in [C] \mid \ell \neq \varphi(x)\}, & \text{else}
    \end{cases}
\end{equation}
implies $\bar{h}^*_\varphi \in \Upsilon$ and $\{x \mid h(x) = \varphi(x)\} = \{x \mid \bar{h}^*_\varphi(x) \neq \varphi(x)\}$. So, we also have
\begin{equation}
\label{eqn:impercision_resolved}
    \forall \varphi \in \mathcal{H} \ : \ \mathbf{Pr}(h(X) = \varphi(X)) +  \mathbf{Pr}(h(\tilde{X}) \neq \varphi(\tilde{X}))  = \mathbf{Pr}(\bar{h}^*_\varphi(X) \neq \varphi(X)) +  \mathbf{Pr}(h(\tilde{X}) \neq \varphi(\tilde{X})).
\end{equation}
Considering that $\bar{h}^*_\varphi \in \Upsilon$, Eq.~\eqref{eqn:impercision_introduced} and Eq.~\eqref{eqn:impercision_resolved} in combination tell us 
\begin{equation}
\label{eqn:mdp_erm_equality-1}
    \forall \varphi \in \mathcal{H} \ : \ \mathbf{Pr}(h(X) = \varphi(X)) +  \mathbf{Pr}(h(\tilde{X}) \neq \varphi(\tilde{X}))  = \min\nolimits_{\bar{h} \in \Upsilon} \Big \{ \mathbf{Pr}(\bar{h}(X) \neq \varphi(X)) +  \mathbf{Pr}(h(\tilde{X}) \neq \varphi(\tilde{X})) \Big \}.
\end{equation}
To see this, it's easiest to use the definition of a set's $\min$ element as that which attains the greatest lower bound; i.e., the $\inf$ or infimum. Then, Eq.~\eqref{eqn:impercision_introduced} implies the $\min$ upperbounds the LHS of Eq.~\eqref{eqn:mdp_erm_equality-1}, and Eq.~\eqref{eqn:impercision_resolved} implies the $\min$ lowerbounds the LHS of Eq.~\eqref{eqn:mdp_erm_equality-1}. In combination, these bounds prove equality.
%since the lowerbound in Eq.~\eqref{eqn:impercision_introduced} is always attained by the upperbound, for some $\bar{h} \in \Upsilon$.

Note, an identical argument also gives,
\begin{equation}
\label{eqn:mdp_erm_equality-2}
    \forall \varphi \in \mathcal{H} \ : \ \mathbf{Pr}(h(\tilde{X}) =  \varphi(\tilde{X})) + \mathbf{Pr}(h(X) \neq \varphi(X)) = \min\nolimits_{\bar{h} \in \Upsilon} \Big \{ \mathbf{Pr}(\bar{h}(\tilde{X}) \neq \varphi(\tilde{X})) + \mathbf{Pr}(h(X) \neq \varphi(X)) \Big \}.
\end{equation}
To continue, we apply Eq.~\eqref{eqn:mdp_erm_equality-1} and Eq.~\eqref{eqn:mdp_erm_equality-2} to Eq.~\eqref{eqn:mdp_expansion}.
%, recalling the $\max$ operation and the $\min$ operation preserve point-wise inequalities.\footnote{If $\forall x \ f(x) \leq g(x)$, then $\sup f \leq \sup g$ and $\inf f \leq \inf g$. In the case of the supremum, this is true because $\sup g$ is an upperbound for $f$ and $\sup f$ (by definition) is the least upperbound. A similar argument proves the case of the infimum.} 
Specifically, we have
\begin{equation}
\begin{split}
    & \max \begin{rcases}
    \begin{dcases}
       1 - \min_ {\varphi \in \mathcal{H}} \Big \{ \mathbf{Pr}(h(X) = \varphi(X)) +  \mathbf{Pr}(h(\tilde{X}) \neq \varphi(\tilde{X})) \Big \}, \\
      1 - \min_ {\varphi \in \mathcal{H}} \Big \{ \mathbf{Pr}(h(\tilde{X}) =  \varphi(\tilde{X})) + \mathbf{Pr}(h(X) \neq \varphi(X)) \Big \}
    \end{dcases}
  \end{rcases} \\
  & = \max \begin{rcases}
    \begin{dcases}
       1 - \underset{\bar{h} \in \Upsilon}{\min_ {\varphi \in \mathcal{H},}} \Big \{ \mathbf{Pr}(\bar{h}(X) \neq \varphi(X)) +  \mathbf{Pr}(h(\tilde{X}) \neq \varphi(\tilde{X})) \Big \}, \\
      1 - \underset{\bar{h} \in \Upsilon}{\min_ {\varphi \in \mathcal{H},}} \Big \{ \mathbf{Pr}(\bar{h}(\tilde{X}) \neq \varphi(\tilde{X})) + \mathbf{Pr}(h(X) \neq \varphi(X)) \Big \}
    \end{dcases}
  \end{rcases}.
\end{split}
\end{equation}
Taking $P, Q, U, V$ as assumed, the desired result follows by the definition of risk in Eq.~\eqref{eqn:risk}.
\end{proof}
\subsection{Theorem~\ref{thm:pb-bound-efficient}}
As noted in the main text, this result introduces a deterministic reference to avoid costly Monte-Carlo estimation. It is the consequence of a series of triangle-inequalities and some of the Lemmas disucssed in proof of Thm.~\ref{thm:pb-bound}. 
\begin{proof}
Observe, for any $h_*$,
\begin{equation}
\begin{split}\label{eqn:pattern-use-3}
    \Delta_{\mathbb{Q}}(S, \mathbb{T}) & \leq \Delta_{\mathbb{Q}, h_*}(S, T) + \Delta_{h_*,\mathbb{Q}}(T, \mathbb{T}) \\
    & \leq \Delta_{h_*}(S, T) + \Delta_{\mathbb{Q}, h_*}(S, S) + \Delta_{\mathbb{Q}}(T, \mathbb{T}) + \Delta_{\mathbb{Q}, h_*}(T, T) \\
    & \leq \rho + \Delta_{h_*}(S, T) + \Delta_{\mathbb{Q}}(T, \mathbb{T})
\end{split}
\end{equation}
Use Lemma~\ref{lem:simple-da-bendavid} and Lemma~\ref{lem:maurer}, respectively, to bound the latter two terms. Application of Boole's Inequality and selection of $h_* = \mu$ gives the result.
\end{proof}
\subsection{Corollary~\ref{cor:pb-bound-efficient}}
Conceptually, this result relies on the same proof-technique as Theorem~\ref{thm:pb-bound-efficient}, but the proof is still a bit more technically involved than a typical ``Corollary'' because it requires the measure-theoretic notion of a pushforward. We consider pushfowards of empirical distributions, which are finitely supported, so there is no need to discuss issues of measurability.
\begin{proof}
Following the proof of Theorem~\ref{thm:pb-bound-efficient}, we have $\Delta_{\mathbb{Q}}(S, \mathbb{T}) \leq \rho + \Delta_{\mu}(S, T) + \Delta_{\mathbb{Q}}(T, \mathbb{T})$. Now, recalling $\mu$ is the composition of a classifier $c_\mu$ and a feature extractor $f_\mu$, we have
\begin{equation}\label{eqn:rewrite}
    \Delta_\mu(S, T) = \Delta_{c_\mu}(S \circ f_\mu^{-1}, T \circ f_\mu^{-1})
\end{equation}
where we abuse notation and write $\mathbb{D} \circ g^{-1}$ for the pushforward of a distribution $\mathbb{D}$ on $\mathcal{X}\times\mathcal{Y}$ by the function $\Phi_g(x,y) = (g(x), y)$. In details, setting $\ell(h, (x,y)) = 1[h(x) \neq y]$ and assuming $f_\mu : \mathcal{X} \to \mathcal{Z}$ and $c_\mu : \mathcal{Z} \to \mathcal{Y}$, Eq.~\eqref{eqn:rewrite} follows because
\begin{equation}\label{eqn:pf-expl}
    \mathbf{R}_\mathbb{D}(\mu) = \int_{\mathcal{X} \times \mathcal{Y}} \ell(c_\mu \circ f_\mu, v)\mathbb{D}(\mathrm{d}v) = \int \ell(c_\mu, \Phi_{f_\mu}(v))\mathbb{D}(\mathrm{d}v) = \int_{\mathcal{Z} \times\mathcal{Y}} \ell(c_\mu, w)\mathbb{D} \circ f_\mu^{-1}(\mathrm{d}w) = \mathbf{R}_{\mathbb{D} \circ f_\mu^{-1}}(c_\mu)
\end{equation}
for any distribution $\mathbb{D}$ over $\mathcal{X} \times \mathcal{Y}$. After applying the equality in Eq.~\eqref{eqn:rewrite}, we can conclude our argument as in the proof of Theorem~\ref{thm:pb-bound} using Lemma~\ref{lem:simple-da-bendavid}. Although, it should be noted the adaptation problem has changed slightly, since we now consider the hypothesis space $\mathcal{W} = \{c_h \mid h \in \mathcal{H}\} \subseteq \mathcal{Y}^\mathcal{Z}$, the source distribution $\mathbb{S} \circ f_\mu^{-1}$ over $\mathcal{Z} \times \mathcal{Y}$, and the target distribution $\mathbb{T} \circ f_\mu^{-1}$ over $\mathcal{Z} \times \mathcal{Y}$. Of course, Lemma~\ref{lem:simple-da-bendavid} still applies in this case, so this does not present an issue.  

After this, to arrive at the result in the main text, we simplify terms to remove any discussion of pushforward distributions. For any risks, this is accomplished by reversing the steps in Eq.~\eqref{eqn:pf-expl}. For any divergences, a similar equality holds and can be applied. In particular, for any $\mathcal{Q} \subseteq \mathcal{Y}^\mathcal{Z}$, any function $p : \mathcal{X} \to \mathcal{Z}$, and any distributions $\mathbb{S}$ and $\mathbb{T}$ over $\mathcal{X} \times \mathcal{Y}$, we use the expansion below:
\begin{equation}
\begin{split}
    \mathbf{d}_\mathcal{Q}((\mathbb{S} \circ p^{-1})_Z, (\mathbb{T} \circ p^{-1})_Z) & = \sup_{q \in \mathcal{Q}} \lvert \mathbf{E}_{Z \sim (\mathbb{S} \circ p^{-1})_Z} [q(Z)] - \mathbf{E}_{Z \sim (\mathbb{T} \circ p^{-1})_Z} [q(Z)]\rvert \\
    & = \sup_{q \in \mathcal{Q}} \lvert \mathbf{E}_{X \sim \mathbb{S}_X} [(q \circ p) (X)] - \mathbf{E}_{X \sim \mathbb{T}_X} [(q \circ p) (X)] \rvert \qquad (\text{similiar to Eq.~\eqref{eqn:pf-expl}})\\
    & = \sup_{r \in \mathcal{Q} \circ p^{-1}} \lvert \mathbf{E}_{X \sim \mathbb{S}_X} [r (X)] - \mathbf{E}_{X \sim \mathbb{T}_X} [r (X)] \rvert \qquad (\mathcal{Q} \circ p^{-1} \defn \{q \circ p \mid q \in \mathcal{Q}\})\\
    & = \mathbf{d}_{\mathcal{Q} \circ p^{-1}}(S_X, T_X).
\end{split}
\end{equation}
Taking $\mathcal{Q} = \{1 - \mathbf{1}_{\{c(\cdot)\}}\{c'(\cdot)\} \mid (c,c') \in \mathcal{W}\}$ and $p = f_\mu$, we end up with $\mathcal{Q} \circ p^{-1} = [\mathcal{H}\Delta\mathcal{H}]_\mu$ as defined in the main text. Likewise, taking $\mathcal{Q} = \{1 - \mathbf{1}_{\{c_\mu(\cdot)\}}\{c'(\cdot)\} \mid c' \in \mathcal{W}\}$ and $p = f_\mu$, we end up with $\mathcal{Q} \circ p^{-1} = [\mu\Delta\mathcal{H}]_\mu$.
\end{proof}

% \subsection{De-Randomizing Traditional PAC-Bayes Bounds}
% \begin{theorem}
% \label{thm:derandom}
% For any $\mathbb{P}$ over $\mathcal{H}$, any $\delta > 0$, w.p. at least $1-\delta$, for all $\mathbb{Q}$ over $\mathcal{H}$ s.t. $\mathbb{Q}$ is $\rho$-flat on $T$
% \begin{equation}
%     \Delta_{\mu}(\mathbb{T}, T) \leq 2\rho + \sqrt{\tfrac{\mathrm{KL}(\mathbb{Q}\mid \mid \mathbb{P}) + \ln \sqrt{4m} - \ln \delta }{2m}}
% \end{equation}
% \end{theorem}
% \begin{proof}
% Observe,
% \begin{equation}
% \begin{split}
%     \Delta_{\mu}(\mathbb{T}, T) \leq \Delta_{\mu,\mathbb{Q}}(\mathbb{T}, \mathbb{T}) + \Delta_{\mu,\mathbb{Q}}(T, T) \leq \Delta_{\mu,\mathbb{Q}}(\mathbb{T}, \mathbb{T}) + \rho.
% \end{split}
% \end{equation}
% To proceed, we bound the first summand
% \begin{equation}
% \begin{split}
%     \Delta_{\mu,\mathbb{Q}}(\mathbb{T}, \mathbb{T}) & = \lvert \mathbf{R}_{\mathbb{T}}(\mu) - \mathbf{E}[\mathbf{R}_{\mathbb{T}}(H) ] \rvert \qquad H \sim \mathbb{Q} \\
%     & \leq \mathbf{E} [\lvert \mathbf{R}_{\mathbb{T}}(\mu) - \mathbf{R}_{\mathbb{T}}(H) \rvert] \qquad \text{(Jensen's Inequality)} \\
%     & = \mathbf{E} [\lvert \mathbf{E}\{ 1[\mu(X) \neq Y] - 1[H(X) \neq Y] \mid H \} \rvert] \qquad \text{(Linearity and Tower Rule)} \\
%     & \leq \mathbf{E}[\mathbf{E}\{\lvert 1[\mu(X) \neq Y] - 1[H(X) \neq Y] \rvert \mid H \} ] \qquad\text{(Jensen's Inequality)} \\
%     & \leq ??
% \end{split}
% \end{equation}
% \end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Extended Related Works}
\label{sec:ext_related}
Here, we give an extended version of the related works (Section~\ref{sec:related}). First, we discuss theoretical adaptation work. We compartmentalize relevant contributions based on some key-terms common to adaptation bounds. Following this, we discuss related works in PAC-Bayes, in which, we give a more in depth history of these bounds.

\paragraph{Divergence} Many bounds use a modified, or generalized, divergence term. \citet{apx_mansour2009domain} define divergence for \textit{any} loss function (i.e., in addition to the 01-loss we consider). 
% To more correctly approximate divergence in DA algorithms, \citet{apx_zhang2019bridging} build on this work to incorporate model-dependent divergence in a margin-loss bound. Independently, \citet{apx_kuroki2019unsupervised} also give a model-dependent divergence for a bound on 01-loss. 
With some restrictions on hypothesis space, \citet{apx_redko2017theoretical} show a Wasserstein metric may be used to bound error. \citet{apx_shen2018wasserstein} extend this to more general settings. As noted by \citet{apx_redko2020ASO}, bounds based on Wasserstein metric imply bounds based on MMD \citep{apx_gretton2012kernel} due to a general relationship between the two. \citet{apx_johansson2019support} give another bound based on an integral probability metric. Note, none of these works consider approximation of divergences used to bound 01-loss in multiclass settings. In this regard, the closest work to ours is \citet{apx_zhang2019bridging} who approximate a divergence used to bound a multiclass \textit{margin} loss, which in turn, bounds the 01-loss we consider. As noted, the primary difference between our work and the work of \citet{apx_zhang2019bridging} is the use of uniform sample-complexity in the latter. Possibly, bounds in the latter could be extended to PAC-Bayesian contexts as well, but our choice of divergences allows us to work directly with 01-loss and avoid any loosening of the bound via the margin penalty.  

\paragraph{Adaptability} Besides requiring small adaptability term, some theoretical DA works consider other possible assumptions. For example, a covariate shift assumption can be made: the marginal feature distributions disagree, but the feature-conditional label distributions are identical. This assumption is useful, for example, in designing model-selection algorithms \citep{apx_sugiyama2007covariate, apx_you2019towards}, but \citet{apx_david2010impossibility} show this assumption (on its own) is \textit{not} enough for the general DA problem. Another frequent assumption is label-shift: the marginal label distributions disagree, but the label-conditional feature distributions remain the same. As mentioned, \citet{apx_zhao2019learning} show failure-cases in this context, while \citet{apx_lipton2018detecting} propose techniques for detecting and correcting shift in this case. Similarly, \citet{apx_tachet2020domain} propose \textit{generalized} label-shift and motivate new algorithms in this context. The DA problem can also be modeled through causal graphs \citep{apx_zhang2015multi, apx_magliacane2018domain} and some extensions to DA consider a meta-distribution over targets \citep{apx_blanchard2021domain, apx_albuquerque2019adversarial, apx_deng2020representation}. Notably, most assumptions are untestable in practice, but not many works consider this. As we are aware, we are the first work to use a sample-dependent adaptability term, which improves estimation in empirical study.

%\paragraph{Our Work} In light of this, one of the primary contributions of our work is to modify the adaptability term so it is easy to estimate in controlled environments (i.e., with access to target labels). This allows us not only to study the adaptability term itself, but also to lowerbound error when approximating divergence. As noted, we use this lowerbound to empirically study techniques a novel, non-trivial extension of an approximation scheme proposed by \citet{apx_ben2010theory} in the binary case. We also study a slight generalization of an approximation scheme for a model-dependent divergence \citep{apx_kuroki2019unsupervised, apx_zhang2019bridging}.

\paragraph{PAC-Bayes} 
% Here, we cover relevant tools in PAC-Bayes and finish with a discussion of PAC-Bayes in DA. 
%We give an (incomplete) overview of the framework, focusing primarily on the tools we employ and other uses in DA. 
For completeness, besides what is discussed here, readers are directed to the work of \citet{apx_catoni2007pac}, \citet{apx_mcallester2013pac}, \citet{apx_germain2009pac, apx_germain2015risk}, and the primer by \citet{apx_guedj2019primer}. While PAC-Bayes is often attributed to \citet{apx_mcallester1999some} with early ideas by \citet{apx_shawe1997pac}, the particular bound we use is due to \citet{apx_maurer2004note}. A similar result was first shown by \citet{apx_langford2001bounds} for 01-loss. In experiments, we use data-dependent priors, perhaps first conceptualized by \citet{apx_ambroladze2007tighter, apx_parrado2012pac}. Besides the previously discussed work of Germain et al., PAC-Bayes has also been used in theories for transfer learning \citep{apx_li2007bayesian, apx_mcnamara2017risk}. As mentioned, our bounds are the first PAC-Bayesian multiclass adaptation bounds.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Experimental Details}
\label{sec:exp_details}
\subsection{Datasets and Models}
\label{sec:datasets_and_models}
As noted in the main text, we consider a collection of common adaptation datasets from both computer vision and NLP. Each dataset consists of a number of component \textit{domains} which are themselves distinct datasets that all share a common label space. In this way, we can simulate transfer of some model from one domain to another. The datasets and models we consider are as follows:
\begin{enumerate}
     \item \textbf{Digits}: Digits consists of collection of digit classification datasets including: USPS \citep{apx_uspsdataset}, MNIST \citep{apx_lecun-mnisthandwrittendigit-2010}, and SVHN \citep{apx_netzer2011reading}. We use only the training sets. The number images in each is about 7K, 60K, and 70K, respectively. We select $\mathcal{X}$ to be the space of 28$\times$28 grayscale images (i.e., the original feature space for MNIST). For USPS and SVHN, this is accomplished through image transformation. The label space $\mathcal{Y}$ consists of the digits 0-9. As we are aware, this collection was first used by \citet{apx_ganin2015unsupervised}. For this task, we consider $\mathcal{H}$ to be a space of CNNs of a fixed 4-layer architecture.
     \item \textbf{PACS}: PACS is an image-classification, domain generalization dataset where each domain has a different style. It was proposed by \citet{apx_li2017deeper_PACS} to be a more challenging task compared to existing generalization datasets. The domains consist of images in style of: Photo, Art Painting, Cartoon, or Sketch. There are about 10K total labeled images with some slight imbalance in the liklihood of each style. The label space $\mathcal{Y}$ consists of 7 common object categories: dog, elephant, giraffe, guitar, horse, house, and person. The feature space $\mathcal{X}$ is selected to be space of real-vectors of dimension $2048$; i.e., $\mathbb{R}^{2048}$. To map to the feature space from an image, we use the hidden-layer output of an image passed through a pre-trained ResNet-50 \citep{apx_he2016deep}. For this task, we consider $\mathcal{H}$ to be either the space of linear classifiers or the space of 4-layer FCNs of a fixed architecture (fully-connected networks).
     \item \textbf{Office-Home}: Office-Home was originally proposed by \citet{apx_venkateswara2017deep}, but we use the smaller preprocessed version given by \citet{apx_zhou2020deep}. The dataset is similar to PACS. It also contains 4 different styles as its component domains across about 15K total images: Art, Clipart, Product, and Real-World. Unlike PACS, it has a much larger number of classes. In particular, the label space $\mathcal{Y}$ contains 65 categories of different daily objects. Like PACS, we use the outputs of ResNet-50 to map to the real vector space $\mathcal{X}$. We let $\mathcal{H}$ be either a linear model or a 4-layer FCN as before.
     \item \textbf{Amazon Reviews}: Amazon Reviews is a text-classification dataset introduced by \citet{apx_blitzer2007biographies}. We use the Books, DVD, Electronics, and Kitchen domains preprocessed as in Blitzer et al. This totals about 4000 reviews which are labeled as having positive or negative sentiment. The feature space $\mathcal{X}$ is the space $\mathbb{N}^{4096}$; i.e., the space of bag-of-word representations. For each review, the non-zero vector components correspond to counts for words found within the review. Implicitly, this limits our vocabulary to the 4095 most frequent words and leaves one special token for out-of-vocabulary words. As noted, the label space $\mathcal{Y}$ is a binary space whose elements denote the sentiment of the review. We let $\mathcal{H}$ be either a linear model or a 4-layer FCN as before.
     \item \textbf{Discourse A (PDTB Labels)}: The Penn Discourse Treebank (PDTB) \citep{apx_prasad2008penn} is an NLP dataset containing a subset of Wall Street Journal articles from the Penn Treebank \citep{apx_marcus1993building} which are tagged with shallow discourse coherence relations (i.e., relations that hold only between the argument pairs and do not have any hierarchy or graph structure). These coherence relations can be explicitly signaled by discourse \emph{connectives} such as \emph{and}, \emph{so}, and \emph{but}, or could require the insertion of an \emph{implicit} connective. In this paper, we focus on the task of implicit discourse sense classification which is the most difficult task for discourse parsers. To form a DA dataset, we also used implicit relations from two parallel corpora: the TED-MDB \citep{apx_zeyrek2020ted} which contains tagged TED talks and the BioDRB \citep{apx_ramesh2010identifying} which contains tagged scientific articles. These three mentioned datasets form our component domains. Our feature space $\mathcal{X}$ is selected to be space of real-vectors of dimension 728. Within this space, we try three different feature representations for the discourse relations.\footnote{We only experiment \textit{within} each representation type and do not attempt to transfer \textit{across} different representations.} In the first two cases, the feature is made up of the argument pairs which have been concatenated and encoded using a BERT model \citep{apx_devlin2018bert}. We use either the pooled output or the average of the hidden states. In the last case, we use Sentence-BERT \citep{apx_reimers2019sentence} to encode our features. Our label space $\mathcal{Y}$ consists of the 4 level 1 discourse sense classes contained in the Penn Discourse Treebank. We let $\mathcal{H}$ be either a linear model or a 4-layer FCN as before.
     \item \textbf{Discourse B (GUM Labels)}: The GUM corpus \citep{apx_zeldes2017gum} contains text documents from 8 different genres: Academic, Biography, Fiction, Interview, News, Reddit, How-To, and Travel. These genres form our component domains. Documents within the corpus are annotated using the discourse framework of Rhetorical Structure Theory \citep{apx_mann1987rhetorical} in which discourse coherence relations are organized in a hierarchical tree structure. The sense hierarchy used for the GUM corpus is similar to that of the RST Discourse Treebank \citep{apx_carlson2003building}. In order to focus on coherence relations only between two argument pairs (without the additional hierarchical structure), we removed all relations where one or both nodes was not a leaf node. To form the label space, we mapped the twenty GUM labels to the conventional RST discourse treebank top-level labels where only three GUM labels did not have an existing mapping encoded in the RST. We mapped these three in the following manner, following \citet{apx_braud2017cross}: \emph{preparation} to BACKGROUND, \emph{justify} and \emph{motivation} to EXPLANATION, and \emph{solutionhood} to TOPIC-COMMENT. Given this mapping, our final label space $\mathcal{Y}$ consists of the 13 different RST discourse sense classes that were mapped to by the GUM corpus classes. Our features are encoded the same way as the PDTB features. We let $\mathcal{H}$ be either a linear model or a 4-layer FCN as before.
\end{enumerate}

\paragraph{Random Data Splits}
To simulate variability due to sampling and also to consider more mild forms of dataset shift, we split each component domain within each dataset into two disjoint sets of (roughly) equal size. So, for a dataset with 4 component domains, the new number of component domains will be 8. Some of these component domains \textit{should} now follow a fairly similiar distribution; i.e., splits coming from the same original component. This process is done randomly and all adaptation scenarios (see Section~\ref{sec:adaptation_pairs}) test 3 different seeds for this split.

\subsection{Model Training for Divergence Approximation, Adaptability Upperbounds, and Simple Algorithm (SA)}\label{sec:simple-training}
We train a number of deterministic models throughout our experimentation; e.g., for divergence approximation, to compute risks for the ranking task, and to compute upperbounds on $\lambda$. To avoid individual parameter selection for more than 12,000 models all trained in our experimentation, we use the optimization parameters given below in most cases. For Gibbs predictors, we use a slightly modified technique which is also discussed below. For the \textbf{DANN} algorithm, we use different parameters which were more carefully selected (details discussed in Section~\ref{sec:dann_details}). While this ``one size fits all'' approach is arguably simplistic, we found these settings worked well for a majority of cases in our preliminary experiments. For divergence approximation and upperbounds on adaptability this is visible in the main text results. In Appendix~\ref{sec:sanity_check}, we also report statistics on the transfer (target) error of some hypotheses trained to minimize error on the source sample (i.e., the Simple Algorithm \textbf{SA}). These provide a sanity check that our simple optimization procedure is indeed selecting non-trivial hypotheses in a majority of cases. Notably, the point of this work is to study global trends rather than to achieve optimal performance on any one dataset. The ``one size fits all'' approach we take is reflective of this; it allows us to use our limited computational resources to study \textit{more} datasets and models, rather than do rigorous parameter search on just a few.

\paragraph{Optimization Parameters} All models are trained using SGD on an NLL loss with momentum set to $0.9$. The NLL loss is sometimes weighted to correctly replicate the importance of multiple risks (e.g., when minimizing a sum of two risks). For example, if we have the objective $\min_h \mathbf{R}_S(h)+ \mathbf{R}_T(h)$ and $S$ has more samples than $T$, the NLL loss will weight examples in $T$ higher to give them equal importance during optimization, as described by the objective.  We start training with a learning of $1 \times 10^{-2}$ for 100 epochs. Then, we train for another 50 epochs using a learning rate of $1 \times 10^{-3}$. If a model ever achieves a training error lower than $5 \times 10^{-4}$, we terminate training. In all cases, we use a batch size of $250$.

\paragraph{Gibbs Predictors} To learn a Gibbs predictor (stochastic model) $\mathbb{Q}$ we need to use a slightly different approach. In all cases, $\mathbb{Q}$ will be a multivariate normal distribution with diagonal covariance and we will minimize Gibbs risk on a source sample $S$ with intention to transfer to a target sample $T$. We use PAC-Bayes-by-Backprop (PBB) to learn the parameters of our normal distribution. PBB is an SGD-based technique proposed by \citet{apx_perez2021tighter} to learn stochastic models that optimize PAC-Bayes bounds. The approach requires specification of a particular PAC-Bayes bound to use as the objective and a particular distribution $\mathbb{P}$ to use as the prior. For the former, we use the variational bound proposed by \citet{apx_dziugaite2021role}. For the latter, we use a multivariate normal distribution: the mean is a (trained) deterministic model (i.e., its parameter vector) and the covariance matrix is $\sigma \mathbf{I}$ where $\sigma = 0.01$ and $\mathbf{I}$ is the identity matrix. We train the deterministic model to minimize the error on $S$ using the same optimization parameters discussed previously. Note, this may seem taboo to one familiar with PAC-Bayes, since the prior $\mathbb{P}$ is typically required to be independent of the data used in the bound. Contrary to this, in our setting, it is \textit{perfectly valid} to select $\mathbb{P}$ based on the data in (only) $S$. This is clear in the proofs of Theorem~\ref{thm:pb-bound} and Theorem~\ref{thm:pb-bound-efficient} because the prior $\mathbb{P}$ is only used to bound the generalization gap between $T$ and $\mathbb{T}$. Thus, this choice is reflective of a realistic scenario where one wishes to compute a PAC-Bayes bound with $\mathbb{Q}$. The approach we describe essentially corresponds to the idea of using a data-dependent prior (see Section~\ref{sec:ext_related}). The prior $\mathbb{P}$ is learned using data that is not used in any part of the bound which depends on $\mathbb{P}$. For optimization parameters of PBB not discussed here (e.g., learning rate), we default to the previously discussed choices.

\subsection{DANN Model Training}
\label{sec:dann_details}
For \textbf{Digits}, we study a PAC-Bayes variant of the invariant feature learning algorithm \textbf{DANN} (Domain Adversarial Neural Network) proposed by \citet{apx_ganin2015unsupervised}. The output of our variation is a Gibbs predictor $\mathbb{Q}$, so we again employ PBB as discussed above in Section~\ref{sec:simple-training}. While many parameters are similar to those used above -- including the prior and the PAC-Bayes bound --, we highlight some differences here. Most notably, we re-weight the KL divergence in the PAC-Bayes bound by a dampening factor to reduce its regularizing impact during training. For example, if the KL divergence was 46K and the dampening factor was 0.1, the effective KL divergence during training is 4.6K. We explore a range of different dampening factors to get a breadth of different ``complexities'' for interpretation; i.e., this variability produces the movement along the horizontal axes of Figure~\ref{fig:dann-all}. In our experiments, we let the dampening factor range in the set $\{0.1, 0.05, 0.01, 0\}$. We did not use any dampening factor to recover the original PAC-Bayes bound (i.e., 1) because we found this setting to be too restrictive in preliminary experiments. Due to the increased training time involved in this parameter sweep, we down-sampled the \textbf{Digits} dataset discussed above so that neither $S$ nor $T$ have more than 5K examples. We selected the learning rate by manual inspection, varying the learning rate (and number of epochs accordingly) until we did not observe frequent gradient explosion / vanishing. We ended up using an initial learning rate of $1 \times 10^{-3}$ for 112 epochs (75\% of 150) and $5 \times 10^{-4}$ for the remaining 38 epochs. We also reduced the batch size to 128, but most other parameters remained the same. It is important to note that the instability we experienced (i.e., related to gradient explosion / vanishing) is somewhat common when training adversarial methods such as DANN. As an additional measure to combat this issue, we slowly eased in the adversarial loss by weighting it using the parameter $\beta_p = 2 / (1 + \exp(-10p))$ where $p$ is the progress ratio of the current epoch in training; e.g., $p = 0.1$ corresponds epoch 15 out of 150. This approach was first proposed by \citet{apx_ganin2015unsupervised} for the same purpose. Besides the adversarial loss, we also multiply the KL divergence dampening factor by this weight to ease in the regularization component as well. As a final measure, we used multiple restarts with a new neural network initialization (up to 25 attempts), which proved to be the most effective measure. Among about 32K statistics computed during these experiments, only 21 statistics were unable to be computed due to instability. These consisted of restricted model-independent divergences (i.e., using class $[\mathcal{H}\Delta\mathcal{H}]_\mu$) and were ignored in plots. Roughly 5\% of the data points still had ``extreme'' values that did not match any other trend, so we removed these in Figure~\ref{fig:dann-all} to help with visual interpretation. 
\subsection{Adaptation Pairs}
\label{sec:adaptation_pairs}
We now discuss the different adaptation scenarios we consider. Instances of each scenario produce the collection of $(S,T)$ pairs we consider in our histograms in the main text. Recall, we randomly split each component domain into two halves (see Section~\ref{sec:datasets_and_models}). This will be important for understanding our adaptation scenarios. 
\paragraph{Single-Source} For all datasets except \textbf{Discourse B}, we consider a single-source adaptation scenario: each component domain in a dataset is paired with each distinct component domain. So, for a dataset with 8 components, this forms 64 $(S,T)$ pairs. For example, one pair might take $S$ to be the first random half of SVHN and $T$ to be the first random half of USPS. Another pair might take $S$ to be the first random half of SVHN and $T$ to be the \textit{second} random half of SVHN. So, as we see from this example, this implies that components derived from the same domain (i.e., through our random splitting procedure) will be paired. Note, these should follow a fairly similar (or identical) distribution. This is purposeful and provides a number of instances of \textbf{within-distribution} shift. These milder forms of shift allow us to test a broader range of realistic scenarios. The random splitting procedure also allows us to test variability in the outcome of a transfer task due to sampling; e.g., the first and second random half of SVHN will both be paired with every other dataset. 
\paragraph{Multi-Source} We also consider multi-source scenarios for all datasets except \textbf{Discourse A}. In these cases, we group all but one component of the dataset into a single pooled sample. The single component which was left out is chosen to be the target. So, for a dataset with 4 components, this forms 4 $(S,T)$ pairs. For example, for PACS, one pair might take $S$ to be the union of Art, Cartoon, and Sketch while $T$ consists of only Photo. \textbf{Importantly}, we only use \textit{one} of the random splits from each component type; e.g., for PACS, although we split photo into two disjoint sets, we only use one of these two sets. Otherwise, in every $(S,T)$ pair, $S$ would contain some data coming from a similar distribution as $T$. Informally speaking, this likely to weaken the adaptation difficulty. Note, the adaptation bounds we give implicitly cover multi-source contexts, since we can view the single source $\mathbb{S}$ as a mixture distribution.
\paragraph{Digits-Specific Scenarios} The \textbf{Digits} dataset is particular interesting because the feature space $\mathcal{X}$ is well-understood by humans. Thus, we can use our experience to design some natural distribution shifts. In particular, we consider the case where $S$ is some component of \textbf{Digits} and $T$ is the same sample except every image is randomly rotated up to $360^\circ$. We also consider the case where $S$ is some component of \textbf{Digits} and $T$ is the same sample except every image is blurred with random white noise. We can also consider a very unnatural shift. In particular, we consider transfer to randomly generated data. Here, $S$ is some component of \textbf{Digits} (as before) and every image in $T$ is a $28 \times 28$ grid of randomly generated pixels which is assigned a random label. For these scenarios, we use the entirety of the components in the \textbf{Digits} sets without doing any random splitting.

\subsection{Details for For Figure~\ref{fig:lambda}}\label{sec:exp_details_ada}
Each dataum in Figure~\ref{fig:lambda} corresponds to an upperbound for one of the adaptation pairs described in Section~\ref{sec:adaptation_pairs} using one of the compatible hypothesis spaces described in Section~\ref{sec:datasets_and_models}. We describe the process for computing each type of upperbound below. In all cases, we compute the upperbound using 3 different random seeds; i.e., this will effect things like the model-training and subsequently the final bound. We report the smallest upperbound of these seeds. This is logical since the smallest upperbound is still a valid upperbound. In case of $\lambda$, this is actually overly optimistic since the confidence parameter should be changed to account for all 3 bounds.  
% The final adaptability term (i.e., proposed by Germain et al.) is model-dependent, so it too will change based on the seed. For consistency in presentation, we also report the smallest upperbound in this case, but remark, this biases our results for this term. In a more correct experimental setup, we expect this term to actually be larger. Thus, our interpretation of the results is unaffected by this induced bias on the adaptability term of Germain et al.

\paragraph{Upperbound for $\tilde{\lambda}$} This bound is computed just as described in the main text. For each adaptation pair $(S,T)$ and each hypothesis space $\mathcal{H}$ which is compatible with $S$ and $T$, we train a model to minimize the summed risks on $S$ and $T$ using the approach described in Section~\ref{sec:simple-training}. If this approach returns the hypothesis $h$, we report $\mathbf{R}_S(h) + \mathbf{R}_T(h)$. As noted in the main text, this is a valid upperbound for $\tilde{\lambda}$.

\paragraph{Upperbound for $\lambda$}
 For each adaptation pair $(S,T)$ and each hypothesis space $\mathcal{H}$ which is compatible with $S$ and $T$, we randomly split $S$ and $T$ using an 80/20 train/test split. Denote train splits for $S$ and $T$ by $S_\mathrm{tr}$ and $T_\mathrm{tr}$, respectively. Denote the test splits for $S$ and $T$ by $S_\mathrm{ho}$ and $T_\mathrm{ho}$, respectively. We train a model to minimize summed risks on $S_\mathrm{tr}$ and  $T_\mathrm{tr}$ using the approach described in Section~\ref{sec:simple-training}. If this approach returns the hypothesis $h$, we then report the quantity
 \begin{equation}
     \mathbf{R}_{S_\mathrm{ho}}(h) + \mathbf{R}_{T_\mathrm{ho}}(h) + \sqrt{\ln (4 / \delta) / (2m)} + \sqrt{\ln (4 / \delta) / (2n)} 
 \end{equation}
where $m = |T_\mathrm{ho}|$, $n = |S_\mathrm{ho}|$, and $\delta = 0.05$. This is a valid upperbound for $\lambda$ which holds (i.e., prior to observing data) with probability $1 - \delta$. It is easily derived using Hoeffding's Inequality to bound both $\Delta_h(S_\mathrm{ho}, \mathbb{S})$ and $\Delta_h(T_\mathrm{ho}, \mathbb{T})$, and then, using Boole's Inequality to combine the bounds.
\subsection{Details for For Table~\ref{tab:divapprox}}\label{sec:exp_details_divapprox}
Each dataum used to compute the correlations in Table~\ref{tab:divapprox} corresponds to a divergence approximation for an $(S,T,h)$ triple. The datasets $S$ and $T$ are given by one of the adaptation pairs described in Section~\ref{sec:adaptation_pairs}, and conceptually, we pick $h$ to be the hypothesis whose error we would like to bound in the simulated transfer from $S$ to $T$. Specifically, for each adaptation pair $(S,T)$ and each compatible hypothesis space $\mathcal{H}$, we select $h$ using \textbf{SA}. We then approximate either the $\mathcal{H}\Delta\mathcal{H}$- or $h\Delta\mathcal{H}$-divergence using the (appropriate) technique described in Section~\ref{sec:method_approx}. The Spearman rank correlation we report compares the $\mathcal{H}\Delta\mathcal{H}$- and the $h\Delta\mathcal{H}$-divergence to the error-gap $\Delta_h(S,T)$ defined in Eq.~\eqref{eqn:error_gap} over all adaptation pairs $(S,T)$ in Section~\ref{sec:adaptation_pairs}, all compatible hypothesis spaces $\mathcal{H}$ discussed in Section~\ref{sec:datasets_and_models}, and all 3 seeds.
\subsection{Details for For Estimation of Flatness}\label{sec:exp_details_flatness}
Each datum discussed in the main text paragraph \textbf{Do Flat Regions Transfer?} corresponds to an estimate for an $(S,T,\mathbb{Q})$ triple. The datasets $S$ and $T$ are given by one of the adaptation pairs described above, and as before, we pick $\mathbb{Q}$ to be the Gibbs predictor whose error we would like to bound in the simulated transfer from $S$ to $T$ (i.e., using \textbf{SA}). We then estimate as described in the main text. For each $(S,T)$ pair and compatible hypothesis space $\mathcal{H}$, we repeat this procedure with 3 seeds to control for variability in the selection of $\mathbb{Q}$. We report all seeds in the histogram in Figure~\ref{fig:rho-hist}. Also, the mean and standard deviation reported in the main text are computed using all data in the histogram.
\subsection{Details for Figure~\ref{fig:dann-all}}
Each datum in Figure~\ref{fig:dann-all} corresponds to statistics computed for an $(S, T, \mathbb{Q})$ triple. As noted, $S$ and $T$ are restricted to be samples from the \textbf{Digits} dataset. Further, we only consider out-of-distribution adaptation scenarios as indicated in each subfigure title. The Gibbs predictor $\mathbb{Q}$ is selected using the training details discussed in Section~\ref{sec:dann_details} and the statistics are reported as described in Thm.~\ref{thm:pb-bound-efficient} and Cor.~\ref{cor:pb-bound-efficient}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Additional Experimental Results}
\label{sec:ext_results}
\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{figures/hued_groups_lambda.png}
    \caption{Boxplots for upperbounds on $\tilde{\lambda}$ for individual datasets. The appendage \texttt{\_m} denotes the \textbf{multi-source} setup, otherwise it is \textbf{single-source}. For digits, we prepend \texttt{r\_}, \texttt{n\_}, or \texttt{f\_} to denote transfer to rotated, noisy, or randomly generated (fake) data as discussed in Section~\ref{sec:adaptation_pairs}. As may be inferred, \texttt{pdtb} corresponds to \textbf{Discourse A} and \texttt{gum} corresponds to \textbf{Discourse B}. Appendages for these indicate the type of BERT features used.}
    \label{fig:grouped_ubs}
\end{figure}
\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{figures/hued_groups_transfer_error.png}
    \caption{Dataset names are as in Figure~\ref{fig:grouped_ubs}. Hues correspond to within-distribution \textbf{WD} and out-of-distribution \textbf{OOD} based on whether the target $T$ is drawn from the same component domain as the source $S$. Notice, \textbf{WD} is not available for multi-source setups and certain Digits setups. This is simply a property of these adaptation scenarios. In these cases, we still report the \textbf{OOD} error. The appendage \texttt{-r} denotes the hypothesis is randomly initialized and not trained. These experiments provide a point of reference for comparison.}
    \label{fig:grouped_transfer_error}
\end{figure}

\subsection{Upperbounds on Adaptability (Dataset Specific)}\label{sec:adaptability_boxp}
We also found it interesting to consider our sample-dependent adaptability in a more problem-specific context. This reveals to us that \textbf{PACS} and \textbf{Office-Home} have the larger upperbounds of computer vision datasets. It also reveals that most upperbounds above about $0.3$ are due to NLP datasets. Informally speaking, this is sensible as the NLP tasks we consider have higher uncertainty in the labeling functions. Results are shown in Figure~\ref{fig:grouped_ubs}. We use the same aggregate data as Figure~\ref{fig:lambda}.

\subsection{Transfer Error of Trained and Random Hypotheses}\label{sec:sanity_check}
In Figure~\ref{fig:grouped_transfer_error}, we show transfer error (i.e., the error on $T$) of hypotheses trained using \textbf{SA}. These are precisely the hypotheses used to compute the error-gap $\Delta_h(S,T)$ when reporting Spearman rank correlation. As noted, we use a standardized optimization procedure to forego parameter selection on the more than 12,000 models we train. Thus, we are not interested in optimal performance in any case. Instead, Figure~\ref{fig:grouped_transfer_error} primarily serves as a sanity check to make sure the hypotheses we use are somewhat reflective of those which might be used in a practical scenario. That is, we would like to confirm that these hypotheses have learned something non-trivial (at least on the source domain). To illustrate this, for all datasets, we also report the error of randomly initialized hypotheses which have not been trained. This provides a point of reference. It is easy to see our trained hypotheses are typically far more effective than the untrained random initializations.

For the discourse datasets with PDTB labels, we observe the error of the random initializations is somewhat harder to interpret. For this reason we compare to a related work. In particular, \citet{apx_kishimoto-etal-2020-adapting} achieve an error rate of $\approx 0.38$ on a comparable (within-distribution) discourse sense classification task. We observe our within-distribution PDTB results (Discourse A) are frequently better than this.

\subsection{Comparison to Germain et al.}
\label{sec:comp2germain}
% \begin{figure}
%     \centering
%     \includegraphics[width=.5\columnwidth]{figures/rho.png}
%     \caption{Estimates of $\rho$ as described in main text and Section~\ref{sec:exp_details_flatness}. Each datum describes unique $(S, T, \mathbb{Q})$. Visualization confirms that $\rho$ is very often small as discussed in the main text.}
%     \label{fig:rho-hist}
% \end{figure}
\begin{figure}
    \centering
    \includegraphics[width=.7\columnwidth]{figures/adaptability.png}
    \caption{\small Upperbounds for sample-dependent (left), sample-independent (center), and binary PAC-Bayes variant (right) of adaptability. Each datum describes unique $(S, T, \mathcal{H})$.}
    \label{fig:g_lambda}
\end{figure}
% \textbf{Comparison to Germain et al.}\hspace{.5em}
While we know, theoretically, the PAC-Bayes bound of \citet{apx_germain2020pac} is not valid for the multiclass setting, we also study this question empirically. In Figure~\ref{fig:g_lambda}, we present a sample-dependent variation of the adaptability proposed by Germain et al.\footnote{Some proof-techniques we employ in Thm.~\ref{thm:pb-bound} can be applied to derive a sample-dependent variation of Thm.~\ref{thm:germain2020pac}, instead.} because of our previous (positive) results on sample-dependence. Even with this upgrade, the adaptability of Germain et al. is not able to capture the same useful information as our multiclass sample-dependent adaptability. Further, conducting a similar experiment as in Table~\ref{tab:divapprox}, we find the divergence term of Germain et al. has low rank correlation (\textbf{0.27} on all data). These empirical results confirm the hypothesis of \citet{apx_germain2020pac} that their PAC-Bayesian theory of adaptation in binary settings is not easily extend to the multiclass setting. This is especially true in comparison to the positive outcomes observed under our theory. The experimental details for these results are provided below.
\paragraph{Upperbound for Adaptability of \citet{apx_germain2020pac}}
The historgram in Figure~\ref{fig:g_lambda} shows a histogram of upperbounds on a sample-dependent variation of the adaptability of \citet{apx_germain2020pac} given in Thm.~\ref{thm:germain2020pac}. We compute the upperbounds using the same setup as described in Appendix~\ref{sec:exp_details_ada}. Because we do not have full access to $\mathbb{Q}$, we instead estimate this term with a finite sample $Q = (H_i)_i \sim \mathbb{Q}^k$.
Using Linearity of $\mathbf{E}$ and Hoeffding's Inequality, we have the following bound on Germain et al.'s adaptability (our sample-dependent variant) with i.i.d. sample $(H_{i,1}, H_{i,2})_i \sim (\mathbb{Q} \times \mathbb{Q})^k$
\begin{equation}\small\label{eqn:lambda_rho_ub}
\begin{split}
     \Bigg \lvert \ k^{-1}\sum\nolimits_i \underset{{X,Y}}{\mathbf{E}}[1[H_{i,1}(X) \neq Y] \cdot 1[H_{i,2}(X) \neq Y]] - k^{-1}\sum\nolimits_i \underset{{\tilde{X},\tilde{Y}}}{\mathbf{E}}[1[H_{i,1}(\tilde{X}) \neq \tilde{Y}] \cdot 1[H_{i,2}(\tilde{X}) \neq \tilde{Y}]] \ \Bigg \rvert
     + \sqrt{\frac{2 \ln (2 / \delta)}{k}}.
 \end{split}
\end{equation}
Here, we pick $\delta = 0.05$ as before and use $k = 100$. This gives a valid bound for which holds with probability $1 - \delta$ (i.e., prior to seeing the samples from $\mathbb{Q}$). We select the Gibbs predictor $\mathbb{Q}$ using \textbf{SA} as before.
\paragraph{Divergence of \citet{apx_germain2020pac}} We also approximate the divergence of Germain et al. to compare to the $\mathcal{H}\Delta\mathcal{H}$- and $h\Delta\mathcal{H}$-divergence in terms of model selection. As noted, the comparison is made through Spearman rank correlation with error-gap $\Delta$ using the same experimental setup as in Appendix~\ref{sec:exp_details_divapprox}. Since we are not aware of an analytic solution for this divergence (in case of neural networks), we approximate the divergence term of Germain et al. using a random sample $Q \sim \mathbb{Q}^k$ with $k = 100$. Here, $\mathbb{Q}$ is a distribution over $\mathcal{H}$ selected, again, using \textbf{SA}. We do this for each adaptation pair $(S,T)$ and each compatible hypothesis space $\mathcal{H}$. The final reported correlation compares the approximated divergence to $\Delta_Q(S,T)$ over all adaptation pairs, all compatible hypothesis spaces, and all 3 seeds.
\subsection{Additional DANN Results}\label{sec:ext_dann_res}
\begin{figure}
    \centering
    \includegraphics[width=.7\columnwidth]{figures/rho-2.png}
    \caption{Estimates for $\rho$ while using \textbf{DANN} for various choices of the prior variance parameter $\sigma$. Solid line shows median, while scatter shows 95\% or more of data. Each datum describes unique $(S, T, \mathbb{Q})$. As a function of complexity, we expect $\rho$ to be smaller for more complex solutions. For example, in the formula for $\mathrm{KL}$-divergence between Gaussian distributions, similarly concentrated distributions will have high $\mathrm{KL}$-divergence as their variances decrease (i.e., holding all else constant). Sensibly, smaller variance gives more concentrated $\mathbb{Q}$, which helps to ensure small $\rho$ as well. This relationship (between $\rho$ and complexity) is observed in the above and is similar to our findings in the main text on adaptability after \textbf{DANN}.}
    \label{fig:rho-dann}
\end{figure}
\begin{figure}
    \centering
    \includegraphics[width=.7\columnwidth]{figures/err.png}
    \caption{Estimates of target error $\mathbf{R}_T(\mathbb{Q})$ (after \textbf{DANN}) compared to prior error $\mathbf{R}_T(\mathbb{P})$ (before \textbf{DANN}, i.e. using \textbf{SA}). Solid line shows median, while scatter shows 95\% or more of data. Each datum describes unique $(S, T, \mathbb{Q})$. More complex solutions achieve lower error as expected. \textbf{DANN} is effective at reducing target error in many cases.}
    \label{fig:err}
\end{figure}
In Figure~\ref{fig:rho-dann}, we show the effect of \textbf{DANN} on $\rho$. This confirms our takeaway in the main text that, as a function of sample complexity, $\rho$ behaves like adaptability. Also, we see that increasing the prior variance makes flatness less likely, since this indirectly controls the variance of $\mathbb{Q}$, which is regularized to be similar to the prior via PBB (see Section~\ref{sec:dann_details}). Intuitively, it is easier to find small flat-minima than very large flat-minima. In Figure~\ref{fig:err}, we show the target (transfer) error of solutions trained using \textbf{DANN} compared to solutions trained using \textbf{SA}. The error rates may appear unusually high to familiar readers (e.g., compared to \citet{apx_ganin2015unsupervised}), but this is likely a result of the down-sampling we do to save training time (see Section~\ref{sec:dann_details}). Since \textbf{DANN} is a more sophisticated adaptation algorithm, we expect it to learn more about the target than \textbf{SA}, and indeed, it does in many contexts. Thus, this result also serves to validate our empirical setup for applying \textbf{DANN}. Lastly, as before, we can interpret the target error as a function of sample complexity: unconstrained solutions are able to achieve lower error than constrained solutions. Due to the higher complexity, these solutions may not generalize well.

\subsection{Detailed Visualization of Data in Table~\ref{tab:divapprox}}
In some instances, we observe poor correlation of the proposed divergence terms with the error-gap. For example, in Table~\ref{tab:divapprox}, poor correlation is observed on the \textbf{Digits} dataset. Poor correlation is also observed on the \textbf{PACS+OH} dataset for the model-dependent divergence. To study and understand these errors in detail, we visualize heatmaps (i.e., 2d histograms) in Figure~\ref{fig:heatmaps}. Histogram counts illustrate counts of the individual data pairs used to compute correlation in Table~\ref{tab:divapprox}; i.e., between a particular approximation of divergence and the corresponding error-gap. Please, see Section~\ref{sec:exp_details_divapprox} for details on the data used for Table~\ref{tab:divapprox}.

Results show the poor performance on the \textbf{Digits} dataset is likely due to insensitivity of the divergence approximation to changes in data sample and hypothesis. In particular, there is significant concentration of the divergence approximations near 1. In the case of the model-independent divergence, we also observe some artificially low approximations (i.e., near 0) compared to the error-gap; this illustrates poor approximation. On the \textbf{PACS+OH} dataset, the poor performance of the model-dependent divergence is best explained by comparing to the data-points for the model-independent divergence. While the model-dependent divergence is more variable and sensitive to data/model changes as one would expect, we see a high density of anti-correlated measurements on the \textbf{PACS+OH} data. Specifically, there is a cluster of cases where the divergence is near 1 with absolute change in error about 0.4 and another cluster of cases where the divergence is only about 0.8 with absolute change in error higher at 0.6. Aptly, the divergence does not perform well at ranking in this case.

These more nuanced results speak to the conservative nature of bounds (and their contained terms), in general. In particular, upperbounds are subject to “false positives” -- in which the actual bound is high, but the quantity controlled is low; e.g.,1 is a valid bound on 1 and so are 2, 10, 50, and 1000. While this undesirable property impacts the \textbf{Digits} and \textbf{PACS+OH} cases, it is also worth mentioning that the divergences perform well in many other cases. Depending on the application, a conservative measure of performance change may actually be desirable.

\begin{figure}
    \centering
    \includegraphics[width=\columnwidth]{figures/heatmaps.png}
    \caption{Heatmap (i.e., 2d histogram) showing counts for data used to compute correlations in Table~\ref{tab:divapprox}. \textbf{images} corresponds to the \textbf{PACS+OH} dataset.}
    \label{fig:heatmaps}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\bibliography{sicilia_277}
\end{document}
