% \documentclass[accepted]{uai2025}
% \documentclass{uai2025} % for initial submission
% \documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
\documentclass[accepted,mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{pifont}
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{accents}
\usepackage{subfigure}
\usepackage{hyperref}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{subcaption}
\usepackage{amsthm}
\usepackage[capitalize,noabbrev]{cleveref}
\usepackage[textsize=tiny]{todonotes}
\usepackage{algpseudocode}
\usepackage{algorithm}
\usepackage[]{authblk}
% \newcommand{\theHalgorithm}{\arabic{algorithm}}
% \renewcommand{\thealgorithm}{}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\input{math_commands}

% \newcommand{\gD}{\gD}
% \newcommand{\gS}{\gS}
% \newcommand{\gT}{\gT}
\newcommand{\disloss}{\ell_{\textrm{dis}}}
\newcommand{\logloss}{\ell_{\textrm{logistic}}}
\newcommand{\hdh}{\ensuremath{\gH\Delta\gH}}
\newcommand{\disdis}{\textsc{Dis}$^2$\xspace}
\newcommand{\odd}{\textsc{ODD}\xspace}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\begin{document}

\title{ODD: Overlap-aware Estimation of Model Performance under Distribution Shift}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<amishr24@jhu.edu>?Subject=ODD paper}{Aayush Mishra}}
\author[1]{Anqi Liu}
% \author[1,2]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science, \protect\\
    Johns Hopkins University,
    Baltimore, Maryland, USA
     \protect\\
    \texttt{\{amishr24, aliu.cs\}@jhu.edu}
}
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
\maketitle

\begin{abstract}
Reliable and accurate estimation of the error of an ML model in unseen test domains is an important problem for safe intelligent systems. Prior work uses \textit{disagreement discrepancy} (\disdis) to derive practical error bounds under distribution shifts. It optimizes for a maximally disagreeing classifier on the target domain to bound the error of a given source classifier. Although this approach offers a reliable and competitively accurate estimate of the target error, we identify a problem in this approach which causes the disagreement discrepancy objective to compete in the overlapping region between source and target domains.
% , resulting in a looser bound. 
% As the true labeling function is not chosen adversarially with respect to the source classifier, 
With an intuitive assumption that the target disagreement should be no more than the source disagreement in the overlapping region due to high enough support, we devise Overlap-aware Disagreement Discrepancy (\odd). 
% Maximizing \odd only requires disagreement in the non-overlapping target domain, removing the competition. 
Our \odd-based bound uses domain-classifiers to estimate domain-overlap and better predicts target performance than \disdis. We conduct experiments on a wide array of benchmarks to show that our method improves the overall performance-estimation error while remaining valid and reliable. Our code and results are available on \href{https://github.com/aamixsh/odd}{GitHub}.
\end{abstract}

\begin{figure}[t]
    \centering
    % \begin{subfigure}{0.5\textwidth}
    %     \includegraphics[width = \linewidth]{figs/dis2.pdf}
    % \end{subfigure}
    % \hfill
    % \begin{subfigure}{0.5\textwidth}
    %     \includegraphics[width = \linewidth]{figs/odd.pdf}
    % \end{subfigure}
    \includegraphics[width=\linewidth]{figs/fig1.png}
    % \includegraphics[width = 0.49\linewidth]{figs/dis2.pdf}
    % \hfill
    % \includegraphics[width = 0.49\linewidth]{figs/odd.pdf}
    % \caption{Given source $\gS$ and target $\gT$ domains with $p_\gS = \mathcal{N}(-3, 2)$ and $p_\gT = \mathcal{N}(3, 2)$ respectively, we illustrate the difference between our method (\odd) and previous work (\disdis). With \disdis, on maximizing the disagreement discrepancy $\Delta(h, h')$ to find a critic $\Bar{h}$, there is competition in the overlapping region resulting in instability during optimization. Instead, \odd uses the practical insight that the true $y^*$ (and hence the critic) should agree with the source classifier $h$. This allows us to maximize the modified $\Delta(h, \Bar{h}, \alpha)$ which reduces the size of competition region resulting in better optimization and tighter risk estimate. Notice that increasing $\alpha$ beyond $\alpha_0$ will make \odd and \disdis equivalent.}
    \caption{Given a source domain $\gS$, and a classifier $\hat{h}$ trained in this domain, we aim to predict the performance of $\hat{h}$ in a target domain $\gT$ \textit{without labels}. ~\citet{rosenfeld2023almost} learn a worst-case \textit{critic} $\Bar{h}$, which maximally disagrees with $\hat{h}$ in $\gT$ while agreeing with $\hat{h}$ in $\gS$ (\disdis), to bound this performance. But this optimization leads to a competition in the overlapping region. \odd discounts disagreement in the overlapping region making $\Bar{h}$ agree with $\hat{h}$ in this region. We show that \odd tightens the gap between true and predicted performance compared to \disdis, while remaining reliable.}
    \label{fig:fig1}
\end{figure}

\section{Introduction}
\label{sec:intro}
The ability of machine learning models to know when they do not know, is an important characteristic to make them safe for deployment, especially in high-stakes applications like the medical domain. Modern neural networks have an incredible capacity to learn complex functions but they are prone to catastrophic errors on inputs outside of their training distributions. Hence, it is crucial to be able to predict the performance of these models on distributionally shifted test domains.

Many works have attempted to address this problem of test performance prediction~\citep{bendavid2007analysis, bendavid2010theory, mansour2009domain, ben2010impossibility}. These methods provide uniform convergence bounds which are often vacuous in practice because we are interested in bounding the error of a single hypothesis. Other recent works attempt to use unlabeled test samples to make point-wise (per hypothesis) prediction of performance bounds in neural networks~\citep{lu2023predicting, baek2022agreement, garg2022leveraging, guillory2021predicting}. Although these methods achieve low performance prediction error on average, they lack reliability and often overestimate the performance, especially under large distribution shift (when reliability is most important). To address these challenges,~\citet{rosenfeld2023almost} proposed Disagreement Discrepancy (\disdis), which provides almost provable performance bounds using unlabeled test samples. It 
% takes a more optimistic view of the source classifier, and 
trains a \textit{critic} that minimizes its disagreement with a given classifier on the source distribution while maximizing its disagreement on the target domain under a sufficiently expressive hypothesis class. This allows \disdis to assume a worst-case shift in the target domain while remaining faithful to the trained classifier in the source domain. This approach gives non-vacuous bounds that almost always remain valid. 

In this work, we find that \disdis can be further improved using domain-overlap awareness. The agreement with source and disagreement with target creates a tension in the region of domain overlap, resulting in unstable optimization, which leads to more pessimistic critics than necessary. 
% We claim that disagreement in the overlapping region can be discounted.
We are motivated by the intuition that if a classifier was trained on labeled samples from the target domain, its support in the overlapping region would be similar to what the source trained classifier had. Hence, we propose to discount the disagreement of the critic with target overlapping samples through a new training objective. 
% it would have support similar to that of the source domain in the overlapping region. 
% Hence it should not be expected to make adversarially more mistakes than the source trained classifier in this region. 
Leveraging domain classifiers for estimate domain-overlap, our Overlap-aware Disagreement Discrepancy (\odd) removes the instability of optimization in the overlapping region and results in practically tighter performance bounds than \disdis. We visually illustrate our method in~\autoref{fig:fig1}. In summary:
\begin{itemize} 
    \item We derive a general version of the \disdis bound using the notion of ideal joint hypothesis, and show that theoretical performance bounds obtained using \odd are as tight as those obtained using \disdis.
    \item We design an \odd-based disagreement loss to find more \textit{optimistic critics} for practically estimating the performance bounds. 
    % having lower disagreement discrepancy.
    \item On several real datasets and training method combinations, \odd-based bounds are tighter than \disdis-based bounds, and still maintain reliable coverage. 
\end{itemize}

\section{The general \disdis bound}
\label{sec:background}

\paragraph{Setup:} We follow the notation setup used by ~\citet{rosenfeld2023almost}. Let $\gS, \gT$ denote the source and target distributions, respectively, over labeled inputs $(x,y) \in \gX \times \gY$, and let $\hat{\gS}$, $\hat\gT$ denote the empirically observed sets (with cardinality $n_S$ and $n_T$) of samples from these distributions. \textit{In the target domain $\gT$, we observe only the covariates and not the labels.} We let $p_{\gS}$ and $p_{\gT}$ denote the marginal distributions of covariates in the source and target domains. We consider classifiers $h:\gX \to \R^{|\gY|}$ which output a vector of logits ($h(x)_y$ denotes the logits of class $y$ in this vector), and let $\hat{h}$ denote the particular classifier (trained on $\hat{\gS}$) under study for which we want to bound the error in the target domain. We use $\gH$ to denote a hypothesis class of such classifiers. 
% Occasionally, where clear from context, we use $h(x)$ to refer to the argmax logit, i.e. the predicted class. We treat these classifiers as deterministic throughout, though our analysis can easily be extended to probabilistic classifiers and labels. 
For a domain $\gD$ on $\gX$, let $\epsilon_{\gD}(h, h') := \E_{x \sim p_\gD}[\mathbf{1}\{\argmax_y h(x)_y \neq \argmax_y h'(x)_y\}]$ denote the one-hot disagreement between any classifiers $h$ and $h'$ on $\gD$. 

\textbf{The labeling function:} Let $y^*_\gS$ and $y^*_\gT$ denote the true labeling functions in the source and target domains, respectively. \citet{rosenfeld2023almost} assume a fixed $y^*$ to represent the true labeling function for all domains. In contrast, we use the notion of ideal joint hypothesis \citep{bendavid2010theory} to analyze \textit{adaptability} in the general case.

\begin{definition}
\label{def:ideal}
The ideal joint hypothesis is defined as:
\begin{align}
\label{eq:ideal}
    y^* := \argmin_{h \in \gH} \epsilon_{\gS}(h, y^*_\gS) + \epsilon_{\gT}(h, y^*_\gT)
\end{align}
\end{definition}
with the corresponding joint risk defined as $\lambda := \epsilon_{\gS}(y^*) + \epsilon_{\gT}(y^*)$. For brevity, we overload $\epsilon_{\gD}(h)$ to mean $\epsilon_{\gD}(h, y^*_\gD)$, i.e. the 0-1 error of classifier $h$ on distribution $\gD$. If $\lambda$ is high, no $h$ trained on the source domain can be expected to perform well on the target domain (low adaptability). 

% As we do not have access to the target labels, calculating $\lambda$ is not possible. Therefore, we will then restrict our analysis to the practical setting with a common $y^*$ across domains. 

% Given an $h$ trained to approximate $y^*_\gS$, a corresponding $y^*_\gT$ can be chosen adversarially such that $y^*_\gT(x) \neq h(x)$ $\forall$ $x$. Therefore, in the general case, the maximum target risk estimate is always $1$. But, a practically useful risk estimate can be calculated if the shifted labeling functions belong to a restricted hypothesis class. 

Now, \citet{rosenfeld2023almost} define \textit{disagreement discrepancy} (\disdis) as follows.

\begin{definition}
    \disdis $\Delta(h, h')$ is the disagreement between any $h$ and $h'$ on $\gT$ minus their disagreement on $\gS$:
    \begin{align}
    \label{eq:dis2}
        \Delta(h, h') := \epsilon_{\gT}(h, h') - \epsilon_{\gS}(h, h').
    \end{align}
\end{definition}

% By maximizing the disagreement discrepancy between the given classifier ($h$) and a \textit{critic} ($h'$), one can approximate the worst-case behavior of the hypothesis in the target domain. 
This immediately implies the following lemma:

\begin{lemma}
\label{lemma:error-expansion}
    For any classifier $h$,\\
    $\epsilon_{\gT}(h) \leq \epsilon_{\gS}(h) + \Delta(h, y^*) + \lambda$.
\end{lemma}
% \begin{proof}
%     \begin{align*}
%         \epsilon_{\gT}(h) &\leq \epsilon_{\gT}(h, y^*) + \epsilon_{\gT}(y^*) \quad \text{[Triangle Inequality]}\\
%         &= \epsilon_{\gS}(h, y^*) + (\epsilon_{\gT}(h, y^*) - \epsilon_{\gS}(h, y^*)) + \epsilon_{\gT}(y^*)\\
%         &\leq \epsilon_{\gS}(h) + \Delta(h, y^*) + (\epsilon_{\gT}(y^*) + \epsilon_{\gS}(y^*)) \\
%         &= \epsilon_{\gS}(h) +\Delta(h, y^*) + \lambda
%         % &= \epsilon_{\gS}(h) + \Delta(h, y^*, \alpha) + \Delta^{\gD_\alpha}(h, y^*) + \lambda\\
%         % &= \epsilon_{\gS}(h) + \Delta(h, y^*, \alpha) + \Delta^{\gD_\alpha}(h, y^*) + \lambda\\
%     \end{align*}
% \end{proof}

The proof can be found in~\autoref{app:proofs}. This gives us a method to bound the target risk in terms of source risk and the discrepancy term $\Delta$. However, as $y^*$ is unknown, we can optimize for an alternate critic $\Bar{h}$ which would act as a proxy for the worst-case $y^*$. For this, ~\citet{rosenfeld2023almost} assume the following:
\begin{assumption}
    \label{ass:dis2concave}
    For a critic $\Bar{h}\in\gH$ which maximizes a concave surrogate to the empirical \disdis, $\Delta(\hat{h}, y^*) \leq \Delta(\hat{h}, \Bar{h})$.
    % and $\underbar{\Delta}(\hat{h}, y^*, \alpha) \leq \underbar{\Delta}(\hat{h}, h', \alpha)$.
\end{assumption}
This assumption is based on the practical observation that $y^*$ is not chosen adversarially with respect to $\hat{h}$. So it is reasonable that there exists another function $h^* \in \gH$ having higher disagreement discrepancy than $y^*$.
% $\Delta(\hat{h}, h^*, \alpha)$ than $\Delta(\hat{h}, y^*, \alpha)$. 
$\Bar{h}$ is a concave surrogate to $h^*$ that we can find empirically (which approaches $h^*$ with increasing sample size). Finally, we have the error bound:
\begin{theorem}[\disdis Bound]
    \label{thm:dis2bound}
    Under~\autoref{ass:dis2concave}, 
    % and an $\alpha$ satisfying~\autoref{ass:overlap}, 
    with probability $\geq 1-\delta$,
    \begin{align*}
        \epsilon_{\gT}(\hat{h}) &\leq \epsilon_{\hat{\gS}}(\hat{h}) + \hat\Delta(\hat{h}, \Bar{h}) + \sqrt{\frac{(n_S + 4n_T) \log \frac{1}{\delta}}{2 n_S n_T}} + \lambda.
        % &\quad + \underbar{\Delta}(h, h', \alpha) + \lambda.
    \end{align*}
\end{theorem}

Here, $\epsilon_\hat\gS$ and $\hat{\Delta}$ denote the empirical versions of their corresponding population terms. The proof can be found in~\autoref{app:proofs}. Even with a suitable $\Bar{h}$, calculating $\lambda$ still requires access to target labels, which we do not have. Hence, we follow~\citet{rosenfeld2023almost} and fix a common $y^*$ across domains (as it happens in most practical cases). When $y^* = y^*_\gS = y^*_\gT$, $\lambda = 0$ and the above bound collapses exactly to their bound. In the next section, we make \disdis more effective with overlap awareness.



% In the next section, we first improve \disdis and derive how it can be used to calculate tight bounds on target risk.

\section{\odd: Overlap-aware \disdis}
\label{sec:odd}

\textbf{The problem with disagreement discrepancy:} Maximizing \disdis with respect to a given classifier $\hat{h}$ loosely translates to finding a \textit{critic} $\Bar{h}$ which maximally disagrees with $\hat{h}$ in the target domain (first term) while maximally agreeing with $\hat{h}$ in the source domain (second term). 
% This allows for calculating a bound on the risk of $\hat{h}$ in the target domain, as the true $y^*$ can only disagree with $\hat{h}$ as much as this worst case critic. 
This works as expected in the case when the source and target domains are disjoint. However, there is often an overlap between source and target domains in practice. Maximizing \disdis creates a competition between the two terms in the region of \textit{domain overlap}. In this region, the maximizing hypothesis has to both agree and disagree with $\hat{h}$. Hence, by definition, the \disdis objective is unstable in the overlapping region, where both agreement and disagreement can result in the same discrepancy value. This presents a problem during practical optimization and can result in over-conservative critics which disagree with $\hat{h}$ more than necessary. 

\begin{figure}[t]
    \centering
    \includegraphics[width=0.9\linewidth]{figs/overlap.pdf}
    % \includegraphics[width = 0.49\linewidth]{figs/dis2.pdf}
    % \hfill
    % \includegraphics[width = 0.49\linewidth]{figs/odd.pdf}
    \caption{For source ($\gS$) and target ($\gT$) domains with $p_\gS = \mathcal{N}(-3, 2)$ and $p_\gT = \mathcal{N}(3, 2)$ respectively, the domain overlapping region using $\alpha = 0.02$ is shown.}
    \label{fig:overlap}
\end{figure}

\textbf{Domain overlap:} To tackle this problem, we first define domain overlap using a parameter $\alpha$, which quantifies the minimum density requirement in 
% the domain of interest for an element of the other domain to be considered inside the overlapping region.
both domains of interest for an element to be considered inside the overlapping region. 

\begin{definition}
    \label{def:overlap} The \textit{domain overlap} set between $\gS$ and $\gT$ 
    % are defined as $\gT_\alpha = \{x \in \gT: p_\gS(x) > \alpha, p_\gT(x) > 0\}$ and $\gS_\alpha = \{x \in \gS: p_\gT(x) > \alpha, p_\gS(x) > 0\}$  
    is defined as $\gD_\alpha = \{x \in \gX: p_\gS(x) > \alpha, p_\gT(x) > \alpha\}$ for some $\alpha > 0$.
\end{definition}
Probability distributions (like the normal distribution), may have non-zero support everywhere, and $\alpha = 0$ makes the entire domain space part of the overlapping region. Practically, an overlap is meaningful only in a region with reasonably high support in domains of interest (see~\autoref{fig:overlap}). 

Now, let $\gK$ denote $[\mathbf{1}\{\argmax_y h(x)_y \neq \argmax_y h'(x)_y\}]$. Then, $\epsilon_{\gD}(h, h') := \E_{x \sim p_\gD}\gK = \int_{x \in \gD}p_\gD(x)\gK dx$. This explicit definition of expectation as an integral allows us to think of the density $p_\gD$ and domain of application $\gD$ independently. If the domain $\gD$ is split into two subsets, say $\gA$ and $\gB$ such that $\gA \cap \gB = \phi$ and $\gA \cup \gB = \gD$, then $\epsilon_{\gD}(h, h') = \int_{x \in \gA}p_\gD(x)\gK dx + \int_{x \in \gB}p_\gD(x)\gK dx$.
We use this separability under the same density as a property to define $\epsilon^\gD_\gA(h, h') = \int_{x \in \gA}p_\gD(x)\gK dx$. Using this definition, we have:
\begin{align*}
    % \epsilon_{\gT}(h, h') := \underbrace{\epsilon^\gT_{\gT_\alpha}(h, h')}_\text{Overlap Disagreement} + \underbrace{\epsilon^\gT_{\gT \setminus \gT_\alpha}(h, h')}_\text{Non-Overlap Disagreement}
    \epsilon_{\gT}(h, h') := \underbrace{\epsilon^\gT_{\gD_\alpha}(h, h')}_\text{Overlap Disagreement} + \underbrace{\epsilon^\gT_{\gT \setminus \gD_\alpha}(h, h')}_\text{Non-Overlap Disagreement}
\end{align*}

This breakdown of disagreement in the overlapping and non-overlapping portions allows us rethink the \disdis objective.

\textbf{Overlap-aware Disagreement Discrepancy (\odd):}
As $\hat{h}$ is trained to minimize the risk in $\gS$, its disagreement with $y^*$ (=$y^*_\gS$) in regions with high support is expected to be low. This is why the \disdis objective aims to minimize the disagreement between the critic $\Bar{h}$ and $\hat{h}$ in the source domain. However, \disdis also tries to maximize the disagreement in $\gD_\alpha$, which is a part of the same high source support region. We address this issue by making \disdis overlap aware. 

\begin{definition}
\label{def:odd}
    The \textit{\textbf{O}verlap-aware \textbf{D}isagreement \textbf{D}iscrepancy (\odd)}
    % $\Delta(h, h', \alpha)$ 
    is defined as the disagreement between any $h$ and $h'$ in the \textit{non-overlapping} region of $\gT$ minus their disagreement in the \textit{non-overlapping} region of $\gS$:
    % \setminus D_\alpha$ 
    % minus their disagreement on $\gS$ in the \textit{non-overlapping} region:
    \begin{align}
    \label{eq:odd}
        \Delta(h, h', \alpha) &:= 
        \epsilon^\gT_{\gT \setminus \gD_\alpha}(h, h') - \epsilon^\gS_{\gS \setminus \gD_\alpha}(h, h').
        % \Delta(h, h') - \epsilon^\gT_{\gD_\alpha}(h, h')
    \end{align}
\end{definition}

We also have the corresponding \textit{overlap discrepancy}:

\begin{definition}
\label{def:overlap_dis}
$\underbar{\Delta}(h, h', \alpha) = \epsilon^\gT_{\gD_\alpha}(h, h') - \epsilon^\gS_{\gD_\alpha}(h, h')$.
% $\Delta^{\gD_\alpha}(h) = \epsilon^\gT_{\gD_\alpha}(h, y^*_\gT) - \epsilon^\gS_{\gD_\alpha}(h, y^*_\gS)$.
\end{definition}

% \textcolor{red}{try to find better notation. also repeat h bar again.}

Note that, $\Delta(h, h') (\text{\disdis}) = \Delta(h, h', \alpha) (\text{\odd}) + \underbar{\Delta}(h, h', \alpha)$. 
% \implies \underbar{\Delta}(h, h', \alpha) = 0$, and $\Delta(h, h') (\disdis) = \Delta(h, h', \alpha) (\odd)$. Similarly, for $\alpha = 0$, $\underbar{\Delta}(h, h', \alpha) = \Delta(h, h')$. In both cases, Therefore, high $\alpha$ values imply empty/small overlap sets resulting in lower $\underbar{\Delta}$. Low $\alpha$ values may consider points with very low densities (source or target) inside the overlap, resulting in potentially high $\underbar{\Delta}$. 
With our separation of overlapping and non-overlapping discrepancies, we can rewrite~\autoref{thm:dis2bound} as:

\begin{theorem}
    \label{thm:splitbound}
    Under~\autoref{ass:dis2concave}, 
    with probability $\geq 1-\delta$,
    \begin{align*}
        \epsilon_{\gT}(\hat{h}) &\leq \epsilon_{\hat{\gS}}(\hat{h}) + \hat\Delta(\hat{h}, \Bar{h}, \alpha) + \underbar{\hat\Delta}(\hat{h}, \Bar{h}, \alpha) + \\
        &\quad\sqrt{\frac{(n_S + 4n_T) \log \frac{1}{\delta}}{2 n_S n_T}} + \lambda.
        % &\quad + \underbar{\Delta}(h, h', \alpha) + \lambda.
    \end{align*}
\end{theorem}

Remember, $\bar{h}$ is the critic from~\autoref{ass:dis2concave}. This is the most general form of our analysis, where terms with $(\hspace{1pt}\hat{}\hspace{1pt})$ denote the empirical counterparts of their corresponding population terms. As we increase $\alpha$, we decrease the size of $\gD_\alpha$. Starting from $\alpha = 0$ when $ \gD_\alpha = \gS \cup \gT$, until we reach some high $\alpha = \alpha_0$ for which $\gD_\alpha = \phi$. The notion of overlap becomes interesting for some intermediate $\alpha$ values, for which samples from either domain behave similarly on learning algorithms. Next, we discuss why the $\underbar{\Delta}$ term is expected to be small due to domain overlap.

\textbf{Overlap Discrepancy between $\hat{h}$ and $y^*$:} Of all $h\in\gH$, we usually only care to examine ones which are trained to achieve low source risk, i.e. $\hat{h}$ typically has low disagreement with $y^*$ in regions of high support. For a reasonably chosen $\gD_\alpha$, we have similarly high support from the target domain. A classifier trained with ground truth target labels should be expected to have similar disagreement in this region. In fact, there is no reason to expect $\epsilon^\gT_{\gD_\alpha}(\hat{h}, y^*)$ to be any higher than $\epsilon^\gS_{\gD_\alpha}(\hat{h}, y^*)$. Hence, we make the following assumption:

% Given that both domains have reasonably high support in this region, we make the following assumption:

\begin{assumption}
\label{ass:overlap}
    Target disagreement between $\hat{h}$ and $y^*$ is bounded by the source disagreement in the region of domain overlap, i.e., $\epsilon^\gT_{\gD_\alpha}(\hat{h}, y^*) \leq \epsilon^\gS_{\gD_\alpha}(\hat{h}, y^*)$.
\end{assumption}

In our experiments with randomly generated datasets(~\autoref{fig:synthetic_main}), we show how the overlap discrepancy $(\epsilon^\gT_{\gD_\alpha}(\hat{h}, y^*) - \epsilon^\gS_{\gD_\alpha}(\hat{h}, y^*))$ consistently remains close to zero, supporting our intuition. With~\autoref{ass:overlap} and a common $y^*$ across domains we can derive a practically tighter bound by maximizing \odd. We change~\autoref{ass:dis2concave} to apply only in the non-overlapping region:


% This assumption relies on the practical observation that real-world shifts are often 

% The range of $\underbar{\Delta}$ depends on how similar are $y^*_\gS$ and $y^*_\gT$ in the overlapping region. If they The magnitude of the terms $\epsilon^\gT_{\gD_\alpha}(h, y^*)$ and $\epsilon^\gS_{\gD_\alpha}(h, y^*)$ are deter
% In the extreme cases: when $h = y^*_\gS$, $\underbar{\Delta} = \epsilon^\gT_{\gD_\alpha}(y^*_\gS, y^*)$, and when $h = y^*_\gT$, $\underbar{\Delta} = -\epsilon^\gS_{\gD_\alpha}(y^*_\gS, y^*)$,
% , compared to the source domain. 

% Given that both domains have reasonably high support in a suitably chosen overlapping region, we expect any $h$ to disagree with $y^*$ \textit{similarly} in both domains. Hence, we make the following assumption:

% \begin{assumption}
% \label{ass:overlap}
%     Overlap Discrepancy $\Delta^{\gD_\alpha}(h, y^*) = 0$ for a suitable $0 < \alpha < \alpha_0$.
% \end{assumption}

% Next, we define Overlap-aware disagreement discrepancy.

% From equations~\ref{eq:dis2} and~\ref{eq:odd}, we have $\Delta(h, h') = \Delta(h, h', \alpha) + \underbar{\Delta}(h, h', \alpha)$.
% With \odd, we have the following lemma. 

%%%%%% PREVIOUS PROOF BELOW

% \epsilon_{\gT}(h) &= \epsilon_{\gS}(h) + \epsilon_{\gT}(h) - \epsilon_{\gS}(h)\\
% &= \epsilon_{\gS}(h) + \Delta(h, y^*)\\
% &= \epsilon_{\gS}(h) + \Delta(h, y^*, \alpha) + \Delta^{\gD_\alpha}(h, y^*)\\
% &\leq \epsilon_{\gS}(h) + \Delta(h, y^*, \alpha).




% \epsilon_{\gT}(h) = \epsilon^\gT_{\gT \setminus \gD_\alpha}(h) + \epsilon^\gT_{\gD_\alpha}(h)\\
% \leq \epsilon^\gT_{\gT \setminus \gD_\alpha}(h) + \epsilon^\gS_{\gD_\alpha}(h)\\
% = \left(\epsilon^\gS_{\gS \setminus \gD_\alpha}(h) + \epsilon^\gS_{\gD_\alpha}(h)\right) + \left(\epsilon^\gT_{\gT \setminus \gD_\alpha}(h) - \epsilon^\gS_{\gS \setminus \gD_\alpha}(h)\right)\\
% = \epsilon_{\gS}(h) + \Delta(h, y^*, \alpha) + \Delta^{\gD_\alpha}(h, y^*) \quad \left(\text{from Assumption \ref{ass:overlap}}\right)\\
% \leq \epsilon_{\gS}(h) + \Delta(h, y^*, \alpha). \quad \left(\Delta^{\gD_\alpha}(h, y^*) \leq 0\right)

% Therefore, high $\alpha$ values may be useless as they result in empty/small overlap sets. A low $\alpha$ may consider points with very low densities (source or target) inside the overlap. This large overlap set may violate our assumption as the true disagreement may be unclear in low-support regions.  
% with very different source and target support, again rendering them of little use. 
% For some $0 < \alpha < \alpha_0$, $\gD_\alpha$ captures appropriate regions of overlap for which this condition is reasonable. Practically, we expect $\Delta^{\gD_\alpha}(h, y^*) \approx 0$ because there is no reason to believe that source disagreement is any more than target disagreement in the overlapping region. We conducted experiments on randomly generated datasets by measuring $\Delta^{\gD_\alpha}(h, h')$ (the critic) and found it to be $\approx 0$ (see~\autoref{fig:synthetic_main}), suggesting that this is the case.


% As the overlapping region has support in both domains, it is reasonable to expect the learned classifier to agree with the target distribution at least as much as it agrees with the source distribution.
% It is reasonable to assume that the expected disagreement of $\hat{h}$ with $y^*$ in the domain overlap set is no more than the disagreement in  
% In most practical scenarios (as we will see in ~\autoref{sec:exps}),
% particularly under the assumption of covariate shift (the true $y^*$ remains fixed), 
% it is reasonable to expect the true $y^*$ to agree with the given classifier in the overlapping region. This is because, \textbf{we don't expect samples in the overlapping region to be adversarially more noisy} than other regions. Each sample in the source domain contributes to the characteristics of the learnt $h$ and it is reasonable to expect an agreement with the true $y^*$ in the region with source support. This removes the competition between the two terms in the region of overlapping support, making the selection of the critic more stable and results in tighter risk bounds. To derive this bound, we first rewrite $\epsilon_{\gD}(h, h')$.  Now, we define domain overlap.
% Clearly, the target domain $\gT$ can be split into $\gD_\alpha$ and $\gT \setminus \gD_\alpha$. We let $\epsilon^\gT_{\gD_\alpha}(h, h')$ denote the disagreement in the overlapping region of interest under the target distribution. 
% as an improvement over \disdis. 
% With 
% Although we don't use this practically, 
%%%%%%%%%%%%%% Previously
% \begin{definition}
%     The domain \textit{\textbf{O}verlap-aware \textbf{D}isagreement \textbf{D}iscrepancy (\odd)}
%     % $\Delta(h, h', \alpha)$ 
%     is defined as:
%     % the disagreement between $h$ and $h'$ on $\gT \setminus D_\alpha$ minus their disagreement on $\gS$:
%     \begin{align}
%     \label{eq:odd}
%         \Delta(h, h', \alpha) &:= 
%         % \epsilon_{\gT \setminus D_\alpha}(h, h') - \epsilon_{\gS}(h, h').
%         \Delta(h, h') - \epsilon^\gT_{\gD_\alpha}(h, h')
%     \end{align}
% \end{definition}
% \begin{definition}
%     The \textit{\textbf{O}verlap-aware \textbf{D}isagreement \textbf{D}iscrepancy (\odd)}
%     % $\Delta(h, h', \alpha)$ 
%     is defined as the disagreement between $h$ and $h'$ in the \textit{non-overlapping} region of $\gT$ minus their disagreement in the \textit{non-overlapping} region of $\gS$:
%     % \setminus D_\alpha$ 
%     % minus their disagreement on $\gS$ in the \textit{non-overlapping} region:
%     \begin{align}
%     \label{eq:odd}
%         \Delta(h, h', \alpha) &:= 
%         \epsilon^\gT_{\gT \setminus \gD_\alpha}(h, h') - \epsilon^\gS_{\gS \setminus \gD_\alpha}(h, h').
%         % \Delta(h, h') - \epsilon^\gT_{\gD_\alpha}(h, h')
%     \end{align}
% \end{definition}

% Let $\Delta^{\gD_\alpha}(h, h') = \epsilon^\gT_{\gD_\alpha}(h, h') - \epsilon^\gS_{\gD_\alpha}(h, h')$. Then, we have the following lemma.
% \begin{lemma}
% \label{lemma:odd_expansion}
%    $\Delta(h, h', \alpha) = \Delta(h, h') - \Delta^{\gD_\alpha}(h, h')$.
% \end{lemma}
% \begin{proof}
%     By definition, $\epsilon_{\gT}(h) = \epsilon^\gT_{\gT \setminus \gD_\alpha}(h, h') + \epsilon^\gT_{\gD_\alpha}(h, h')$. Hence,
%     \begin{align*}
%         \Delta(h, h', \alpha) = \epsilon^\gT_{\gT \setminus \gD_\alpha}(h, h') - \epsilon^\gS_{\gS \setminus \gD_\alpha}(h, h')\\
%         = \left(\epsilon^\gT_{\gT \setminus \gD_\alpha}(h, h') +  \epsilon^\gT_{\gD_\alpha}(h, h')\right) - \\\left(\epsilon^\gS_{\gS \setminus \gD_\alpha}(h, h') +  \epsilon^\gS_{\gD_\alpha}(h, h')\right) - \\\left(\epsilon^\gT_{\gD_\alpha}(h, h') - \epsilon^\gS_{\gD_\alpha}(h, h')\right)\\
%         = \Delta(h, h') - \Delta^{\gD_\alpha}(h, h').
%     \end{align*}
% \end{proof}
% \begin{assumption}
% \label{ass:overlap}
%     The disagreement between $h$ and $y^*$ in the overlapping target domain is no more than the disagreement between them in the overlapping source domain, i.e., $\epsilon^\gT_{\gD_\alpha}(h, y^*) \leq \epsilon^\gS_{\gD_\alpha}(h, y^*)$.
% \end{assumption}
% This is a practical assumption. As the overlapping region has support in both domains, it is reasonable to expect the learned classifier to agree with the target distribution at least as much as it agrees with the source distribution.
% \begin{lemma}
% \label{lemma:error-expansion}
%     For any classifier $h$, \\$\epsilon_{\gT}(h) \leq \epsilon_{\gS}(h) + \Delta(h, y^*, \alpha)$.
% \end{lemma}
% \begin{proof}
%     \begin{align*}
%         \epsilon_{\gT}(h) = \epsilon_{\gS}(h) + \left(\epsilon_{\gT}(h) - \epsilon_{\gS}(h)\right)\\
%         = \epsilon_{\gS}(h) + \Delta(h, h')\\
%         = \epsilon_{\gS}(h) + \Delta(h, h', \alpha) + \Delta^{\gD_\alpha}(h, h') \quad \left(\text{from Lemma \ref{lemma:odd_expansion}}\right)\\
%         \leq \epsilon_{\gS}(h) + \Delta(h, h', \alpha). \quad \left(\Delta^{\gD_\alpha}(h, h') \leq 0\right)
%     \end{align*}
% \end{proof}

% From here, we follow \citet{rosenfeld2023almost} to get a bound on the target disagreement based on \odd. 

\begin{assumption}
    \label{ass:oddconcave}
    For an $\Bar{h}\in\gH$ which maximizes a concave surrogate to the empirical \odd, $\Delta(\hat{h}, y^*, \alpha) \leq \Delta(\hat{h}, \Bar{h}, \alpha)$.
\end{assumption}
Hence,~\autoref{thm:splitbound} is simplified to: 

\fbox{\parbox{\linewidth}{\begin{theorem}[\odd Bound]
    \label{thm:oddbound}
    Under~\autoref{ass:oddconcave} and~\autoref{ass:overlap},
    with probability $\geq 1-\delta$,
    \begin{align*}
        \epsilon_{\gT}(\hat{h}) &\leq \epsilon_{\hat{\gS}}(\hat{h}) + \hat\Delta(\hat{h}, \Bar{h}, \alpha) +
        \sqrt{\frac{(n_S + 4n_T) \log \frac{1}{\delta}}{2 n_S n_T}}
    \end{align*}
\end{theorem}
}}

We discuss this derivation in~\autoref{app:proofs}. Note that this bound does not theoretically improve upon the \disdis based bound from~\citep{rosenfeld2023almost} for a given critic $\Bar{h}$. However, optimizing for \odd allows for the selection of a better critic, as we will see in the next section, which agrees more with $\hat{h}$ in the overlapping source domain. This better critic results in the \odd term being lower than the original \disdis term, making the bound practically tighter.


% $\underbar{\Delta}(\hat{h}, \Bar{h}, \alpha) = $

% As $y^*$ is unknown, we can optimize for an alternate critic $h'$ which acts as a proxy for the worst-case $y^*$. For this, we follow~\citet{rosenfeld2023almost} and assume the following:
% \begin{assumption}
%     \label{ass:concave}
%     For a critic $h'\in\gH$ which maximizes a concave surrogate to the empirical \odd, we assume $\Delta(\hat{h}, y^*, \alpha) \leq \Delta(\hat{h}, h', \alpha)$.
%     % and $\underbar{\Delta}(\hat{h}, y^*, \alpha) \leq \underbar{\Delta}(\hat{h}, h', \alpha)$.
% \end{assumption}
% This assumption is based on the practical observation that $y^*$ is not chosen adversarially with respect to $\hat{h}$. So it is reasonable that there exists another function $h^* \in \gH$ having higher disagreement discrepancy.
% % $\Delta(\hat{h}, h^*, \alpha)$ than $\Delta(\hat{h}, y^*, \alpha)$. 
% $h'\in\gH$ is a concave surrogate to $h^*$ that we can find empirically (which approaches $h^*$ with increasing sample size). Finally, we have our bound:
% \begin{theorem}[Disagreement Bound]
%     \label{thm:oddbound}
%     Under~\autoref{ass:concave}, 
%     % and an $\alpha$ satisfying~\autoref{ass:overlap}, 
%     with probability $\geq 1-\delta$,
%     \begin{align*}
%         \epsilon_{\gT}(\hat{h}) &\leq \epsilon_{\hat{\gS}}(\hat{h}) + \hat\Delta(\hat{h}, h', \alpha) + \sqrt{\frac{(n_S + 4n_T) \log \frac{1}{\delta}}{2 n_S n_T}}\\
%         &\quad + \underbar{\Delta}(h, h', \alpha) + \lambda.
%     \end{align*}
% \end{theorem}

\section{Maximizing \odd}
\label{sec:algo}
\textbf{Competition in the overlapping region:} The \textit{disagreement logistic loss} of a classifier $h$ on a labelled sample $(x, y)$ is described in \citep{rosenfeld2023almost} as:

\resizebox{\linewidth}{!}{%
\begin{minipage}{\linewidth}
\begin{align*}
    \disloss(h, x, y) := \frac{1}{\log 2} \log\left( 1 + \exp\left( h(x)_y
    - \frac{1}{|\gY|-1} \sum_{\hat y\neq y} h(x)_{\hat y} \right)\right).
\end{align*}
\end{minipage}%
}

$\disloss$ is shown to be convex in $h(x)$ and to upper bound the 0-1 disagreement loss. To maximize the empirical disagreement discrepancy, a combined loss on source and target samples is used:
\begin{align*}
    \hat\gL_\Delta(\Bar{h}) := &\ \frac{1}{|\hat{\gS}|} \sum_{x\in\hat{\gS}} \logloss(\Bar{h}, x, \hat{h}(x)) \\
    &+ \frac{1}{|\hat\gT|} \sum_{x\in\hat\gT} \disloss(\Bar{h}, x, \hat{h}(x)).
\end{align*}
% \resizebox{\linewidth}{!}{%
% \begin{minipage}{\linewidth}
% \begin{align*}
%     \hat\gL_\Delta(\Bar{h}) := \frac{1}{|\hat{\gS}|} \sum_{x\in\hat{\gS}} \logloss(\Bar{h}, x, \hat{h}(x)) + \frac{1}{|\hat\gT|} \sum_{x\in\hat\gT} \disloss(\Bar{h}, x, \hat{h}(x)).
% \end{align*}
% \end{minipage}%
% }

Here, $\logloss := -\frac{1}{\log |\gY|} \log \textrm{softmax}(h(x))_y$ is the standard log-loss. This is where we see the competition: for points that lie in the overlapping region, one would contribute by reducing $\logloss$, making $\Bar{h}$ agree with $\hat{h}$ while the other would contribute by reducing $\disloss$, making $\Bar{h}$ disagree with $\hat{h}$. In practice, this typically results in an $\Bar{h}$ that agrees with $\hat{h}$ a little less than it could have in the overlapping region. We illustrate this phenomena in~\autoref{fig:synthetic_main} and show how our proposed method improves on it, which we describe next. 

\textbf{Mitigating the competition:} We propose to improve this loss by discounting the disagreement of target samples in the overlapping region, i.e.,
\begin{align*}
    \hat\gL_{\Delta}(\Bar{h}, \alpha) := &\ \frac{1}{|\hat{\gS}|} \sum_{x\in\hat{\gS}} \logloss(\Bar{h}, x, \hat{h}(x)) \\
    &+ \frac{1}{|\hat\gT|} \sum_{x\in\hat\gT} \mathbf{1}\left\{x \notin (\hat{\gD_\alpha})\right\}*\disloss(\Bar{h}, x, \hat{h}(x)).
\end{align*}

Here, $\mathbf{1}\left\{x \notin (\hat{\gD_\alpha})\right\}$ is an indicator function which only selects samples that are outside the overlapping set. For samples in the overlapping set, the $\disloss$ term is discounted to zero, encouraging the maximizing $\Bar{h}$ to not disagree with $\hat{h}$ in this region. Note that this loss still maximizes the empirical \odd, as the disagreement in the non-overlapping target region is still maximized and the disagreement in the non-overlapping source region is still minimized. 

\textbf{How to select $\gD_\alpha$?} The ideal way to find the overlap set $\gD_\alpha$ is to find the densities $p_\gS$ and $p_\gT$, and tune for the smallest $\alpha$ on a held-out validation set which satisfies~\autoref{ass:overlap}. However, density estimation is typically difficult with high-dimensional data and accessing a held-out set may be challenging in many domains where data is scarce (where reliable performance estimation is the most essential). We propose to do it with a simple method of domain classification. If there is overlap between the source and target domains, a domain classifier should have difficulty in classifying samples from this overlapping region. As we already know domain labels, we use the domain classifier's prediction as a proxy to quantify the extent of overlap. 

Suppose a domain classifier $d: \gX \rightarrow \{0, 1\}$ is trained to classify all source samples with the label $0$ and all target samples with the label $1$. It produces logits $\{d(x)_0, d(x)_1\}$ for all samples and taking the $\argmax$ gives us the classified domain. We can replace the indicator function $\mathbf{1}\left\{x \notin (\hat{\gD_\alpha})\right\}$ with $\mathbf{1}\left\{\argmax(d(x)) \neq 1)\right\}$ to find target domain samples which are being classified as source samples, and hence have an estimate of the set $\gD_\alpha$. Note that this assumes our domain classifier model is accurate.  
% i.e., all target samples in the actual $\gD_\alpha$ have $d(x)_0 > 0.5$.
In practice, we can convert this problem of hard selection of the set $\gD_\alpha$ into a soft version, by simply weighting each sample in the target domain by its probability of being classified in the target domain. Let $\{s(x)_0, s(x)_1\}$ denote the soft-max of $\{d(x)_0, d(x)_1\}$. Then, the indicator function $\mathbf{1}\left\{x \notin (\hat{\gD_\alpha})\right\}$ can be replaced with $s(x)_1$. Although the soft-max probabilities are not guaranteed to be calibrated without the use of a held-out calibration set, this soft version makes the discounting of overlapping target samples smooth and it works well in practice. Hence, we use the following loss to maximize \odd while making $\Bar{h}$ agree with $\hat{h}$ in the overlapping region:
\begin{align*}
    \hat\gL_{\Delta}(\Bar{h}, \alpha) := &\ \frac{1}{|\hat{\gS}|} \sum_{x\in\hat{\gS}} \logloss(\Bar{h}, x, \hat{h}(x)) \\
    &+ \frac{1}{|\hat\gT|} \sum_{x\in\hat\gT} s(x)_1 *\disloss(\Bar{h}, x, \hat{h}(x)).
\end{align*}
We describe our method step by step in Algorithm~\ref{algo:odd_weight}.


\textbf{Why not also discount source overlapping samples?} 
% The reason we do not discount the $\logloss$ term for overlapping source samples is that w
We want $\Bar{h}$ to agree with $\hat{h}$ in this region, which includes both source and target samples. Discounting loss from both overlapping source samples and overlapping target samples would lead to a problem similar to that of $\hat\gL_\Delta(\Bar{h})$, where instead of competition, the optimization would be indifferent to all the samples in the overlapping region. In~\autoref{app:expdetails}, we conduct experiments to validate this and discuss how this may lead to worse critics.

\begin{algorithm}[tb]
\caption{Finding the critic $\Bar{h}$ for a given $\hat{h}$}
\label{algo:odd_weight}
\begin{algorithmic}[0]
    \State {\bfseries Input:} $\hat{h}, \hat{\gS} = \{(x^\gS_i, y^*_i)\}_{i=1}^{n_\gS}, \hat{\gT} = \{(x^\gT_i)\}_{i=1}^{n_\gT}$
    \State $d \gets \texttt{train}(\{(x^\gS_i, 0)\}_{i=1}^{n_\gS} \cup \{(x^\gT_i, 1)\}_{i=1}^{n_\gT})$ \\\Comment{Train domain classifier.}
    \State $s \gets \texttt{softmax}(d)$
    \State $p^{\hat{h}} \gets \argmax \hat{h}$ \Comment{Returns the $\hat{h}$ predicted label.}
    \State $\Bar{h} \gets \hat{h}$ \Comment{Initialize critic with $\hat{h}$ parameters.}
    \State $\Theta \in \Bar{h} \gets \text{require grad}$ \Comment{Choose parameters to learn.}
    \While{not converged}
        \State $\gS_{\text{loss}} \gets \texttt{mean}(\{\logloss((x^\gS_i, p^{\hat{h}}(x^\gS_i)))\}_{i=1}^{n_\gS})$
        \State $\gT_{\text{loss}} \gets \texttt{mean}(\{s(x^\gT_i)_1 * \disloss((x^\gT_i, p^{\hat{h}}(x^\gT_i)))\}_{i=1}^{n_\gT})$
        \State $\Theta'_t \gets \Theta'_{t-1} - \frac{\partial (\gS_{\text{loss}} + \gT_{\text{loss}})}{\partial \Theta}$ \Comment{Update $\Bar{h}$ parameters.}
    \EndWhile
    \State return $\Bar{h}$
\end{algorithmic}
\end{algorithm}

\section{Experimental Results}
\label{sec:exp}

\subsection{Synthetic data experiments}
\label{subsec:synthetic_exps}
\textbf{Dataset:} To visually illustrate the benefit of our overlap-aware disagreement discrepancy, we conduct experiments on 2-D synthetic data. First, we sample source and target domains from 2 randomly initialized Gaussian distributions with a randomly sampled overlap factor, which determines how close their means are. Then, we initialize a complex decision surface $y^*$ which determines the labels for each sample (with some noise). One sample dataset can be seen in~\autoref{fig:synthetic}. Then, we train a model $\hat{h}$ on the source data. Finally, we use Algorithm~\ref{algo:odd_weight} to find $\Bar{h}$ to obtain a bound on the target performance (accuracy). For comparison and analysis, we also use \disdis to obtain the same bound. The details of the dataset generation can be found in~\autoref{app:expdetails}.

\begin{figure}[t]
    \centering
    \includegraphics[width=0.85\linewidth]{figs/synthetic_domains.pdf}
    \caption{Sample 2-D synthetic dataset used for analysis. With a complex $y^*$, the model learned on source data can make errors in the target domain, but the overlapping region has high agreement.}
    \label{fig:synthetic}
\end{figure}

\textbf{Methodology:} We randomly sample an overlap rate $100$ times between $[0, 1]$ and generate a new dataset for each (around $2000$ training samples and $1250$ validation samples in each domain). We repeat the experiment $40$ times, effectively creating $4000$ unique datasets, to smoothen any finite sampling issues. We use a small $3-$layer, $16-$neurons wide MLP to train the $\hat{h}, \Bar{h}$ and $d$. Finally, we distribute the $4000$ overlap rates into $20$ equal width bins to average our findings across multiple datasets. Instead of predicting the error bound (as in~\autoref{thm:oddbound}), we predict the accuracy bound to follow the convention in~\citet{rosenfeld2023almost}. Hence, a tighter bound implies a larger lower bound in accuracy. 


\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{figs/synthetic_analysis1.pdf}\\
    \includegraphics[width=\linewidth]{figs/synthetic_analysis2.pdf}
    \caption{\textbf{TOP:} As the overlap factor increases, \odd becomes increasingly smaller than \disdis (yellow shaded region), resulting in a tighter bound (purple shaded region) that is closer to the target accuracy. 
    % As \odd allows for more agreement in the overlapping region, $\Bar{h}$ agrees more optimistically with $\hat{h}$ resulting in the improvement. 
    \textbf{BOTTOM:} 
    We decompose the discrepancy into individual source and target agreements. 
    Due to the competition in the overlapping region, maximizing \disdis makes $\Bar{h}$ start disagreeing with $\hat{h}$ in the source domain, while maximizing \odd maintains agreement. \odd also allows for much higher target agreement in high overlap cases. Both shown with green shaded region. This results in a less conservative $\Bar{h}$. Moreover, we find that $\Delta(\hat{h}, \Bar{h})\approx\Delta(\hat{h}, \Bar{h}, \alpha)$ for both \odd and \disdis based critics (difference $< 0.01\%$ making plot-lines coincide in this figure), as predicted by our theory.}
    % \textcolor{red}{discuss the dis2 vs odd discrepancies a bit more.}}}
    \label{fig:synthetic_main}
\end{figure}


% \textcolor{red}{make it structured with points}
\textbf{Findings---why \odd is better than \disdis:}
In~\autoref{fig:synthetic_main}, we show the relationship between various metrics (discrepancy, target accuracy, predicted target accuracy (the bound) and individual domain agreements) with the overlap rate (for both \odd and \disdis). The target performance prediction is calculated as $\epsilon_\gS(\hat{h}, y^*) - \Delta(\hat{h}, \Bar{h}) - \varepsilon$ for \disdis, and $\epsilon_\gS(\hat{h}, y^*) - \Delta(\hat{h}, \Bar{h}, \alpha) - \varepsilon$ for \odd ($\varepsilon$ is the concentration term in~\autoref{thm:oddbound}). We find that:
\begin{itemize}
    \item As the overlap increases, \disdis-based critics start disagreeing with $\hat{h}$ in the source domain, while \odd-based critics maintain their source agreement.
    \item Even with high overlap, \disdis-based critics disagree substantially with $\hat{h}$ in the target domain, making them overly pessimistic. \odd fixes this by increasing target agreement in high overlap regions. 
    \item For both \disdis and \odd based critics, overall discrepancy $\Delta(\hat{h}, \bar{h})$ is approximately equal to non-overlapping discrepancy $\Delta(\hat{h}, \Bar{h}, \alpha)$, validating our assumption and theory in~\autoref{sec:odd}. This shows that \textit{\odd improves the bound by selecting a better critic}, not by reducing the disagreement for a given critic. 
    % It is only through better selection of $\Bar{h}$ does \odd based bound performs better than \disdis.
\end{itemize}

\textit{\odd consistently improves over \disdis by choosing a more optimistic $\Bar{h}$ that agrees more with $\hat{h}$ in the overlapping region, while remaining valid and reliable.}
It makes sure that the critic is chosen as optimistically as possible based on the overlap, without compromising disagreement in the unseen target region.
% As the overlap rate increases (the means of source and target Gaussians are closer), \disdis based source agreement starts dropping while \odd based source agreement is maintained. This is the key factor, which in turn increases target agreement, and leads to a smaller discrepancy value. 
% , implying a tighter bound with \odd. Moreover, we see that $\Delta(\hat{h}, \Bar{h})\approx\Delta(\hat{h}, \Bar{h}, \alpha)$ for the same $\Bar{h}$ (for both \disdis and \odd). This implies that \textit{\odd improves the bound by selecting a better critic}, not by reducing the disagreement for a given critic. 

\begin{table*}[t]
    \begin{adjustbox}{width=0.68\linewidth,center}
    \centering
    \tabcolsep=0.12cm
    \renewcommand{\arraystretch}{1.2}
    \begin{tabular}{lccccccccc}
    \toprule
    {} && \multicolumn{2}{c}{MAE $(\downarrow)$} && \multicolumn{2}{c}{Coverage $(\uparrow)$} && \multicolumn{2}{c}{Overest. $(\downarrow)$} \\
    \multicolumn{1}{r}{\textbf{DA?}} &&          \ding{55} & \ding{51} &&             \ding{55} & \ding{51} &&               \ding{55} & \ding{51} \\
    Prediction Method             &&                    &           &&                      &           &&                         &           \\
    \midrule
    AC \citep{guo2017calibration}                            &&            0.1086 &    0.1091 &&                0.0989 &    0.0333 &&                  0.1187 &    0.1123 \\
    DoC \citep{guillory2021predicting}                           &&             0.1052 &    0.1083 &&                0.1648 &    0.0167 &&                  0.1230 &    0.1095 \\
    ATC NE \citep{garg2022leveraging}                        &&             0.0663 &    0.0830 &&                0.2857 &    0.2000 &&                  0.0820 &    0.1007 \\
    COT \citep{lu2023predicting}                           &&             0.0695 &    0.0808 &&                0.2528 &    0.1833 &&                  0.0858 &    0.0967 \\
    \midrule
    \disdis~\citep{rosenfeld2023almost} && & && & && &\\ 
     \quad Using Features            &&             0.2841 &    0.1886 &&                1.0000 &    1.0000 &&                  0.0000 &    0.0000 \\
     \quad Using Logits              &&             0.1525 &    0.0881 &&                0.9890 &    0.7500 &&                  0.0171 &    0.0497 \\
    \quad Using Logits w/o $\delta$ term &&             0.0996 &    0.0937 &&                0.6813 &    0.2833 &&                  0.0779 &    0.0956 \\
    \midrule
    \odd && & && & && &\\ 
     \quad Using Features            &&             0.2538 &    0.1657 &&                0.9890 &    0.9333 &&                  0.0314 &    0.0318 \\
     \quad Using Logits              &&             0.1190 &    0.0739 &&                0.9341 &    0.7333 &&                 0.0290 &    0.0738 \\
    \quad Using Logits w/o $\delta$ term &&             0.0943 &    0.1005 &&                0.4945 &    0.2000 &&                  0.0803 &    0.1079 \\
    \bottomrule

    \end{tabular}
    \end{adjustbox}
    % \vspace{7pt}
    \caption{Comparing the \odd bound with \disdis bound and other prior methods for predicting accuracy. DA denotes if the representations were learned via a domain-adversarial algorithm (DANN, CDANN). MAE: mean absolute error, Coverage: fraction of predictions correctly bounding the true error, Overest.: MAE among shifts whose accuracy is overestimated. \odd improves on MAE in comparison to \disdis with some loss in coverage, but maintains a much higher lead in coverage over prior methods.}
    \label{tab:main}
\end{table*}

\subsection{Real data experiments}
\label{subsec:real_exps}
\textbf{Datasets:} As we aim to achieve an improvement of \disdis~\citep{rosenfeld2023almost}, we follow their setup and conduct experiments across all vision benchmark datasets used in their study for a fair comparison between \disdis and \odd. These include four BREEDs datasets ~\citep{santurkar2020breeds}:
{Entity13}, {Entity30}, {Nonliving26}, and {Living17}; {FMoW}~\citep{christie2018functional}
% and Camelyon \citep{bandi2018detection}
from {WILDS}~\citep{wilds2021}; Officehome~\citep{venkateswara2017deep}; {Visda}~\citep{peng2018syn2real, visda2017}; CIFAR10, CIFAR100~\citep{krizhevsky2009learning}; and Domainet~\citep{peng2019moment}. In addition, we conduct additional experiments on a language dataset: CivilComments~\citep{borkan2019nuanced} for breadth of applicable domains. These datasets contain multiple domains and include a wide variety of subpopulation and natural distribution shifts. We present some details about these datasets and their shift variations, and complete details about the new CivilComments dataset in~\autoref{app:expdetails}. Comprehensive details can be found in the \disdis paper.

\textbf{Methods and baselines:} Models are trained with ERM and Unsupervised Domain Adaptation methods (which help improve target performance with unlabeled target data) like FixMatch~\citep{sohn2020fixmatch}, DANN~\citep{ganin2016domain}, CDAN~\citep{long2018conditional}, and
BN-adapt~\citep{li2016revisiting}). Although our paper is positioned as an improvement to \disdis, we also present comparisons with other baselines like Average Confidence (AC)~\citep{guo2017calibration}, Difference of Confidences (DoC)~\citep{guillory2021predicting}, Average Thresholded Confidence (ATC)~\citep{garg2022leveraging}, and Confidence Optimal Transport (COT)~\citep{lu2023predicting}, for completeness. All methods are calibrated with temperature scaling~\citep{guo2017calibration} using source validation data. As our general setup is identical, we follow the implementation details in the \disdis paper. We list experimental details on training the domain classifiers in ~\autoref{app:expdetails}. For the correction term in~\autoref{thm:oddbound}, we use a small $\delta=0.01$ (same as \disdis) in all our experiments. 

\textbf{Bound calculation strategies:} As pointed out by~\citet{rosenfeld2023almost}, the value of the error bound is expected to decrease if the critic is searched for in a restricted hypothesis class. Their real-data experiments are performed on the deep features of the inputs, as classifiers trained on these features have been shown to have the capacity to generalize under distribution shifts~\citep{rosenfeld2022domain, kirichenko2022last}. This makes $\hat{h}$ belong to the linear hypothesis class. Even logits of the source classifier may contain information necessary for this task as prior work suggests that deep network representations have small effective ranks~\citep{arora2018optimization, huh2022lowrank, pezeshki2021gradient}. Therefore, we follow their setup to calculate our bounds using both: the deep feature space, and the logit space. Our domain classifier (used for measuring domain overlap) is also trained on the same space (details in~\autoref{app:expdetails}). Lastly, the $\delta$ term in~\autoref{thm:oddbound} may make the bound too conservative in practice, so we also calculate the bound without it for a complete comparison with \disdis.

\textbf{Evaluation metrics:} We evaluate all methods on mean absolute error (MAE), which measures how close the predicted performance is to the actual performance. In addition, we report coverage, which is the fraction of accuracy predictions that are valid ($\leq$ true accuracies). This measures the reliability of the method. We also report the MAE among predictions which are invalid, to get a sense of how bad the over-estimates are, when they do happen. 

\textbf{Domain-adversarially learnt representations:}
\label{para:da} Domain adversarial representation learning methods like DANN and CDANN, regularize deep representations to be indifferent between source and target domains, to make classifiers robust to distribution shifts. This regularization may cause representations to lose domain specific features resulting in a higher degree of overlap between source and target domain features. A higher degree of overlap typically means a more lenient bound using our \odd method, as it suggests that the two domains are similar (see~\autoref{fig:synthetic_main}). Therefore, when the method produces a classifier which actually has low target error, the \odd-based bound will be tighter. However, when the actual target error is high, it means that the representations have lost important information from the target domain and the high overlap hurts, causing the \odd-based bound to overestimate accuracy. Therefore, we present our findings in two categories: \textbf{DA?} \ding{51}: representing the case when the representations were learnt using a domain adversarial algorithm, and \textbf{DA?} \ding{55} otherwise.

\textbf{Results:} We report our comprehensive evaluation metrics in~\autoref{tab:main}. We find that \textit{\odd-based bounds are more accurate than \disdis-based bounds} in general; and significantly improve the estimate in many cases (especially with non-DA methods). Moreover, this improved MAE does not come at a huge cost of coverage, where the \odd-based bound maintains a much higher coverage compared to other baselines. We discuss non-DA results in~\autoref{app:expdetails}.

\textbf{Improvements in Valid vs Invalid predictions:}
In~\autoref{tab:invalidity}, we segregate the predictions on the basis of validity to investigate failure cases. In most cases, both \disdis and \odd based bounds are valid and, the \odd-based bound is significantly more tight compared to \disdis. In a small number of cases, \odd overshoots the target accuracy while \disdis remains valid. Even in these cases, the MAE of \odd is better than \disdis, meaning it only slightly overestimates the performance of $\hat{h}$. And in a few cases, \odd achieves a valid bound when \disdis does not. We conducted a paired Student’s t-test on the distributions of (target accuracy $-$ predicted lower bound) for \odd and \disdis (for non-DA algorithms). Out of 94 total predictions, 89 were valid (predicted lower bound was actually lower than target accuracy) for both \odd and \disdis. To remove any influence of invalid predictions, we show the t-statistic for both valid and all predictions cases:

\textit{Valid predictions}: t-statistic: \textbf{-5.47}, p-value: 4.14e-07\\
\textit{All predictions}: t-statistic: \textbf{-5.62}, p-value: 2.02e-07

As evident, we achieve a high t-statistic with a very low p-value, giving strong evidence against the null-hypothesis (i.e., no significant difference between the two methods). 

\textbf{Variation due to degree of overlap:} We plot point-wise bound estimates (using logits) for all non-DA trained $\hat{h}$ (~\autoref{fig:real_logits}) for more analysis. 
% highlight cases where \odd significantly improves over \disdis. Overall, we find that overlap-awareness improves the practical utility of \disdis. 
When actual target accuracy is low (left part of the plot), the overlap between source and target domains is likely low. Hence, we see little improvement over \disdis in this region. When actual target accuracy is high (right part of the plot), domain overlap is likely also high. Therefore, we see a lot more improvement in this region. Around the middle is when overlap estimate is likely the hardest. This is where \odd sometimes overestimates the overlap and the target accuracy (only slightly) as a result.


\begin{table}[t]
    \begin{adjustbox}{width=0.95\linewidth,center}
    \centering
    \tabcolsep=0.12cm
    \renewcommand{\arraystretch}{1.2}
    \begin{tabular}{lccccccccc}
    \toprule
    {MAE $(\downarrow)$} && \multicolumn{2}{c}{\textbf{DA:} \ding{55}} && \multicolumn{2}{c}{\textbf{DA:} \ding{51}}\\
    \multicolumn{1}{r}{} &&          \disdis & \odd &&          \disdis & \odd\\
    Prediction set             &&                    & && &          \\
    \midrule
     \disdis invalid, \odd valid            &&             0.0171 &   \textbf{0.0029}  && 0.0489 &    \textbf{0.0032}\\
     \disdis valid, \odd invalid              &&             0.0540 &  \textbf{0.0248}   &&       0.0325       &    \textbf{0.0215}\\
    Both valid &&             0.1611 &   \textbf{0.1234}  &&             0.1058 &    \textbf{0.0772}\\
    Both invalid &&  0.0000 & 0.0000 &&  \textbf{0.0498} & 0.0822 \\
    % \midrule
    % \odd && & \\ 
    %  \quad \disdis invalid, \odd valid            &&             0.0029 &    0.0032\\
    %  \quad \disdis valid, \odd invalid              &&             0.0248 &    0.0215\\
    % \quad both valid &&             0.1234 &    0.0772\\
    % \quad both invalid &&  0.0000 & 0.0822 \\
    \bottomrule

    \end{tabular}
    \end{adjustbox}
    % \vspace{7pt}
    \caption{Comparing MAE of \disdis and \odd conditioned on coverage. When \odd-based bound is invalid, it typically overestimates by a small amount. In comparison, \disdis typically has a larger MAE when it is invalid. In the most popular case, when both are valid, \odd consistently outperforms \disdis.}
    \label{tab:invalidity}
\end{table}

\section{Discussion}
\label{sec:discuss}
While \odd shows consistent improvement in prediction accuracies, we discuss some potential limitations and improvements to our method.

\textbf{Estimating overlap:} We find using a small domain-classifier's softmax probabilities as weights to discount the \disdis contribution of overlapping target samples effective in improving the bound estimation accuracy. However, this approach does not guarantee a good quantification of overlap and is affected by factors like the training hyper-parameters of the domain-classifier (underfitting/overfitting) and the representations on which the classifier is being trained (like DA vs non-DA). A true overlap-aware bound would require accurate density estimation of the source and target densities, which is challenging for high-dimensional data. Nonetheless, our positive results with the CivilComments dataset using BERT embeddings paint a promising picture for the future with increasingly better representation learning.

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{figs/real_logits_False.pdf}
    \caption{Our \odd-based bound (using logits, \textbf{DA?} \ding{55}) improves the \disdis bound on multiple dataset-method combinations, achieving an improved average MAE. Improvement is shown with \textcolor{blue}{$\uparrow$} (deterioration with \textcolor{red}{$\downarrow$}). Notice that our bound, like \disdis, still almost always maintains its reliability, unlike other baseline methods.}
    \label{fig:real_logits}
\end{figure}

\begin{figure*}[t]
    \centering
    \includegraphics[width = 0.7\linewidth]{figs/dis2_odd_variants.pdf}
    \includegraphics[width = 0.25\linewidth]{figs/dis2_odd_vary_delta.pdf}
    \caption{\textbf{(First 3)}: Comparing \disdis and \odd on estimated bound based on features and logits vs true accuracy. ``w/o $\delta$'' drops the $\delta$ term of~\autoref{thm:oddbound}. \textbf{(4$^{\text{th}}$)} Observed bound violation rate vs. desired probability $\delta$ from~\autoref{thm:oddbound} based on logits. The observed violation rate for both \disdis and \odd remains under the allowed violation probability for most $\delta$'s, but \odd overshoots it for a small set.}
    \label{fig:coverage}
    % \vspace{-8pt}
\end{figure*}

\textbf{When does \odd fail?} In~\autoref{fig:coverage}, we compare the estimated bound and the true accuracy on a variety of non-DA representations in both the feature and logit space. We find that both \odd and \disdis always remain valid in the case of feature space, and almost always valid in the case of logit space. In both cases, \odd improves on \disdis with a more accurate prediction of the bound. However, without the correction $\delta$ term in~\autoref{thm:oddbound}, both methods overestimate the accuracy many times. We observe that for a small range of $\delta$ values, \odd marginally overshoots the allowed probability of violations. We suspect that this could be caused by an over-estimation of the overlap between the two domains in some cases, making the critic agree more with $\hat{h}$ than it should have. Better overlap estimation should be able to mitigate this issue, which we defer to future work.

\textbf{Sample Complexity:}
 Our theory does not include sample complexity analysis of training the evaluated classifier. We assume that the classifier is given to us and we focus on estimating the target performance with finite samples. We restricted our search for the critic in the linear hypothesis class which has a uniform sample complexity, but sample complexity for linear classifiers can still be measured and analyzed in a non-uniform way. 
In our analysis, following~\citet{rosenfeld2023almost}, we repeated our experiments 30 times to dilute variance (for each dataset, the critic used in bound calculation was chosen from a set of 30 critics learnt on the same data, based on max discrepancy). 
Regarding analyzing the variance and how it can in turn affect our discrepancy measure and the final bound with a non-uniform sample complexity, we refer to future work.
% In the future, we aim to theoretically analyze the sample complexity of our solutions in a non-uniform way like recent work~\citep{sicilia2022pac}.


\section{Related Work}
\label{sec:related}

\paragraph{Generalization bounds under distribution shift:} Finding generalization bounds under distribution shift is a classic machine learning problem with a long history of development~\citep{redko2019advances}. Early works using $\gH$ and $\gH\Delta\gH$ divergence~\citep{bendavid2007analysis, mansour2009domain, bendavid2010theory} established domain adaptation bounds through uniform convergence. These works inspired many efforts in making classifiers robust to distribution shifts~\citep{zhang2019shiftinvar,ganin2016domain, long2018conditional,rahimian2019distributionally, sagawa2019distributionally,arjovsky2019invariant}. In particular, a major line of work leverages representation invariance across domains for domain adaptation and generalization \citep{johansson2019support,zhao2019learning}
% , which leads to generalization bounds under distribution shift that incorporate the representation function used in representation learning. 
Theoretically, the $\gH\Delta\gH$ divergence motivates further improvement on the definition of divergence to measure the difference between domains, with respect to a particular function class \citep{zhang2019bridging}. Application to various types of distribution shift \citep{awasthi2023theory} and new methods for the computation of the discrepancy \citep{kuroki2019unsupervised} also followed. 
In addition, PAC-Bayesian analysis has also been applied to this problem \citep{germain2013pac, germain2016new} which was recently extended to provide non-uniform sample complexity analysis~\citep{sicilia2022pac}. However, due to the general difficulty of estimating the divergence, it is usually hard to directly estimate the target performance following theoretical bounds.  \citep{rosenfeld2023almost} developed \disdis as a theoretically sound but practical method to bound the target risk of a given source classifier, which is the setting of focus in this work. 

Another line of work aims to bound the generalization risk of models (espeically neural networks) by measuring and using the complexity of these models~\citep{bartlett2017spectrally, dziugaite2017computing, zhou2018non}. By evaluating the complexity, these works estimate the expressive capacity of the models providing insights into their worst-case behavior, but often lead to loose bounds. 


\textbf{Predicting error in a target domain:} Other works focus on predicting the error of a trained classifier/network on unlabeled samples from a target domain. They either provide instance-level estimates using ensembles or data augmentations~\citep{chen2021detecting, deng2021labels} or an overall domain-level error using empirical observations~\citep{baek2022agreement}, domain-invariant representations~\citep{chuang2020estimating}, average thresholded confidence~\citep{garg2022leveraging} or difference of confidences~\citep{guillory2021predicting}. Some of these methods require labeled target samples for calibration, which makes their applicability limited.
In our work, we mainly focus our effort on improving \disdis~\citep{rosenfeld2023almost} as it presents a strong practical method to give reliable performance bounds estimates. Reliability is a major concern especially when the actual target performance is low.
% , where some of the prior works make prohibitive overestimates. 
With this work, we push to improve the prediction accuracy of \disdis without compromising reliability.



\section{Conclusion}
\label{sec:conclusion}

In this work, we identified a potential for improvement in recent work which uses disagreement discrepancy (\disdis) to calculate practical bounds on the performance of a source trained classifier in unseen target domains. \disdis suffers from a competition between source and target samples for agreement and disagreement respectively in the overlapping region, which may results in a suboptimal critic due to unstable optimization. We mitigate this issue by using overlap-aware disagreement discrepancy (\odd). By restricting the disagreement in the non-overlapping target domains, we eliminate the instability in the overlapping region, and derive a new bound on the basis of \odd. We train a domain classifier to discriminate between source and target samples. 
% and leverage its prediction uncertainty as a expect it to make mistakes on samples in the overlapping region. 
We use the softmax probabilities of this domain classifier to act as weights to discount the disagreement of target samples in the overlapping region. This makes the critic agree with source samples in the overlapping region and improves performance bounds as a result, without much compromise in the reliability of the bound. 
% Our code and results are available on \href{https://github.com/aamixsh/odd_anon}{github}.

In the future, we hope to further improve the overlap estimation step. Moreover, in the era of Large Language Models (LLMs), where the input/output space is highly complex, we aim to extend our method to develop generic definitions of disagreement and overlap beyond classification settings; and provide performance estimation for LLMs. 

% \paragraph{Relevance to Foundation Models:}
% The code used in this work and the results produced are located at \href{https://anonymous.4open.science/r/odd_anon-C048}{this} anonymous link.
% To illustrate this phenomena, we use the finite sample case from Figure~\ref{fig:fig1} and show how nearby points in the overlapping domain. 




% Notice that $\Delta(h, h', \alpha) \leq \Delta(h, h')$, as there is an additional negative term which depends on the size of $D_\alpha$. In general, increasing $\alpha$ shrinks the size of the overlap set ($D_\alpha$). For each given $\gS$ and $\gT$, this overlap set will become the empty set $\phi$ for some $\alpha = \alpha_0$, i.e. no point in the domain space has both source and target densities greater than $\alpha_0$. At this point, $D_\alpha = \phi$ and $\Delta(h, h', \alpha) = \Delta(h, h')$.

% \begin{lemma}
% \label{lemma:error-expansion}
%     For any classifier $h$, \\$\epsilon_{\gT}(h) = \epsilon_{\gS}(h) + \Delta(h, y^*, \alpha) + \epsilon^\gT_{\gD_\alpha}(h, y^*)$.
% \end{lemma}
% \begin{proof}
%     By definition, $\epsilon_{\gT}(h) = \epsilon_{\gS}(h) + \left( \epsilon_{\gT}(h) - \epsilon_{\gS}(h) - \epsilon^\gT_{\gD_\alpha}(h, y^*) \right) + \epsilon^\gT_{\gD_\alpha}(h, y^*) = \epsilon_{\gS}(h) + \Delta(h, y^*, \alpha) + \epsilon^\gT_{\gD_\alpha}(h, y^*)$.
% \end{proof}

% A hypothesis $h^* = \argmax_{h\in\gH} \Delta(\hat{h}, h, \alpha)$ that maximizes \odd satisfies $$


% \begin{contributions} 
%     Aayush Mishra came up with the idea, ran the experiments and wrote the paper.
%     Anqi Liu supervised the project, provided technical insights and helped edit the paper.
% \end{contributions}

\begin{acknowledgements} 
We thank the reviewers for their insightful feedback. We also benefit from the highly reproducible work by~\citet{rosenfeld2023almost} in our experiments.
Both authors are partially supported by a seed grant from JHU Institute of Assured Autonomy (IAA). AL is also partially supported by an Amazon Research Award.
\end{acknowledgements}

% References
\bibliography{odd}

\newpage

\onecolumn

\title{ODD: Overlap-aware Estimation of Model Performance under Distribution Shift\\(Supplementary Material)}
\maketitle

\appendix
\section{Proofs}
\label{app:proofs}

\begin{lemma}
\label{lemma:error-expansion_app}
    For any classifier $h$, \\
    $\epsilon_{\gT}(h) \leq \epsilon_{\gS}(h) + \Delta(h, y^*) + \lambda$.
\end{lemma}
\begin{proof}
    \begin{align*}
        \epsilon_{\gT}(h) &\leq \epsilon_{\gT}(h, y^*) + \epsilon_{\gT}(y^*) \quad \text{[Triangle Inequality]}\\
        &= \epsilon_{\gS}(h, y^*) + (\epsilon_{\gT}(h, y^*) - \epsilon_{\gS}(h, y^*)) + \epsilon_{\gT}(y^*)\\
        &\leq \epsilon_{\gS}(h) + \Delta(h, y^*) + (\epsilon_{\gT}(y^*) + \epsilon_{\gS}(y^*)) \\
        &= \epsilon_{\gS}(h) +\Delta(h, y^*) + \lambda
        % &= \epsilon_{\gS}(h) + \Delta(h, y^*, \alpha) + \Delta^{\gD_\alpha}(h, y^*) + \lambda\\
        % &= \epsilon_{\gS}(h) + \Delta(h, y^*, \alpha) + \Delta^{\gD_\alpha}(h, y^*) + \lambda\\
    \end{align*}
\end{proof}

\begin{theorem}[\disdis Bound]
    \label{thm:dis2bound_app}
    Under~\autoref{ass:dis2concave}, with probability $\geq 1-\delta$,
    \begin{align*}
        \epsilon_{\gT}(\hat{h}) &\leq \epsilon_{\hat{\gS}}(\hat{h}) + \hat{\Delta}(\hat{h}, \Bar{h}) + \sqrt{\frac{(n_S + 4n_T) \log \frac{1}{\delta}}{2 n_S n_T}} + \lambda.
    \end{align*}
\end{theorem}
\begin{proof}
    We follow~\cite{rosenfeld2023almost} with our notion of \textit{adaptability} through the ideal joint hypothesis $y^*$. \\
    From~\autoref{lemma:error-expansion_app} and~\autoref{ass:dis2concave}, we have, $\epsilon_{\gT}(h) \leq \epsilon_{\gS}(h) +\Delta(h, \Bar{h}) + \lambda$.

    To upper bound the first two terms using empirical estimates, we define the following random variables:
    \begin{align*}
        r_{\gS, i} = 
        \begin{cases}
        0,& \Bar{h}(x_i) = y^\gS_i,\\
        \frac{1}{n_S},& \Bar{h}(x_i) = \hat{h}(x_i) \neq y^\gS_i,\\
        \frac{-1}{n_S},& \Bar{h}(x_i) \neq \hat{h}(x_i) = y^\gS_i , 
        \end{cases}
        \qquad
        r_{\gT, i} = \frac{\mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}}{n_T}
    \end{align*}
    % By construction, the sum of all of these variables is precisely $\epsilon_{\hat{\gS}}(\hat{h}, y^*) + \epsilon_{\hat\gT}(\hat{h}, \Bar{h}) - \epsilon_{\hat{\gS}}(\hat{h}, \Bar{h})$ (note these are the empirical terms). Further, 
    % Here, $y^\gS_i$ is the source ground truth label for the $i^\text{th}$ sample (according to $y^*_\gS$), while $y^\gS_i$ is the label according to $y^*$. \\
    Here, $y^\gS_i$ is the source ground truth label for the $i^\text{th}$ sample (according to $y^*_\gS$).
    Consider the following sums:
    \begin{align*}
        \sum_{\gS} r_{\gS,i} = \frac{1}{n_\gS}\sum_{\gS}[\mathbf{1}\{\hat{h}(x_i) \neq y^\gS_i\} - \mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}] &= \epsilon_{\hat{\gS}}(\hat{h}, y^*_\gS) - \epsilon_{\hat{\gS}}(\hat{h}, \Bar{h}),\\
        \sum_{\gT} r_{\gT, i} =\frac{1}{n_\gT}\sum_{\gT}[\mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}] &= \epsilon_{\hat{\gT}}(\hat{h}, \Bar{h}).
    \end{align*}
    Their sum $\epsilon_{\hat{\gS}}(\hat{h}, y^*_\gS) - \epsilon_{\hat{\gS}}(\hat{h}, \Bar{h}) + \epsilon_{\hat{\gT}}(\hat{h}, \Bar{h}) = \epsilon_{\hat{\gS}}(\hat{h}) + \hat{\Delta}(\hat{h}, \Bar{h})$ gives us the empirical estimate of the first two terms. The sum of their corresponding population terms: 
    \begin{align*}
        \E_\gS\left[ \sum_{\gS} r_{\gS,i} \right] &= \epsilon_{\gS}(\hat{h}) - \epsilon_{\gS}(\hat{h}, \Bar{h}),\\
        \E_\gT\left[ \sum_{\gT} r_{\gT, i} \right] &= \epsilon_{\gT}(\hat{h}, \Bar{h}),
    \end{align*}
    and $\lambda$ gives us the bound: $\epsilon_{\gT}(h) = \epsilon_{\gS}(\hat{h}) + \Delta(\hat{h}, \Bar{h}) + \lambda$.
    Applying Hoeffding's inequality: the probability that the expectation exceeds their sum by $t$ is no more than $\exp\left(-\frac{2t^2}{n_S \left(\frac{2}{n_S}\right)^2 + n_T \left(\frac{1}{n_T}\right)^2}\right)$ and solving for $t$ completes the proof.
\end{proof}

Note that the $\lambda$ term is incalculable in our setting (without access to target labels), so this bound only provides a qualitative relationship to the target risk. In practice, if $\lambda$ is estimated through held out target samples, the adjustment due to finite sampling (through Hoeffding's inequality) will also change accordingly.

\begin{theorem}
    \label{thm:splitbound_app}
    Under~\autoref{ass:dis2concave},
    with probability $\geq 1-\delta$,
    \begin{align*}
        \epsilon_{\gT}(\hat{h}) &\leq \epsilon_{\hat{\gS}}(\hat{h}) + \hat\Delta(\hat{h}, \Bar{h}, \alpha) + \underbar{\hat\Delta}(h, \Bar{h}, \alpha) + \sqrt{\frac{(n_S + 4n_T) \log \frac{1}{\delta}}{2 n_S n_T}} + \lambda.
    \end{align*}
\end{theorem}

\begin{proof}
    As $\Delta(h, h') = \Delta(h, h', \alpha) + \underbar{\Delta}(h, h', \alpha)$ for any $h, h'$ and $\alpha$, the random variables defined in the proof for~\autoref{thm:dis2bound_app} can be split in two mutually exclusive and exhaustive sets (overlapping and non-overlapping regions). The rest of the proof follows similarly.
\end{proof}

% \autoref{ass:oddconcave} is directly implied by~\autoref{ass:overlap} and~\autoref{ass:dis2concave}. We re-write it here before discussing why.

% \begin{assumption}
%     \label{ass:oddconcave_app}
%     For an $\Bar{h}\in\gH$ which maximizes a concave surrogate to the empirical \odd, $\Delta(\hat{h}, y^*, \alpha) \leq \Delta(\hat{h}, \Bar{h}, \alpha)$.
% \end{assumption}
% \begin{proof}
%      % As $\Delta(h, h') = \Delta(h, h', \alpha) + \underbar{\Delta}(h, h', \alpha)$ and $\underbar{\Delta}(h, h', \alpha) \leq 0$ (from~\autoref{ass:overlap}), we have:
%      \begin{align*}
%          \Delta(\hat{h}, y^*) &\leq \Delta(\hat{h}, \Bar{h})\\
%          \implies \\
%          \Delta(\hat{h}, y^*, \alpha) + \underbar{\Delta}(\hat{h}, y^*, \alpha) &\leq  
%      \end{align*}
% \end{proof} 

\begin{theorem}[\odd Bound]
    \label{thm:oddbound_app}
    Under~\autoref{ass:oddconcave} and~\autoref{ass:overlap},
    with probability $\geq 1-\delta$,
    \begin{align*}
        \epsilon_{\gT}(\hat{h}) &\leq \epsilon_{\hat{\gS}}(\hat{h}) + \hat\Delta(\hat{h}, \Bar{h}, \alpha) +
        \sqrt{\frac{(n_S + 4n_T) \log \frac{1}{\delta}}{2 n_S n_T}}
    \end{align*}
\end{theorem}


\begin{proof}
    With the common $y^*$, $\lambda$ goes to zero. Hence,~\autoref{lemma:error-expansion_app} implies:
    $\epsilon_{\gT}(h) \leq \epsilon_{\gS}(h) +\Delta(h, \Bar{h})$.
    
    % We follow the proof from~\cite{rosenfeld2023almost} with our new notion of overlap aware discrepancy, and consideration of \textit{adaptability} through the ideal joint hypothesis $y^*$. \\
    % We know, $\epsilon_{\gT}(h) \leq \epsilon_{\gS}(h, y^*) + (\epsilon_{\gT}(h, y^*) - \epsilon_{\gS}(h, y^*)) + \epsilon_{\gT}(y^*)$ from~\autoref{lemma:error-expansion}.
    
    % From~\autoref{ass:concave} and~\autoref{lemma:error-expansion}, we have\\
    % % and ~\autoref{ass:overlap}, 
    % $\epsilon_{\gT}(\hat{h}) \leq \epsilon_{\gS}(\hat{h}) + \Delta(\hat{h}, \Bar{h}, \alpha) + \Delta^{\gD_\alpha}(\hat{h}, y^*) + \lambda = \epsilon_{\gS}(\hat{h}, y^*_\gS) + \epsilon^{\gT}_{\gT \setminus \gD_\alpha}(\hat{h}, \Bar{h}) - \epsilon^{\gS}_{\gS \setminus \gD_\alpha}(\hat{h}, \Bar{h}) + \boxed{\Delta^{\gD_\alpha}(\hat{h}, y^*) + \lambda$}. 
    
    % \fbox{\parbox{\textwidth}{\paragraph{NOTE:} The last two terms depend on $y^*$, the ideal joint hypothesis, which can not be practically calculated without access to target domain labels. As mentioned in the main text, these inestimable terms go to zero under many real-world conditions, but we keep them as-is in our analysis to paint a complete general picture.}}

    % We now define the foll
    % To upper bound the first three terms using empirical estimates, we define the 
    We now define the following random variables:
    \begin{align*}
        r_{\gS \setminus \gD_\alpha,i} = 
        \begin{cases}
        0,& \Bar{h}(x_i) = y^*_i,\\
        \frac{1}{n_S},& \Bar{h}(x_i) = \hat{h}(x_i) \neq y^*_i,\\
        \frac{-1}{n_S},& \Bar{h}(x_i) \neq \hat{h}(x_i) = y^*_i , 
        \end{cases}
        \qquad
        % r_{\gT \setminus \gD_\alpha, i} = \frac{\mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}}{n_T}.
        r^\gS_{\gD_\alpha,i} = 
        \begin{cases}
        0,& \Bar{h}(x_i) = y^*_i,\\
        \frac{1}{n_S},& \Bar{h}(x_i) = \hat{h}(x_i) \neq y^*_i,\\
        \frac{-1}{n_S},& \Bar{h}(x_i) \neq \hat{h}(x_i) = y^*_i, 
        \end{cases}
    \end{align*}
    \begin{align*}
        r_{\gT \setminus \gD_\alpha, i} = \frac{\mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}}{n_T}
        \qquad
        r^\gT_{\gD_\alpha, i} = \frac{\mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}}{n_T}
    \end{align*}
    % By construction, the sum of all of these variables is precisely $\epsilon_{\hat{\gS}}(\hat{h}, y^*) + \epsilon_{\hat\gT}(\hat{h}, \Bar{h}) - \epsilon_{\hat{\gS}}(\hat{h}, \Bar{h})$ (note these are the empirical terms). Further, 
    % Here, $y^\gS_i$ is the source ground truth label for the $i^\text{th}$ sample (according to $y^*_\gS$), while $y^*_i$ is the label according to $y^*$. \\
    % Here, $y^*_i$ is the ideal joint hypothesis label for the $i^\text{th}$ sample.
    Now we have the following empirical sums:
    \begin{align*}
        \sum_{\gS \setminus \gD_\alpha} r_{\gS \setminus \gD_\alpha,i} = \frac{1}{n_\gS}\sum_{\gS \setminus \gD_\alpha}[\mathbf{1}\{\hat{h}(x_i) \neq y^*_i\} - \mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}] &= \epsilon^\gS_{\widehat{\gS \setminus \gD_\alpha}}(\hat{h}, y^*) - \epsilon^\gS_{\widehat{\gS \setminus \gD_\alpha}}(\hat{h}, \Bar{h}),\\
        \sum_{\gT \setminus \gD_\alpha} r_{\gT \setminus \gD_\alpha, i} =\frac{1}{n_\gT}\sum_{\gT \setminus \gD_\alpha}[\mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}] &= \epsilon^\gT_{\widehat{\gT \setminus \gD_\alpha}}(\hat{h}, \Bar{h}),\\
        \sum_{\gD_\alpha} r^\gS_{\gD_\alpha,i} = \frac{1}{n_\gS}\sum_{\gD_\alpha}[\mathbf{1}\{\hat{h}(x_i) \neq y_i\} - \mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}] &= \epsilon^\gS_{\hat{\gD_\alpha}}(\hat{h}, y^*) - \epsilon^\gS_{\hat{\gD_\alpha}}(\hat{h}, \Bar{h}),\\
        \sum_{\gD_\alpha} r^\gT_{\gD_\alpha, i} =\frac{1}{n_\gT}\sum_{\gD_\alpha}[\mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}] &= \epsilon^\gT_{\hat{\gD_\alpha}}(\hat{h}, \Bar{h}),
    \end{align*}
    % represent the empirical measure of source are target disagreement in the non-overlapping region, target disagreement in the non-overlapping region, source disagreement in the overlapping region and target disagreement in the overlapping region, respectively. 
    % to give an empirical estimate to the corresponding population terms.
    % $\epsilon_{\gS}(\hat{h}, y^*_\gS) + \epsilon^{\gT}_{\gT \setminus \gD_\alpha}(\hat{h}, \Bar{h}) - \epsilon^{\gS}_{\gS \setminus \gD_\alpha}(\hat{h}, \Bar{h})$. 
    and their corresponding population terms: 
    % Therefore, the sum of their expectations: 
    \begin{align*}
        \E_\gS\left[ \sum_{\gS \setminus \gD_\alpha} r_{\gS \setminus \gD_\alpha,i} \right] &= \epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, y^*) - \epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, \Bar{h}),\\
        \E_\gT\left[ \sum_{\gT \setminus \gD_\alpha} r_{\gT \setminus \gD_\alpha, i} \right] &= \epsilon^\gT_{\gT \setminus \gD_\alpha}(\hat{h}, \Bar{h}),\\
        \E_\gS\left[ \sum_{\gD_\alpha} r^\gS_{\gD_\alpha,i} \right] &= \epsilon^\gS_{\gD_\alpha}(\hat{h}, y^*) - \epsilon^\gS_{\gD_\alpha}(\hat{h}, \Bar{h}),\\
        \E_\gT\left[ \sum_{\gD_\alpha} r^\gT_{\gD_\alpha, i} \right] &= \epsilon^\gT_{\gD_\alpha}(\hat{h}, \Bar{h}).
    \end{align*}
    The sum of these terms
    % gives the population terms
    \begin{align*}
        &= \epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, y^*) - \epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, \Bar{h}) + \epsilon^\gT_{\gT \setminus \gD_\alpha}(\hat{h}, \Bar{h}) + \epsilon^\gS_{\gD_\alpha}(\hat{h}, y^*) - \epsilon^\gS_{\gD_\alpha}(\hat{h}, \Bar{h}) + \epsilon^\gT_{\gD_\alpha}(\hat{h}, \Bar{h})\\
        &= (\epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, y^*) + \epsilon^\gS_{\gD_\alpha}(\hat{h}, y^*)) + (\epsilon^\gT_{\gT \setminus \gD_\alpha}(\hat{h}, \Bar{h}) - \epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, \Bar{h})) + (\epsilon^\gT_{\gD_\alpha}(\hat{h}, \Bar{h}) - \epsilon^\gS_{\gD_\alpha}(\hat{h}, \Bar{h}))\\
        &= \epsilon_{\gS}(\hat{h}, y^*) + \Delta(\hat{h}, \Bar{h}, \alpha) + (\epsilon^\gT_{\gD_\alpha}(\hat{h}, \Bar{h}) - \epsilon^\gS_{\gD_\alpha}(\hat{h}, \Bar{h}))\\
        &\leq \epsilon_{\gS}(\hat{h}, y^*_\gS) + \Delta(\hat{h}, \Bar{h}, \alpha) + \underbar{\Delta}(\hat{h}, \Bar{h}, \alpha)\\
        &\leq \epsilon_{\gS}(\hat{h}, y^*_\gS) + \Delta(\hat{h}, \Bar{h}, \alpha) \qquad \left(\underbar{\Delta}(\hat{h}, \Bar{h}, \alpha) \leq 0 \text{ from~\autoref{ass:overlap}.}\right)
        % + \epsilon^\gS_{\gD_\alpha}(y^*, y^*_\gS)
        % \qquad \left(\epsilon^\gT_{\gD_\alpha}(\hat{h}, \Bar{h}) - \epsilon^\gS_{\gD_\alpha}(\hat{h}, \Bar{h}) \leq 0 \text{ from~\autoref{ass:overlap}.}\right)
    \end{align*}
    Now, applying Hoeffding's inequality completes the proof.
    
    
    % To upper bound these terms, we define the following random variables for source and target samples, respectively.
    % \begin{align*}
    %     r_{\gS \setminus \gD_\alpha,i} = 
    %     \begin{cases}
    %     0,& \Bar{h}(x_i) = y_i,\\
    %     \frac{1}{n_S},& \Bar{h}(x_i) = \hat{h}(x_i) \neq y_i,\\
    %     \frac{-1}{n_S},& \Bar{h}(x_i) \neq \hat{h}(x_i) = y_i , 
    %     \end{cases}
    %     \qquad
    %     r^\gS_{\gD_\alpha,i} = 
    %     \begin{cases}
    %     0,& \Bar{h}(x_i) = y_i,\\
    %     \frac{1}{n_S},& \Bar{h}(x_i) = \hat{h}(x_i) \neq y_i,\\
    %     \frac{-1}{n_S},& \Bar{h}(x_i) \neq \hat{h}(x_i) = y_i , 
    %     \end{cases}
    % \end{align*}
    % \begin{align*}
    %     r_{\gT \setminus \gD_\alpha, i} = \frac{\mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}}{n_T}
    %     \qquad
    %     r^\gT_{\gD_\alpha, i} = \frac{\mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}}{n_T}
    % \end{align*}
    % % By construction, the sum of all of these variables is precisely $\epsilon_{\hat{\gS}}(\hat{h}, y^*) + \epsilon_{\hat\gT}(\hat{h}, \Bar{h}) - \epsilon_{\hat{\gS}}(\hat{h}, \Bar{h})$ (note these are the empirical terms). Further, 
    % Observe that
    % \begin{align*}
    %     \sum_{\gS \setminus \gD_\alpha} r_{\gS \setminus \gD_\alpha,i} = \frac{1}{n_\gS}\sum_{\gS \setminus \gD_\alpha}[\mathbf{1}\{\hat{h}(x_i) \neq y_i\} - \mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}] &= \epsilon^\gS_{\widehat{\gS \setminus \gD_\alpha}}(\hat{h}, y^*) - \epsilon^\gS_{\widehat{\gS \setminus \gD_\alpha}}(\hat{h}, \Bar{h}),\\
    %     \sum_{\gT \setminus \gD_\alpha} r_{\gT \setminus \gD_\alpha, i} =\frac{1}{n_\gT}\sum_{\gT \setminus \gD_\alpha}[\mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}] &= \epsilon^\gT_{\widehat{\gT \setminus \gD_\alpha}}(\hat{h}, \Bar{h}),\\
    %     \sum_{\gD_\alpha} r^\gS_{\gD_\alpha,i} = \frac{1}{n_\gS}\sum_{\gD_\alpha}[\mathbf{1}\{\hat{h}(x_i) \neq y_i\} - \mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}] &= \epsilon^\gS_{\hat{\gD_\alpha}}(\hat{h}, y^*) - \epsilon^\gS_{\hat{\gD_\alpha}}(\hat{h}, \Bar{h}),\\
    %     \sum_{\gD_\alpha} r^\gT_{\gD_\alpha, i} =\frac{1}{n_\gT}\sum_{\gD_\alpha}[\mathbf{1}\{\hat{h}(x_i) \neq \Bar{h}(x_i)\}] &= \epsilon^\gT_{\hat{\gD_\alpha}}(\hat{h}, \Bar{h}),
    % \end{align*}

    % represent the empirical measure of source disagreement in the non-overlapping region, target disagreement in the non-overlapping region, source disagreement in the overlapping region and target disagreement in the overlapping region, respectively. Therefore, the sum of their expectations: 

    
    % \begin{align*}
    %     \E_\gS\left[ \sum_{\gS \setminus \gD_\alpha} r_{\gS \setminus \gD_\alpha,i} \right] &= \epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, y^*) - \epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, \Bar{h}),\\
    %     \E_\gT\left[ \sum_{\gT \setminus \gD_\alpha} r_{\gT \setminus \gD_\alpha, i} \right] &= \epsilon^\gT_{\gT \setminus \gD_\alpha}(\hat{h}, \Bar{h}),\\
    %     \E_\gS\left[ \sum_{\gD_\alpha} r^\gS_{\gD_\alpha,i} \right] &= \epsilon^\gS_{\gD_\alpha}(\hat{h}, y^*) - \epsilon^\gS_{\gD_\alpha}(\hat{h}, \Bar{h}),\\
    %     \E_\gT\left[ \sum_{\gD_\alpha} r^\gT_{\gD_\alpha, i} \right] &= \epsilon^\gT_{\gD_\alpha}(\hat{h}, \Bar{h}),
    % \end{align*}
    % gives the population terms
    % \begin{align*}
    %     &= \epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, y^*) - \epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, \Bar{h}) + \epsilon^\gT_{\gT \setminus \gD_\alpha}(\hat{h}, \Bar{h}) + \epsilon^\gS_{\gD_\alpha}(\hat{h}, y^*) - \epsilon^\gS_{\gD_\alpha}(\hat{h}, \Bar{h}) + \epsilon^\gT_{\gD_\alpha}(\hat{h}, \Bar{h})\\
    %     &= (\epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, y^*) + \epsilon^\gS_{\gD_\alpha}(\hat{h}, y^*)) + (\epsilon^\gT_{\gT \setminus \gD_\alpha}(\hat{h}, \Bar{h}) - \epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, \Bar{h})) + (\epsilon^\gT_{\gD_\alpha}(\hat{h}, \Bar{h}) - \epsilon^\gS_{\gD_\alpha}(\hat{h}, \Bar{h}))\\
    %     &\leq \epsilon_{\gS}(\hat{h}, y^*) + (\epsilon^\gT_{\gT \setminus \gD_\alpha}(\hat{h}, \Bar{h}) - \epsilon^\gS_{\gS \setminus \gD_\alpha}(\hat{h}, \Bar{h})) \qquad \left(\epsilon^\gT_{\gD_\alpha}(\hat{h}, \Bar{h}) - \epsilon^\gS_{\gD_\alpha}(\hat{h}, \Bar{h}) \leq 0 \text{ from~\autoref{ass:overlap}.}\right)
    % \end{align*}
    % Applying Hoeffding's inequality: the probability that the expectation exceeds their sum by $t$ is no more than $\exp\left(-\frac{2t^2}{n_S \left(\frac{2}{n_S}\right)^2 + n_T \left(\frac{1}{n_T}\right)^2}\right)$ and solving for $t$ completes the proof.
\end{proof}

\paragraph{Remark:}~\autoref{ass:overlap} bounds the target risk only in terms of non-overlapping disagreement discrepancy. It may be counter-intuitive to think that target disagreement is bounded by source disagreement. However, we have shown that $\Delta(\hat{h}, \bar{h}, \alpha)$ accounts for nearly all of $\Delta(\hat{h}, \bar{h})$ (\autoref{fig:synthetic_main}) for $\bar{h}$ found through either \disdis or \odd. This implies that even if the target disagreement exceeds source disagreement, it doesn't do it by a lot. Therefore, even without~\autoref{ass:overlap}, the $\underbar{\Delta}$ term from~\autoref{thm:splitbound} should be expected to be negligible and inconsequential in most practical cases.

\section{Experiment Details}
\label{app:expdetails}

\subsection{Dataset details}
\paragraph{Synthetic Datasets:}
The Gaussians are randomly initialized with varying means and covariance matrices. The target gaussian is brought close to the source gaussian by translating its mean using $\mu_\gT \gets \mu_\gT + (\mu_\gS - \mu_\gT) * \text{overlap factor}$. For a point $X = (x_1, x_2)$, its class label ($y^*$) is decided using a complex function:  
\[
y^*(X) =
\begin{cases} 
  0, & \text{if } x_1 \leq \cos(a\sin(bx_2) + ce^{dx_2} + \frac{x_2^2 + 2x_2 - 5}{2}) + \epsilon \\
  1, & \text{otherwise}
\end{cases}
\]
where, $a, b, c, d$ are chosen randomly from a small range and $\epsilon$ is small random noise to make the decision boundary noisy. This allowed for simple as well as extremely complex decision surfaces, making for a suitable test bed for our method and its comparison with \disdis. We show 5 more samples of these datasets in~\autoref{fig:synth_samples}.


\begin{figure}[h]
    \centering
    \includegraphics[width=0.19\linewidth]{figs/synthetic_domains_1.pdf}
    \includegraphics[width=0.19\linewidth]{figs/synthetic_domains_2.pdf}
    \includegraphics[width=0.19\linewidth]{figs/synthetic_domains_3.pdf}
    \includegraphics[width=0.19\linewidth]{figs/synthetic_domains_4.pdf}
    \includegraphics[width=0.19\linewidth]{figs/synthetic_domains_6.pdf}
    \caption{Variations in synthetic datasets.}
    \label{fig:synth_samples}
\end{figure}

\paragraph{Real Datasets:}
We use the datasets provided by~\citet{rosenfeld2023almost} on their \href{https://github.com/erosenfeld/disagree_discrep/tree/master}{github repository}. This includes CIFAR10  [4 shift variations], CIFAR100 [4 shift variations], DomainNet [3 shift variations], Entity13 [3 shift variations], Entity30 [3 shift variations], FMoW [2 shift variations], Living17 [3 shift variations], NonLiving26 [3 shift variations], OfficeHome [3 shift variations] and Visda [2 shift variations]. All datasets were trained with different deep neural network backbones as described in the \disdis paper (Appendix A) with ERM, DANN, CDANN, BN-adapt and FixMatch methods. The deep representations of the trained networks are provided directly for download making replication seamless. 

\paragraph{CivilComments:} We used CivilComments~\cite{borkan2019nuanced} to test if our method improves over \disdis in natural language domain, which it had not been tested on before. This dataset contains sentences labeled \textit{toxic} or not. We created source and target domains using subpopulation shift in this dataset. We filtered all rows with the \textit{black} label True into one set (target) and the rest in another (source), and evenly balanced the dataset to avoid class imbalance issues in evaluation. Train Set: $\sim$ 27k samples in each class, Validation Set: $\sim$ 4.5k samples in each class.  We collected the BERT~\citep{devlin2018bert} embeddings of all sentences in the source and target domain and trained a linear model on the source domain, which achieved a source validation accuracy of $\sim75\%$. The source model achieved a target validation accuracy of $\sim65\%$, which would be the target to predict by our bound. We found that the \disdis-based bound predicted an accuracy of $\sim58\%$, while \odd improved it to $\sim63\%$.

\subsection{More experimental details}
\paragraph{Training domain classifiers:}  
For each experiment: (dataset, shift, training method) combination, we train a small 3-layer MLP, with number of neurons $=$ the dimensionality of the representation. We train the network on a balanced training set consisting of equal number of samples from both source and target domains. We randomly subsample the majority domain to make both classes balanced to not skew the classifier towards one of the classes. We train the classifier with Adam Optimizer, with learning rate $=10^{-4}$, until convergence (difference in loss $< 10^{-4}$ for $10$ epochs).


\subsection{More experimental results}

\paragraph{Results on non-DA algorithms:} It is noticeable that the MAE for domain-adversarial algorithm based representations is much better than the other. We plot the point-wise bound estimates (using logits) for all DA-method predictions in the appendix~\autoref{fig:real_logits_DA} and find that like \disdis, \odd overestimates accuracy in many cases. This is expected as explained in~\autoref{para:da}, but our method still maintains a significantly higher coverage compared to baselines.

% In~\autoref{fig:real_logits_DA}, we show the pointwise predictions of target accuracy on domain-adversarial algorithm trained representations (logits). We see that like \disdis, it overestimates the target accuracy many times resulting in low coverage.

\begin{figure}[b]
    \centering
    \includegraphics[width=0.7\linewidth]{figs/real_logits_True.pdf}
    \caption{Like \disdis, \textbf{DA?}\ding{55} based predictions overestimate accuracy in many cases but still by a smaller margin compared to other baselines.}
    \label{fig:real_logits_DA}
\end{figure}

% \subsection{Further Analysis}

\paragraph{Discounting loss of overlapping source samples:}We ran an additional experiment on the real datasets where we weighted the source samples in addition to the target samples while optimizing for the critic. We found no significant difference in MAE under this setting (MAE without source weighting: 0.1224, MAE with source weighting: 0.1244). This is probably because we are searching in the simple space of linear models, and ignoring a few samples does not impact the selection of the critic much on average. 

\paragraph{Measuring quality of overlap estimation:} To measure the quality of overlap estimation, we split the source and target data (of a few studied datasets) into train and dev sets, and early stopped the domain classifier training if the dev set loss did not fall for 10 iterations in a row. As we do not have access to true densities, the dev set performance acts as a proxy for the quality of overlap estimation.  We tried multiple sized MLPs and found that the mean dev set accuracy across datasets was almost the same (apart from the linear classifier, which probably underfits in some cases). For details, see~\autoref{tab:dcab} where we also measure the expected calibration error (ECE)~\citep{guo2017calibration} of the model. Although we could improve ODD-based bounds by tuning the classifier’s hyperparameters for each individual dataset, we used a single setting (3 layer deep, num\_features wide network) for all experiments to reflect the robustness of our approach. 
% We also ablated with smaller (2-layer) and deeper (4-layer) networks with varying width, and found negligible differences in performance (see~\autoref{tab:dcablation}). We trained linear domain classifiers as well, but they did not perform as well, probably due to under-fitting. 

\begin{table}[h]
\centering
\caption{We find that training domain classifiers on representations is fairly robust to the model architecture.}
\resizebox{0.8\textwidth}{!}{%
\begin{tabular}{ccccccc}
\textbf{num\_layers} & \textbf{width}     & \textbf{Accuracy (mean)} & \textbf{Accuracy (std)} & \textbf{ECE (mean)} & \textbf{ECE (std)} & \textbf{Final MAE} \\
1                    & -                  & 0.66                     & 0.15                    & 0.09                & 0.08               & 0.1234             \\
2                    & num\_features      & 0.71                     & 0.16                    & 0.11                & 0.09               & 0.1269             \\
2                    & num\_features // 2 & 0.69                     & 0.17                    & 0.10                & 0.09               & 0.1236             \\
3                    & num\_features      & 0.71                     & 0.15                    & 0.15                & 0.10               & 0.1223             \\
3                    & num\_features // 2 & 0.70                     & 0.16                    & 0.14                & 0.09               & 0.1282             \\
4                    & num\_features      & 0.70                     & 0.16                    & 0.19                & 0.12               & 0.1229             \\
4                    & num\_features // 2 & 0.69                     & 0.16                    & 0.18                & 0.11               & 0.1255            
\end{tabular}%
}
\label{tab:dcab}
\end{table}




% \section{Additional simulation results}
% Table~\ref{tab:supp-data} lists additional simulation results; see also \citet{einstein} for a comparison. 

% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%         \toprule % from booktabs package
%         \bfseries Dataset & \bfseries Result\\
%         \midrule % from booktabs package
%         Data1 & 0.12345\\
%         Data2 & 0.67890\\
%         Data3 & 0.54321\\
%         Data4 & 0.09876\\
%         \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%     F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.


\end{document}
