\section{Proofs of Formal Results} \label{app:proofs}
\subsection{Proof of Error Guarantees} \label{app:proofs_guarantees}
% \paragraph{Assumptions are required for guarantees on novel class detection.} It is quite clear that if $\Plabel{0}, \Plabel{1}$ can be any two distributions then we are unable to provide guarantees on identification of the novel subgroup. To emphasize this necessity of distributional assumptions, let us consider a toy scenario. Assume our distributions are over $3$ states, where $\Psource = [1-\epsilon, \epsilon, 0]$ and $\Plabel{0} = [0, 1-\epsilon, \epsilon], \Plabel{1} = [0, \epsilon, 1-\epsilon]$ for some small $\epsilon > 0$, and $\alpha=0.5$. On the other hand consider taking $\tilde{P}_{\gT, 0} = [0, \epsilon, 1-\epsilon], \tilde{P}_{\gT, 1} = [0, 1-\epsilon, \epsilon]$ with the same $\Psource, \alpha$. This means $\Ptarget = \alpha\Plabel{0} + (1-\alpha)\Plabel{1} = \alpha\tilde{P}_{\gT, 0} + (1-\alpha)\tilde{P}_{\gT, 1}$, hence a PU-learning algorithm that receives $\Psource, \Ptarget$ gets the same input in both scenarios. On the other hand, it is easy to see that any hypothesis $h$ that achieves small error when the true distributions are $\Plabel{0}, \Plabel{1}$ (e.g. that returns $1$ for the third state and $0$ for the others, achieving $R_{\gT}(h) = \epsilon$), thus solving the problem with small error. On the other hand, if we switch the label in the target distribution and let $\Plabel{0} = [0, \epsilon, 1-\epsilon], \Plabel{1} = [0, 1-\epsilon, \epsilon]$, then the same hypothesis achieves error $1-\epsilon$. It easy to see that for any hypothesis that achieves low error on the first problem, will obtain high error for the second. Show that switching the roles of the $Y=0$ and $Y=1$ leads to the same input to the learner, and hence the problem is unidentifiable\ldots By the nature of novelties, if $\Plabel{0}$ is allowed to introduce unexpected events w.r.t $\Psource$ it is impossible to tell whether we should associate these events with the novel class or not. The bounds we will derive in the sequel will include a quantity that accounts for this possibility, meaning they are non-vacuous only if it is bounded.

% \paragraph{Assumptions that appear in previous works.} Let us shortly review the assumptions that have been proposed in the literature to enable PU-learning under nominal distribution shift. \citet{gerych2022recovering} assume \emph{separability}, where there is a hypothesis $h^*\in{\gH}$ achieving zero risk w.r.t both source and target, $R_{\gS}(h^*) = 0, R_{\gT}(h^*) = 0$, and also the supports of $\Psource, \Plabel{0}$ are identical. Our method will also guarantee a risk that approaches $0$ as the sample size approaches infinity under these conditions. The main difference is that we will analyze bounds where these assumptions do not hold perfectly, and provide generalization bounds that provide guarantees for learning with finite samples. Whereas most previously proposed algorithms are based on some form of density ratio estimation, our derivation leads to a different algorithm based on constrained learning.

% The distributional assumptions we rely on in this work, which are reminiscent of the aforementioned separability assumption, impose rather weak restrictions on the relation between $\Psource$ and $\Plabel{0}$. Other works make stronger distributional assumptions that are tailored for more specific cases

% The reason this is necessary for identification of $\alpha$ is that if we have $\Plabel{1} = \gamma\Psource + (1-\gamma)Q$ for $\gamma > 0$ then both $\Ptarget = \alpha \Plabel{1} + (1-\alpha)\Psource$ and $\Ptarget = \alpha(1-\tilde{\gamma})(\frac{1-\gamma}{1-\tilde{\gamma}}Q + \frac{\gamma-\tilde{\gamma}}{1-\tilde{\gamma}}\Psource) + (1 - \alpha + \alpha\tilde{\gamma})\Psource$ 
% are valid decompositions of $\Ptarget$ for any $0 < \tilde{\gamma} < \gamma$, but they yield different novelty distributions and different mixture proportions ($\alpha$ for the first, and $\alpha(1-\tilde{\gamma})$ for the second).

We recall the notion of distance defined in the main paper, inspired by the $\gH$-divergence in the domain adaptation literature \citep{kifer2004detecting, bendavid2010adaptation},
% \begin{align*}
%     \hdiv
% \end{align*}
\begin{align*}
d_{\gH, \beta}&\left( P \| Q \right) = \\
&\sup_{g\in{\gH}: P\left[I(g)\right] \leq \beta}{2 \Big| P\left[I(g)\right] - Q\left[I(g)\right] \Big|}.
\end{align*}
Let us prove the first part of \cref{thm:main_result}:
% \upperbound*
\begin{restatable}{lemma}{upperbound}
For a novelty detection problem as in \Cref{def:prob_setting}, let $h\in{\gH}$ and denote $\alpha(h) = \E_{\Ptarget}{[h(\rvx)]}$, while $\beta(h) = \E_{\Psource}{[h(\rvx)]}$. Define,
\begin{align*}
\bar{R}^{l_{01}}_\gT(h) = [\alpha &- \alpha(h)] + \\
&(1-\alpha)\left[\beta(h) + d_{\gH, \beta(h)}\left( P_{\gS} \| \Plabel{0} \right)\right].
\end{align*}
Then we have that $R^{l_{01}}_\gT(h) \leq \bar{R}^{l_{01}}_\gT(h)$.
\label{lem:err_bound}
\end{restatable}
\begin{proof}
We decompose the error as follows:
\begin{align}
R^{l_{01}}_\gT(h) &= (1-\alpha)\cdot \E_{\rvx\sim \Plabel{0}}{[h(\rvx)]} + \alpha\cdot \E_{\rvx\sim \Plabel{1}}[1-h(\rvx)] \nonumber \\
&= (1-\alpha)\cdot\E_{\rvx\sim \Plabel{0}}{[h(\rvx)]} + \alpha \cdot \left(1 - \E_{\rvx\sim \Plabel{1}}[h(\rvx)] \right) \nonumber \\
&= \alpha - \E_{\rvx\sim (1-\alpha)\Plabel{0} + \alpha\Plabel{1}}{[h(\rvx)]} + 2\cdot(1-\alpha)\cdot\E_{\rvx\sim \Plabel{0}}{[h(\rvx)]} \nonumber \\
&= \alpha - \E_{\rvx\sim \Ptarget}{[h(\rvx)]} + 2\cdot(1-\alpha)\cdot\E_{\rvx\sim \Plabel{0}}{[h(\rvx)]} \label{eq:expanded_risk}\\ %+ \alpha\left(1 - \E_{\rvx\sim \Plabel{1}}[h(\rvx)] \right) \\
&=  \alpha - \E_{\rvx\sim \Ptarget}{[h(\rvx)]} + 2\cdot(1-\alpha)\cdot\left[\E_{\rvx\sim \Psource}{[h(\rvx)]} + \E_{\rvx\sim \Plabel{0}}{[h(\rvx)]} - \E_{\rvx\sim \Psource}{[h(\rvx)]} \right] \nonumber \\
&\leq  \alpha - \E_{\rvx\sim \Ptarget}{[h(\rvx)]} + 2\cdot(1-\alpha)\cdot \E_{\rvx\sim \Psource}{[h(\rvx)]} + 2\cdot(1-\alpha)\cdot\left|\Plabel{0}\left( h(\rvx)=1 \right) - \Psource\left( h(\rvx)=1 \right) \right| \nonumber \\
&= \alpha - \alpha(h) + 2\cdot(1-\alpha)\beta(h) + 2\cdot(1-\alpha)\cdot\left|\Plabel{0}\left( h(\rvx)=1 \right) - \Psource\left( h(\rvx)=1 \right) \right| \nonumber \\
&\leq \alpha - \alpha(h) + 2\cdot(1-\alpha)\left[ \beta(h) + d_{\gH, \beta(h)}(\Psource \| \Plabel{0}) \right] = \bar{R}^{l_{01}}_\gT(h). \nonumber
% \\
% &= (1-\alpha)\Plabel{0}(h(X)=1) + \alpha\Plabel{1}(h(X) = 0) \\
% & = (1-\alpha)\Plabel{0}(h(X)=1) + \alpha\left( 1 - \Plabel{1}(h(X) = 1) \right) \\
% % & = \alpha - \Ptarget(h(X)=1) + 2\Ptarget(h(X) = 1, Y = 0) \\
% & = \alpha - \Ptarget(h(X)=1) + 2\cdot(1-\alpha)\Plabel{0}(h(X=1)) \\%\Ptarget(h(X) = 1, Y = 0) - 2\Psource(h(X)=1) + 2P_A(h(X)=1) \\
% & = \alpha - \alpha(h) + 2\cdot(1-\alpha)\left[\Psource(h(X=1)) + \Plabel{0}(h(X=1)) - \Psource(h(X=1)) \right] \\%\Ptarget(h(X) = 1, Y = 0) - 2\Psource(h(X)=1) + 2P_A(h(X)=1) \\
% & \leq \alpha - \alpha(h) + 2\cdot(1-\alpha)\left[ \beta(h) + d_{\gH, \beta(h)}(\Psource \| \Plabel{0})\right]
\end{align}
\end{proof}
With this inequality in hand, we can now prove \cref{prop:seperability}.
\begin{proposition*}
Assume separability holds, which postulates that $\Plabel{0}(B) > 0 \Rightarrow \Psource(B) > 0$ for any measurable subset $B$ w.r.t both distributions. \footnote{separability also assumes $\exists h^*\in{\gH}$ such that $R^{l_{01}}_{\gT}(h^*)=0$, but to prove \cref{prop:seperability} we do not require this.} Scarcity-of-Unicorns (\Cref{assum:unicorn_bound}) holds with $\beta, \varepsilon_{\text{shift}}$ set to $0$.
\end{proposition*}
\begin{proof}
Let $\gB$ denote the measurable subsets w.r.t both $\Psource$ and $\Plabel{0}$ and define,
\begin{align*}
d_{1, \beta}\left( \Psource \| \Plabel{0} \right) = \sup_{B\in{\gB}: \Psource(B) \leq \beta}{2 \Big| \Psource(B) - \Ptarget(B) \Big|}.
\end{align*}
Taking for any $g\in{\gH}$ the subset of inputs where it equals $1$, $I(g)\subseteq{\gX}$, and $I(\gH) = \{I(g) ~:~ g\in{\gH} \}$, we see that $I(\gH)\subseteq{\gB}$ and hence we have 
\begin{align} \label{eq:temp_prop41}
    d_{\gH, \beta}\left( \Psource \| \Plabel{0} \right) \leq d_{1, \beta}\left( \Psource \| \Plabel{0} \right),
\end{align} for any $\beta\geq 0$. Under separability we have that $d_{1, 0}\left( \Psource \| \Plabel{0} \right) = 0$, since if $\Plabel{0}(\tilde{B}) > 0$ for some $\tilde{B}\in{\gB}$ then we must also have $\Psource(\tilde{B}) > 0$ and then $\tilde{B}\notin \{B\in{\gB} : \Psource(B) \leq 0 \}$. This means that for any $\tilde{B}\in \{B\in{\gB} : \Psource(B) \leq 0 \}$ we must have $\Plabel{0}(\tilde{B}) = 0$ and hence $d_{1,0}\left( \Psource \| \Plabel{0} \right) = 0$. The claim is proved by combining this with \cref{eq:temp_prop41}, to obtain $d_{\gH, \beta}\left( \Psource \| \Plabel{0} \right) \leq d_{1, \beta}\left( \Psource \| \Plabel{0} \right) = 0$, and since the divergence is non-negative it must equal $0$, meaning Scarcity-of-Unicorns holds with $\beta, \varepsilon_{\text{shift}}$ set to $0$.
% Separability also assumes that $\beta(h^*)=0$ for some $h^*\in{\gH}$, hence we get:
% \begin{align*}
% d_{\gH, \beta(h^*)}\left( \Psource \| \Plabel{0} \right) = d_{\gH, 0}\left( \Psource \| \Plabel{0} \right) \leq d_{1, 0}\left( \Psource \| \Plabel{0} \right) = 0.
% \end{align*}
% This means that $d_{\gH, \beta(h^*)}\left( \Psource \| \Plabel{0} \right) = 0$ and \cref{assum:unicorn_bound} holds with $\beta=0$ and $\varepsilon_{\text{shift}}=0$. It is easy to see that since $R^{l_{01}}_{\gT}(h^*)=0$ we must have $\alpha(h^*) = \alpha, \beta(h^*)=0$ and hence $\bar{R}^{l_{01}}_{\gT} = \alpha - \alpha(h^*) + 2\cdot(1-\alpha)[\beta(h^*) + d_{\gH, \beta(h^*)}(\Psource \| \Plabel{0})] = 0$, which proves our claim.
\end{proof}
Next let us restate the proposed learning rule
\begin{align*}
    &\max_{h\in{\gH}}{\hat{\alpha}(h)} \\
    &\text{s.t. } \hat{\beta}(h) \leq \beta
\end{align*}

%%%% START COMMENT
%%%%
\begin{comment}
The error bound on the solution to \cref{eq:precision_at_recall} follows easily from \cref{lem:err_bound}.
\precrecallbound*
\begin{proof}
    Since we assume that $\beta > \beta(h^*)$, the hypothesis $h^*$ is a feasible solution to \cref{eq:precision_at_recall} and it holds for an optimal solution $\hat{h}$ to the problem that $\alpha(\hat{h}) \geq \alpha(h^*)$. Using \cref{eq:expanded_risk}, we can write the gap in risks $R^{l_{01}}_{\gT}(\hat{h}) - R^{l_{01}}_{\gT}(h^*)$ as follows,
    \begin{align*}
        R^{l_{01}}_{\gT}(\hat{h}) - R^{l_{01}}_{\gT}(h^*) &= \E_{\rvx\sim \Ptarget}{\left[ h^*(\rvx) - \hat{h}(\rvx) \right]} + 2\cdot(1-\alpha)\cdot\E_{\rvx\sim{\Plabel{0}}}{\left[\hat{h}(\rvx)-h^*(\rvx)\right]} \\
        &= \alpha(h^*) - \alpha(\hat{h}) + 2\cdot(1-\alpha)\cdot\E_{\rvx\sim{\Plabel{0}}}{\left[\hat{h}(\rvx)-h^*(\rvx)\right]} \\
        &\leq \alpha(h^*) - \alpha(\hat{h}) + 2\cdot(1-\alpha)\cdot\E_{\rvx\sim{\Plabel{0}}}{\left[\hat{h}(\rvx)\right]} \\
        &\leq 2\cdot(1-\alpha)\cdot\E_{\rvx\sim{\Plabel{0}}}{\left[\hat{h}(\rvx)\right]} \\
        &= 2\cdot(1-\alpha)\cdot\left(\E_{\rvx\sim{\Plabel{0}}}{\left[\hat{h}(\rvx)\right]} - \E_{\rvx\sim{\Psource}}{\left[\hat{h}(\rvx)\right]} + \E_{\rvx\sim{\Psource}}{\left[\hat{h}(\rvx)\right]} \right) \\
        &= 2\cdot(1-\alpha)\cdot\left(\E_{\rvx\sim{\Plabel{0}}}{\left[\hat{h}(\rvx)\right]} - \E_{\rvx\sim{\Psource}}{\left[\hat{h}(\rvx)\right]} + \beta(\hat{h}) \right) \\
        &\leq 2\cdot(1-\alpha)\cdot\left(d_{\gH, \beta}(\Psource \| \Plabel{0}) + \beta \right) \leq 2\cdot(1-\alpha)\cdot\left(\varepsilon_{\text{shift}} + \beta \right).
    \end{align*}
    The first inequality holds because the False Positive rate of any hypothesis, including $h^*$, is positive. The second one uses $\alpha(\hat{h}) \geq \alpha(h^*)$, and the last ones rely on the definition of $d_{\gH, \beta}(\Psource \| \Plabel{0})$ and our assumptions on $\beta(\hat{h})$ and the divergence.
\end{proof}
\end{comment}
%%%%
%%%% END COMMENT
We derive generalization bounds for solutions to the empirical version of this problem. Recall the Rademacher complexity of $\gH$ with respect to $n$ samples from distribution $P$ is denoted by $R_{n, P}(\gH) = \E_{P^n}{\left[ \frac{1}{n}\E_{\boldsymbol{\sigma}}{\left[\sup_{h\in{\gH}}\sum_{i}{\sigma_i h(\rvx_i)}  \right]} \right]}$, the following statement gives the statistical guarantee we require for our result.
\begin{lemma} \label{lem:reacll_finite_sample}
Let $\gH$ be a hypothesis class with Rademacher complexities $R_{n_{\gS},\Psource}(\gH)$ and $R_{n_{\gT},\Ptarget}(\gH)$ respectively, and $\hat{h}$ a solution to the empirical estimate of \cref{eq:precision_at_recall}, with $\beta \geq \beta(h^*) + \frac{R_{n_{\gS}, \Psource}(\gH)}{2} + \sqrt{\frac{\ln(1/\delta)}{2n_{\gS}}}$. Then with probability at least $1-4\delta$ we have that simultaneously,
\begin{align}
\alpha(\hat{h}) &\geq \alpha(h^*) - R_{n_{\gT}, \Ptarget}(\gH) - \sqrt{\frac{2\ln(1/\delta)}{n_{\gT}}}, \\
\beta(\hat{h}) &\leq \beta + \frac{R_{n_{\gS}, \Psource}(\gH)}{2} + \sqrt{\frac{\ln(1/\delta)}{2n_{\gS}}}. \label{eq:lem1_part2}
\end{align}
\end{lemma}
\begin{proof}
    From standard Rademacher bounds on the risk of classifiers in a hypothesis class (e.g. \citet[Theorem~5]{bartlett2002rademacher}), we have that with probability $1-2\delta$:
    \begin{align*}
        \left| \hat{\beta}(h) - \beta(h)\right| \leq \frac{R_{n_{\gS}, \Psource}(\gH)}{2} + \sqrt{\frac{\ln(1/\delta)}{2n_{\gS}}} \quad \forall h\in{\gH}.
    \end{align*}
    Therefore \cref{eq:lem1_part2} holds since $\hat{\beta}(\hat{h}) \leq \beta$. Also, from the lower bound on $\beta$ assumed in our lemma statement, all classifiers with False Positive Rate smaller than $\beta(h^*)$ will be in the feasible set of \cref{eq:precision_at_recall}. This follows from the above inequality since for all $h\in{\gH}$ where $\beta(h) < \beta(h^*)$ it holds that
    \begin{align*}
    \hat{\beta}(h) \leq \; & \beta(h) + \frac{R_{n_{\gS}, \Psource}(\gH)}{2} + \sqrt{\frac{\ln(1/\delta)}{2n_{\gS}}} \\
    \Rightarrow \hat{\beta}(h) \leq \; & \beta(h^*) + \frac{R_{n_{\gS}, \Psource}(\gH)}{2} + \sqrt{\frac{\ln(1/\delta)}{2n_{\gS}}}.
    \end{align*}
    Specifically, this means that with probability at least $1-2\delta$, $h^*$ is a feasible solution to \cref{eq:precision_at_recall} and taking $\hat{h}$ that is optimal for \cref{eq:precision_at_recall}, we can gather that $\hat{\alpha}(\hat{h}) \geq \hat{\alpha}(h^*)$. For the second part of the proof, we use the same inequality as before to obtain that with probability at least $1-2\delta$,
    \begin{align} \label{eq:rad_on_recall}
        |\hat{\alpha}(h) - \alpha(h)| \leq \frac{R_{n_{\gT}, \Ptarget}(\gH)}{2} + \sqrt{\frac{\ln(1/\delta)}{2n_{\gT}}} \quad \forall h\in{\gH}.
    \end{align}
    Then we take a union bound to conclude that with probability at least $1-4\delta$,
    \begin{align*}
        \alpha(h^*) - \alpha(\hat{h}) &= \alpha(h^*) - \hat{\alpha}(h^*) + \hat{\alpha}(h^*) - \hat{\alpha}(h) + \hat{\alpha}(h) - \alpha(\hat{h}) \\
        &\leq \alpha(h^*) - \hat{\alpha}(h^*) + \hat{\alpha}(h) - \alpha(\hat{h}) \\
        &\leq R_{n_{\gT}, \Ptarget}(\gH) + \sqrt{\frac{2\ln(1/\delta)}{n_{\gT}}}.
    \end{align*}
    The first inequality follows from our previous conclusion that $\hat{\alpha}(\hat{h}) \geq \hat{\alpha}(h^*)$ and the second from \cref{eq:rad_on_recall}.
\end{proof}
With the concentration properties in hand, recall that we assume $d_{\gH, \beta}(\Psource \| \Plabel{0}) \leq \varepsilon_{\text{shift}}$ for some fixed $\beta, \varepsilon_{\text{shift}} \geq 0$, and let us combine this with the previous claims to bound the error as required for the second part of \cref{thm:main_result}.

\begin{lemma} \label{eq:empirical_bound}
    Let $h^*\in{\gH}$ be a minimizer of $R^{l_{01}}_{\gT}(h)$ and assume $\hat{h}$ solves \cref{eq:precision_at_recall} with $\beta \geq \beta(h^*) + \frac{R_{n_{\gS}, \Psource}(\gH)}{2} + \sqrt{\frac{\ln(1/\delta)}{2n_{\gS}}}$, then with probability at least $1-4\delta$ it holds that 
    \begin{align*}
        R^{l_{01}}_{\gT}(\hat{h}) &\leq R^{l_{01}}_{\gT}(h^*) + 4\varepsilon_{\text{shift}} + 2(\beta-\beta(h^*)) \nonumber \\
    & + R_{n_{\gS}, \Psource}(\gH) + R_{n_{\gT}, \Ptarget}(\gH) \nonumber \\
    & + \sqrt{2\ln(1/\delta)}\left[ n_{\gS}^{-\frac{1}{2}} + n_{\gT}^{-\frac{1}{2}} \right].
    \end{align*}
\end{lemma}
\begin{proof}
    Let us assume that the inequalities in \cref{lem:reacll_finite_sample} hold, which occurs with probability at least $1-4\delta$. We write down the gap in risks between the hypotheses $\hat{h}$ and $h^*$, while using these inequalities:
    \begin{align*}
        R^{l_{01}}_{\gT}(\hat{h}) - R^{l_{01}}_{\gT}(h^*) &= \E_{\rvx\sim \Ptarget}{\left[ h^*(\rvx) - \hat{h}(\rvx) \right]} + 2\cdot(1-\alpha)\cdot\E_{\rvx\sim{\Plabel{0}}}{\left[\hat{h}(\rvx)-h^*(\rvx)\right]} \\
        &= \alpha(h^*) - \alpha(\hat{h}) + 2\cdot(1-\alpha)\cdot\E_{\rvx\sim{\Plabel{0}}}{\left[\hat{h}(\rvx)-h^*(\rvx)\right]} \\
        &= \alpha(h^*) - \alpha(\hat{h}) + 2\cdot(1-\alpha)\cdot\Big[\E_{\rvx\sim{\Plabel{0}}}{\left[\hat{h}(\rvx)-h^*(\rvx)\right]} - \E_{\rvx\sim{\Psource}}{\left[\hat{h}(\rvx)-h^*(\rvx)\right]} \\
        &\qquad \qquad \qquad \qquad \qquad \qquad \qquad + \E_{\rvx\sim{\Psource}}{\left[\hat{h}(\rvx)-h^*(\rvx)\right]} \Big] \\
        &= \alpha(h^*) - \alpha(\hat{h}) + 2\cdot(1-\alpha) \cdot \Big[\E_{\rvx\sim{\Plabel{0}}}\left[\hat{h}(\rvx)\right] - \E_{\rvx\sim{\Psource}}{\left[\hat{h}(\rvx)\right]}\\
        &\qquad \qquad \qquad \qquad \qquad \qquad \qquad + \E_{\rvx\sim{\Psource}}{\left[h^*(\rvx)\right]} - \E_{\rvx\sim{\Plabel{0}}}{\left[h^*(\rvx)\right]} \\
        &\qquad \qquad \qquad \qquad \qquad \qquad \qquad + \E_{\rvx\sim{\Psource}}{\left[\hat{h}(\rvx)-h^*(\rvx)\right]} \Big] \\
        &\leq R_{n_t, \Ptarget}(\gH) + \sqrt{\frac{2\ln (1/\delta)}{n_{\gT}}} + 2\cdot(1-\alpha) \cdot \Big[\E_{\rvx\sim{\Plabel{0}}}\left[\hat{h}(\rvx)\right] - \E_{\rvx\sim{\Psource}}{\left[\hat{h}(\rvx)\right]}\\
        &\qquad \qquad \qquad \qquad \qquad \qquad \qquad \qquad \qquad + \E_{\rvx\sim{\Psource}}{\left[h^*(\rvx)\right]} - \E_{\rvx\sim{\Plabel{0}}}{\left[h^*(\rvx)\right]} \\
        &\qquad \qquad \qquad \qquad \qquad \qquad \qquad \qquad \qquad + \E_{\rvx\sim{\Psource}}{\left[\hat{h}(\rvx)-h^*(\rvx)\right]} \Big] \\
        &\leq R_{n_t, \Ptarget}(\gH) + \sqrt{\frac{2\ln (1/\delta)}{n_{\gT}}} + 2\cdot(1-\alpha) \cdot \left[2\epsilon_{\text{shift}} + \E_{\rvx\sim{\Psource}}{\left[\hat{h}(\rvx)-h^*(\rvx)\right]} \Big] \right] \\
        &\leq R_{n_t, \Ptarget}(\gH) + \sqrt{\frac{2\ln (1/\delta)}{n_{\gT}}} + 2\cdot(1-\alpha) \cdot \left[2\epsilon_{\text{shift}} + \beta-\beta(h^*) + \frac{R_{n_{\gS}, \Psource}(\gH)}{2} + \sqrt{\frac{\ln(1/\delta)}{2n_{\gS}}} \Big] \right].
        % &\leq R_{n_t, \Ptarget}(\gH) + 
        % R_{n_{\gS}, \Psource}(\gH) + \sqrt{\frac{2\ln (1/\delta)}{n_{\gT}}} +  \sqrt{\frac{\ln(1/\delta)}{2n_{\gS}}} + 2\cdot(1-\alpha) \cdot \left[2\epsilon_{\text{shift}} + \beta-\beta(h^*) \Big] \right]
    \end{align*}
    The first and third inequalities are obtained by applying \cref{lem:reacll_finite_sample}, the second holds due to \cref{assum:unicorn_bound}. It is easy to see that the above expression lower bounds the one in our claim since $\alpha\in{[0, 1]}$ and hence our proof is concluded.
\end{proof}
The theorem in the main paper follows directly from the statements we proved above.
\begin{proof}[Proof of \cref{thm:main_result}]
The first part of the theorem follows directly from \cref{lem:err_bound}, while the second is a direct consequence of \cref{eq:empirical_bound}.
\end{proof}

\textbf{Possible Extension of Results.} We note that one clear gap in our results is that they apply to $l_{01}$ instead of other surrogate losses that we use in practice. This is also a gap in the work of \citet{bendavid2010adaptation} on domain adaptation and it is a result of using the $d_{\gH}$ divergence. Hence a possible path to generalize our result is to use other divergences in the proof of \cref{lem:err_bound}, e.g. like that used in \citet{DBLP:conf/colt/MansourMR09} to extend the results of \citet{bendavid2010adaptation}. The other component for proving \cref{thm:main_result}, namely the proof of \cref{lem:reacll_finite_sample}, does not depend explicitly on $l_{01}$ and can be extended using standard arguments on Rademacher complexity.

\subsection{Sufficient and Necessary Conditions for Learning and Mixture Proportion Estimation} \label{sec:identifiability}
We complete proofs of claims made in the main paper with simple proofs for the necessity and sufficiency of assumptions in detecting classes under distribution shift. Our first claim was impossibility of learning when no distributional assumptions are made.
\begin{proposition*}
    Let $\gA$ be a learning algorithm for the task of OOD novel category detection. There are distributions $\Psource, \Plabel{0}, \Plabel{1}$ such that $\exists h^*\in{\gH}$ for which $R^{l_{01}}_{\gT}(h^*)=0$, while $\E_{S_{\gS}, S_{\gT}}\left[ R^{l_{01}}_{\gT}(\gA(S_{\gS}, S_{\gT})) \right] \geq 0.5$.
\end{proposition*}
\begin{proof}
    Let $\alpha=0.5$, and $P, Q, D$ distributions such that for some hypothesis class $\gH$ there is $h^*\in{\gH}$ for which $\E_{Q}[h^*(\rvx)] = 0, \E_{D}[h^*(\rvx)] = 1$.
    % are $h^*, g^*, f^*\in{\gH}$ that perfectly separate each pair of distributions. That is $\E_{Q}[h^*(\rvx)] = 0, \E_{D}[h^*(\rvx)] = 1$, while $\E_{P}[g^*(\rvx)] = 0, \E_{D}[g^*(\rvx)] = 1$ and $f^*$ separates $P$ and $D$ in the same manner. 
    Consider two problems where in one $\Psource = P, \Plabel{0}=Q, \Plabel{1}=D$, and in the other $\tilde{P}_{\gS} = P, \tilde{P}_{\gT, 0}=D, \tilde{P}_{\gT, 1}=Q$. 
    That is, the roles of $D$ and $Q$ are switched between the two problems. 
    Notice that the target distributions $\Ptarget$ and $\tilde{P}_{\gT}$ are the same in both problems since $\Ptarget=0.5 \Plabel{0} + 0.5 \Plabel{1} = 0.5 \tilde{P}_{\gT, 0} + 0.5 \tilde{P}_{\gT, 1} = \tilde{P}_{\gT}$. Hence training data for a learning algorithm $\gA$ are drawn from the same distribution. Yet if we denote the risk w.r.t to the second problem by $\tilde{R}^{l_{01}}_{\gT}:\gH\rightarrow [0,1]$ then for any $h\in{\gH}$ if $R^{l_{01}}_{\gT}(h)=\varepsilon$ it holds that $\tilde{R}^{l_{01}}_{\gT} = 1-\varepsilon$. Hence a learning algorithm $\gA$ that achieves expected error smaller than $0.5$ on one problem will incur expected error larger than $0.5$ in the other, which proves the statement.
\end{proof}

Note that \citet{blanchard2010semi} prove that irreducibility is required for identification of $\alpha$ and for learning under the SCAR assumption (i.e. $\Psource=\Plabel{0}$). Irreducibility states that $\max_{\gamma\geq 0}{\left\{\Plabel{1} = \gamma\Psource + (1-\gamma)Q ~:~ Q\in \Delta \right\}} = 0$, where $\Delta$ is the set of all distributions over the measurable set $\gX$. While the statement we prove above is much simpler and says we cannot learn unless something is known about the target distribution, it does not follow from their proof.
When irreducibility does not hold, then there is also no $h^*\in{\gH}$ with $R_{\gT}^{l_{01}}(h^*)=0$ since $\Plabel{1}$ is a mixture with a non-zero component of $\Psource$, and thus cannot be perfectly separated from $\Plabel{0}=\Psource$. Our statement demands the existence of $h^*$ that achieves loss $0$, and thus the conditions for the statments are different.

The last remaining claim we made in the paper and has not been proven above, is that under separability and given perfect knowledge of $\Psource$ and $\Ptarget$ the mixture proportion $\alpha$ can be recovered.
\begin{lemma}
Assume the novel class detection problem satisfies \textbf{(No-Overlap)}: there exists a subset $B_{sep}\subset \gB$ such that $\Psource(B_{sep}) = 1$, $\Plabel{0}(B_{sep}) = 1$ and $\Plabel{1}(B_{sep}) = 0$, Then $\alpha$ is identifiable.
\end{lemma}
\begin{proof}
Define the set of distributions over $\gX$ that fully overlaps with $\Psource$, that is $\gP(\Psource) = \{ P\in\Delta : P(B) > 0 \Rightarrow \Psource(B) > 0 \; \forall B\in{\gB}\}$ where $\gB$ is the set of all measurable subsets of $\gX$. Let us define the following principle for approximating $\alpha$:
\begin{align} \label{eq:optimization_principle}
    \hat{\alpha} = \mathrm{arg}\min_{\gamma\in{[0,1]}}{\left\{\Ptarget = (1-\gamma)P + \beta Q ~:~ P\in{\gP(\Psource)} \text{ and } Q \text{ a distribution}\right\}}.
\end{align}
Because given the ground truth distributions $\Plabel{0}, \Plabel{1}$, we know that $\Ptarget = (1-\alpha)\Plabel{0} + \alpha \Plabel{1}$, we have that $\Ptarget(X_{sep}) = (1-\alpha) \Plabel{0}(X_{sep}) = 1-\alpha$. Clearly, taking $\gamma=\alpha$, $P = P_0$ and $Q = P_1$ gives a feasible solution to the right hand side of \Cref{eq:optimization_principle}. Now assume that there exists some feasible solution with $\gamma < \alpha$, $P\in{\gP(\Psource)}$ and a distribution $Q$. Then $\Ptarget(X_{sep}) \geq (1-\gamma)P(X_{sep}) = 1-\gamma > 1-\alpha$, which contradicts our conclusion that $\Ptarget(X_{sep}) = 1-\alpha$ must hold. Hence $\alpha$ is identifiable and given by the solution to \Cref{eq:optimization_principle}.
\end{proof}
Having proven all claims made in the main paper, we turn to a short supplementary discussion on the divergence $d_{\gH, \beta}(P \| Q)$ we used in our assumptions and corresponds to the frequency that rare events in $P$ take in distribution $Q$.

\subsection{Further discussion on \texorpdfstring{$d_{\gH, \beta}(P \| Q)$}{dHb}} \label{sec:disc_diverge}
In the domain adaptation literature \citep{bendavid2010adaptation, kifer2004detecting}, the $\gH$-divergence defined as
\begin{align*}
    d_{\gH}(P, Q) = 2\sup_{g\in{\gH}}{ \left| P\left[ I(g) \right] -  Q\left[ I(g) \right] \right|},
\end{align*}
is used for two reasons. As in our use of $d_{\gH, \beta}(P \| Q)$, the term $d_{\gH}(\Psource, \Ptarget)$ is included in an upper bound on error w.r.t a target distribution. While $d_{\gH}(\Psource, \Ptarget)$ can be estimated from data, and therefore one can optimize the resulting upper bound w.r.t $\gH$, this is not true in our case. Unfortunately calculation of $d_{\gH, \beta}(\Psource, \Plabel{0})$ requires a sample from $\Plabel{0}$, and to obtain an upper bound we require an assumption about the magnitude of the divergence. The second reason that the $\gH$-divergence is used in domain adaptation is that it provides a much tighter bound than the one based on standard divergences between distributions, e.g. in our case it is an alternative to $d_{1, \beta}\left( P \| Q \right) = \sup_{B\in{\gB}: P(B) \leq \beta}{2 \Big| P(B) - Q(B) \Big|}$, taken w.r.t measurable subsets $\gB$ under the two distributions. This indeed tightens our bounds by weakening the assumption required in \cref{assum:unicorn_bound}, though it has no practical implication on the algorithm we use.

It is worth noting that if we obtain samples from distributions $P$ and $Q$ then $d_{\gH, \beta}(P \| Q)$ can be estimated efficiently by solving a rate-constrained classification problem. This can be helpful in case we wish to reason about $d_{\gH, \beta}(\Psource \| \Plabel{0})$ in a data-driven manner. For instance, say $\Psource$ is a distribution over EHRs in one hospital, and we have a dataset from another hospital with corresponding distribution $Q$ where we do not think that novel groups have emerged. If we are willing to assume that in our target distribution $\Ptarget = \alpha\Plabel{1} + (1-\alpha)\Plabel{0}$, it holds that $d_{\gH, \beta}(\Psource \| \Plabel{0})$ does not exceed $d_{\gH, \beta}(\Psource \| Q)$, then we can get an upper bound on the divergence we are interested in by estimating $d_{\gH, \beta}(\Psource \| Q)$ from data. The following lemma tells us this can be done by solving a rate-constrained Empirical Risk Minimization problem.
\begin{lemma}
    Let $S_{P}, S_{Q}$ be i.i.d sampled datasets of size $n$ from $P, Q$ respectively, $\gH$ a symmetric hypothesis class (i.e. that $1-h \in {\gH}$ for any $h\in{\gH}$), and $d_{\gH, \beta}(\hat{P} \| \hat{Q})$ the empirical estimate of $d_{\gH, \beta}(P \| Q)$ (i.e. where we replace $P, Q$ with empirical distributions defined by a uniform distribution over the examples in the datasets). Then we have that:
    \begin{align*}
        d_{\gH, \beta}(S_{P} \| S_{Q}) = 2\left( 1 - \min_{h\in{\gH}: n^{-1}\sum_{\rvx\in{S_{P}}}{h(\rvx)} \leq \beta}{\left[ \frac{1}{n}\sum_{\rvx:h(\rvx)=1}{I[\rvx\in{S_P}]} + \frac{1}{n}\sum_{\rvx:h(\rvx)=0}{I[\rvx\in{S_Q}]} \right]}\right)
    \end{align*}
\end{lemma}
\begin{proof}
    Denoting by $\hat{P}, \hat{Q}$ the empirical distributions corresponding to $S_P$ and $S_Q$, we will follow the proof of \citet[Lemma~2]{bendavid2010adaptation} to show that for any $h\in{\gH}$,
    \begin{align}\label{eq:empirical_div_est}
         \hat{Q}[I(h)] - \hat{P}[I(h)] = 1 - \left[ \frac{1}{n}\sum_{\rvx: h(\rvx)=1}{I[\rvx\in{S_{P}}]} + \frac{1}{n}\sum_{\rvx: h(\rvx)=0}{I[\rvx\in{S_{Q}}]} \right].
    \end{align}
    Once this is shown, we get the result in the statement by maximizing w.r.t $h\in{\gH}: \hat{\beta}(h) \leq \beta$, since $\hat{\beta}(h)= n`^{-1}\sum_{\rvx\in{S_P}}{h(\rvx)}$ by definition. The absolute value on the left hand side of the above equation, which appears in the definition of $d_{\gH, \beta}(P \| Q)$ is obtained from the symmetry of $\gH$. Now for completeness let us give the proof of the required equality. We start by taking,
    \begin{align*}
        1 = \frac{1}{2n}\sum_{\rvx:h(\rvx)=0}{I[\rvx\in{S_P}] + I[\rvx\in{S_Q}]} + \frac{1}{2n}\sum_{\rvx:h(\rvx)=1}{I[\rvx\in{S_P}] + I[\rvx\in{S_Q}]},
    \end{align*}
    and plugging-in to the right hand side of \cref{eq:empirical_div_est} we get:
    \begin{align*}
        1 -& \left[ \frac{1}{n}\sum_{\rvx: h(\rvx)=1}{I[\rvx\in{S_{P}}]} + \frac{1}{n}\sum_{\rvx: h(\rvx)=0}{I[\rvx\in{S_{Q}}]} \right]\\
        =&\frac{1}{2n}\sum_{\rvx:h(\rvx)=0}{I[\rvx\in{S_P}] - I[\rvx\in{S_Q}]} + \frac{1}{2n}\sum_{\rvx:h(\rvx)=1}{I[\rvx\in{S_Q}] - I[\rvx\in{S_P}]} \\
        =&\frac{1}{2}(1-\hat{P}[I(h)] - 1 + \hat{Q}[I(h)]) + \frac{1}{2}\left(\hat{Q}[I(h)] - \hat{P}[I(h)]\right) \\
        =&\hat{Q}[I(h)] - \hat{P}[I(h)].
    \end{align*}
\end{proof}
The lemma tells us that the divergence can be estimated with rate-constrained optimization, and using similar techniques to the ones used in other constrained learning works \citep{donini2018empirical, chamon2022constrained} and in \cref{thm:main_result}, we can obtain generalization bounds for estimation of $d_{\gH,\beta}(P \| Q)$ from a finite sample.

% Let us develop a bound on the generalization error of hypotheses in our class $\gH$. We will define a notion of distance $d_{\gH, \beta}$, inspired by those defined in the domain adaptation literature \citep{bendavid2010adaptation}.
% Our distance will measure the extent to which rare events in $P_A$ can become likely under $P_0$. Given distributions $D_1, D_2$ and a threshold $\beta > 0$ used to define an event being ``rare", we denote\footnote{It is worth noting that the notion of this distance w.r.t measurable subsets $\gB$ under the two distributions can also be defined
% $d_{1, \beta}\left( D_1 \| D_2 \right) = \sup_{B\in{\gB}: D_1(B) \leq \beta}{2 \Big| D_1(B) - D_2(B) \Big|}$ and is perhaps more intuitive to consider before the one defined with $\gH$.}
% \begin{align*}
% d_{\gH, \beta}\left( D_1 \| D_2 \right) = \sup_{h\in{\gH}: D_1(h(\rvx=1)) \leq \beta}{2 \Big| D_1(h(\rvx=1)) - D_2(h(\rvx=1)) \Big|}.
% \end{align*}
% To bound the generalization error $R^{l_{01}}_\gT (h) = \E_{x,y \sim P_B}{\left[ h(x) \neq y\right]}$,
% denote the recall of hypothesis $h\in{\gH}$ w.r.t the dataset labels $\hat{Y}$ as $\alpha(h)$ (i.e. the proportion of samples $\rvx \sim P_B$ such that $h(x) = 1$),
% and its false-positive rate (i.e. the error of $h$ over $P_A$, $R^{l_{01}}_{\gS}(h) = \E_{x \sim P_A}{\left[ h(x) \right]}$, which is simply proportion of samples $\rvx \sim P_A$ such that $h(x) = 1$) by $\beta(h) := R^{l_{01}}_{\gS}(h)$.
% \begin{lemma} For a biased PU-learning problem it holds that,
% \begin{align*}
% R^{l_{01}}_\gT(h) \leq \left[\alpha - \alpha(h)\right] + (1-\alpha)\left[\beta(h) + d_{\gH, \beta(h)}\left( P_{\gS} \| P_0 \right)\right].
% \end{align*}
% \end{lemma}
% \begin{proof}
% We decompose the error as follows:
% \begin{align*}
% R^{l_{01}}_\gT(h) & = \Ptarget(h(X) = 0, Y = 1) + \Ptarget(h(X) = 1, Y = 0) \\
% & = \Ptarget(Y = 1) - \Ptarget(h(X)=1, Y=1) + \Ptarget(h(X) = 1, Y = 0) \\
% & = \Ptarget(Y = 1) - \Ptarget(h(X)=1) + 2\Ptarget(h(X) = 1, Y = 0) \\
% & = \Ptarget(Y = 1) - \Ptarget(h(X)=1) + 2\Ptarget(h(X) = 1, Y = 0) - 2\Psource(h(X)=1) + 2P_A(h(X)=1) \\
% & \leq \alpha - \alpha(h) + 2(1-\alpha)\left[ R^{l_{01}}_{\gS}(h) + d_{\gH, \beta(h)}(P_A \| P_0)\right]
% \end{align*}
% \end{proof}
% Both the explicit error term and the bound cannot be calculated directly from observed data, since the true label is unobserved. However, the quantities in the bound that involve $h$ are observed, namely $\alpha(h)$ and $\beta(h)$. It cannot be evaluated from observed data due to the quantity $d_{\gH, \beta(h)}(P_A \| P_0)$, but we can reason about this quantity with simple assumptions. For instance by assuming an upper bound on $P_0(\tilde{h}=1) / P_A(\tilde{h}=1)$ for all $\tilde{h}\in{\gH}$, or if we have access to a contaminated sample from $P_0 + \tilde{\alpha}P_1$ for some small $\tilde{\alpha}$. Such a contaminated sample may be accessible after we have identified a subset of examples that we are certain about their novelty.

% \begin{proof}
% The first part of the proposition simply states that the distribution $P_1$ has to be irreducible w.r.t each distribution in $\gP$. This is an immediate consequence of irreducibility being a necessary condition for identification in standard PU-learning \citep{blanchard2010semi}. To show this, assume that for some $P\in{\gP}$ we have that $P_1(\rvx) > 0$ for all $\rvx\in{\mathrm{supp}(P)}$ and that $\alpha$ is the mixture proportion. Then consider the case where $P_0=P_A=P$. For some small enough $\beta > 0$, we have that $P_B = \alpha P_1 + (1-\alpha) P = (\alpha + \beta)\left[(\alpha + \beta)^{-1}\cdot(\alpha P_1 + \beta P)\right] + (1 -\alpha -\beta)P$. This means that setting $P'_1 = (\alpha + \beta)^{-1}\cdot(\alpha P_1 + \beta P)$ with mixture proportion $\alpha + \beta$ is another possible solution to the detection problem and it is unidentifiable.

% For the second part of the claim, assume that $P\in\mathrm{relint}(\gP)$ (which means that $\exists Q \in{\gP} s.t. Q(\rvx) > 0 \Rightarrow P(\rvx) > 0$). Now let $Q\in{gP}$ be some other distribution in the uncertainty set, then $P_B = \alpha P_1 + (1-\alpha)P = P_1  \beta P - (1-\beta)Q $
% \end{proof}

% \subsection{Constrained Optimization Results}
% Assume that $h\in{\gH}$ achieves recall $\alpha > 0$ with precision $1-\epsilon$, and take $d_{\gH\Delta\gH}(P_0, P_A)$:
% For hypothesis class $\gH$