
\section{Datasets description}


\textbf{\textit{\texttt{Adult income} dataset} \citep{lichman2013UCI}}: This dataset comprises $48,842$ examples with demographic information. The task is to predict whether an individual's income exceeds $50k\$$ annually. We chose the protected attribute to be binarized gender.


\textbf{\textit{\texttt{Compas} dataset} \citep{lichman2013UCI}}: This dataset, which was released by \citet{angwin2022machine}, encompasses $5,278$ data related to juvenile felonies. It includes details such as marital status, ethnicity, age, prior criminal history, and the severity of the current arrest charges. In our analysis, we identify binarized gender as a sensitive attribute. In line with established conventions \citep{corbett2017algorithmic, anahideh2021fair}, we adopt a two-year violent recidivism record as the ground truth for assessing recidivism.

\textbf{\textit{\texttt{Drug consumption} dataset \citep{fehrman2017factor}}}: 
%This dataset comes from the
%UCI machine learning repository \cite{dua2019UCI}, and 
This dataset consists of $1,885$ entries containing information about individuals, where each entry includes five demographic characteristics (such as Age, binarized Gender, or Education), seven measurements related to personality traits (such as Nscore indicating neuroticism and Ascore representing agreeableness), and 18 descriptors detailing the subject's most recent consumption of a specific substance (like Cannabis). We chose the task of predicting whether an individual consumed Cannabis in the last year and chose the protected attribute to be (binarized) Gender. 

\textbf{\textit{\texttt{German Credit} dataset \citep{hofmann1994statlog}}}: The German Credit dataset classifies people as good or bad credit risks using the profile and history of $1,000$ clients. We set the binarized gender as the sensitive attribute.


\textbf{\textit{\texttt{Community and Crime} dataset \citep{redmond2002data}}}: 
The Crime and Community dataset consists of $1,902$ instances of crimes with $128$ attributes related to the crime and the corresponding community. It uses `violent crimes' as the target variable and combines `percentage of non-white' as the protected attribute. The target variable is binarized to categorize communities as high or low crime based on a threshold of $500$. The protected attribute is also binarized, separating communities with non-white residents below $20\%$.\loose

\textbf{\textit{\texttt{Bank} dataset \citep{moro2014data}}}:  The task is to predict whether the client has subscribed to a term deposit service based on $11,162$ data points with features such as marital status and age. We set the client having tertiary education as the sensitive attribute.

\textbf{\textit{\texttt{Synthetic} dataset}}: We created the synthetic dataset in the following manner. It is depicted in \Cref{fig:synthetic}. The dataset consists of two dimensions, and data for group $0$ is generated by randomly sampling $10,000$ data points from a Gaussian distribution with a mean of $(0, 0)$, while group $1$ comprises $100$ data points sampled from $(10, 10)$. For group $0$ (and group $1$), labels are assigned a value of $1$ if the x-coordinate (or y-coordinate) of the data point is greater than $0$, and $0$ otherwise. This ensures that each group is linearly separable, but their combination is not.
\begin{figure}
    \centering
    \includegraphics[width=0.45\textwidth]{new_final_plots/synthetic_data.png}
    \caption{Synthetic dataset}
    \label{fig:synthetic}
\end{figure}

\section{Performance of baseline algorithms with different pre-trained dataset sizes}
We report the results of the sweeps over the size of the pretrain dataset in \Cref{fig:drug_panda,fig:bank_panda,fig:german_panda,fig:adult_panda,fig:compas_panda,fig:crime_panda,fig:drug_fal,fig:bank_fal,fig:german_fal,fig:adult_fal,fig:compas_fal,fig:crime_fal}.
Due to its large computational cost, we compared the performance of \Panda for two sizes of pretrain datasets. 



\iftoggle{arxiv}{

\begin{figure*}
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/drugPANDA.png}
    \caption{Performance on \texttt{Drug Consumption}}
    \label{fig:drug_panda}
\end{minipage}
\hfill
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/bankPANDA.png}
    \caption{Performance on \texttt{Bank}}
    \label{fig:bank_panda}
\end{minipage}
\\
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/germanPANDA.png}
    \caption{Performance on \texttt{German Credit}}
    \label{fig:german_panda}
\end{minipage}
\hfill
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/adultPANDA.png}
    \caption{Performance on \texttt{Adult Income}}
    \label{fig:adult_panda}
\end{minipage}
\\
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/compasPANDA.png}
    \caption{Performance on \texttt{Compas}}
    \label{fig:compas_panda}
\end{minipage}
\hfill
%\begin{figure}[t]
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/crimePANDA.png}
    \caption{Performance on \texttt{Community and Crime}}
    \label{fig:crime_panda}
\end{minipage}
\end{figure*}




\begin{figure*}
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/drugFAL.png}
    \caption{Performance on \texttt{Drug Consumption}}
    \label{fig:drug_fal}
\end{minipage}
\hfill
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/bankFAL.png}
    \caption{Performance on \texttt{Bank}}
    \label{fig:bank_fal}
\end{minipage}
\\
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/germanFAL.png}
    \caption{Performance on \texttt{German Credit}}
    \label{fig:german_fal}
\end{minipage}
\hfill
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/adultFAL.png}
    \caption{Performance on \texttt{Adult Income}}
    \label{fig:adult_fal}
\end{minipage}
\\
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/compasFAL.png}
    \caption{Performance on \texttt{Compas}}
    \label{fig:compas_fal}
\end{minipage}
\hfill
%\begin{figure}[t]
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/crimeFAL.png}
    \caption{Performance on \texttt{Community and Crime}}
    \label{fig:crime_fal}
\end{minipage}
\end{figure*}


\begin{figure*}
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/adulteoPANDA.png}
    \caption{Performance on \texttt{Adult Income} for Equalized Odds}
    \label{fig:adulteo_panda}
\end{minipage}
\hfill
\begin{minipage}[c]{0.5\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/adulteoFAL.png}
    \caption{Performance on \texttt{Adult Income} for Equalized Odds}
    \label{fig:adulteo_fal}
\end{minipage}
\end{figure*}
}{

\begin{figure*}
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/drugPANDA.png}
    \caption{Performance on \texttt{Drug Consumption}}
    \label{fig:drug_panda}
\end{minipage}
\hfill
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/bankPANDA.png}
    \caption{Performance on \texttt{Bank}}
    \label{fig:bank_panda}
\end{minipage}
\\
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/germanPANDA.png}
    \caption{Performance on \texttt{German Credit}}
    \label{fig:german_panda}
\end{minipage}
\hfill
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/adultPANDA.png}
    \caption{Performance on \texttt{Adult Income}}
    \label{fig:adult_panda}
\end{minipage}
\\
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/compasPANDA.png}
    \caption{Performance on \texttt{Compas}}
    \label{fig:compas_panda}
\end{minipage}
\hfill
%\begin{figure}[t]
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/crimePANDA.png}
    \caption{Performance on \texttt{Community and Crime}}
    \label{fig:crime_panda}
\end{minipage}
\end{figure*}





\begin{figure*}
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/drugFAL.png}
    \caption{Performance on \texttt{Drug Consumption}}
    \label{fig:drug_fal}
\end{minipage}
\hfill
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/bankFAL.png}
    \caption{Performance on \texttt{Bank}}
    \label{fig:bank_fal}
\end{minipage}
\\
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/germanFAL.png}
    \caption{Performance on \texttt{German Credit}}
    \label{fig:german_fal}
\end{minipage}
\hfill
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/adultFAL.png}
    \caption{Performance on \texttt{Adult Income}}
    \label{fig:adult_fal}
\end{minipage}
\\
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/compasFAL.png}
    \caption{Performance on \texttt{Compas}}
    \label{fig:compas_fal}
\end{minipage}
\hfill
%\begin{figure}[t]
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/crimeFAL.png}
    \caption{Performance on \texttt{Community and Crime}}
    \label{fig:crime_fal}
\end{minipage}
\end{figure*}


\begin{figure*}
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepPanda/adulteoPANDA.png}
    \caption{Performance on \texttt{Adult Income} for Equalized Odds}
    \label{fig:adulteo_panda}
\end{minipage}
\hfill
\begin{minipage}[c]{0.45\linewidth}
    \centering
    \includegraphics[width=\textwidth]{new_final_plots/sweepFAL/adulteoFAL.png}
    \caption{Performance on \texttt{Adult Income} for Equalized Odds}
    \label{fig:adulteo_fal}
\end{minipage}
\end{figure*}


}


\section{Theoretical results - proof of \Cref{cor:fairness_est_simple}}
\subsection{Full theorem}
We have the following result.
\begin{theorem}\label{thm:fairness_est}
Let the train set be  $\mc{D} = \{(x_1, a_1, y_1), \ldots,(x_n, a_n, y_n)\}$. If $\mc{D}\sim \nu$, then it holds with probability $1-\delta$ that:
\begin{align*}
    &|L^{\rm EO}_\nu(h) - \widehat{L}^{\rm EO}_\mc{D}(h)| \leq C_{0,0}+C_{0,1}+C_{1,0}+C_{1,1},\\
    &|L^{\rm TP}_\nu(h) - \widehat{L}^{\rm TP}_\mc{D}(h)| \leq C_{0,1}+C_{1,1},\\
    &|L^{\rm FP}_\nu(h) - \widehat{L}^{\rm FP}_\mc{D}(h)| \leq C_{0,0}+C_{1,0},
\end{align*}
with confidence terms 
\iftoggle{arxiv}{
\begin{align*}
    &C_{j, k} =\left( \widehat{p}_{j,k} +  \sqrt{2\widehat{\V}^{(1)}_{j,k} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}\right)\times\frac{\sqrt{2\widehat{\V}^{(2)}_{j,k} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}}{\left(\frac{1}{n}\sum_{i=1}^n \1\{y_i = k,  a_i = j \}\right)^2} + \frac{ \sqrt{2\widehat{\V}^{(1)}_{j, k} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}} 
\end{align*}
}{
\begin{align*}
    &C_{j, k} =\left( \widehat{p}_{j,k} +  \sqrt{2\widehat{\V}^{(1)}_{j,k} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}\right)\times\\
    &\qquad\qquad\qquad\qquad\times\frac{\sqrt{2\widehat{\V}^{(2)}_{j,k} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}}{\left(\frac{1}{n}\sum_{i=1}^n \1\{y_i = k,  a_i = j \}\right)^2}
    \\
    &\qquad\qquad
    + \frac{ \sqrt{2\widehat{\V}^{(1)}_{j, k} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}} 
\end{align*}
}
% \awcomment{maybe state in words what $j$ and $k$ correspond to, eg $j$ is the protected attribute}
for label $k\in\{0, 1\}$ and protected attribute $j\in\{0, 1\}$, where $\widehat{p}_{j,k}=\frac{1}{n}\sum_{i=1}^n \1\{h(x_i) = 1,y_i = k , a_i = j \}$ and the empirical variances defined as
\iftoggle{arxiv}{
\begin{align*}
&\widehat{\V}^{(1)}_{j,k} = \frac{1}{n(n-1)}\sum_{1 \leq \ell < \ell' \leq n} (\1\{h(x_\ell) = 1, y_\ell = k, a_\ell = j \} - \1\{h(x_{\ell'}) = 1, y_{\ell'} = k, a_{\ell'} = j \} )^2,\\
&\widehat{\V}^{(2)}_{j,k} = \frac{1}{n(n-1)}\sum_{1 \leq \ell < \ell' \leq n} (\1\{y_\ell = k, a_\ell = j \} - \1\{y_{\ell'} = k, a_{\ell'} = j \} )^2.
\end{align*}}{
\begin{align*}
&\widehat{\V}^{(1)}_{j,k} = \frac{1}{n(n-1)}\sum_{1 \leq \ell < \ell' \leq n} (\1\{h(x_\ell) = 1, y_\ell = k, a_\ell = j \} \\
&\qquad\qquad\qquad - \1\{h(x_{\ell'}) = 1, y_{\ell'} = k, a_{\ell'} = j \} )^2,\\
&\widehat{\V}^{(2)}_{j,k} = \frac{1}{n(n-1)}\sum_{1 \leq \ell < \ell' \leq n} (\1\{y_\ell = k, a_\ell = j \}\\
&\qquad\qquad\qquad - \1\{y_{\ell'} = k, a_{\ell'} = j \} )^2.
\end{align*}
}
\end{theorem}
This theorem provides a confidence bound on the concentration rate of the empirical fairness violation. 
% \awcomment{is it actually anytime? anytime would imply that it holds for all values of $n$ simultaneously, but I think it just holds for a fixed $n$?} 
\begin{proof}
Let us start by proving the statement for $\rm TPRP$. Recall 
\iftoggle{arxiv}{
\begin{align*}
    &L^{\rm TP}_\nu(h) = \Bigg | \frac{P_{(x, a, y)\sim\nu}(h(x)=1,a=0, y=1)}{P_{(x, a, y)\sim\nu}(a=0, y=1)} - \frac{P_{(x, a, y)\sim\nu}(h(x)=1,a=1, y=1)}{P_{(x, a, y)\sim\nu}(a=1, y=1)} \Bigg |\\
    &\widehat{L}^{\rm TP}_\mc{D}(h) = \Bigg | \sum_{i=1}^{n}\frac{\1\{h(x_i)=1,y_i=1,a_i=1\}}{\sum_{i=1}^{n}\1\{y_i=1,a_i=1\}} - \sum_{i=1}^{n}\frac{\1\{h(x_i)=1,y_i=1,a_i=0\}}{\sum_{i=1}^{n}\1\{y_i=1,a_i=0\}} \Bigg |.
\end{align*}
}{
\begin{align*}
    &L^{\rm TP}_\nu(h) = \Bigg | \frac{P_{(x, a, y)\sim\nu}(h(x)=1,a=0, y=1)}{P_{(x, a, y)\sim\nu}(a=0, y=1)} \\
    &\qquad\qquad\qquad- \frac{P_{(x, a, y)\sim\nu}(h(x)=1,a=1, y=1)}{P_{(x, a, y)\sim\nu}(a=1, y=1)} \Bigg |\\
    &\widehat{L}^{\rm TP}_\mc{D}(h) = \Bigg | \sum_{i=1}^{n}\frac{\1\{h(x_i)=1,y_i=1,a_i=1\}}{\sum_{i=1}^{n}\1\{y_i=1,a_i=1\}} \\
    &\qquad\qquad\qquad- \sum_{i=1}^{n}\frac{\1\{h(x_i)=1,y_i=1,a_i=0\}}{\sum_{i=1}^{n}\1\{y_i=1,a_i=0\}} \Bigg |.
\end{align*}
}
and write these for short
\begin{align*}
&L^{\rm TP}_\nu(h) = | \text{num}_0/ \text{den}_0 -  \text{num}_1 / \text{den}_1|,\\
&\widehat{L}^{\rm TP}_\mc{D}(h) = | \widehat{\text{num}}_0/ \widehat{\text{den}}_0 -  \widehat{\text{num}}_1 / \widehat{\text{den}}_1|,
\end{align*}
with for protected attribute $j\in\{0, 1\}$,
\begin{align*}
&\text{num}_j = P_{(x, a, y)\sim\nu}(h(x)=1,a=j, y=1)\\
&\widehat{\text{num}}_j = \frac{1}{n}\sum_{i=1}^{n}\1\{h(x_i)=1,y_i=1,a_i=j\}\\
&\text{den}_j = P_{(x, a, y)\sim\nu}(a=j, y=1)\\
&\widehat{\text{den}}_j = \frac{1}{n}\sum_{i=1}^{n}\1\{y_i=1,a_i=j\}.
\end{align*}
Applying Bernstein's concentration bound it holds that for $j\in\{0,1\}$ with probability at least $1-\delta$
\iftoggle{arxiv}{
\begin{align*}
    |\widehat{\text{num}}_j - \text{num}_j| 
    &= \Bigg|\frac{1}{n}\sum_{i=1}^n \1\{h(x_i) = 1, y_i = 1, a_i = j \} - \P_{(x, a, y)\sim\nu}(h(x) = 1, y = 1, a = j )\Bigg|\\
    &\leq \sqrt{2\widehat{\V}^{(1)}_{j,1} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} =: \alpha^{(\text{num})}_j,
\end{align*}
}{
\begin{align*}
    |\widehat{\text{num}}_j - \text{num}_j| 
    &= \Bigg|\frac{1}{n}\sum_{i=1}^n \1\{h(x_i) = 1, y_i = 1, a_i = j \} \\
    &\qquad\qquad- \P_{(x, a, y)\sim\nu}(h(x) = 1, y = 1, a = j )\Bigg|\\
    &\leq \sqrt{2\widehat{\V}^{(1)}_{j,1} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} =: \alpha^{(\text{num})}_j,
\end{align*}
}
where we defined 
\iftoggle{arxiv}{
\begin{align*}
&\widehat{\V}^{(1)}_{j,k} = \frac{1}{n(n-1)}\sum_{1 \leq \ell < \ell' \leq n} (\1\{h(x_\ell) = 1, y_\ell = k, a_\ell = j \} - \1\{h(x_{\ell'}) = 1, y_{\ell'} = k, a_{\ell'} = j \} )^2.
\end{align*}
}{
\begin{align*}
&\widehat{\V}^{(1)}_{j,k} = \frac{1}{n(n-1)}\sum_{1 \leq \ell < \ell' \leq n} (\1\{h(x_\ell) = 1, y_\ell = k, a_\ell = j \} \\
&\qquad\qquad\qquad\qquad- \1\{h(x_{\ell'}) = 1, y_{\ell'} = k, a_{\ell'} = j \} )^2.
\end{align*}
}
Also applying Bernstein's concentration bound it holds that for $j\in\{0,1\}$ with probability at least $1-\delta$
\iftoggle{arxiv}{
\begin{align*}
    |\widehat{\text{den}}_j - \text{den}_j| 
    &= \Bigg|\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \} - \P_{(x, a, y)\sim\nu}(y = 1, a = j )\Bigg|\\
    &\leq \sqrt{2\widehat{\V}^{(2)}_{j,1} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} =: \alpha^{(\text{den})}_j,
\end{align*}
}{
\begin{align*}
    |\widehat{\text{den}}_j - \text{den}_j| 
    &= \Bigg|\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \} -\\
    &\qquad\qquad\qquad\qquad \P_{(x, a, y)\sim\nu}(y = 1, a = j )\Bigg|\\
    &\leq \sqrt{2\widehat{\V}^{(2)}_{j,1} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} =: \alpha^{(\text{den})}_j,
\end{align*}
}
where we defined 
\iftoggle{arxiv}{
\begin{align*}
&\widehat{\V}^{(2)}_{j,k} = \frac{1}{n(n-1)}\sum_{1 \leq \ell < \ell' \leq n} (\1\{y_\ell = k, a_\ell = j \} - \1\{y_{\ell'} = k, a_{\ell'} = j \} )^2.
\end{align*}
}{
\begin{align*}
&\widehat{\V}^{(2)}_{j,k} = \frac{1}{n(n-1)}\sum_{1 \leq \ell < \ell' \leq n} (\1\{y_\ell = k, a_\ell = j \} \\
&\qquad\qquad\qquad\qquad- \1\{y_{\ell'} = k, a_{\ell'} = j \} )^2.
\end{align*}
}
Then, as soon as for both $j=1$ and $j=2$, $\alpha^{(\text{den})}_j\leq \widehat{\text{den}}_j/2$, holds the inequality
\begin{align*}
    \left|\frac{1}{\widehat{\text{den}}_j} - \frac{1}{\text{den}_j}\right| \leq \frac{\alpha^{(\text{den})}_j}{\widehat{\text{den}}_j^2},
\end{align*}
so that for $j\in\{0, 1\}$, we have
\begin{align*}
    \left|\frac{\widehat{\text{num}}_j}{\widehat{\text{den}}_j} - \frac{\text{num}_j}{\text{den}_j}\right| 
    &\!=\! \left|\frac{\widehat{\text{num}}_j}{\widehat{\text{den}}_j} - \frac{\text{num}_j}{\widehat{\text{den}}_j} - \frac{\text{num}_j}{\widehat{\text{den}}_j} - \frac{\text{num}_j}{\text{den}_j}\right| \\
    &\!\leq \!\left|\frac{\widehat{\text{num}}_j}{\widehat{\text{den}}_j} \!-\! \frac{\text{num}_j}{\widehat{\text{den}}_j}\right| \!+\! \left|\frac{\text{num}_j}{\widehat{\text{den}}_j} \!-\! \frac{\text{num}_j}{\text{den}_j}\right| \\
    &\!\leq \! \frac{\alpha^{(\text{num})}_j}{\widehat{\text{den}}_j} + \frac{\text{num}_j \alpha^{(\text{den})}_j}{\widehat{\text{den}}_j^2}\\
    &\!\leq \! \frac{\alpha^{(\text{num})}_j}{\widehat{\text{den}}_j} + \frac{(\alpha^{(\text{num})}_j + \widehat{\text{num}}_j) \alpha^{(\text{den})}_j}{\widehat{\text{den}}_j^2}.
\end{align*}
Note that $C_{j, 1}$ is exactly the last upper bound above, 
\iftoggle{arxiv}{
\begin{align*}
    &C_{j, 1} =\left( \widehat{p}_{j,1} +  \sqrt{2\widehat{\V}^{(1)}_{j,1} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} \right)\times\frac{\sqrt{2\widehat{\V}^{(2)}_{j,1} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}}{\left(\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1,  a_i = j \}\right)^2} + \frac{ \sqrt{2\widehat{\V}^{(1)}_{j, 1} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}} 
\end{align*}
}{
\begin{align*}
    &C_{j, 1} =\left( \widehat{p}_{j,1} +  \sqrt{2\widehat{\V}^{(1)}_{j,1} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} \right)\times\\
    &\qquad\qquad\qquad\qquad\times\frac{\sqrt{2\widehat{\V}^{(2)}_{j,1} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}}{\left(\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1,  a_i = j \}\right)^2}\\
    &\qquad\qquad+ \frac{ \sqrt{2\widehat{\V}^{(1)}_{j, 1} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}} 
\end{align*}
}
where $\widehat{p}_{j,1}=\frac{1}{n}\sum_{i=1}^n \1\{h(x_i) = 1,y_i = 1 , a_i = j \}$. Putting it together
\iftoggle{arxiv}{
\begin{align*}
    |L^{\rm TP}_\nu(h) - \widehat{L}^{\rm TP}_\mc{D}(h) | 
    &= | | \text{num}_0/ \text{den}_0 -  \text{num}_1 / \text{den}_1|  - | \widehat{\text{num}}_0/ \widehat{\text{den}}_0 - \widehat{\text{num}}_1 / \widehat{\text{den}}_1| |,\\
    &\leq | \text{num}_0/ \text{den}_0 -  \text{num}_1 / \text{den}_1  -  \widehat{\text{num}}_0/ \widehat{\text{den}}_0 + \widehat{\text{num}}_1 / \widehat{\text{den}}_1| ,\\
    &\leq | \text{num}_0/ \text{den}_0 - \widehat{\text{num}}_0 / \widehat{\text{den}}_0| +| \widehat{\text{num}}_1/ \widehat{\text{den}}_1  -  \text{num}_1 / \text{den}_1 |,\\
    &\leq C_{0, 1} + C_{1, 1}.
\end{align*}
}{
\begin{align*}
    |L^{\rm TP}_\nu(h) - \widehat{L}^{\rm TP}_\mc{D}(h) | 
    &= | | \text{num}_0/ \text{den}_0 -  \text{num}_1 / \text{den}_1| \\
    &\qquad- | \widehat{\text{num}}_0/ \widehat{\text{den}}_0 - \widehat{\text{num}}_1 / \widehat{\text{den}}_1| |,\\
    &\leq | \text{num}_0/ \text{den}_0 -  \text{num}_1 / \text{den}_1 \\
    &\qquad-  \widehat{\text{num}}_0/ \widehat{\text{den}}_0 + \widehat{\text{num}}_1 / \widehat{\text{den}}_1| ,\\
    &\leq | \text{num}_0/ \text{den}_0 - \widehat{\text{num}}_0 / \widehat{\text{den}}_0|\\
    &\qquad+| \widehat{\text{num}}_1/ \widehat{\text{den}}_1  -  \text{num}_1 / \text{den}_1 |,\\
    &\leq C_{0, 1} + C_{1, 1}.
\end{align*}
}
which is the conclusion for TPRP.

As $\widehat{L}^{\rm FP}_\mc{D}(h)$ was defined as the empirical estimate of the FPRP violation by conditioning on $\1\{y_i=0\}$ (instead of $\1\{y_i=1\}$ for TPRP), the proof for the concentration bound on FPRP is analogous to the one of TPRP, with the exception of the conditioning on $\1\{y_i=0\}$ instead of $\1\{y_i=1\}$ for TPRP.

We defined the empirical estimate of the EO violation as the maximum of empirical estimate of the TPRP violation and the empirical estimate of the FPRP violation, $\widehat{L}^{\rm EO}_\mc{D}(h) = \max\{\widehat{L}^{\rm TP}_\mc{D}(h), \widehat{L}^{\rm FP}_\mc{D}(h)\}$, so holds
$$\widehat{L}^{\rm EO}_\mc{D}(h) \leq \widehat{L}^{\rm TP}_\mc{D}(h) + \widehat{L}^{\rm FP}_\mc{D}(h),$$
which immediately leads to the conclusion of \Cref{thm:fairness_est}.
\end{proof}


\subsection{Proof of \Cref{cor:fairness_est_simple}}
We first state the full result that leads to the statement of \Cref{cor:fairness_est_simple}. 
\begin{proposition}\label{cor:fairness_est_simple_full}
    Let the train set be  $\mc{D} = \{(x_1, a_1, y_1), \ldots,(x_n, a_n, y_n)\}$. If $\mc{D}\sim \nu$, then it holds with probability $1-\delta$ that:
\iftoggle{arxiv}{
\begin{align*}
    &|L^{\rm TP}_\nu(h) - \widehat{L}^{\rm TP}_D(h)| \leq 2\max_{j\in\{0, 1\}} \Bigg\{2\left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} }{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}}\right)  + \left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + 2\frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}}\right)^2\Bigg\},\\
    &|L^{\rm EO}_\nu(h) - \widehat{L}^{\rm EO}_D(h)| \leq 4\max_{0\leq j, k \leq 1} \Bigg\{2\left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} }{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right) + \left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + 2\frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right)^2\Bigg\}.
\end{align*}
}{
\begin{align*}
    &|L^{\rm TP}_\nu(h) - \widehat{L}^{\rm TP}_D(h)| \leq\\
    &\qquad\qquad 2\max_{j\in\{0, 1\}} \Bigg\{2\left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} }{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}}\right) \\
    &\qquad\qquad\qquad\qquad+ \left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + 2\frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}}\right)^2\Bigg\},\\
    &|L^{\rm EO}_\nu(h) - \widehat{L}^{\rm EO}_D(h)| \leq \\
    &\qquad\qquad 4\max_{0\leq j, k \leq 1} \Bigg\{2\left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} }{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right) \\
    &\qquad\qquad\qquad\qquad+ \left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + 2\frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right)^2\Bigg\}.
\end{align*}
    }
\end{proposition}

\begin{proof}[Proof of \Cref{cor:fairness_est_simple} and \ref{cor:fairness_est_simple_full}]
We use \Cref{thm:fairness_est} and for label $k\in\{0, 1\}$ and protected attribute $j\in\{0, 1\}$ we bound $C_{j, k}$.

We first have that the empirical variances are such that $\widehat{\V}^{(1)}_{j,k} \leq 1$ and $\widehat{\V}^{(2)}_{j,k} \leq 1$. Also, 
\iftoggle{arxiv}{
\begin{align*}
\widehat{p}_{j,k} 
&= \frac{1}{n}\sum_{i=1}^n \1\{h(x_i) = 1,y_i = k , a_i = j \}\leq \frac{1}{n}\sum_{i=1}^n \1\{y_i = k , a_i = j \}.
\end{align*}
}{
\begin{align*}
\widehat{p}_{j,k} 
&= \frac{1}{n}\sum_{i=1}^n \1\{h(x_i) = 1,y_i = k , a_i = j \} \\
&\leq \frac{1}{n}\sum_{i=1}^n \1\{y_i = k , a_i = j \}.
\end{align*}
}
Thus, we can bound
\iftoggle{arxiv}{
\begin{align*}
    &C_{j, k} = \left( \widehat{p}_{j,k} +  \sqrt{2\widehat{\V}^{(1)}_{j,k} \frac{\log(2/\delta)}{n}}  + \frac{\log(2/\delta)}{n}\right)\times \frac{\sqrt{2\widehat{\V}^{(2)}_{j,k} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}}{\left(\frac{1}{n}\sum_{i=1}^n \1\{y_i = k,  a_i = j \}\right)^2} + \frac{ \sqrt{2\widehat{\V}^{(1)}_{j, k} \frac{\log(2/\delta)}{n}}  + \frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}} \\
    &\qquad\leq 2\left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} }{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right) + \left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}}2\frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right)^2.
\end{align*}
}{
\begin{align*}
    &C_{j, k} = \left( \widehat{p}_{j,k} +  \sqrt{2\widehat{\V}^{(1)}_{j,k} \frac{\log(2/\delta)}{n}}  + \frac{\log(2/\delta)}{n}\right)\times\\
    &\qquad\qquad\qquad\qquad\times\frac{\sqrt{2\widehat{\V}^{(2)}_{j,k} \frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n}}{\left(\frac{1}{n}\sum_{i=1}^n \1\{y_i = k,  a_i = j \}\right)^2}\\
    &\qquad\qquad+ \frac{ \sqrt{2\widehat{\V}^{(1)}_{j, k} \frac{\log(2/\delta)}{n}}  + \frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}} \\
    &\qquad\leq 2\left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} }{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right) \\
    &\qquad\qquad+ \left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}}2\frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right)^2.
\end{align*}
}
With that result, we conclude for TPRP that
\iftoggle{arxiv}{
\begin{align*}
    |L^{\rm TP}_\nu(h) - \widehat{L}^{\rm TP}_D(h)| &\leq C_{0, 1} + C_{1, 1} \\
    &\leq 2\max_{j\in\{0, 1\}} C_{j, 1} \\
    &\leq 2\max_{j\in\{0, 1\}} \Bigg\{2\left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} }{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}}\right) + \left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + 2\frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}}\right)^2\Bigg\}\\
    &= 4\max_{j\in\{0, 1\}} \frac{\sqrt{2\frac{\log(2/\delta)}{n}}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}}+\mc{O}\left(\frac{1}{n}\right).
\end{align*}
}{
\begin{align*}
    &|L^{\rm TP}_\nu(h) - \widehat{L}^{\rm TP}_D(h)| \\
    &\qquad\leq C_{0, 1} + C_{1, 1} \\
    &\qquad\leq 2\max_{j\in\{0, 1\}} C_{j, 1} \\
    &\qquad\leq 2\max_{j\in\{0, 1\}} \Bigg\{2\left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + \frac{\log(2/\delta)}{n} }{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}}\right) \\
    &\qquad\qquad+ \left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + 2\frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}}\right)^2\Bigg\}\\
    &\qquad= 4\max_{j\in\{0, 1\}} \frac{\sqrt{2\frac{\log(2/\delta)}{n}}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = 1, a_i = j \}}+\mc{O}\left(\frac{1}{n}\right).
\end{align*}
}

Analogous bounds conclude for EO:
\iftoggle{arxiv}{
\begin{align*}
    |L^{\rm EO}_\nu(h) - \widehat{L}^{\rm EO}_D(h)| &\leq C_{0, 0} + C_{1, 0} + C_{0, 1} + C_{1, 1} \\
    &\leq 4\max_{j\in\{0, 1\}} C_{j, k} \\
    &\leq 4\max_{0\leq j, k \leq 1} \Bigg\{2\left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} \frac{\log(2/\delta)}{n} }{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right)  + \left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + 2\frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right)^2 \Bigg\}\\
    &=8\max_{0\leq j, k \leq 1} \frac{\sqrt{2\frac{\log(2/\delta)}{n}}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}+\mc{O}\left(\frac{1}{n}\right).
\end{align*}
}{
\begin{align*}
    &|L^{\rm EO}_\nu(h) - \widehat{L}^{\rm EO}_D(h)| \\
    &\qquad\leq C_{0, 0} + C_{1, 0} + C_{0, 1} + C_{1, 1} \\
    &\qquad\leq 4\max_{j\in\{0, 1\}} C_{j, k} \\
    &\qquad\leq 4\max_{0\leq j, k \leq 1} \Bigg\{2\left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} \frac{\log(2/\delta)}{n} }{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right) \\
    &\qquad\qquad+ \left(\frac{\sqrt{2\frac{\log(2/\delta)}{n}} + 2\frac{\log(2/\delta)}{n}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}\right)^2 \Bigg\}\\
    &\qquad=8\max_{0\leq j, k \leq 1} \frac{\sqrt{2\frac{\log(2/\delta)}{n}}}{\frac{1}{n}\sum_{i=1}^n \1\{y_i = k, a_i = j \}}+\mc{O}\left(\frac{1}{n}\right).
\end{align*}
}
\end{proof}

