\section{Comment}


\begin{comment}
\begin{example}
Let $X = [-1, 1]$ be a domain and 
$F : X \rightarrow \mathbb{R}$ be 
the set of logistic functions 
$f_w(x) = \frac{1}{1 + \exp(-w x)}$ 
with $w \in \mathbb{R}$. For any 
$\alpha/\beta = 1$, then $\xi = \ldots$. 
\end{example}



The following theorem shows an example 
where the coefficient is bounded.  

\begin{theorem}
Let $h_w(x) = w \cdot x$ be a linear 
function defined on $[-B, B] \subset 
\mathbb{R}$ for some $B > 0$. 
Let $H =\{ h_w ; w \geq 0 \}$ be a 
function class. 
If $B > \alpha$, then $\xi = 1$. 
\end{theorem}

\begin{proof}
Our proof strategy is as follows. We first 
derive the $\Delta_{\alpha,\beta}(h_w)$ for 
any $h_w$ and show it is non-decreasing 
with respect to $w$. Based on 
this, we can show the probability in (\ref{def:coef}) 
equals to the following probability 
\begin{equation}
\mathbb{P}_{*} := \Pr \{ |h_{w_{*}}(x) - h_{w_{*}}(x')| 
> \beta, d(x,x') \leq \alpha\},     
\end{equation}
where $h_{w_*}$ is the model achieving 
$\Delta_{\alpha,\beta}(h_{w_*}) = r$. Hence
\begin{equation}
\xi = \sup_{r > 0} \mathbb{P}_{*} / r 
= \sup_{r > 0} r / r = 1. 
\end{equation}
Now we show the detailed proof. 
For conciseness, we will write $h$ for $h_w$ 
but with the mind that each $h$ is associated 
with a $w$. Also, let $\&$ be logic `AND' 
and define event 
\begin{equation}
I_{\alpha}^{\beta}(x,x';h) 
:= d(x,x') \leq \alpha\ \&\ 
|h(x) - h(x')| > \beta. 
\end{equation}
We have $\Delta_{\alpha,\beta}(h) = 
\Pr\{I_{\alpha}^{\beta}(x,x';h)\}$. 

\textit{Step 1: Characterize $\Delta_{\alpha,\beta}(h)$ 
for any $h \in H$}. 

Fix any $h$. Consider two cases. 

(i) If $\alpha w < \beta$, simple geometric analysis 
shows that event $I_{\alpha}^{\beta}(x,x';h)$ is always false 
and thus $Pr\{I_{\alpha}^{\beta}(x,x';h)\} = 0$. 

(ii) If $\alpha w \geq \beta$ (which implies $w \neq 0$), 
then $\alpha \geq \beta / w$. In this case, we can properly 
partition the domain and have 
\begin{align}
\label{arg:xi_splitdomain}
\begin{split}
& {\Pr}_{x, x'} \{ I_{\alpha}^{\beta}(x,x';h) \}\\[.5em]
& = \mathbb{E}_{x' \in [-B, B]} \left[\, {\Pr}_{x} 
\{ I_{\alpha}^{\beta}(x,x';h) \}\, \right]\\[.5em]
& = 2\ \mathbb{E}_{x' \in [0, B]} \left[\, {\Pr}_{x} 
\{ I_{\alpha}^{\beta}(x,x';h) \}\, \right]\\[.5em] 
& = 2 \int_{x' \in [0, B - \alpha]} {\Pr}_{x} 
\{ I_{\alpha}^{\beta}(x,x';h) \} \cdot p(x') \\ 
& \quad + 2 \int_{x' \in (B - \alpha, B - \frac{\beta}{w}]} 
{\Pr}_{x} \{ I_{\alpha}^{\beta}(x,x';h) \} \cdot p(x') \\ 
& \quad + 2 \int_{x' \in (B - \frac{\beta}{w}, B]} 
{\Pr}_{x} \{ I_{\alpha}^{\beta}(x,x';h) \} \cdot p(x'),  
\end{split}
\end{align}
where ${\Pr}_{x} \{ I_{\alpha}^{\beta}(x,x';h) \}$ is 
the probability defined for $x$ with a fixed $x'$. In 
(\ref{arg:xi_splitdomain}), the first equality is by the 
independent assumption, and the second equality is by the
observation that ${\Pr}_{x} \{ I_{\alpha}^{\beta}(x,x';h) \}$ 
is symmetric on $[-B, B]$ (which will become more 
clear in later analysis). Note that $p(x') = \frac{1}{2 B}$ 
by the uniform assumption. 

Now we study 
${\Pr}_{x} \{ I_{\alpha}^{\beta}(x,x';h) \}$ 
in each integral. 

(ii.a) If $x' \in [0, B - \alpha]$, we can show 
\begin{equation}
\label{eq:xi_1stintegral}
{\Pr}_{x} \{ I_{\alpha}^{\beta}(x,x';h) \} 
= 1 - \frac{\beta}{w \alpha}. 
\end{equation}
which implies the first integral is 
$(1 - \frac{\beta}{w \alpha}) (1 - \frac{\alpha}{B})$. 
Note that $\alpha w \geq \beta$ guarantees 
the RHS of (\ref{eq:xi_1stintegral}) is non-negative 
and thus the first integral is non-negative. 

To verify (\ref{eq:xi_1stintegral}), 
let us first fix $x' = 0$ and identify the 
set of $x$ in $[-\alpha, \alpha]$ that makes event 
$I_{\alpha}^{\beta}(x,x';h)$ true. This case 
is illustrated in Figure \ref{fig:xi_demo}. 
We see all targeted $x$ fall in 
$[\beta/w, \alpha]$ and (by symmetry) 
in $[-\alpha, -\beta/w]$. This implies 
\begin{equation}
\label{eq:xi_1stprobability}
{\Pr}_{x} \{ I_{\alpha}^{\beta}(x,x';h)\} 
= \frac{2 \cdot (\alpha -\beta/w)}{2 \cdot \alpha}
= 1 - \frac{\beta}{\alpha w}. 
\end{equation}
Since $h$ is linear, the above analysis 
and result apply to all $x ' \in [0, B - \alpha]$ (by 
shifting $x'$). This verifies (\ref{eq:xi_1stintegral}). 

\begin{figure}[t!]
     \centering
     \includegraphics[width=.4\textwidth]{figure/Xi_Demo_v2.PNG}
     \caption{}
     \label{fig:xi_demo}
\end{figure}

(ii.b) If $x' \in (B - \alpha, B - \frac{\beta}{w}]$, 
we can show 
\begin{equation}
\label{eq:xi_2ndintegral}
{\Pr}_{x} \{ I_{\alpha}^{\beta}(x,x';h) \} 
= 1 - \frac{\beta}{w (B - x')},  
\end{equation}
and thus the second integral is 
$\frac{1}{B} \left(\alpha - \frac{\beta}{w} 
\left( 1 - \ln \frac{\beta}{w \alpha}\right)\right)$. 
Note the domain of $x'$ guarantees the RHS 
of (\ref{eq:xi_1stintegral}) is non-negative 
and thus the second integral is non-negative. 

We can verify (\ref{eq:xi_2ndintegral}) based 
on Figure \ref{fig:xi_demo}, with an additional shift 
of the origin to $x'$ and an additional constraint 
that $x' \leq B$. We can then show all targeted $x$ 
fall in $[x' + \frac{\beta}{w}, B]$, which is shorter 
than interval $[\beta/w, \alpha]$ in the figure. Thus 
\begin{equation}
\label{eq:xi_2ndprobability}
{\Pr}_{x} \{ I_{\alpha}^{\beta}(x,x';h)\} 
= \frac{2 (B - x' - \beta/w)}{2 (B - x')}. 
\end{equation}
This verifies (\ref{eq:xi_2ndintegral}). 

(ii.c) If $x' \in (B - \frac{\beta}{w}, B]$, we 
can show no $(x',x)$ makes event 
$I_{\alpha}^{\beta}(x,x';h)$ true and thus 
${\Pr}_{x} \{ I_{\alpha}^{\beta}(x,x';h)\} = 0$. 

Now, plugging results of cases (ii.a), (ii.b) and 
(ii.c) back to (\ref{arg:xi_splitdomain}), and 
combining results of cases (ii) and (i), we have  
\begin{equation}
\label{eq:xi_deltafinal}
\Delta_{\alpha,\beta}(h) = 
\left\{ \begin{array}{ll}
0, & \text{ if } w < \frac{\beta}{\alpha} \\[.5em]
1 - \frac{\beta}{\alpha w} + 
\frac{\beta}{ B w} \ln \frac{\beta}{\alpha w} 
, & \text{ if } w \geq \frac{\beta}{\alpha}
\end{array}
\right.. 
\end{equation}


\textit{Step 2: Show $\Delta_{\alpha,\beta}(h)$ 
is non-decreasing w.r.t. $w$}.

All we need to show is $\Delta_{\alpha,\beta}(h)$ 
is non-negative and non-decreasing when $w \geq \frac{\beta}{\alpha}$. The first property is 
guaranteed since all integrals in (\ref{arg:xi_splitdomain}) are non-negative. 
Take derivative
\begin{equation}
\frac{\partial \Delta_{\alpha,\beta}(h)}{ 
\partial w} =  \frac{\beta\, (B + \alpha (
\ln \frac{\alpha w}{\beta} - 1))}{w^2\, \alpha\, B}.  
\end{equation}
Since $w \geq \frac{\beta}{\alpha}$, we have 
$\ln (\alpha w / \beta) \geq 0$, which implies 
the derivative is bigger than zero whenever 
$B > \alpha$. This verifies the second property 
and thus $\Delta_{\alpha,\beta}(h)$ is non-decreasing. 

\textit{Step 3: Equivalent Probability}.

Let $h_* = w_* \cdot x$ be the model satisfying 
$\Delta_{\alpha,\beta}(h_{*}) = r$.\footnote{It 
is not hard to show $h_*$ exists for every $r < 1$ 
based on (\ref{eq:xi_deltafinal}).}  
Results of Step 1 and Step 2 tell us 
that $\mathcal{B}_{\alpha,\beta}(r)$ 
is the set of linear models satisfying 
$w \leq w_{*}$. Based on this, we can 
derive the following equivalence. 
\begin{equation}
\label{eq:xi_equivalence}
{\Pr}_{x,x'} \{ \mathcal{C}_{\alpha,\beta} 
(\mathcal{B}_{\alpha,\beta}(r)) \} =     
{\Pr}_{x, x'} \{ I_{\alpha}^{\beta}(x,x';h_*) \}. 
\end{equation}

To verify (\ref{eq:xi_equivalence}), we first show 
every $(x,x') \in  \mathcal{C}_{\alpha,\beta} 
(\mathcal{B}_{\alpha,\beta}(r))$ makes 
event $I_{\alpha}^{\beta}(x,x';h_{*})$ 
true. As illustrated in Figure \ref{fig:xi_demo2} 
where we fix $x' = 0$, for any $(x,x')$ satisfying $d(x,x') 
\leq \alpha$, if there exists an $h$ with $w \leq w_*$ such 
that $|h(x) - h(x')| > \beta$, then 
$|h_{*}(x) - h_{*}(x')| \geq |h(x) - h(x')| 
> \beta$. On the other hand, every $(x,x')$ that makes 
event $I_{\alpha}^{\beta}(x,x';h_{*})$ 
true is also in $\mathcal{C}_{\alpha,\beta} 
(\mathcal{B}_{\alpha,\beta}(r))$ due to the existence 
of $h_{*}$. 

\begin{figure}[t!]
     \centering
     \includegraphics[width=.4\textwidth]{figure/Xi_Demo2.PNG}
     \caption{}
     \label{fig:xi_demo2}
\end{figure}

\textit{Step 4: Putting all together}.

The equivalence suggests that 
Now we have 
\begin{equation}
\xi = \sup_{r > 0} 
\frac{{\Pr} \{ I_{\alpha}^{\beta}(x,x';h_*) \}}{r} 
= \sup_{r > 0} \frac{r}{r} = 1. 
\end{equation}
This proves the theorem. 
\end{proof}


Definition of (19) is very problematic. First, you are mixing empirical estimate (6) with probability, you need to pick one not both. Second, sometimes you vary x only and sometimes x and x' both, again which one do you really want. This is confusing I have no idea how you want to fix them





\begin{comment}

Further, define 
\begin{equation}
\mathcal{B}_{\alpha,\beta}(r) 
= \{ h; \Delta_{\alpha,\beta} (h) \leq r\}, 
\end{equation}
and quantity 
\begin{equation}
\label{def:coef}
\xi = \sup_{r > 0} \frac{\Pr \{ 
(x,x') \in \mathcal{C}_{\alpha,\beta} 
(\mathcal{B}_{\alpha,\beta}(r)) \}}{r}.
\end{equation}
\end{comment}
In this section, we derive a sample complexity 
of the active metric fair learner. 
Recall the intuition of the proposed active 
learner is to label instances whose predicted 
labels are dissimilar to their neighbors. 
We model this intuition by the following set. 
\begin{align}
\label{eq:empdelta}
\begin{split}
\mathcal{C}_{\alpha,\beta}(H) 
= \{& (x,x') \in X \times X; \exists h \in H, \\ 
& d(x,x') \leq \alpha,\ |h(x) - h(x')| > \beta\}.  
\end{split}
\end{align}

\begin{comment}
Define the neighborhood of $x$ as 
\begin{equation}
N_{\alpha}(x) = \{ x' \in X;\ 
d(x,x') \leq \alpha \}, 
\end{equation}
which is the set of instances similar to $x$.  


 \begin{align}
\begin{split}
\mathcal{C}_{\alpha,\beta}(H) 
= \{x; \exists h \in H, \exists x' 
\in N_{\alpha}(x), |h(x) - h(x')| > \beta\}.  
\end{split}
\end{align}
\begin{equation}
\mathcal{B}_{\alpha,\beta}(r) = \{ h; \Delta_{\alpha,\beta} 
(h) \leq r\}, 
\end{equation}
and quantity 
\begin{equation}
\label{def:coef}
\xi = \sup_{r > 0} \frac{\Pr \{ 
x \in \mathcal{C}_{\alpha,\beta} 
(\mathcal{B}_{\alpha,\beta}(r)) \}}{r}.
\end{equation}
\end{comment}


\begin{comment}
By definition, for any $h \in V_{t+1}$, we have 
\begin{equation}
\Delta_{\alpha, \beta}(h) \leq 
\Pr\{ x \in \mathcal{C}_{\alpha,\beta}(V_{t+1})\}. 
\end{equation}
\end{comment}


\end{comment}






\begin{comment}
\begin{figure}[t!]
     \centering
     \begin{subfigure}[t]{.4\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/RMSE_Lambda_Insurance.png}
         \caption{}
         \label{fig:lambda_rmse}
     \end{subfigure}\\ 
     \begin{subfigure}[t]{.4\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/Delta_Lambda_Insurance.png}
         \caption{}
         \label{fig:lambda_delta}
     \end{subfigure}
    \caption{Performance of GMFL versus $\lambda$ on Insurance}
    \label{fig:lambda}
\end{figure}


\begin{figure}[t!]
     \centering
     \includegraphics[width=.4\textwidth]{figure/Delta_DataSize_Insurance.png}
     \caption{Performance of GMFL versus Training Set Size}
     \label{fig:size_delta}
\end{figure}
\end{comment}


\begin{comment}
\begin{figure}[t!]
     \centering
     \begin{subfigure}[b]{.4\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/RMSE_insurance.png}
         \caption{RMSE}
     \end{subfigure}
     \\ 
     \begin{subfigure}[b]{.4\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/Delta_insurance.png}
         \caption{$\Delta$}
     \end{subfigure}
    \caption{Performance on 
    the Health Cost Data Set}
    \label{fig:compas}
\end{figure}

\begin{figure}[t!]
     \centering
     \begin{subfigure}[b]{.4\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/RMSE_life.png}
         \caption{RMSE}
     \end{subfigure}
     \\ 
     \begin{subfigure}[b]{.4\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/Delta_life.png}
         \caption{$\Delta$}
     \end{subfigure}
    \caption{Performance of Linear Approximate Metric-Fair Model}
    \label{fig:crime}
\end{figure}


\begin{figure}[t!]
     \centering
     \begin{subfigure}[b]{.4\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/RMSE_covid.png}
         \caption{RMSE}
     \end{subfigure}
     \\ 
     \begin{subfigure}[b]{.4\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/Delta_covid.png}
         \caption{$\Delta$}
     \end{subfigure}
    \caption{Performance on the COVID Data Set}
    \label{fig:crime}
\end{figure}
\end{comment}


\begin{comment}
\begin{figure}[t!]
     \centering
     \begin{subfigure}[b]{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/insurance_active_L_delta.png}
         \caption{$\Delta_{\alpha,\beta}(h)$ on Insurance}
     \end{subfigure} 
     \begin{subfigure}[b]{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/insurance_active_L_RMSE.png}
         \caption{$\Delta_{\alpha,\beta}(h)$ on Life}
     \end{subfigure}
    \caption{Performance of the Linear Approximate Metric-Fair Model}
    \label{fig:delta}
\end{figure}
\end{comment}


We first show the generalized 
metric fair learner (GMFL) in Algorithm \ref{alg:optPAMFL} 
is effective, meaning it can effectively reduce 
$\Delta_{\alpha,\beta}(h)$ without significantly 
reducing model accuracy. To verify this, we run 
GMFL on the initial training set and testing set 
when varying $\lambda$ in \{0, 1e-3, 1e-2, 1e-1, 
1, 1e1, 1e2, 1e3\}. Results on the medical cost data 
set are shown in Figure \ref{fig:lambda}. We see as $\lambda$ increases, the $\Delta_{\alpha,\beta}(h)$ 
values on both training and testing sets are reduced. This 
suggests GMFL is effective and its achieved fairness 
is generalizable. We also see RMSE decreases as $\lambda$ 
increases, which implies GMFL can also help to reduce overfitting. 

In Figure \ref{fig:lambda_delta}, one may notice there is some 
gap between the training and testing $\Delta_{\alpha,\beta}(h)$. 
This is because the initial training set is too small. 
As we increase the size in Figure \ref{fig:size_delta}, 
we see the gap is reduced. 
(In this figure, $\lambda$ is fixed to 10 and $p$ is 
the feature dimension). This further verifies the 
fairness achieved by GMFL is generalizable. 



Recall our key hypothesis is that the proposed active 
query strategy AMFL allows the generalized metric-fair 
learner in Algorithm \ref{alg:optPAMFL} to improve 
fairness efficiently. Since we are the first to propose 
active query 


We verify this hypothesis 
by comparing AMFL with two baseline query strategies. 
One is random query, which queries labels for randomly 
selected data. The other one is the state-of-the-art 
iGS \cite{wu2019active}, which queries labels 
for data with most uncertain input and output 

. 
The $(\alpha,\beta,\varepsilon)$-metric fair 
learner in Algorithm \ref{alg:optPAMFL} is 
the base model for all strategies. Our hypothesis is AMFL 
allows the learner to improve fairness more 
efficiently than other methods while maintaining 
comparable accuracy improvement rate. 

The regularization coefficient 
$\lambda$ of the passive metric-fair learner is 
chosen among \{1e-3, 1e-2, 1e-2, 1, 1e1, 1e2, 1e3\} 
that achieves the lowest RMSE based on the initial 
training set across three data sets. As a result, 
we set $\lambda$ to 1. For metric fairness, 
we set $(\alpha,\beta)$ to (2, 0.1) on the 
medical cost set, (10, 0.2) on the life 
expectancy set, and (1.5, 0.001) on the covid 
death rate set -- in practice, these parameters 
should be configured based on application needs; 
here we set them around their median values in 
the data sets to verify efficacy of the 
proposed active learner. All active learning 
methods query label for one instance (and 
update the model) in each iteration. For QBC, we 
follow the configuration in [X] and set the size 
of committee to 5 and the size of bootstrap sample 
to the training data size divided by the committee size. 



\begin{comment}

\begin{lemma}
Let $F: X \times X \rightarrow \mathbb{R}$ be the 
set of functions induced from $\tau_{\beta}^t$ 
and defined as $\forall f \in F$, 
\begin{equation}
f(a,b) = \tau_{\beta}^t (|h(a) - h(b)|).     
\end{equation}
Then 
\begin{equation}
\mathcal{R}_m(F) \leq 8t \cdot \mathcal{R}_{m}(H).     
\end{equation}
\end{lemma}
\begin{proof}
Let $G : X \times X \rightarrow \mathbb{R}$ 
be the set of functions induced from $h$ 
and defined as $\forall g \in G$, 
$g(a,b) = h(a) - h(b)$. Let 
${abs}$ be the absolute function. Then 
\begin{equation}
f(a,b) = \tau_{\beta}^t \circ abs \circ g(a,b).    
\end{equation}
We first show $\mathcal{R}_{m}(F) \leq \mathcal{R}_{m}(G)$. 
This is true because 
\begin{align}
\label{eq:lem_Rad_1}
\begin{split}
\mathcal{R}_{m}(F) 
& = \mathcal{R}_{m}(\tau_{\beta}^t \circ abs \circ G)\\[.5em]
& \leq 2t \cdot \mathcal{R}_{m}(abs \circ G)\\[.5em]
& \leq 4t \cdot \mathcal{R}_{m}(G),  
\end{split}
\end{align}
where both inequalities are by the property of 
Rademacher complexity for the composite function with 
Lipschitz condition \cite[Theorem 12]{bartlett2002rademacher}. 

We then show $\mathcal{R}_{m}(G) \leq 2 \cdot 
\mathcal{R}_m(H)$. This is true because 
\begin{align}
\label{eq:lem_Rad_2}
\begin{split}
\mathcal{R}_{m}(G) 
& = \mathbb{E}_{(a,b)} \mathbb{E}_{\sigma}
\sup_{g} \frac{1}{m} \sum_{i = 1}^m 
\sigma_i g(a_i,b_i)\\ 
& = \mathbb{E}_{(a,b)} \mathbb{E}_{\sigma} 
\sup_{g} \frac{1}{m} \sum_{i = 1}^m 
\sigma_i [h(a_i) - h(b_i)]\\ 
& \leq \mathbb{E}_{(a,b)} \mathbb{E}_{\sigma} 
\sup_{g} \frac{1}{m} \sum_{i = 1}^m 
\sigma_i h(a_i) \\ 
& \quad + \mathbb{E}_{(a,b)} 
\mathbb{E}_{\sigma} \sup_{g} \frac{1}{m} 
\sum_{i = 1}^m \sigma_i h(b_i)\\ 
& = 2 \cdot \mathbb{E}_{(a,b)} 
\mathbb{E}_{\sigma} \sup_{g} \frac{1}{m} 
\sum_{i = 1}^m \sigma_i h(x_i) \\
& = 2 \cdot \mathcal{R}_m(H),  
\end{split}
\end{align}
where the third equality is based on the fact 
that $\sigma_i$ is uniform in \{-1, 1\} so 
the expectation w.r.t. $\sigma_i$ is the 
same as the expectation w.r.t. $-\sigma_i$. 

Combining (\ref{eq:lem_Rad_1}) and 
(\ref{eq:lem_Rad_2}) proves the lemma. 
\end{proof}

\end{comment}



\begin{comment}
\begin{algorithm}[h!]
\setstretch{1.1}
\caption{The Ideal 
$(\alpha,\beta,\varepsilon)$-Metric Fair Learner}
\begin{algorithmic}[1]
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
\renewcommand{\algorithmicloop}{\textbf{Loop:}}
\REQUIRE 
A training set $S_{n}$ and 
hyper-parameters $\alpha, \beta, \lambda$. 
\STATE learn a model $h$ by solving 
\begin{align}
\begin{split}
\label{eq:optPAMFL}
\min_{h \in H}\ \frac{1}{n} 
\sum_{i=1}^n (h(x_i) - y_i)^2 
+ \lambda \Delta_{\alpha,\beta}(h).
\end{split}
\end{align}
\ENSURE model $h$. 
\end{algorithmic} 
\label{alg:optPAMFL}
\end{algorithm}



Define set $Z = \{ (a,b) \in X \times 
X \}$. Let $z_i = (a_i, b_i) \in Z$. 
Define 
\begin{equation}
g^{+}_h(z_i) =  \mathbb{I} \{ d(x_i, x_j) \leq 
\alpha,\ |h(x_i) - h(x_j)| > \beta \}
\end{equation}
and 
\begin{equation}
g_h(z_i) =  \mathbb{I} \{ |h(x_i) - h(x_j)| > \beta \}
\end{equation}
 
 
Let $S$ be a $m$-sized sample of $Z$. 
The empirical Rademacher complexity of 
function class $G^+ = \{ g_h^+ \}$ is 
\begin{equation}
\hat{\mathcal{R}}_{m}(G^+) = \mathbb{E}_{\sigma}\ 
\sup_{g_h} \frac{1}{m}  \sum_{z_i \in S} \sigma_i g_h(z_i). 
\end{equation}
Our goal is to connect 
$\hat{\mathcal{R}}_m(G^+)$ and $\hat{\mathcal{R}}_n(H)$. 

Partition $Z$ into the following two subsets. 
\begin{equation}
Z_{1} = \{ (a, b) \in Z; d(a,b) \leq \alpha\}, 
\end{equation}
and 
\begin{equation}
Z_{2} = \{ (a, b) \in Z; d(a,b) > \alpha\}.
\end{equation}
Let $S_i = S \cap Z_i$ and $m_i = |S_i|$. 
Assume $m_1 > 0$. 
\begin{align}
\begin{split}
& \hat{\mathcal{R}}_m(G^+ \mid S) \\ 
& = \mathbb{E}_{\sigma}\ 
\sup_{g_h} \frac{1}{m}  \sum_{z_i \in S}
\sigma_i g^+_h(z_i) \\
& = \mathbb{E}_{\sigma}\ 
\sup_{g_h} \frac{1}{m}  \sum_{z_i \in S_1 
\cup S_2} \sigma_i g^+_h(z_i) \\
& = \mathbb{E}_{\sigma}\ 
\sup_{g_h} \frac{1}{m}  \sum_{z_i \in S_1} 
\sigma_i g^+_h(z_i)  
\\ & \quad + \mathbb{E}_{\sigma}\ 
\sup_{g_h} \frac{1}{m}  \sum_{z_i \in S_2} 
\sigma_i g^+_h(z_i)\\
& = \mathbb{E}_{\sigma}\ 
\sup_{g_h} \frac{1}{m}  \sum_{z_i \in S_1} 
\sigma_i g_h(z_i)\\
& \leq \mathbb{E}_{\sigma}\ 
\sup_{g_h} \frac{1}{m_1}  \sum_{z_i \in S_1} 
\sigma_i g_h(z_i)\\
& \textcolor{red}{\text{\ldots get rid of 
absolute and indicator} }\\
& \leq \mathbb{E}_{\sigma}\ 
\sup_{h} \frac{1}{m_1}  \sum_{z_i \in S_1} 
\sigma_i (h(a_i) - h(b_i))\\ 
& \leq \mathbb{E}_{\sigma}\ 
\sup_{h} \frac{1}{m_1}  \sum_{a_i \in S_1^{(1)}} 
\sigma_i h(a_i) \\ 
& \quad + \mathbb{E}_{\sigma}\ 
\sup_{h} \frac{1}{m_1} \sum_{b_i \in S_1^{(2)}} 
\sigma_i h(b_i),  
\end{split}
\end{align}
where $S_1^{(j)}$ is set of the $j_{th}$ element in $S_1$. 
Define 
\begin{equation}
X_1 = \{ x \in X; \exists x' \in X, d(x,x') \leq \alpha \}.
\end{equation}

The Rademacher complexity is 
\begin{align}
\begin{split}
& \mathcal{R}_m(G^+ \mid S) \\ 
& = \mathbb{E}_{S} \hat{\mathcal{R}}_m(G^+ \mid S)\\  
& \leq \mathbb{E}_{S}  \mathbb{E}_{\sigma}\ 
\sup_{h} \frac{1}{m_1}  \sum_{a_i \in S_1^{(1)}} 
\sigma_i h(a_i) \\ 
& \quad + \mathbb{E}_{S} \mathbb{E}_{\sigma}\ 
\sup_{h} \frac{1}{m_1} \sum_{b_i \in S_1^{(2)}} 
\sigma_i h(b_i) \\
& = \mathbb{E}_{S_1}  \mathbb{E}_{\sigma}\ 
\sup_{h} \frac{1}{m_1}  \sum_{a_i \in S_1^{(1)}} 
\sigma_i h(a_i) \\ 
& \quad + \mathbb{E}_{S_1} \mathbb{E}_{\sigma}\ 
\sup_{h} \frac{1}{m_1} \sum_{b_i \in S_1^{(2)}} 
\sigma_i h(b_i) \\
& = \mathbb{E}_{X_1}  \mathbb{E}_{\sigma}\ 
\sup_{h} \frac{1}{m_1}  \sum_{a_i \in S_1^{(1)}} 
\sigma_i h(a_i) \\ 
& \quad + \mathbb{E}_{X_1} \mathbb{E}_{\sigma}\ 
\sup_{h} \frac{1}{m_1} \sum_{b_i \in S_1^{(2)}} 
\sigma_i h(b_i) \\ 
& = 2 \cdot \mathbb{E}_{X_1}  \mathbb{E}_{\sigma}\ 
\sup_{h} \frac{1}{m_1}  \sum_{a_i \in S_1^{(1)}} 
\sigma_i h(a_i) \\ 
& = 2 \cdot \mathcal{R}_{m_1}().  
\end{split}    
\end{align}


Note that in principle our constraint requires 
the estimate to be zero. This is achievable 
and will facilitate later theoretical analysis. 
In practice, if this constraint conflicts with 
the objective, one can balance them by solving 
\begin{align}
\begin{split}
\label{eq:optPAMFL}
\min_{h \in H}\ \frac{1}{n} 
\sum_{i = 1}^n (h(x_i) - y_i)^2 
+ \lambda \cdot \Delta_{\alpha,\beta}(h; S_n),  
\end{split}
\end{align}
with hyper-parameter $\lambda$. 

\end{comment}

\begin{comment}
Let there be a population $X$ equipped with 
a metric $d$ and a distribution $D$. 
Let $H$ be a set of hypotheses mapping 
from $X$ to $\mathbb{R}$. For any model 
$h \in H$ and constants $\alpha, \beta$, define 
\begin{equation}
\label{eq:delta}
\Delta_{\alpha,\beta}(h) = \Pr_{x, x' \sim D} 
\{ d(x,x') \leq \alpha,\,  |h(x) - h(x')| \geq \beta \}. 
\end{equation} 

Intuitively, $\Delta_{\alpha,\beta}(h)$ is the 
probability a model gives dissimilar predictions 
on similar instances. Apparently, this probability 
should be small if the model wants to be considered 
fair. We formalize this intuition by the following 
definition. 

\begin{definition}
We say model $h$ is 
$(\alpha,\beta,\varepsilon)$-metric fair if 
\begin{equation}
\Delta_{\alpha,\beta}(h) \leq \varepsilon, 
\end{equation}
for some $\varepsilon>0$. 
\end{definition}
\end{comment}


\begin{comment}
\begin{algorithm}[t!]
\caption{Passive MF Learning}
\begin{algorithmic}[1]
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
\renewcommand{\algorithmicloop}{\textbf{Loop:}}
\REQUIRE 
a labeled data set $S_n$, a hypothesis class $h$, 
and metric-fairness parameters $\alpha, \beta$. 
\STATE learn a model $h \in H$ by solving 

\ENSURE model $h$. 
\end{algorithmic} 
\label{alg:passiveAMFL}
\end{algorithm}
\end{comment}
\begin{comment}
Let $Z_{\alpha} = \{ (a,b) \in X \times X 
\mid d(a,b) \leq \alpha \}$. 
Let $S$ be a set of $m$ instances sampled 
i.i.d. from $Z_{\alpha}$.
\end{comment}

\begin{comment}

be 
a threshold function with threshold $\beta$. Then 
\begin{equation}
\label{thm:genearlization_tauT}
\tau_{\beta+\frac{1}{t}}(z) 
\leq \tau_{\beta}^t(z) 
\leq \tau_{\beta}(z),     
\end{equation}
where $\tau_{\beta}^t$ is the piecewise 
function in Lemma \ref{lem:tool_generalization}. 



We first transform the analysis on 
$L_{n} \times L_{n}$ to the analysis on $S$; 

Then, we can first show $\Delta_{\alpha,\beta}(h; L_n)$ is lower bounded by $\frac{1}{m} \sum_{(a,b) 
\in S_m} \tau_{\beta}^t (|h(a) - h (b)|)$ based on 
elementary arguments and (\ref{thm:genearlization_tauT}). We can then 
show this bound is further lower bounded by 
$\mathbb{E} \tau_{\beta}^t (|h(a) - h (b)|) $ 
plus a term of $\mathcal{R}_{m}(F)$ by 
standard generalization arguments e.g. 
\cite[Theorem 3.3]{mohri2018foundations}. 
Next we show the expectation is lower bounded 
by $\Delta_{\alpha,\beta+\frac{1}{t}}$ 
show the expectation is lower bounded 
by $\mathbb{E} \tau_{\beta} (|h(a) - h (b)|) $


Define two functions 
\begin{equation}
\tau_{\beta}(z) = \left\{ 
\begin{array}{ll}
1, & \text{ if } z > \beta \\[.5em]
0, & \text{ if } z \leq \beta 
\end{array}
\right.,  
\end{equation}
and 
\begin{equation}
\tau_{\beta}^{t}(z) = \left\{ 
\begin{array}{ll}
1, & \text{ if } z > \beta + \frac{1}{t} \\[.5em]
t (z - \beta), & \text{ if } \beta 
< z \leq \beta + \frac{1}{t} \\[.5em] 
0, & \text{ if } z \leq \beta 
\end{array}
\right.. 
\end{equation}
We have 
\begin{equation}
\label{eq:taufunction}
\tau_{\beta+\frac{1}{t}}(z) 
\leq \tau_{\beta}^t(z) 
\leq \tau_{\beta}(z). 
\end{equation}

Let $S$ be a sample of $X \times X$ 
with $m$ elements, and $S_\alpha$ be the 
subset of $S$ with all $(a,b) \in S_{\alpha}$ 
satisfying $d(a,b) \leq \alpha$. Suppose 
the size of $S_{\alpha}$ is $m_{\alpha}$. Then 
\begin{align}
\begin{split}
& \frac{1}{m} \sum_{(a,b) \in S} 
\mathbb{I}\{|h(a) - h (b)| > \beta,\ 
d(a,b) \leq \alpha\}\\[.5em]
& = \frac{m_{\alpha}}{m} \cdot 
\frac{1}{m_{\alpha}} \sum_{(a,b) \in S_{\alpha}} 
\tau_{\beta}(|h(a) - h (b)|).  
\end{split}
\end{align}

Let $F: X \times X \rightarrow \mathbb{R}$ be the 
set of functions induced from $\tau_{\beta}^t$ 
and defined as $\forall f \in F$, 
$f(a,b) = \tau_{\beta}^t (|h(a) - h(b)|)$. 
By the standard generalization bound e.g. 
\cite[Theorem 3.3]{mohri2018foundations}
and (\ref{eq:taufunction}) and assumption 
(A1), we have 
\begin{align}
\label{eq:generalization}
\begin{split}
& \frac{1}{m_{\alpha}} \sum_{(a,b) \in S_{\alpha}} 
\tau_{\beta} (|h(a) - h (b)|) \\ 
& \geq \frac{1}{m_{\alpha}} \sum_{(a,b) 
\in S_{\alpha}} \tau_{\beta}^t (|h(a) - h (b)|) \\ 
& \geq \mathbb{E} \tau_{\beta}^t (|h(a) - h (b)| 
\mid d(a,b) \leq \alpha) \\ 
& \qquad - 2 \mathcal{R}_m(F) - \sqrt{\frac{\log \frac{1}{\delta}}{2m}}\\
& \geq \mathbb{E} \tau_{\beta + \frac{1}{t}} 
(|h(a) - h (b)| \mid d(a,b) \leq \alpha) \\ 
& \qquad - 16 t \mathcal{R}_m(H) 
- \sqrt{\frac{\log \frac{1}{\delta}}{2m}} \\ 
& \geq \mathbb{E} \tau_{\beta + \frac{1}{t}} 
(|h(a) - h (b)| \mid d(a,b) \leq \alpha) \\ 
& \qquad - \frac{1}{\sqrt{m}} 
\left( 16t \cdot C + \sqrt{\frac{1}{2} 
\log \frac{1}{\delta}} \right),  
\end{split}
\end{align}
for some constant $C$. 

Setting the LHS of (\ref{eq:generalization}) 
to zero implies $\mathbb{E} \tau_{\beta +
\frac{1}{t}} (|h(a) - h (b)| \mid d(a,b) 
\leq \alpha) \leq \frac{1}{\sqrt{m}} 
\left( 16t \cdot C + \sqrt{\frac{1}{2} 
\log \frac{1}{\delta}} \right)$. Further 
upper bounding the RHS of this inequality 
by $\varepsilon$ and solving for $m$ 
proves the theorem. 

\end{comment}


%where $\beta' = \beta + \frac{1}{t}$ and $\Delta_{m_{\alpha}}$ is a function of the Rademacher complexity of $H$ w.r.t. size sample $m_{\alpha}$ and conditioned on $d(a,b) \leq \alpha$ (to be derived from standard arguments plus Rademacher complexity properties and holds with probability at least $1 - \delta$. Need an additional Lemma.). 

\begin{comment}
\begin{align}
\begin{split}
& \frac{1}{m} \sum_{(a,b) \in S} 
\mathbb{I}\{|h(a) - h (b)| > \beta,\ 
d(a,b) \leq \alpha\}\\[.5em]
& = \frac{1}{m} \sum_{(a,b) \in S_{\alpha}} 
\tau_{\beta}(|h(a) - h (b)|)\\[.5em]
& \geq \frac{1}{m} \sum_{(a,b) \in S_{\alpha}} 
\tau_{\beta}^t (|h(a) - h (b)|)\\[.5em]
& \geq \frac{m_{\alpha}}{m} \cdot 
\frac{1}{m_{\alpha}} \sum_{(a,b) \in 
S_{\alpha}} \tau_{\beta}^t (|h(a) - h (b)|)\\[.5em]
& \geq \frac{m_{\alpha}}{m} \cdot \left( 
\mathbb{E} \left[ \tau_{\beta}^t (|h(a) - h (b)|) 
\mid d(a,b) \leq \alpha \right] 
- \Delta_{m_{\alpha}} \right)\\[.5em]
& \geq \frac{m_{\alpha}}{m} \cdot \left( 
\mathbb{E} \left[ \tau_{\beta + \frac{1}{t}} 
(|h(a) - h (b)|) \mid d(a,b) \leq \alpha \right] 
- \Delta_{m_{\alpha}} \right) \\[.5em]  
& = \frac{m_{\alpha}}{m} \cdot \left( 
\mathbb{E} \left[ \tau_{\beta'} 
(|h(a) - h (b)|) \mid d(a,b) \leq \alpha \right] 
- \Delta_{m_{\alpha}} \right)\\[.5em]
& = \frac{m_{\alpha}}{m} \cdot \left( 
\Pr \{ |h(a) - h (b)| > \beta' \mid d(a,b) \leq \alpha\}
- \Delta_{m_{\alpha}} \right), 
\end{split}
\end{align}

Our algorithm will return a model with the LHS = 0. 
The above analysis implies this model has 
\begin{align}
\begin{split}
\Pr \{ |h(a) - h (b)| > \beta' \mid d(a,b) \leq \alpha\} 
\leq \Delta_{m_{\alpha}}. 
\end{split}
\end{align}
Setting $\Delta_{m_{\alpha}} = \varepsilon$ 
and solving for $m_{\alpha}$ proves the theorem. 
\end{comment}



Consider labeling $m$ instances in 
round $q+1$. First, note that all 
labeled instances fall in 
$\mathcal{C}_{\alpha,\beta}(V_q)$ 
and thus will add to $S_q$ at least 
$m$ pairs of $(x,x')$ 
satisfying $d(x,x') \leq \alpha$.
Then, by Theorem \ref{thm:generalization} 
and setting $t = 1/\beta$, if 
$m \geq \frac{1}{4\xi^2} 
\left( 32 c/\beta + \sqrt{\frac{1}{2}
\log \frac{1}{\delta'}}\right)$,
with probability at least $1 - \delta'$, 
any $h \in V_{q+1}$ satisfies 
\begin{equation}
\label{eq:thm_proof_deltapassive}
\Delta_{\alpha,\beta}(h) \leq 1/(2 \xi).     
\end{equation}

Let $\&$ be logic `AND' and define event 
\begin{equation}
I_{\alpha}^{\beta}(x,x';h) 
:= d(x,x') \leq \alpha\ \&\ 
|h(x) - h(x')| > \beta. 
\end{equation}
Then, with probability at least $1 - \delta'$, 
any $h \in V_{q+1}$ satisfies 
\begin{align}
\label{thm2_proof_boundbyxi}
\begin{split}
& \Pr \{ I_{\alpha}^{\beta}(x,x';h) \}\\[.5em] 
& = \Pr \{ I_{\alpha}^{\beta}(x,x';h)\, \&\,  
(x,x') \in \mathcal{C}_{\alpha,\beta}(V_q)\}\\
& \quad + \Pr \{ I_{\alpha}^{\beta}(x,x';h)\, \&\, 
(x,x') \notin 
\mathcal{C}_{\alpha,\beta}(V_q)\}\\[.5em] 
& = \Pr \{ I_{\alpha}^{\beta}(x,x';h)\, \&\, (x,x') \in \mathcal{C}_{\alpha,\beta}(V_q)\}\\[.5em]
& = \Pr \{I_{\alpha}^{\beta}(x,x';h)  \mid 
(x,x') \in \mathcal{C}_{\alpha,\beta}(V_q)\}\\
& \quad \cdot \Pr \{ (x,x') \in
\mathcal{C}_{\alpha,\beta}(V_q)\}\\[.5em]
& \leq \frac{\Pr\{ (x,x') \in
\mathcal{C}_{\alpha,\beta}(V_q) \}}{2 \xi},  
\end{split}
\end{align}
where the second equality is by the fact that 
$\Pr \{ I_{\alpha}^{\beta}(x,x';h)\ \&\ (x,x') 
\notin \mathcal{C}_{\alpha,\beta}(V_{q})\} 
\leq \Pr \{ I_{\alpha}^{\beta}(x,x';h)\ \&\ (x,x') 
\notin \mathcal{C}_{\alpha,\beta}(V_{q+1})\} = 0$, 
and the inequality is by  (\ref{eq:thm_proof_deltapassive})
conditioned on an additional fact that all labeled instances 
fall in $\mathcal{C}_{\alpha,\beta}(V_{q+1})$. 
For conciseness, we will write $\Pr\{ \mathcal{C}_{\alpha,\beta}(V_q) \}$ for $\Pr\{ (x,x') \in \mathcal{C}_{\alpha,\beta}(V_q) \}$ . 
 
Result in (\ref{thm2_proof_boundbyxi}) 
implies $V_{q+1} \subseteq  \mathcal{B}\left(\frac{\Pr\{ \mathcal{C}_{\alpha,\beta}(V_q) \}}{2 
\xi}\right)$ and  
\begin{align}
\begin{split}
& \Pr\{ \mathcal{C}_{\alpha,\beta}(V_{q+1})\} \\[.5em]
& \leq \Pr\left\{ \mathcal{C}_{\alpha,\beta} \left( \mathcal{B}_{\alpha,\beta}\left(\frac{\Pr\{ 
\mathcal{C}_{\alpha,\beta}(V_q) \}}{
2 \xi}\right)\right)\right\}\\[.5em] 
& \leq \xi \cdot \frac{\Pr\{ \mathcal{C}_{\alpha,\beta}(V_q) \}}{2 \xi}\\[.5em]
& = \frac{\Pr\{\mathcal{C}_{\alpha,\beta}(V_q) \}}{2},
\end{split}
\end{align}
where the first inequality is by the definition 
of $\xi$. This result means
$\Pr\{\mathcal{C}_{\alpha,\beta}(V_q) \}$ 
is halved after each round of labeling. 
Therefore, after $Q := \log_2 \frac{1}{\varepsilon}$ 
rounds of labeling, 
\begin{equation}
\Delta_{\alpha, \beta}(h) \leq \Pr\{ 
\mathcal{C}_{\alpha,\beta}(V_{Q}) \} 
\leq \varepsilon, 
\end{equation}
with probability at least $1 - Q \delta'$;  
where the left inequality is by definition. 
By then, the total number of labeled 
instances is $\log_{2}\frac{1}{\varepsilon} \cdot 
\frac{1}{4\xi^2} \left( 32 c / \beta 
+ \sqrt{\frac{1}{2} \log \frac{1}{\delta'}}\right)$. 
Setting $\delta = Q \delta'$ and plugging 
$\delta' = \delta/Q$ in completes the proof. 

\begin{proof}
We can apply the same proof strategy and show
$\Delta_{\alpha,\beta}(h)=\frac{\alpha(1-\alpha^2)}{1-2\alpha}\geq\alpha(1+\alpha)$. (For two points to be within $\alpha$ distance apart, the probability is $(1-2\alpha)2\alpha+2\alpha(\alpha+\frac{\alpha}{2})=1-\alpha^2$; for $z$ to land in a certain $\alpha$-length interval within its whole range of length $1-2\alpha$, the probability is $\alpha/(1-2\alpha)$), $\mathcal{C}_{\alpha,\beta}(\mathcal{B}_{\alpha,\beta}(r))=[z-\frac{\alpha}{2},z]\times[z,z+\frac{\alpha}{2}]\cup[z,z+\frac{\alpha}{2}]\times[z-\frac{\alpha}{2},z]$, so $\Pr[\mathcal{C}_{\alpha,\beta}(\mathcal{B}_{\alpha,\beta}(r))]=\frac{\alpha^2}{2}\leq \frac{r}{2}$ (because $\alpha+\alpha^2\leq r$ implies $\alpha^2\leq r$). Hence $\xi\leq \frac{1}{2}$.
\end{proof}



\begin{proof}
Since $\mathbb{I}_{x \geq t} \leq \frac{x}{t}$ for any 
$x, t \geq 0$, we have 
\begin{align}
\begin{split}
& \mathbb{I} \{ d(x_i, x_j) \leq \alpha, 
\,  |h(x_i) - h(x_j)| \geq \beta \} \\[.5em] 
& = \mathbb{I} \{ d(x_i, x_j) \leq \alpha\} \cdot 
\mathbb{I} \{ |h(x_i) - h(x_j)|^2 \geq \beta^2 
\} \\[.5em] 
& \leq \mathbb{I} \{ d(x_i, x_j) \leq \alpha \} 
\cdot |h(x_i) - h(x_j)|^2 / \beta^2 \\[.5em] 
& = M_{ij} \cdot |h(x_i) - h(x_j)|^2 / \beta^2.   
\end{split}
\end{align}
Plugging this back to (\ref{eq:empdelta}) 
proves the lemma. 
\end{proof}


\begin{comment}
 which queries labels for a standard learner to improve group 
fairness of classification 
model. This paper considers a completely different 
setting, where we query labels for a metric-fair learner 
to improve individual fairness of regression model. 
In addition, we derive the first $O(\log \frac{1}{\varepsilon})$ sample complexity for a model to achieve $(\alpha,\beta,\varepsilon)$-metric fairness.  

Individual fairness was initially formalized 
as the Lipschitz condition of a prediction model 
\cite{dwork2012fairness}, and later relaxed 
to a probabilistic and almost Lipschitz condition 
called approximate metric-fairness with 
generalization guarantee \cite{yona2018probably}. 
There are many studies addressing 
different aspects of individual fairness 
such as how to design a metric 
\cite{ilvento2020metric,mukherjee2020two},
how to achieve individual fairness with limited 
resource \cite{kim2018fairness,bechavod2020metric}  
or how to combine it with group fairness 
\cite{zemel2013learning,sharifi2019average}. 
The sample efficiency for achieving individual 
fairness in passive learning is studied in \cite{balashankar2019fairness,shabat2020sample}.


Two fairness notations have been extensively studied: 
group fairness and individual fairness \cite{dwork2012fairness}. 
The former requires model prediction 
to have minimum disparity across different groups, 
while the latter requires model prediction 
to be similar on similar individuals. 
This paper focuses on individual fairness. 

Model fairness at the individual level is first formalized 
as the Lipschitz condition of the model \cite{dwork2012fairness}. 
Due to several practical challenges, \cite{yona2018probably} 
relaxes this formalization to a probabilistic Lipschitz condition  
and names it \textit{metric-fairness}. Our study further 
relaxes the latter formalization to a probabilistic continuous condition 
which we name $(\alpha,\beta,\varepsilon)$-metric fairness. 

We notice that most fairness studies are based on passive  
learning, where labels of training data are randomly queried. 
Our study is based on active learning, where labels are 
actively queried for efficiently improving metric fairness. 
\end{comment}


\begin{comment}
\begin{theorem}
\label{thm:generalization}
Let $H$ be a hypothesis class with 
$\mathcal{R}_m(H) \in O(1/\sqrt{m})$. 
Fix any $\alpha, \beta, t > 0$. Then, 
any $h \in H$ satisfying 
$\Delta_{\alpha,\beta}(h; S) = 0$  
on an i.i.d. sample $S$ of $\{(x,x') 
\in X \times X; d(x,x') \leq \alpha \}$ 
also satisfies $\Delta_{\alpha,\beta + 1/t}(h) 
\leq \varepsilon$ with probability 
at least $1 - \delta$ if $
|S| \geq \frac{1}{\varepsilon^2}
\left( 16 t c + \sqrt{\frac{1}{2}
\log \frac{1}{\delta}}\right)$, 
where $c$ is the constant inherited 
from $O(1/\sqrt{m})$. 
\end{theorem}
\end{comment}



\begin{comment}
If $x' \in (B - \frac{\beta}{w}, B]$, then 
$h(B) - h(x') = w(B-x') < \beta$. This 
implies $\Pr_{x} \{ I_{\alpha}^{\beta}(x,x';h) 
\mid x' \} = 0$ and thus the third integral 
is zero. 

If $x' \in (B - \alpha, B - \frac{\beta}{w}]$, 
by similar analysis we can show 
\begin{equation}
\Pr_{x} \{ I_{\alpha}^{\beta}(x,x';h) \mid x' \} 
= 1 - \frac{\beta}{w (B - x')}. 
\end{equation}
Then, the second integral is 
$\frac{1}{B} \left(\alpha - \frac{\beta}{w} 
\left( 1 - \ln \frac{\beta}{w \alpha}\right)\right)$. 

our goal is to identify the set 
of $(x,x')$ satisfying $d(x,x') \leq \alpha$ and 
$|f(x)-f(x')|>\beta$. We will refer to this set 
as $E_{\alpha,\beta}$. 

Let us first fix one element $x' = 0$ and identify 
the set of $x$ such that $(x,0) \in E_{\alpha,\beta}$. 
This is illustrated in Figure \ref{fig:xi_demo}. 

Consider two cases. If $f(\alpha) \leq \beta$, then 
no $(x,0)$ is in $E_{\alpha,\beta}$. Since $f$ is linear, 
this argument applies to other $x'$ so no $(x,x')$ 
is in $E_{\alpha,\beta}$, which implies 
$\Delta_{\alpha,\beta}(h) = 0$. 


If $f(\alpha) > \beta$, then geometric analysis 
shows that $(x,0) \in E_{\alpha,\beta}$ for all 
$x \in (f^{-1}(\beta), \alpha]$ and 
$x \in [-\alpha, f^{-1}(\beta))$. This implies 
that conditioned on $x' = 0$, we have if $w \geq 0$, 
\begin{align}
\label{eq:conditionalbound}
\begin{split}
& \Pr_{x} \{ |h(x) - h(x')| > \beta \mid 
d(x,x') \leq \alpha \} \\[.5em] 
& =  \frac{2 \cdot (\alpha - f^{-1}(\beta))}{2 \cdot \alpha}
=  \frac{2 \cdot (\alpha - \frac{\beta}{w})}{2 \cdot \alpha}
= 1 - \frac{\beta}{\alpha w}. 
\end{split}
\end{align}
Note the probability is non-zero only if 
$w \geq 1$. By symmetric argument, if 
$w < 0$, then 
\begin{align}
\label{eq:conditionalbound2}
\begin{split}
& \Pr_{x} \{ |h(x) - h(x')| > \beta \mid 
d(x,x') \leq \alpha \} = 1 + \frac{1}{w}. 
\end{split}
\end{align}
And the probability is is non-zero only 
if $w \leq -1$. 
In the following, we will write $1 \pm 1/w$ for 
convenience. 


Since $f$ is linear, the above result applies to 
all $x' \in [-B + \alpha, B - \alpha]$. For 
$x' \in (B - \alpha, B]$, consider two sub-cases. 
If $f(B) - f(x') \leq \beta$, which implies 
$x' \in [B - \beta/w, B]$, then the above 
probability is zero. 
If $x' \in (B - \alpha, B - \beta/w)$, we can show 
the above probability is no greater than $1 - w$. 
\textcolor{red}{(possibly flawed)} 
To see this, for any fixed $x'$, first expand the 
domain to $[-B - \varepsilon, B + \varepsilon]$ 
by any proper $\varepsilon > 0$ so the new domain 
contains $x' + \alpha$. Then, we can apply 
(\ref{eq:conditionalbound}) to obtain the probability 
$\frac{2 \cdot \alpha \pm \beta / w}{ 2 \cdot \alpha}$. 
Finally, shrink the new domain back to the original 
domain, in which case the probability becomes  
\begin{equation}
\frac{2 \cdot \alpha \pm \beta / w - \varepsilon}{ 
2 \cdot (\alpha - \varepsilon)} \leq      
\frac{2 \cdot \alpha \pm \beta / w}{ 2 \cdot \alpha} 
= 1 \pm 1/w. 
\end{equation}
This also applies to case when 
$x' \in (- B + \beta/w, - B + \alpha)$. 

Putting all together, we see any 
$x' \in [-B, B]$ satisfies   
\begin{equation}
\label{eq:conditionalbound}
\Pr_{x} \{ |h(x) - h(x')| > \beta 
\mid d(x,x') \leq \alpha \} \leq 1 \pm 1/w. 
\end{equation}

Let $\&$ denote logic `and'. Define event 
\begin{equation}
I_{\alpha}^{\beta}(x,x';h) 
:= d(x,x') \leq \alpha\ \&\ 
|h(x) - h(x')| > \beta. 
\end{equation}

Assume $w > 0$. We have 
\begin{align}
\begin{split}
& \Pr_{x, x'} \{ I_{\alpha}^{\beta}(x,x';h) \}\\
& = \mathbb{E}_{x' \in [-B, B]} \left[ \Pr_{x} 
\{ I_{\alpha}^{\beta}(x,x';h) \mid x' \} \right]\\
& = 2\ \mathbb{E}_{x' \in [0, B]} \left[ \Pr_{x} 
\{ I_{\alpha}^{\beta}(x,x';h) \mid x' \} \right]\\ 
& = 2 \int_{x' \in [0, B - \alpha]} \Pr_{x} 
\{ I_{\alpha}^{\beta}(x,x';h) \mid x' \} p(x') \\
& \quad + 2 \int_{x' \in (B - \alpha, B - \frac{\beta}{w}]} 
\Pr_{x} \{ I_{\alpha}^{\beta}(x,x';h) \mid x' \} p(x') \\
& \quad + 2 \int_{x' \in (B - \frac{\beta}{w}, B]} 
\Pr_{x} \{ I_{\alpha}^{\beta}(x,x';h) \mid x' \} p(x'). 
\end{split}
\end{align}

We will analyze each integral separately. 

If $x' \in (B - \frac{\beta}{w}, B]$, then 
$h(B) - h(x') = w(B-x') < \beta$. This 
implies $\Pr_{x} \{ I_{\alpha}^{\beta}(x,x';h) 
\mid x' \} = 0$ and thus the third integral 
is zero. 

If $x' \in (B - \alpha, B - \frac{\beta}{w}]$, 
by similar analysis we can show 
\begin{equation}
\Pr_{x} \{ I_{\alpha}^{\beta}(x,x';h) \mid x' \} 
= 1 - \frac{\beta}{w (B - x')}. 
\end{equation}
Then, the second integral is 
$\frac{1}{B} \left(\alpha - \frac{\beta}{w} 
\left( 1 - \ln \frac{\beta}{w \alpha}\right)\right)$. 

If $x' \in [0, B - \frac{\beta}{w}]$, 
by similar analysis we can show 
\begin{equation}
\Pr_{x} \{ I_{\alpha}^{\beta}(x,x';h) 
\mid x' \} = 1 - \frac{\beta}{w \alpha}. 
\end{equation}
This implies the first integral is 
$(1 - \frac{\beta}{w \alpha}) (1 - \frac{\alpha}{B})$. 

Putting all together, we have 
\begin{equation}
\Pr_{x, x'} \{ I_{\alpha}^{\beta}(x,x';h) \} 
=  1 - \frac{\beta}{\alpha w} + 
\frac{\beta}{ B w} \ln \frac{\beta}{\alpha w}. 
\end{equation}



\textcolor{red}{key is to show the above is 
monotonic under mild conditions. Let $w^*$ be 
the model that achieves $r$. Based on this, 
we can then show the probability in (23) equals 
to the probability in (29) with $w = w^*$. 
Thus the coefficient is 1.}

Setting $\frac{\beta}{\alpha} = 1$ and 
$\frac{\beta}{B} = 0.01$, and applying 
the inequality $\ln \frac{1}{t} \leq 
\frac{1}{t} - 1$ whenever $t \geq 1$, we have 
\begin{equation}
\Pr_{x, x'} \{ I_{\alpha}^{\beta}(x,x';h) \} 
\leq \frac{0.01}{w^2} - \frac{1.01}{w} + 1. 
\end{equation}

Now, bounding the RHS by $r$ and solving 
for $w$ gives 
\begin{equation}
w \leq \frac{0.1}{5.05 - \sqrt{r + 24.5025}}.     
\end{equation}
Note that as $r$ increases from 0 to 1, this 
upper bound increases from 1 to $+\infty$. 
In other words, we can write 
\begin{equation}
\mathcal{B}_{\alpha,\beta}(r)) 
\leq \{  \} [\frac{1}{2 B r - 1}, 
\frac{1}{1 - 2 B r}]. 
\end{equation}

\end{comment}


\begin{comment}
\begin{figure}[t!]
     \centering
     \includegraphics[width=.3\textwidth]{figure/Xi_Demo2.PNG}
     \caption{}
     \label{fig:xi_demo2}
\end{figure}
\end{comment}




\begin{comment}
Now, from an algorithm perspective, a natural way 
to learn a $(\alpha,\beta,\varepsilon)$-metric 
fair model is to solve 
\begin{align}
\begin{split}
\label{eq:optPAMFL}
\min_{h \in H}\ \frac{1}{n} 
\sum_{i=1}^n (h(x_i) - y_i)^2,\quad 
s.t.\ \Delta_{\alpha,\beta}(h; S_n)  \leq \varepsilon.
\end{split}
\end{align}
However, (\ref{eq:optPAMFL}) is not easy to 
solve since $\Delta_{\alpha,\beta}(h; S_n)$ 
is not differentiable. 
We propose to tackle this challenge by 
replacing $\Delta_{\alpha,\beta}(h; S_n)$ 
with the following quantity in the constraint: 
\begin{equation}
\label{eq:passiveAMFL}
\tilde{\Delta}_{\alpha,\beta}(h; S_n) 
= \frac{1}{n^2 \beta^2} \sum_{i,j=1}^n M_{ij} 
\cdot |h(x_i) - h(x_j)|^2.  
\end{equation}
where $M$ is an $n$-by-$n$ matrix with $M_{ij} 
= \mathbb{I}\{d(x_i,d_j) \leq \alpha\}$. 
Such replacement is justified in the following lemma. 
\end{comment}



\begin{comment}
Based on the above discussions, we propose 
a $(\alpha,\beta,\varepsilon)$-metric 
fair learner in Algorithm \ref{alg:optPAMFL}.  
Note it is a passive learner since 
its training set $S_n$ is randomly sampled.  

In practice, problem (\ref{eq:optPAMFL}) can be solved 
using the Lagrange multiplier technique, which is 
equivalent to solving 
\begin{equation}
\label{eq:optPAMFL_lag}
\min_{h \in H}\ \frac{1}{n} \sum_{i = 1}^n 
(h(x_i) - y_i)^2 + \lambda 
\tilde{\Delta}_{\alpha,\beta}(h; S_n)      
\end{equation}
for some hyper-parameter $\lambda$. 
\end{comment}


\begin{comment}
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/insurance_active_L_delta.png}
         \caption{$\Delta_{\alpha,\beta}(h)$ on Insurance}
     \end{subfigure} 
\end{comment}

\begin{comment}
\begin{figure*}[t!]
     \centering
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/a.png}
         \caption{Bias of linear model on Insurance}
     \end{subfigure}
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/b.png}
         \caption{Bias of linear model on Life}
     \end{subfigure}
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/c.png}
         \caption{Bias of linear model on COVID}
     \end{subfigure}
     \\[.5em] 
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/d.png}
         \caption{Bias of rff model on Insurance}
     \end{subfigure}
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/rfflifebias.png}
         \caption{Bias of rff model on Life}
     \end{subfigure}
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/rffcovidbias.png}
         \caption{Bias of rff model on COVID}
     \end{subfigure}     
     \\[1.5em]
     
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/g.png}
         \caption{Error of linear model on Insurance}
     \end{subfigure}
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/h.png}
         \caption{Error of linear model on Life}
     \end{subfigure}
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/i.png}
         \caption{Error of linear model on COVID}
     \end{subfigure}
     \\[.5em]
     \centering
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/j.png}
         \caption{Error of rff model on Insurance}
     \end{subfigure}
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/rfflifermse.png}
         \caption{Error of rff model on Life}
     \end{subfigure}
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/rffcovidrmse.png}
         \caption{Error of rff model on COVID}
     \end{subfigure}
     \\[1.5em]
     
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/insurance_lambda_delta.png}
         \caption{$\Delta$ versus $\lambda$}
     \end{subfigure} 
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figure/insurance_lambda_RMSE.png}
         \caption{RMSE versus $\lambda$}
     \end{subfigure}
     \begin{subfigure}{.33\textwidth}
         \centering
         \includegraphics[width=\textwidth]{}
    %     \caption{}
    \end{subfigure}
    \vspace{-5pt}
    \caption{Experimental Results}
    \label{fig:expresults}
\end{figure*}

\end{comment}