\appendix 
\onecolumn

\section{Supplementary Material}

\begin{lemma}[Lemma 3.5]
Fix any $t, \beta > 0$. 
Let $F: X \times X \rightarrow \mathbb{R}$ 
be a hypothesis class induced from $H$ 
such that $\forall f \in F$, $f(x,x') 
= \tau_{\beta}^t (|h(x) - h(x')|)$ where 
$\tau_{\beta}^{t}(z)$ is a piecewise model 
outputting $1$ if $z > \beta + \frac{1}{t}$, outputting 
$0$ if $z \leq \beta$ and $t(z-\beta)$ otherwise. 
Then $\mathcal{R}_m(F) \leq 8t \cdot \mathcal{R}_{m}(H)$.
\end{lemma}
\begin{proof}
Let $G : X \times X \rightarrow \mathbb{R}$ 
be the set of functions induced from $h$ 
and defined as $\forall g \in G$, 
$g(a,b) = h(a) - h(b)$. Let 
${abs}$ be the absolute function. Then 
$f(a,b) = \tau_{\beta}^t \circ abs \circ g(a,b)$ 
and we can write, accordingly, 
\begin{equation}
F = \tau_{\beta}^t \circ abs \circ G.      
\end{equation}
We first show $\mathcal{R}_{m}(F) \leq \mathcal{R}_{m}(G)$. 
This is true because 
\begin{align}
\label{eq:lem_Rad_1}
\begin{split}
\mathcal{R}_{m}(F) 
= \mathcal{R}_{m}(\tau_{\beta}^t \circ abs \circ G)
\leq 2t \cdot \mathcal{R}_{m}(abs \circ G)
\leq 4t \cdot \mathcal{R}_{m}(G),  
\end{split}
\end{align}
where both inequalities are by the property of 
Rademacher complexity for composite function 
with one component being Lipschitz continuous 
e.g., \cite[Theorem 12]{bartlett2002rademacher} and 
the facts that $\tau_{\beta}^{t}$ and $abs$ are 
both Lipschitz with constants $t$ and 1 respectively. 


We then show $\mathcal{R}_{m}(G) \leq 2 \cdot 
\mathcal{R}_m(H)$. This is true because 
\begin{align}
\label{eq:lem_Rad_2}
\begin{split}
\mathcal{R}_{m}(G) 
& = \mathbb{E}_{\{(a_i,b_i)\}} \mathbb{E}_{\sigma}
\sup_{g \in G} \frac{1}{m} \sum_{i = 1}^m 
\sigma_i g(a_i,b_i)\\ 
& = \mathbb{E}_{\{(a_i,b_i)\}} \mathbb{E}_{\sigma} 
\sup_{g \in G} \frac{1}{m} \sum_{i = 1}^m 
\sigma_i [h(a_i) - h(b_i)]\\ 
& \leq \mathbb{E}_{\{(a_i,b_i)\}} \mathbb{E}_{\sigma} 
\sup_{g \in G} \frac{1}{m} \sum_{i = 1}^m 
\sigma_i h(a_i)  + \mathbb{E}_{\{(a_i,b_i)\}} 
\mathbb{E}_{\sigma} \sup_{g \in G} \frac{1}{m} 
\sum_{i = 1}^m \sigma_i h(b_i)\\ 
& = 2 \cdot \mathbb{E}_{\{(a_i,b_i)\}} 
\mathbb{E}_{\sigma} \sup_{g \in G} \frac{1}{m} 
\sum_{i = 1}^m \sigma_i h(x_i) \\
& = 2 \cdot \mathcal{R}_m(H),  
\end{split}
\end{align}
where the third equality is based on the fact 
that $\sigma_i$ is uniform in \{-1, 1\} so 
the expectation with respect to $\sigma_i$ is the 
same as the expectation with respect to  $-\sigma_i$. 

Combining (\ref{eq:lem_Rad_1}) and 
(\ref{eq:lem_Rad_2}) proves the lemma. 
\end{proof}

\begin{theorem}[Theorem 3.6]
\label{thm:generalization}
Fix any $\alpha, \beta, t > 0$. 
Suppose $\mathcal{R}_m(H) \in O(1/\sqrt{m})$. 
Any model $h \in H$ returned by the AMF learner 
satisfies $\Delta_{\alpha,\beta + 1/t}(h) 
\leq \varepsilon$ with probability 
at least $1 - \delta$ if 
$m \geq \frac{1}{\varepsilon^2}
\left( 16 t c + \sqrt{\frac{1}{2}
\log \frac{1}{\delta}}\right)$, 
where $m$ is the number of 
$(x,x') \in S$ satisfying 
$d(x,x') \leq \alpha$ 
and $c$ is a constant inherited 
from $O(1/\sqrt{m})$. 
\end{theorem}

\begin{proof}
To facilitate discussion, define two functions 
\begin{equation}
\tau_{\beta}(z) = \left\{ 
\begin{array}{ll}
1, & \text{ if } z > \beta \\[.5em]
0, & \text{ if } z \leq \beta 
\end{array}
\right.,  
\end{equation}
and 
\begin{equation}
\tau_{\beta}^{t}(z) = \left\{ 
\begin{array}{ll}
1, & \text{ if } z > \beta + \frac{1}{t} \\[.5em]
t (z - \beta), & \text{ if } \beta 
< z \leq \beta + \frac{1}{t} \\[.5em] 
0, & \text{ if } z \leq \beta 
\end{array}
\right.. 
\end{equation}
By definition, we have 
\begin{equation}
\label{eq:taufunction}
\tau_{\beta+\frac{1}{t}}(z) 
\leq \tau_{\beta}^t(z) 
\leq \tau_{\beta}(z). 
\end{equation}
Recall $S = \{ (x_i, x_j) \}_{i,j = 1, \ldots, n}$. 
Let $S_{\alpha}$ be a subset of $S$ defined as 
\begin{equation}
S_{\alpha} = \{ (a,b) \in S \mid d(a,b) \leq \alpha\}.      
\end{equation}
Suppose the size of $S_{\alpha}$ is $m$. Then, 
\begin{align}
\label{eq:generalization_v0}
\begin{split}
\Delta_{\alpha,\beta}(h; S) 
& = \frac{1}{n^2} \sum_{i,, j = 1}^n 
\mathbb{I}\{|h(x_i) - h (x_j)| 
> \beta,\ d(x_i,x_j) \leq \alpha\}\\[.5em] 
& = \frac{m}{n^2} \cdot \frac{1}{m} \sum_{(a,b) \in S_{\alpha}}
\mathbb{I}\{|h(a) - h (b)| > \beta\}\\[.5em] 
& = \frac{m}{n^2} \cdot \frac{1}{m} \sum_{(a,b) \in S}
\tau_{\beta}(|h(a) - h (b)|). 
\end{split}
\end{align}

\begin{comment}
Let $S$ be a sample of $X \times X$ 
with $m$ elements, and $S_\alpha$ be the 
subset of $S$ with all $(a,b) \in S_{\alpha}$ 
satisfying $d(a,b) \leq \alpha$. Suppose 
the size of $S_{\alpha}$ is $m_{\alpha}$. Then 
\end{comment}

Recall $F: X \times X \rightarrow \mathbb{R}$ is 
the set of functions induced from $\tau_{\beta}^t$ 
and defined as $\forall f \in F$, 
$f(a,b) = \tau_{\beta}^t (|h(a) - h(b)|)$. 
We have that, with probability at least 
$1 - \delta$, 
\begin{align}
\label{eq:generalization}
\begin{split}
\frac{1}{m} \sum_{(a,b) \in S}
\tau_{\beta}(|h(a) - h (b)|) 
& \geq \frac{1}{m} \sum_{(a,b) 
\in S} \tau_{\beta}^t (|h(a) - h (b)|) \\
& \geq \mathbb{E} [ \tau_{\beta}^t (|h(a) - h (b)|)  
\mid d(a,b) \leq \alpha] - 2 \mathcal{R}_{m}(F) 
- \sqrt{\frac{\log \frac{1}{\delta}}{2m}}\\
& \geq \mathbb{E} [ \tau_{\beta + \frac{1}{t}} 
(|h(a) - h (b)|) \mid d(a,b) \leq \alpha]  
- 16 t \mathcal{R}_{m}(H) 
- \sqrt{\frac{\log \frac{1}{\delta}}{2m}} \\
& \geq \mathbb{E} [\tau_{\beta + \frac{1}{t}} 
(|h(a) - h (b)|) \mid d(a,b) \leq \alpha] 
- \frac{1}{\sqrt{m}} 
\left( 16t c + \sqrt{\frac{1}{2} 
\log \frac{1}{\delta}} \right). 
\end{split}
\end{align}
where for some constant $c$. In (\ref{eq:generalization}), 
the first inequality is by (\ref{eq:taufunction}); 
the second one is by standard generalization 
bound\footnote{Here we follow 
\cite{yona2018probably} and treat $S_{\alpha}$  
as an i.i.d. sample. If it is not, we can either 
add an additional constraint that no two pairs 
in $S_{\alpha}$ share the same instance so it 
can be viewed as an i.i.d. sample, or apply a generalization 
error bound on non-i.i.d. sample e.g. \cite{mohri2008rademacher}. 
In either case, the order of our result remains the same.} with Rademacher complexity e.g. \cite[Theorem 3.3]{mohri2018foundations} conditioned on $d(a,b) \leq \alpha$; 
the third one is by (\ref{eq:taufunction}) 
and Lemma \ref{lem:tool_generalization}; 
and the last one holds since $\mathcal{R}_{m} \in O(1/\sqrt{m})$. Note the expectation of $(a,b) \in S_{\alpha}$ 
in $\mathcal{R}_{m} \in O(1/\sqrt{m})$ is also 
conditioned on $d(a,b) \leq \alpha$, and we always 
assume $\mathcal{R}_{m} \in O(1/\sqrt{m})$ w.r.t. any 
data proper distribution.

Combining (\ref{eq:generalization_v0}) and  (\ref{eq:generalization}), we see  
$\Delta_{\alpha,\beta}(h; S) = 0$ implies 
\begin{equation}
\label{app:generalization_l1}
\mathbb{E} [ \tau_{\beta +
\frac{1}{t}} (|h(a) - h (b)|) \mid d(a,b) \leq \alpha]  
\leq \frac{1}{m} 
\left( 16t c + \sqrt{\frac{1}{2} 
\log \frac{1}{\delta}} \right). 
\end{equation}

Further, we can show 
\begin{equation}
\label{app:generalization_l2}
\Delta_{\alpha,\beta+\frac{1}{t}} (h) \leq 
\mathbb{E} [\tau_{\beta + \frac{1}{t}} 
(|h(a) - h (b)|) \mid d(a,b) \leq \alpha],    
\end{equation}
because 
\begin{align}
\begin{split}
\Delta_{\alpha,\beta+\frac{1}{t}} (h) & = 
\int_{(a,b) \in X \times X} \mathbb{I}\{|h(a) - h(b)| > 
\beta + 1/t\} \cdot \mathbb{I} \{ d(a,b) \leq \alpha\} \cdot p(a,b) \\ 
& \leq \int_{(a,b) \in X \times X} 
\mathbb{I}\{|h(a) - h(b)| > \beta + 1/t\} 
\cdot p(a,b) \\[.5em]
& \leq \int_{(a,b) \in X \times X} 
\mathbb{I}\{|h(a) - h(b)| > \beta + 1/t\} 
\cdot p(a,b \mid d(a,b) \leq \alpha) \\[.5em]
& = \mathbb{E} [ \tau_{\beta + 
\frac{1}{t}} (|h(a) - h (b)|) \mid d(a,b) \leq \alpha]. 
\end{split}    
\end{align}

Combining (\ref{app:generalization_l1}) and 
(\ref{app:generalization_l2}), and upper bounding 
the RHS of (\ref{app:generalization_l1}) by $\varepsilon$ 
implies that $\Delta_{\alpha,\beta+\frac{1}{t}} (h) \leq 
\varepsilon$ whenever 
\begin{equation}
m \geq \frac{1}{\varepsilon^2}
\left( 16 t c + \sqrt{\frac{1}{2}
\log \frac{1}{\delta}}\right).     
\end{equation}
The theorem is proved. 
\end{proof}



\begin{theorem}[Theorem 4.2]
Fix any $\alpha, \beta > 0$. 
Suppose $\mathcal{R}_m(H) \in O(1/\sqrt{m})$ 
and the counter $(\alpha,\beta)$ AMF 
coefficient w.r.t. $H$ is bounded. Then, 
with probability at least $1 - \delta$, 
any $h \in H$ returned by Algorithm 
\ref{alg:optPAMFL2} satisfies 
$\Delta_{\alpha,\beta}(h) \leq \varepsilon$ 
after $O(\log\frac{1}{\varepsilon})$ labeling. 
\end{theorem}
\begin{proof}
Suppose we have performed $q$ rounds of 
labeling. Let $L_{q}$ be the updated  
training set and $S_{q}$ be the associated 
set of instance pairs 
in Definition \ref{def_AMFlearner}. Define 
\begin{equation}
V_q = \{ h \in H; 
\Delta_{\alpha, \beta}(h; S_{q}) = 0\}.  
\end{equation} 

Consider labeling $m$ instances in 
round $q+1$. First, note that all 
labeled instances fall in 
$\mathcal{C}_{\alpha,\beta}(V_q)$ 
and thus will add to $S_q$ at least 
$m$ pairs of $(x,x')$ 
satisfying $d(x,x') \leq \alpha$.
Then, by Theorem \ref{thm:generalization} 
and setting $t = 1/\beta$, if 
$m \geq \frac{1}{4\xi^2} 
\left( 32 c/\beta + \sqrt{\frac{1}{2}
\log \frac{1}{\delta'}}\right)$,
with probability at least $1 - \delta'$, 
any $h \in V_{q+1}$ satisfies 
\begin{equation}
\label{eq:thm_proof_deltapassive}
\Delta_{\alpha,\beta}(h) \leq 1/(2 \xi).     
\end{equation}

Let $\&$ be logic `AND' and define event 
\begin{equation}
I_{\alpha}^{\beta}(x,x';h) 
:= d(x,x') \leq \alpha\ \&\ 
|h(x) - h(x')| > \beta. 
\end{equation}
Then, with probability at least $1 - \delta'$, 
any $h \in V_{q+1}$ satisfies 
\begin{align}
\label{thm2_proof_boundbyxi}
\begin{split}
\Pr \{ I_{\alpha}^{\beta}(x,x';h) \}
& = \Pr \{ I_{\alpha}^{\beta}(x,x';h)\, \&\,  
(x,x') \in \mathcal{C}_{\alpha,\beta}(V_q)\}
+ \Pr \{ I_{\alpha}^{\beta}(x,x';h)\, \&\, 
(x,x') \notin 
\mathcal{C}_{\alpha,\beta}(V_q)\}\\[.5em] 
& = \Pr \{ I_{\alpha}^{\beta}(x,x';h)\, \&\, (x,x') \in \mathcal{C}_{\alpha,\beta}(V_q)\}\\[.5em]
& = \Pr \{I_{\alpha}^{\beta}(x,x';h)  \mid 
(x,x') \in \mathcal{C}_{\alpha,\beta}(V_q)\}
\cdot \Pr \{ (x,x') 
\in \mathcal{C}_{\alpha,\beta}(V_q)\}\\[.5em]
& \leq \frac{\Pr\{ (x,x') \in
\mathcal{C}_{\alpha,\beta}(V_q) \}}{2 \xi},  
\end{split}
\end{align}
where the second equality is by the fact that 
$\Pr \{ I_{\alpha}^{\beta}(x,x';h)\ \&\ (x,x') 
\notin \mathcal{C}_{\alpha,\beta}(V_{q})\} 
\leq \Pr \{ I_{\alpha}^{\beta}(x,x';h)\ \&\ (x,x') 
\notin \mathcal{C}_{\alpha,\beta}(V_{q+1})\} = 0$, 
and the inequality is by  (\ref{eq:thm_proof_deltapassive})
conditioned on an additional fact that all labeled instances 
fall in $\mathcal{C}_{\alpha,\beta}(V_{q+1})$. 
For conciseness, we will write $\Pr\{ \mathcal{C}_{\alpha,\beta}(V_q) \}$ for $\Pr\{ (x,x') \in \mathcal{C}_{\alpha,\beta}(V_q) \}$ . 
 
Result in (\ref{thm2_proof_boundbyxi}) 
implies $V_{q+1} \subseteq  \mathcal{B}\left(\frac{\Pr\{ \mathcal{C}_{\alpha,\beta}(V_q) \}}{2 
\xi}\right)$ and  
\begin{align}
\begin{split}
\Pr\{ \mathcal{C}_{\alpha,\beta}(V_{q+1})\} 
\leq \Pr\left\{ \mathcal{C}_{\alpha,\beta} \left( \mathcal{B}_{\alpha,\beta}\left(\frac{\Pr\{ 
\mathcal{C}_{\alpha,\beta}(V_q) \}}{
2 \xi}\right)\right)\right\}
\leq \xi \cdot \frac{\Pr\{ \mathcal{C}_{\alpha,\beta}(V_q) \}}{2 \xi}
= \frac{\Pr\{\mathcal{C}_{\alpha,\beta}(V_q) \}}{2},
\end{split}
\end{align}
where the first inequality is by the definition 
of $\xi$. This result means
$\Pr\{\mathcal{C}_{\alpha,\beta}(V_q) \}$ 
is halved after each round of labeling. 
Therefore, after $Q := \log_2 \frac{1}{\varepsilon}$ 
rounds of labeling, 
\begin{equation}
\Delta_{\alpha, \beta}(h) \leq \Pr\{ 
\mathcal{C}_{\alpha,\beta}(V_{Q}) \} 
\leq \varepsilon, 
\end{equation}
with probability at least $1 - Q \delta'$;  
where the left inequality is by definition. 
By then, the total number of labeled 
instances is $\log_{2}\frac{1}{\varepsilon} \cdot 
\frac{1}{4\xi^2} \left( 32 c / \beta 
+ \sqrt{\frac{1}{2} \log \frac{1}{\delta'}}\right)$. 
Setting $\delta = Q \delta'$ and plugging 
$\delta' = \delta/Q$ in completes the proof. 
\end{proof}


\begin{example}[Example 4.4]
Fix $\alpha, \beta > 0$. 
Let $h_z(x)=1_{x>z}$ be a threshold function 
defined on $[0, 1]$. Let $H = \{ h_z; \alpha 
\leq z \leq 1-\alpha \}$. 
Assume points are uniformly distributed in $[0, 1]$.
Then, $\Delta_{\alpha,\beta}(h)=\frac{\alpha(1-\alpha^2)}{1-2\alpha}$ and $\xi \leq \frac{1}{2}$.
\end{example}
\begin{proof}
We can apply the same proof strategy and show
$\Delta_{\alpha,\beta}(h)=\frac{\alpha(1-\alpha^2)}{1-2\alpha}\geq\alpha(1+\alpha)$. (For two points to be within $\alpha$ distance apart, the probability is $(1-2\alpha)2\alpha+2\alpha(\alpha+\frac{\alpha}{2})=1-\alpha^2$; for $z$ to land in a certain $\alpha$-length interval within its whole range of length $1-2\alpha$, the probability is $\alpha/(1-2\alpha)$), $\mathcal{C}_{\alpha,\beta}(\mathcal{B}_{\alpha,\beta}(r))=[z-\frac{\alpha}{2},z]\times[z,z+\frac{\alpha}{2}]\cup[z,z+\frac{\alpha}{2}]\times[z-\frac{\alpha}{2},z]$, so $\Pr[\mathcal{C}_{\alpha,\beta}(\mathcal{B}_{\alpha,\beta}(r))]=\frac{\alpha^2}{2}\leq \frac{r}{2}$ (because $\alpha+\alpha^2\leq r$ implies $\alpha^2\leq r$). Hence $\xi\leq \frac{1}{2}$.
\end{proof}


\begin{lemma}[Lemma 5.1]
Fix any $\alpha, \beta > 0$. We have 
$\Delta_{\alpha,\beta}(h; S) \leq 
\tilde{\Delta}_{\alpha,\beta}(h; S)$ for 
any $h \in S$ and sample $S$.
\end{lemma}
\begin{proof}
Since $\mathbb{I}_{x \geq t} \leq \frac{x}{t}$ for any 
$x, t \geq 0$, we have 
\begin{align}
\begin{split}
\mathbb{I} \{ d(x_i, x_j) \leq \alpha, 
\,  |h(x_i) - h(x_j)| \geq \beta \} 
& = \mathbb{I} \{ d(x_i, x_j) \leq \alpha\} \cdot 
\mathbb{I} \{ |h(x_i) - h(x_j)|^2 \geq \beta^2 
\} \\[.5em] 
& \leq \frac{1}{\beta^2} \cdot 
\mathbb{I} \{ d(x_i, x_j) \leq \alpha \} 
\cdot |h(x_i) - h(x_j)|^2 \\[.5em] 
& = \frac{1}{\beta^2} \cdot M_{ij} \cdot |h(x_i) 
- h(x_j)|^2.   
\end{split}
\end{align}
Plugging this back to (\ref{eq:empdelta}) 
proves the lemma. 
\end{proof}












\begin{comment}

If $h$ is a linear model, we can easily show 
\footnote{\url{https://people.cs.uchicago.edu/~niyogi/papersps/Laplacianface.pdf}}
\begin{equation}
\tilde{\Delta}_{\alpha,\beta}(h; L) = 
\frac{2}{n \beta^2} \cdot h^T X (D - M) X^T h,  
\end{equation}
where $D$ is diagonal with $D_{ii} = \sum_{j=1}^n M_{ij}$ 
and $M$ is a symmetric matrix with $M_{ij} = \mathbb{I}(d(x_i, x_j) \leq \alpha)$ and $X \in \mathbb{R}^{p \times n}$ is the sample matrix. 

This is similar to a graph Laplacian regularization 
problem. The solution is 
\begin{equation}
h = ( X (I - \frac{2 \lambda}{\beta^2} 
(D - M)) X^T)^{-1}(XY). 
\end{equation}

\newpage 

Let $X = \{ x_1, x_2, \ldots, x_{10}\}$ be a population. Consider 
three different processes of constructing samples.\\ 

\underline{Construction Process I}

Step 1. Sample three instances in $X$ uniformly and independently 
and obtain e.g., $x_1, x_3, x_4$. 

Step 2. Construct $a := (x_1, x_3)$ and $b := (x_1, x_4)$. 

Question: can we view $a$ and $b$ as independently sampled? 

Analysis: if we want to prove independence, we should prove 
$\Pr(a, b) = \Pr(a) \Pr(b)$. 

Since 
\begin{equation}
\Pr \{a\} = \Pr \{ x_1, x_3\} = \Pr \{ x_1 \} Pr \{ x_3 \} = \frac{1}{100},  
\end{equation}
and 
\begin{equation}
\Pr \{b\} = \Pr \{ x_1, x_4\} = \Pr \{ x_1\} Pr \{ x_3 \} = \frac{1}{100},  
\end{equation}
the RHS is  
\begin{equation}
\Pr \{a\} \cdot \Pr \{b\} = \frac{1}{10^4}. 
\end{equation}

The LHS is 
\begin{equation}
\Pr \{a, b\} = \Pr \{ (x_1, x_4), (x_1, x_3)\} 
= \Pr \{ x_1, x_3, x_4 \} = \frac{1}{10^3},  
\end{equation}
where the second equality is by the fact 
that the same copy $x_1$ is shared by the two pairs.  
(Compare this with the next construction process.) 

Thus $RHS \neq LHS$ so we cannot view $a$ and $b$ as 
independently sampled.\\ 

\underline{Construction Process II}

Step 1. Sample four instances uniformly and independent 
and obtain e.g., $x_1^a, x_1^b, x_3, x_4$. 

-- Here, we assume $x_1$ is obtained in two sampling. 
Note the two copies $x_1^a, x_1^b$ are indepenedently sampled. 

Step 2. Construct $a := (x_1^a, x_3)$ and $b := (x_1^b, x_4)$. 

Question: can we view $a$ and $b$ as independently sampled? 

Analysis: if we want to prove independence, we should prove 
$\Pr(a, b) = \Pr(a) \Pr(b)$. 

The LHS is 
\begin{equation}
\Pr \{a, b\} = \Pr \{ x_1^a, x_1^b, x_3, x_4 \} 
= \Pr \{ x_1^a, x_3\} \Pr \{x_1^b, x_4 \} 
= \Pr \{a\} \cdot \Pr \{b\},  
\end{equation}
where the second equality is by the fact that 
$x_1^a$ and $x_1^b$ (and $x_3, x_4$) are 
independently sampled. 

Thus $RHS = LHS$ so we can view $a$ and $b$ 
as independently sampled.\\  


\underline{Construction Process III}

Step 1. Sample a pair of instances e.g. $a := (x_1^a, x_3)$,  
uniformly and independently from $X \times X$. 

Step 2. Sample another pair of instances e.g. $ b: = (x_1^b, 
x_4)$, uniformly and independently from $X \times X$. 

-- Here, we assume $x_1$ is obtained in both sampled pairs. 
Note the two copies $x_1^a, x_1^b$ are independently sampled. 

Question: can we view $a$ and $b$ as independently sampled? 

Analysis: if we want to prove independence, we should prove 
$\Pr(a, b) = \Pr(a) \Pr(b)$. 

The LHS is 
\begin{equation}
\Pr \{a, b\} = \Pr \{x_1^a, x_3, x_1^b, x_4\} 
= \Pr \{x_1^a, x_3\}  \Pr \{x_1^b, x_4\} = 
\Pr \{a \} \Pr \{b\}, 
\end{equation}
where the second equality is by the fact that 
$x_1^a$ and $x_1^b$ (and $x_3, x_4$) are 
independently sampled. 

Thus $RHS = LHS$ so we can view $a$ and $b$ as 
independently sampled. 
\end{comment}




