\section{Appendix}
\setcounter{theorem}{0}
\setcounter{lemma}{0}
\begin{theorem}
%\label{Coverage Theorem}
The coverage-regret of algorithm \ref{SMOMABA} is \(R = O\left( T^\frac{2}{3} (n\log T)^\frac{1}{3}\right)\).
% In other words, for any PO solution \( a^* \in \mathcal{P^*}\), there exists an arm \(a^t \in P^t\) such that 
% \[
% \sum_{t=1}^T r_{d}(a^*) - \sum_{t=1}^T r_d(a^t) \leq R_d \quad \forall d = 1, 2, \dots, D
% \]
% , where \(R_d = O( \sqrt{K T \log T})\) for all \(d=1,2,..,D\).
\end{theorem}


\begin{proof}

Let \(a^*\) be an arm in \(\mathcal{A^*}\). The goal is constructing a sequence of arms \( S = \{ a^t \}_{t=1}^T \) where $a^t$ is pulled by the algorithms in rounds $t=1,2,\ldots, T$ such that 

\[
 \mathbb{E} \left[\sum_{t=1}^T r_{d}^t(a^*) - \sum_{t=1}^T r_d^t(a^t) \right] \leq  O\left( T^\frac{2}{3} (n\log T)^\frac{1}{3}\right), \forall d \in [D].
\]

To construct $S$, we simply choose arm $a^*$ in the first $T'$ rounds, and for the next rounds ($t=T'+1$ to $t=T$) we choose $a^*$ if it is not removed from $B$. Otherwise, we choose an arm $b \in B$ such that $b+2r \succeq a^*$.

For such a sequence of arms, if \(a^* \in B\), the regret will be zero. Thus, let's analyze the case of $a^* \notin B$, that is, there exists an arm $b \in B$ such that \(b+2r \succeq a^*\).

We define the clean event \citet{slivkins2019introduction} as
\[
C = \left\{ \left| \bar{\mu}_d^{t}(a) - \mu_d(a) \right| \leq r, \, \forall a \in \mathcal{A}, \, \forall d \in [D], \, \forall t \in [T]\right\},
\]

where the confidence radius $r= \sqrt{\frac{2 \log T}{T'}}$.  
Define the \textit{unclean event} \( \overline{C} \) as the complement of the clean event \(C\). By the law of total expectation,
\[
\mathbb{E}[R(T)] = \mathbb{E}[R(T) | C] \, P[C] + \mathbb{E}[R(T) | \overline{C}] \, P[\overline{C}].
\]

Applying Hoeffding's inequality and union bounds, the deviation of the average reward from the true expected reward can be bound as: 
\[
P[\overline{C}] \leq  \frac{2 n D T}{T^4} \leq \frac{2}{T}.
\]

We used the fact that \(n < T\) and \(D < T\). So, the term \(\mathbb{E}[R(T) | \overline{C}] \, P[\overline{C}]\) is negligible. Thus, by the fact that \(P[C] \leq 1\), we can focus only on the clean events, particularly, \(\mathbb{E}[R(T) | C]\). 

Since arm $b+2r$ dominates arm $a^*$, and both arms $a$ and $b$ are clean, we can write

\[
\bar \mu_d(a^*) \leq \bar \mu_d(b) + 2 r, \forall d \in [D],
\]

and 
\[
\mu(a^*) \leq \bar \mu(a^*)+ r, \, \text{and} \, ~~ \mu(b) \leq \bar \mu(b) +r.
\]

Thus,
\[
\mu(a^*) - \mu(b) \leq 4 r = 4 \sqrt{\frac{2 \log T}{T'}}.
\]

Considering all \(T-T'\) rounds, results in

\[
E[R(T)|C] \leq (T-T') 4 \sqrt{\frac{2 \log T}{T'}} < 4T \sqrt{\frac{2 \log T}{T'}}.
\]

By choosing \(T'= \left( \frac{T}{n}\right)^\frac{2}{3} \left ( 2 \log T \right)^\frac{1}{3}\), we have

\[
E[R(T)|C] \leq 4 T^{\frac{2}{3}} (2 n \log T)^\frac{1}{3}.
\]

Finally, 
\begin{align*}
\mathbb{E}[R(T)] &= \mathbb{E}[R(T) | C] \, P[C] + \mathbb{E}[R(T) | \overline{C}] \, P[\overline{C}] \\
&\leq 4 T^{\frac{2}{3}} (2 n \log T)^\frac{1}{3} + (T-T') \frac{2}{T} \\
&= O\left( T^\frac{2}{3} (n\log T)^\frac{1}{3}\right)
\end{align*}


\end{proof}

\begin{theorem}
%\label{Convergence Theorem}
The outcome of Algorithm \ref{SMOMABA} converges to the PO arms \( \mathcal{A}^* \) as \( T \to \infty \).
\end{theorem}

\begin{proof}
The outcome of Algorithm \ref{SMOMABA} is the set $B$ which remains unchanged after round \(T'\). To prove this convergence property, we need to show that as the time horizon \( T \) approaches infinity, the probability of $B = \mathcal{A^*}$ approaches 1. When $T$ approaches infinity, \(T'= \left( \frac{T}{n}\right)^\frac{2}{3} \left ( 2 \log T \right)^\frac{1}{3}\) approaches infinity and consequently the confidence radius $r= \sqrt{\frac{2 \log T}{T'}}$ approaches zero. Therefore, none of the PO solutions is dominated by some other arms $b+2r \approx b$ in the second phase of algorithm \ref{SMOMABA}. So, all the PO arms will appear in $B$. On the other hand, by approaching the confidence radius $r$ to zero, any non-PO arm at least must be dominated by some PO arm. As a result, set $B$ will be exactly all PO arms.



\begin{comment}

Because of the fairness property of MO-UCB, assume \(a^*\) is the arm with maximum $f_1$ (or \(\mu_{a^*1}\)) value. Without loss of generality, assume there is no other solution with such $f_1$ \footnote{If there is another arm $b$ such that \(\mu_{1} = \mu_{b1}\), we can ignore the dimension $f_1$ and follow the proof by the next dimension that the true mean reward arm $a$ is greater than the true mean reward $a$. Note that, since we assumed $a$ is a PO arm, such a dimension will always exist.}. 
So, at least in one of the objectives, i.e., $d=1$, \(\mu_{a^*1}\) is greater than any other arm. Thus, in round $T$, the arm \({a^*}\) is not in $P^T$, if there is a solution $a$ such that $a$ dominates $a^*$. As discussed in the proof of \ref{Coverage Theorem}, for such an arm $a$, the following inequality is hold 

\[
\Delta_1^t(a^*,a^t) = \mu_1(a^*) - \mu_1(a^t) \leq 2r^t(a^t) = 2 \sqrt{\frac{2 \log T}{n^t(a^t)}}.
\]

Since \(\Delta_1^t(a^*,a^t)\) is positive, 

\[
n^t(a^t) \leq \frac{8}{(\Delta_1^t(a^*,a^t))^2} \log T
\]

Considering \(T \to \infty \), \(\frac{8}{(\Delta_1^t(a^*,a^t))^2}\) is a constant, and so therefore \(n^t(a^t) = O(\log T)\). Therefore, number of times that an arm \(a^t\) dominates \(a^*\) and is pulled limited to \( O(\log T)\). Considering all the arms, this will limited to \( O(K \log T)\) times. So, the probability of \( a^*\) does not appear in \(P^T\) is \( O(\frac{K \log T}{T})\). As a result when \(T \to \infty \), the probability of \( a^*\) appears in $P^T$ approaches to 1. Simultaneously, since the probability of a dominated arm \(a\) appears in \(P^T\) is \( O(\frac{K \log T}{T})\), which approaches to 0 by \(T \to \infty \).

% We show that the number of times that \(UCB_{a^*1}\) is less than \(UCB_{a1}\) is at most 
% $\frac{8}{\Delta^2} \log T$. Then in total, \(a^*\) can be at most \(\frac{8K}{\Delta^2} \log T\) times is dominated by other arms. Which means probability of dominated is less than  \(O(\frac{\log T}{T})\) which approaches to 0 by increasing $T$.


    


Thus, the exploration term diminishes relative to the average reward, leading to:
   \[
   \text{UCB}_{id}^t \to \mu_{id} \text { for all } d\in [D] \quad \text{as } n_i \to \infty
   \]
where  



UCB estimates converge to the true means.
   Then MO-UCB identifies the non-dominated set \( Q^t \) based on these converging UCB scores. As \( T \) increases, the UCB scores increasingly reflect the true Pareto relationships among the arms, ensuring that only those arms that are truly non-dominated remain in \( Q^t \). Finally, it  filters these non-dominated arms to form the strong non-dominated set \( P^t \), which includes arms that are not \(\epsilon\)-dominated by any other arm. With \( \epsilon_i = 2 \sqrt{\frac{\log T}{n_i}} \), the tolerance becomes negligible as \( T \) grows, thus allowing identification of PO arms as found in $Q^T$.
\end{comment}

\end{proof}

%\textbf{Proposition:} As \( T \to \infty \), the set \( B \) converges to the efficient PO set \( \mathcal{EA}^* \), and consequently, the average regret of algorithm \ref{SMOMABA} approaches zero.



\begin{lemma}
%\label{lemmaSetCover}
    In case of the clean event $C$ happens, the optimal solution for the minimum set covering of arms, computed in Step 9 of Algorithm \ref{SMOMABA}, is bounded by \( |\mathcal{A^*}| \).
\end{lemma}

\begin{proof}
    Note that no PO arms are removed up to Step 9 in the algorithm. On the other hand, the union of PO arms dominates all other arms. When all arms are clean with radius \( r \), the union of improved (by $2r$) PO arms, also dominates all other arms, i.e., \( \bigcup_{a^* \in \mathcal{A^*}} Dom(a^*) = \mathcal{A} \).

    Therefore, the minimum set covering of arms has at least one solution of size \( |\mathcal{A^*}| \). Consequently, the optimal solution is bounded by \( |\mathcal{A^*}| \).
\end{proof}




Now let's prove the cumulative adjustment-regret bound for algorithm \ref{SMOMABA}.

\begin{theorem}
%\label{cumulative adjustment-regret Theorem}
The cumulative adjustment-regret holds for algorithm \ref{SMOMABA} with the regret \(R = O\left( T^\frac{2}{3} (n\log T)^\frac{1}{3}\right)\). 
\end{theorem}

\begin{proof}
To prove this theorem, we need to show that

\begin{align*}
\mathbb{E} \left[ \sum_{t=1}^T \sum_{a^t \in A^t} \min \left\{ \epsilon \geq 0 \mid \exists a^* \in \mathcal{A}^* : a^t + \epsilon \succeq a^* \right\} \right] \\
\leq |\mathcal{A}^*| O \left( T^{\frac{2}{3}} \left( n \log T \right)^{\frac{1}{3}} \right).
\end{align*}




Similar to Theorem \ref{Coverage Theorem}, the probability of the unclean event \( P[\overline{C}] \) is bounded by \( \leq \frac{2}{T} \). Therefore, we assume only the case where all arms are clean. For this case, we decompose the above right-hand term into two components to analyze them separately and derive the desired bounds.

\[
\text{Term 1} = \mathbb{E} \left[ \sum_{t=1}^{T'} \sum_{a^t \in A^t} \min \{ \epsilon \geq 0 \mid \exists a^* \in \mathcal{A}^* : a^t + \epsilon \succeq a^* \} \right]
\]

and

\[
\text{Term 2} = \mathbb{E} \left[ \sum_{t=T'+1}^T \sum_{a^t \in A^t} \min \{ \epsilon \geq 0 \mid \exists a^* \in \mathcal{A}^* : a^t + \epsilon \succeq a^* \} \right]
\]

Based on the behavior of Algorithm \ref{SMOMABA},

\[
\text{Term 1} = \sum_{t=1}^{T'} \sum_{a \in \mathcal{A}} \min \{ \epsilon \geq 0 \mid \exists a^* \in \mathcal{A}^* : a^t + \epsilon \succeq a^* \}
\]
That is pulling all arms in \(\mathcal{A}\). So, 
\[
\text{Term 1} \leq \sum_{t=1}^{T'} \sum_{a \in \mathcal{A}} 1 \leq n T' = O\left( T^\frac{2}{3} (n\log T)^\frac{1}{3}\right).
\]

We now proceed to bound 
\[
\text{Term 2} = \mathbb{E} \left[ \sum_{t=T'+1}^T \sum_{b \in B} \min \{ \epsilon \geq 0 \mid \exists a^* \in \mathcal{A}^* : b + \epsilon \succeq a^* \} \right],
\]
where \( B \) is the minimum set of arms such that every arm \( a \in \mathcal{A} \), including PO arms, is weakly dominated by \( b + 2r \) for some \( b \in B \). If \( b \) is a PO arm, then \( \epsilon = 0 \). Otherwise, we show that under these conditions, there always exists a PO arm \( a^* \) such that \( a^* \succeq b \) and \( \mu_d(a^*) - \mu_d(b) \leq 4r \) for all \( d \in [D] \). Specifically, this \( a^* \) can be expressed as:
\[
a^* = \operatorname*{argmin}_{a \in \mathcal{A}^* \text{ and } a \succeq b} \quad \max_{k \in [D]} \mu_d(a) - \mu_d(b).
\]

Assume, for contradiction, that there exists some \( k \in [D] \) such that \( \mu_d(a^*) - \mu_d(b) > 4r \). Since both \( b \) and \( a^* \) are clean ($\forall d \in [D]: |\mu_d(a^*) - \bar \mu_d(a^*)| \leq r$ and \(|\mu_d(b) - \bar \mu_d(b)| \leq r\)), this implies \( b \in Dom(a^*) \) but \( a^* \notin Dom(b) \). However, as the weak domination relation (\( \succeq \)) is transitive, it follows that \( Dom(b) \subset Dom(a^*) \).

Given that \( B \) is the minimum set covering all arms:
\begin{itemize}
    \item If \( a^* \in B \), then \( b \notin B \), as including both would violate \( B \) is the minimum set cover.
    \item If \( a^* \notin B \), there must exist some clean arm \( b' \in B \) such that \( a^* \in Dom(b') \). By transitivity, this implies \( b \in Dom(b') \), again contradicting \( B \) is the minimum set cover.
\end{itemize}

Thus, both cases lead to contradictions, proving that \( \mu_d(a^*) - \mu_d(b) \leq 4r \) for all \( d \in [D] \).

So, we can bound Term 2 as:
\[
\text{Term 2} \leq \mathbb{E} \left[ \sum_{t=T'+1}^T \sum_{b \in B} 4r \right]
= (T - T') |B| 4 \sqrt{\frac{2 \log T}{T'}}.
\]

This simplifies further to:
\[
\text{Term 2} < 4T |B| \sqrt{\frac{2 \log T}{T'}}.
\]

Based on lemma \ref{lemmaSetCover}, $|B| \leq |\mathcal{A^*}|$. So, by replacing \( T'= \left( \frac{T}{n}\right)^\frac{2}{3} \left ( 2 \log T \right)^\frac{1}{3} \), we obtain:
\[
\text{Term 2} \leq  O\left( |\mathcal{A^*}| T^\frac{2}{3} (n\log T)^\frac{1}{3}\right) .
\]

Finally,

\[
\text{Term 1 + Term 2} \leq |\mathcal{A^*}| O\left(  T^\frac{2}{3} (n\log T)^\frac{1}{3}\right) .
\]

While the main proof concludes here, we note that the polynomial-time approximation for the minimum covering set satisfies $|B| \leq \log n \cdot |\mathcal{A}^*|$. Consequently, the polynomial-time algorithm achieves the regret bound of $O\left( \log n \cdot T^{\frac{2}{3}} (n \log T)^{\frac{1}{3}} \right)$ while maintaining correctness.


\end{proof}


\begin{theorem}
%\label{Efficient Pareto Theorem}
In Algorithm \ref{SMOMABA}, if after computing the minimum arm covering set \( B \), the non-efficient arms are removed, Theorem \ref{Coverage Theorem}, Theorem \ref{Convergence Theorem}, and Theorem \ref{cumulative adjustment-regret Theorem} remain valid for the efficient PO arms, \(\mathcal{EA^*}\).
\end{theorem}

\begin{proof}
An arm \( a \in B \) is identified and removed as a non-efficient arm if a linear combination of other arms in \( B \) weakly dominates \( a \). Specifically, in this case, there exists a subset \( B' \subseteq B \) with \( m \leq n\) arms and coefficients \(\alpha = (\alpha_1, \alpha_2, \dots, \alpha_m)\), such that \(\sum_{i=1}^m \alpha_i = 1\), where the \textit{artificial arm} \( b_\alpha = \sum_{i=1}^m \alpha_i a_i \) dominates \( a \). The term "artificial" reflects that no individual arm has rewards identical to \( b_\alpha \), but since Theorem \ref{Coverage Theorem}, and Theorem \ref{cumulative adjustment-regret Theorem} discuss the expected value for the defined regret, the rewards of \( b_\alpha \) can be approximated for sufficiently large \( T \) by selecting arm \( a_i \in B' \) with probability \( \alpha_i \).
Replacing \( b \) with such an artificial arm \( b_\alpha \) in Theorems \ref{Coverage Theorem} and \ref{cumulative adjustment-regret Theorem} confirms their validity for the efficient PO arms, \(\mathcal{EA^*}\). 
For Theorem \ref{Convergence Theorem}, since \( r = \sqrt{\frac{2 \log T}{T'}} \) converges to zero as \( T \to \infty \), removing non-efficient arms from \( B \) ensures only efficient PO arms remain in \(\mathcal{EA^*}\).
\end{proof}



