
\section{Proving the Main Result}
\label{sec:NashQproof}

%We have shown that the principle of optimism has allowed us to obtain finite-sample guarantees for Nash Q-learning. The proof of our main theorem shows that, remarkably, it was obtained by following the general proof outline of single-agent RL with optimism and value iteration -- found in~\citep{CJ-ZY-ZW-MIJ:20}-- along with the introduction of some precise and important modifications. 

We first present two lemmas that make use of the fact that we solve for Nash equilibria in the stage games. All missing proofs and results are in the supplementary material.

%\pc{This section is self-contained in that the whole proof can be understood from following the results just cited --- for example, some simple adaptations from various results in~\citep{CJ-ZY-ZW-MIJ:20} which are mainly in terms of notation or not too complicated mathematically, have been omitted. However, for the sake of the reader, we have incorporated a more detailed treatment of all these parts in Appendix~\ref{subsec:proof_main_parallel}.} 

%\subsection{Important lemmas for Nash Q-learning}
%
%
%We study reward-free RL with an underlying zero-sum Markov game to demonstrate the power of parallel exploration in the MARL context. We propose the \emph{Reward-Free Markov Game Parallel Optimistic Least-Squares Value Iteration} (RFMG-POLSVI) algorithm. The \emph{exploration phase}, described in Algorithm~\ref{alg:main_LIN_RFMG_POLSVI_exp} in Appendix~\ref{App:RFMG-POLSVI}, is basically the same as RF-POLSVI with the difference that the action space is extended to a product of action spaces corresponding to each of the two players in the MG. In the \emph{planning phase}, in Algorithm~\ref{alg:main_LIN_RFMG_POLSVI_plan}, the central server computes the policies for each player through the computation of two Nash Equilibria at each step of the MG --- lines 11 and 12 of Algorithm~\ref{alg:main_LIN_RFMG_POLSVI_plan} are minimax problems. The following result summarizes the performance of the RFMG-POLSVI algorithm.

%The following is a key lemma in our analysis.
%
%\subsubsection*{Bounding covering numbers}

\begin{lemma}[Bounding the covering number]\label{lem:bound-import}
%%
%
Let $i\in[n]$, and let $\bar{w}_i\in\R^d$ be such that $\norm{\bar{w}_i}\leq L$, $\bar{\Lambda}\in\R^{d\times d}$ be such that its minimum eigenvalue is greater or equal than $\lambda$, and, for all $(x,a)\in\S\times\A$, let $\phi(x,a)\in\R^d$ be such that $\norm{\phi(x, a)}\leq 1$, and let $\beta>0$.
%
Define the function class 
%\begin{multline}
%\label{eq:function_class}
%\mathcal{V}_i = \left\{V:\S\to\R\;\Big|\; V(\cdot)\right.\\
%\left.=\max_{\nu\in\Delta(\A_i)}\E_{\substack{a_i\sim\nu\\a_{-i}\sim\pi_{-i}(\cdot)}}\left[\min\left\{ \bar{w}_i^\top\phi(\cdot, a) \right.\right.\right.\\
%\left.\left.\left.
%+ \beta \sqrt{\phi(\cdot, a)^\top\bar{\Lambda}^{-1} %\phi(\cdot, a)}, H \right\}\right]\right\},
%\end{multline}
\begin{multline}
\label{eq:function_class}
\mathcal{V}_i = \Big\{V:\S\to\R\;\Big|\; V(\cdot)\\
=\max_{\nu\in\Delta(\A_i)}\E_{\substack{a_i\sim\nu\\a_{-i}\sim\pi_{-i}(\cdot)}}\Big[\min\Big\{ \bar{w}_i^\top\phi(\cdot, a)\\
+ \beta \sqrt{\phi(\cdot, a)^\top\bar{\Lambda}^{-1} \phi(\cdot, a)}, H \Big\}\Big]\Big\},
\end{multline}
where %$i\in[n]$ and
$\pi_{-i}(\cdot)\in\Delta(\A_{-i})$. Let $\mathcal{N}_{\epsilon_i}$ be the $\epsilon_i$-covering number of $\mathcal{V}_i$ with respect to the distance $\operatorname{dist}(V, V') = \sup_{x\in\S} |V(x) - V'(x)|$. Then, 
$$
\log \mathcal{N}_{\epsilon_i} \leq d  \log (1+ 4L/ \epsilon ) + d^2 \log \bigl [ 1 +  8 d^{1/2} \beta^2  / (\lambda\epsilon^2)  \bigr ].
$$
\end{lemma}

In Lemma~B.2 of the supplementary material, we introduce an event $\mathcal{E}_i$ which defines a concentration bound over a cumulative quantity of the value function associated to agent $i\in[n]$ across iterations. We use this event in the lemma below. 

\begin{lemma}[Optimism bounds]% -- Lemma~B.5 in~\citep{CJ-ZY-ZW-MIJ:20}]
\label{lem:optimism_bound} Consider the setting of Theorem~\ref{thm:main-nashQ}. Given the event $\mathcal{E}_i$ defined in Lemma~B.2, we have for all $(x, a, h, k) \in \S\times\A\times[H]\times[K]$ that  \begin{multline*}
Q^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_h(x,a)\leq Q^{i,k}_h(x,a)\\ \text{ and }\quad V^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_h(x)\leq V^{i,k}_h(x).
\end{multline*}
%where $\pi^*$ is any Nash equilibrium of the underlying stochastic game.
\end{lemma}

{\par \textbf{The importance of Lemma~\ref{lem:bound-import} and Lemma~\ref{lem:optimism_bound}.}} Lemma~\ref{lem:bound-import} defines a function class $\mathcal{V}_i$ to which the function $V^{i,k}_h(\cdot)=\E_{a\sim\pi^k_h(\cdot)}[Q^{i,k}_h(\cdot,a)]$ belongs. Indeed, the characterization of $\mathcal{V}_i$ includes the one of a Nash equilibrium for a static game; however, we remark that $\pi_{-i}(\cdot)\in\Delta(\A_{-i})$ in the statement of Lemma~\ref{lem:bound-import} does not need to be a product measure. Using a covering number argument, Lemma~\ref{lem:bound-import} would be used to prove a series of results that would end up being used by Lemma~\ref{lem:optimism_bound}. Fundamentally, Lemma~\ref{lem:optimism_bound} makes use of (i) the optimism bonus at each episode -- the factor starting with $\beta$ in line 10 of \algbrev ~-- and (ii) the selection of Nash equilibria across all stage games. Bounding the best-response value functions across agents in Lemma~\ref{lem:optimism_bound} is important because it upper bounds one of the terms of the regret, see~\eqref{eq:regret_nash}. 
Finally, we end our discussion by pointing out that these lemmas are the only two places in the proof of Theorem~\ref{thm:main-nashQ} which makes direct use of the notion of Nash equilibria.
 
%{\par \textbf{The importance of Lemma~\ref{lem:bound-import}}.} Lemma~\ref{lem:bound-import} presents two cases for the policy profile that defines any member of the function class $\mathcal{V}_i$, such as the function function $V^{i,k}_h(\cdot)=\E_{a\sim\pi^k_h(\cdot)}[Q^{i,k}_h(\cdot,a)]$ -- line 14 from Algorithm~\ref{alg:main_LIN_UCB_LSVI}. Indeed, according to the assumption of Theorem~\ref{thm:main-nashQ}: (i) if $\pi_h^k$ is a global optimal equilibrium, then $\mathcal{V}_i$ is defined according to Case~\ref{case-a} in Lemma~\ref{lem:bound-import}; whereas (ii) if $\pi_h^k$ is a saddle Nash equilibrium, then $\mathcal{V}_i$ is defined according to Case~\ref{case-b} in Lemma~\ref{lem:bound-import}. 
%%The proof of our main theorem uses a covering number argument to upper bound a specific quantity of interest %(see \pc{YYY}), 
%%for which Lemma~\ref{lem:bound-import} plays an important role. 
%Lemma~\ref{lem:bound-import} is the only result in our proof which makes use of the assumption on the existence of such Nash equilibria in the stage games (the assumption is stated in Theorem~\ref{thm:main-nashQ}). 

%{\par \textbf{Relationship between the proofs of \algbrev ~and the classic Nash Q-learning algorithm.}} 
%%
%%%{\par \textbf{The key piece in our proof}} We describe the key piece in our proof which allows us to use the same conditions on the Nash equilibria of the stage games as in~\citep{Hu2003NashQ}.
%%As explained, the important of Lemma~\ref{lem:bound-import} is that this is the only part in our proof where we make use of the assumption on 
%The assumption on the existence of specific Nash equilibria in the stage games was used by~\cite{Hu2003NashQ}; 
%%, but its use in their proof is different -- recall that \cite{Hu2003NashQ}'s proof is for discounted stochastic games and provides only asymptotic guarantees. 
%however, since their proof works for discounted Markov games and only provides asymptotic guarantees, its use is quite different. 
%%
%\cite{Hu2003NashQ} based their proof on the value-iteration idea that the estimated Q-functions originate from the successive application of an operator defined by the selection of Nash equilibria in the stage games. 
%In order to prove asymptotic convergence, 
%%they use a contraction approach and thus require such operator to be a contracting one. 
%%
%%\cite{Hu2003NashQ} based their proof on a contraction analysis where an operator is defined that performs such contraction based on the selection of Nash equilibria in the stage games. 
%%
%\cite{Hu2003NashQ} finds that two sufficient conditions to ensure contraction of the operator is the selection or global optimal or saddle Nash equilibria. %%~\citep[Lemma~16]{Hu2003NashQ}. 
%In contrast, in our proof, linear function approximation requires an analysis based on covering numbers whose upper bound make use of such equilibria.
%%
%%. It is in the computation of the coverin number of the class of functions in which the estimated Q-function with optimism belongs to, where we make use of the assumptions on finding a global optimal or a saddle Nash equilibria at every stage game~[\pc{Our Lemma...}].
%%
%%
%%\subsubsection*{Using Optimism for Bounding}

%

%{\par \textbf{The importance of Lemma~\ref{lem:optimism_bound}.}
%The use of the optimism bonus at each episode -- the factor starting with $\beta$ in line 10 of \algbrev ~-- let us bound best-response value functions across agents in Lemma~\ref{lem:optimism_bound}. This is important because it upper bounds one of the terms of the regret, see~\eqref{eq:regret_nash}. The proof of this lemma makes use of the selection of Nash equilibria across all stage games, though they do not need to be of a specific type. 
%The proof of Lemma~\ref{lem:optimism_bound} requires proving Lemma~\ref{lem:bound-import} first.
%%Moreover, proving Lemma~\ref{lem:bound-import} is needed before proving Lemma~\ref{lem:optimism_bound}.

%============================================================================================
% =============================================================================================

\subsection{Proof sketch
of Theorem~\ref{thm:main-nashQ}}

%In this proof sketch of our main result, we will be citing auxiliary results from the appendix, which are mostly adaptations of different results in~\cite{CJ-ZY-ZW-MIJ:20}.  

%========================

We present the proof sketch of our main result. A more detailed full version of the proof along with all auxiliary results and necessary proofs are found in the supplementary material.  

%=============
  

Let us first condition on the event $\bigcap_{i=1}^n\mathcal{E}_i$ where $\mathcal{E}_i$ is defined in Lemma~B.2. %\ref{lem:stochastic_term}. 
Since $\Pe[\text{not }  \mathcal{E}_i]\leq \delta$, applying union bound let us conclude that $\Pe[\bigcap_{i\in[n]}\mathcal{E}_i]\geq 1-n\delta
$.
Conditioning on this event allows us to use Lemma~\ref{lem:optimism_bound} for every $i\in[n]$.

% Using a simple union bound, we find that this holds with probability at least $1-n\delta$. \pc{[THIS WILL HAVE REPERCURSION IN THE THEOREM STATEMENT AND ITS COMMENTS, SINCE $n$ WILL APPEAR MORE ON $\iota$!]}

For any $k\in[K]$, given the policy $\pi^k=\{\pi^k_i\}_{i\in[n]}$ defined by \algbrev, we define the functions $\hat{Q}_h^k$ %$:\S\times\A\to[0,2\beta]$
and $\hat{V}_h^k$ %$:\S\to[0,2\beta]$
recursively as: $\hat{V}_{H+1}^k(x)=\hat{Q}_{H+1}^k(x)=0$ and 
\begin{align*}
%\hat{Q}_h^k(x,a) &= \E_{x'\sim \P_h(\cdot|x,a)}[\hat{V}^k_{h+1}(x')]\\
%&\quad
\hat{Q}_h^k(x,a) &= \Pe_h\hat{V}^k_{h+1}(x,a)+2\beta\sqrt{(\phi^k_h)^\top(\Lambda^k_h)^{-1}\phi^k_h},\\
\hat{V}_h^k(x)&=\E_{a\sim\pi^k_h(x)}[\hat{Q}_h^k(x,a)]
\end{align*}
for any $h=H,\dots,1$ and $(x,a)\in\S\times\A$. Notice that since $2\beta\sqrt{(\phi^k_h)^\top(\Lambda^k_h)^{-1}\phi^k_h}\leq 2\beta\sqrt{(\phi^k_h)^\top\phi^k_h}=2\beta\norm{\phi^k_h}\leq 2\beta$, we have that $\hat{Q}_h^k$ %$:\S\times\A\to[0,2\beta]$
and $\hat{V}_h^k$ are nonnegative with maximum value $2\beta H$.

%Let $i\in[n]$ and 
Let $k\in[K]$. We 
can show that
%claim that 
for any $(h,x,a)\in[H]\times\S\times\A$, 
\begin{equation}
\label{eq:claim-upper-bound}
\begin{aligned}
        \max_{i\in[n]}(Q^{i,k}_h(x,a)-Q^{i,\pi^k}_h(x,a))&\leq \hat{Q}_h^k(x,a)\text{, and}\\
        \max_{i\in[n]}(V^{i,k}_h(x)-V^{i,\pi^k}_h(x))&\leq \hat{V}_h^k(x).
\end{aligned}
\end{equation}
%
%We prove the claim by induction in $h\in\{H+1,\dots,1\}$. The base case $H+1$ is trivial, since $Q^{i,k}_{H+1}(x,a)=Q^{i,\pi^k}_{H+1}(x,a)=\hat{Q}^k_{H+1}(x,a)=0$ for every $i\in[n]$. Now, at step $h+1$ we have the induction hypothesis $\max_{i\in[h]}(Q^{i,k}_{h+1}(x,a)-Q^{i,\pi^k}_{h+1}(x,a))\leq \hat{Q}_{h+1}^k(x,a)$. Taking expectations over $a\sim\pi^k_{h+1}$ let us immediately obtain $\max_{i\in[h]}(V^{i,k}_{h+1}(x)-V^{i,\pi^k}_{h+1}(x))\leq \hat{V}_{h+1}^k(x)$. 
%
%Now, for any $i\in[n]$,
%\begin{equation}
%%\label{eq:boundV-at-h1}
%\begin{aligned}
%&Q^{i,k}_h(x,a)-Q^{i,\pi^k}_h(x,a)\\
%&= \min\{(w^{i,k}_h)^\top\phi(x,a)\\
%&\quad+\beta\sqrt{\phi(x,a)^\top(\Lambda_h^k)^{-1}\phi(x,a)},H\}-Q^{i,\pi^k}_h(x,a)\\
%&\overset{(a)}{\leq}
%\Pe_h (V^{i,k}_{h+1} - V^{i,\pi^k}_{h+1})(x, a)\\
%&\quad+ 2\beta \sqrt{\phi(x,a)^\top (\Lambda^k_h)^{-1}  \phi(x,a)}\\
%&\leq 
%\Pe_h\hat{V}^{k}_{h+1}(x,a) + 2\beta \sqrt{\phi(x,a)^\top (\Lambda^k_h)^{-1}  \phi(x,a)}\\
%%\overset{(c)}{\implies}Q^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_{h}(x,a)&\leq \min\{\langle\phi(x, a), w^{i,k}_h\rangle + \beta \sqrt{\phi(x,a)^\top (\Lambda^k_h)^{-1}  \phi(x,a)},H\}\\
%&=\hat{Q}^{k}_h(x,a),
%\end{aligned}
%\end{equation}
%where (a) follows from Lemma~\ref{lem:basic_relation}. Taking expectations let us obtain  $V^{i,k}_h(x)-V^{i,\pi^k}_h(x)\leq \hat{V}^{k}_h(x)$. This finishes the proof for the claim in~\eqref{eq:claim-upper-bound}.

We now introduce the following notation: $\delta^{k}_h := \E_{a\sim\pi^k_h(x^k_h)}[\hat{Q}^k_h(x^k_h,a)]-\hat{Q}^k_h(x^k_h,a^k_h)$, and $\xi^k_{h+1} := 
\Pe_h\hat{V}^k_{h+1}(x^k_h,a^k_h) - \hat{V}^k_{h+1}(x^k_{h+1})$ with $\xi^k_1:=0$. Then, for any $(h,k) \in [H] \times [K]$, we can show that 
\begin{align*}
\hat{V}^k_h(x^k_h)&=\delta^k_h+\xi^k_{h+1}+2\beta\sqrt{(\phi^k_h)^\top(\Lambda^k_h)^{-1}\phi^k_h}\\
    &\quad+\hat{V}^k_{h+1}(x^k_{h+1}).
\end{align*}
%we use Lemma~\ref{lem:basic_relation} (with $x=x_h^k$ and $a=a^{k}_h$ following the lemma's notation) to obtain, 
%\begin{equation}
%\label{eq:recursive-paral-aux}
%\begin{aligned}
%&Q^{i,k}_h(x_h^{k}, a^{k}_h) - Q^{i,\pi_i^k,\pi^*_{-i}}_h(x_h^{k}, a^{k}_h)\\
%&\quad \leq \Pe_h (V^{i,k}_{h+1} - V^{i,\pi_i^k,\pi^*_{-i}}_{h+1})(x_h^{k}, a^{k}_h)+ \beta \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}  \phi^{k}_h}\\
%%
%\implies & V^{i,k}_h(x^{k}_{h}) - V^{i,\pi_i^k,\pi^*_{-i}}_h(x^{k}_{h})\\
%&\quad \leq (\Pe_h (V^{i,k}_{h+1} - V^{i,\pi_i^k,\pi^*_{-i}}_{h+1})(x_h^{k},a_h^{k})) - \delta^{i,k}_{h+1})+ \delta^{i,k}_{h+1} 
%+ \beta \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}  \phi^{k}_h}\\
%%
%\implies & \delta^{i,k}_h\leq 
%\xi^{i,k}_{h+1} + \delta^{i,k}_{h+1}
%+ \beta \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}  \phi^{k}_h}.
%%
%\end{aligned}
%\end{equation}
%
%We define $\xi_1^{i,k}=0$ for every $(i,k)\in[n]\times[K]$.
%

Now, let us focus on the regret performance metric.
% 
\begin{equation}
\label{eq:regret_prev_p}
\begin{aligned}
\textnormal{Regret}(K) &=  \sum_{k=1}^K\max_{i\in[n]}(V_1^{i,\bre(\pi^k_{-i}),\pi^k_{-i}}(s_o) - V_1^{i,\pi^k}(s_o))\\
&\overset{(a)}{\leq}\sum_{k=1}^{K} \max_{i\in[n]}(V^{i,k}_1(s_o) - V^{i,\pi^k}_1 (s_o))\\
%
&\overset{(b)}{\leq}\sum_{k=1}^{K} \hat{V}^{k}_1(s_o)
\\
%
&=\underbrace{\sum_{k=1}^K\sum_{h=1}^H\xi^k_h}_{\textrm{(I)}}+\underbrace{\sum_{k=1}^{K}\sum_{h=1}^H \delta^{k}_{h}}_{\textrm{(II)}} \\
&\quad+ \underbrace{2\beta \sum_{k=1}^{K}\sum_{h=1}^H \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h}}_{\textrm{(III)}},
\end{aligned}
\end{equation}
%
where (a) follows from Lemma~\ref{lem:optimism_bound} and the fact that we are conditioned on the event $\bigcap^n_{i=1}\mathcal{E}_i$; (b) follows from~\eqref{eq:claim-upper-bound}.
%found in Lemma~B.5 from~\citep{CJ-ZY-ZW-MIJ:20} but using the event $\mathcal{E}_i$ from Lemma~\ref{lem:stochastic_term}, 
%and (b) follows from the recursive formula in~\eqref{eq:recursive-paral-aux} and the fact that $\delta_{H+1}^{i,k}=\xi_{H+1}^{i,k}=0$ and and $\xi^{i,k}_1=0$ for every $(i,k)\in[n]\times[K]$.

We first analyze the term (I) from~\eqref{eq:regret_prev_p}. 
%
By defining an appropriate infinite sequence of tuples $\L^\star\subset \mathbb{Z}_{\geq 1}\times[H]$, we can show that $\{\xi^k_h\}_{(k,h)\in\L^\star}$ is a martingale difference sequence.  
%
%Let us define the filtration $\{\F_{(k,h)}\}_{(k,h)\in\L^\star}$ where
%$\L^\star$ is a sequence such that $\L^\star\subset\mathbb{Z}_{\geq 1}\times[H]$ and its elements are arranged as follows. Firstly, we let the second coordinate take values from $1$ to $H$ and repeat this periodically \emph{ad infinitum}, so that each period has $H$ elements of $\L^\star$. Finally, we let the first coordinate take the value corresponding to the current number of periods so far in the second coordinate (and so its value is unbounded). 
%%
%Consider any element $(k,h)\in\L^\star$. We denote by $(k,h)^{-1}$ its previous element in $\L^\star$. We let $\F_{(k,h)}$ contain the information of the tuple $(x^{\bar{k}}_{\bar{h}},a^{\bar{k}}_{\bar{h}})$ whose indexes $(\bar{k},\bar{h})$ belong to the set $\L^\star$ up to the element $(k,h)\in\L^\star$
%
%We then can conclude that $\{\xi^k_h\}_{(k,h)\in\L^\star}$ is a 
%%Since the computation of $V_h^k$ is independent of the new observation $x^k_h$ at episode $k$, 
%martingale difference sequence due to the following two properties:
%\begin{enumerate}
%    \item $\xi^k_h\in\F_{(k,h)^{-1}}$.
%    For $h=1$, $\E[\xi^k_{h}|\F_{(k,h)^{-1}}]=0$ is trivial, so we focus on $h=2,\dots,H$. 
%    Then, since $x^{k}_h\sim\P_{h-1}(\cdot|x^{k}_{h-1},a^{k}_{h-1})$ (line 16 of NQOVI), we have $\E[\hat{V}^k_h(x^k_h)|\F_{(k,h)^{-1}}]=\E_{x'\sim\P_{h-1}(\cdot|x_{h-1}^{k},a_{h-1}^{k})}[\hat{V}^{k}_h(x')]=\Pe_{h-1}\hat{V}^k_h(x^k_{h-1},a^k_{h-1})$, which immediately implies $\E[\xi^k_{h}|\F_{(k,h)^{-1}}]=0$.
%    \item $|\xi^k_h|\leq 
%    |\Pe_{h-1}\hat{V}^k_h(x^{k}_{h-1}, a^{k}_{h-1})| +|\hat{V}^k_h(x^k_h)|\leq 4\beta H <\infty$
%    since $\hat{V}^k_h(x)\in[0,2\beta H]$ for any $x\in\S$.
%\end{enumerate}
%
%
Therefore, we can use the Azuma-Hoeffding inequality to conclude that, for any $\epsilon > 0$,
\begin{equation*}
\Pr \left(\sum_{k=1}^{K}\sum_{h=1}^H  \xi^{k}_{h}> \epsilon \right) \leq \exp \bigg (\frac{-2 \epsilon^2 } {(KH)(16\beta^2H^2) } \bigg ).
\end{equation*}
We choose $\epsilon=\sqrt{8KH^3\beta^2\log\left(\frac{1}{\delta}\right)}$. Then, with probability at least $1 -\delta$,   
\begin{equation}
\label{eq:final2}
  \textrm{(I)}=\sum_{k=1}^{K}\sum_{h=1}^H  \xi^k_{h}\leq \sqrt{8KH^3\beta^2\log\left(\frac{1}{\delta}\right)} \leq 8\beta H\sqrt{KH\iota}, 
\end{equation}
when setting $\iota = \log\left(\frac{dKH}{\delta}\right)$. We call $\bar{\mathcal{E}}$ the event such that~\eqref{eq:final2} holds.

The term (II) can be analyzed in a very similar way as in (I) to show that $\{\delta^k_h\}_{(k,h)\in\cL^*}$ is a martingale difference sequence, and thus obtain that with probability at least $1-\delta$,
\begin{equation}
\label{eq:final3}
  \textrm{(II)}=\sum_{k=1}^{K}\sum_{h=1}^H  \delta^k_{h}\leq 8\beta H\sqrt{KH\iota}. 
\end{equation}
We call $\tilde{\mathcal{E}}$ the event such that~\eqref{eq:final3} holds.

We now analyze the term (III) from~\eqref{eq:regret_prev_p}.
%Then, 
%$$
%%\sum_{i=1}^{n}\sum_{h=1}^{H}\left(
%\sum_{k=1}^{K}(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h
%%\right)
%%
%\overset{\textrm{(a)}}{\leq} 2\log\left[\frac{\det(\Lambda_h^{K+1})}{\det(\lambda I_d)}\right]
%$$
%%
%%\beta \sum_{k=1}^{K}\I[\mathcal{D}_t^c]\sum_{p=1}^{P}\sum_{h=1}^H \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h}
%%
%where (a) follows from the so-called elliptical potential lemma~\citep[Lemma~11]{YAY-DP-CS:11}, whose conditions are satisfied from our bounded sequence $\{\phi_h^{k}\}_{k=1}^K$ and the fact that the minimum eigenvalue of $\Lambda_h^k$ is lower bounded by $\lambda=1$ for every $(h,k)\in[H]\times[K]$. Now, we have that $\Lambda_h^{K+1}$ is a positive definite matrix whose maximum eigenvalue can be bounded as 
%$\norm{\Lambda_h^{K+1}}\leq \norm{\sum_{k=1}^K\phi^{k}_h(\phi^{k}_h)^\top}+\lambda\leq K+\lambda$, and so $\det(\Lambda^{K+1}_h)\leq \det((K+\lambda)I_d)=(K+\lambda)^d$. We also have that $\det(\lambda I_d)=\lambda^d$. Then, using these results in our previous equation back in term (II), we obtain that
%\begin{multline}
%\label{eq:aux_last_1}
%\sum_{k=1}^{K}(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h
%%
%\leq
%2\log\left[\frac{K+\lambda}{\lambda}\right]^d\\=
%2d\log(K+1)\leq 2d\iota,
%\end{multline}
%where the last inequality holds since $\log(K+1)\leq \log\left(\frac{dKH}{\delta}\right)=\iota$ for $d\geq 2$. 
%
%
%Now, going back to term (III), 
%we assume the event $\bigcap_{i=1}^n\bar{\mathcal{E}}_i$ holds, which, after realizing that $\Pe[\text{not }  \bar{\mathcal{E}}_i]\leq \delta$ and applying union bound, holds with probability at least $1-n\delta$,
%%Then, taking the second term in~\eqref{eq:double_round_fact},   
\begin{multline}
\label{eq:aux_last_2}
\textrm{(III)}=2\beta \sum_{h=1}^H\sum_{k=1}^{K} \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h}\\
%
  \overset{\textrm{(a)}}{\leq}2\beta \sum_{h=1}^H  \sqrt{K}\sqrt{ \sum_{k=1}^K (\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h}\\
%
\overset{(b)}{\leq} 2\beta H \sqrt{2dK\iota}, 
%
%\overset{(b)}{\leq} 2\beta H\sqrt{dK\iota},
\end{multline}
where (a) follows from the Cauchy-Schwartz inequality, and we can show (b) 
%
by using the so-called elliptical potential lemma~\citep[Lemma~11]{YAY-DP-CS:11}.
% 

Now, using the results in~\eqref{eq:final2}, \eqref{eq:final3}, and~\eqref{eq:aux_last_2} back in~\eqref{eq:regret_prev_p}, we conclude that,
%
\begin{multline}
\label{eq:regret_almost_l}
\textnormal{Regret}(K) \leq  
8\beta H\sqrt{KH\iota} + 8\beta H\sqrt{KH\iota}
+ 
2\beta H \sqrt{dK\iota}\\
%
=16c_\beta\sqrt{d^2 K H^5\iota^2}
+ 
2c_\beta\sqrt{ d^3KH^4\iota^2}
\\
%
\overset{\textrm{(a)}}{\leq} 18c_\beta\sqrt{d^3KH^5\iota^2},
\end{multline}
%
where (a) follows from $\sqrt{\iota}\leq \iota$.% which follows from equation~\eqref{eq:iota_bound}.

Finally, %since $\Pe[\text{not }  \bigcap_{i\in[n]}\mathcal{E}_i]\leq n\delta$ and $\Pe[\text{not } \bigcap_{i\in[n]}\bar{\mathcal{E}}_i]\leq n\delta$, %
applying union bound let us conclude that 
%$\Pe[\bigcap_{i\in[n]}\mathcal{E}_i\cap\bigcap_{i\in[n]}\bar{\mathcal{E}}_i]\geq 1-2n\delta
%$,
$\Pe[\bigcap_{i\in[n]}\mathcal{E}_i\cap\bar{\mathcal{E}}\cap\tilde{\mathcal{E}}]\geq 1-(n+2)\delta
$,
i.e., our final result holds with probability at least $1-(n+2)\delta
$. We set the change of variables $\delta':=(n+2)\delta$ so that all results hold with probability at least $1-\delta'$ and the regret bound has now a logarithmic dependence $\iota = \log\left(\frac{dKH(n+2)}{\delta'}\right)$. This finishes the proof of Theorem~\ref{thm:main-nashQ}.
%
%\input{lower_bound}
% \input{break_log_gap}
