We restate Theorem~\ref{thm:main-nashQ} below with a change of variables that moves the dependency on the number of agents from the upper bound of the regret to the probability expression. This is the version of the Theorem we will prove when referring to Theorem~\ref{thm:main-nashQ} from the main paper.

\begin{theorem*}[Performance of the \algbrev ~algorithm]
%Assume that either all stage games (line 14) have a global optimal equilibrium or that all stage games have a saddle Nash equilibrium. Then, 
There exists an absolute constant $c_\beta>0$ such that, for any fixed $\delta\in(0,1)$, if we set $\lambda=1$ and $\beta=c_\beta dH\sqrt{\iota}$, with $\iota:=\log(dKH/\delta)$, then, with probability at least $1-(n+2)\delta$, 
\begin{equation}
\label{eqn:regret-res2}
%\begin{aligned}
\textnormal{Regret}(K)
\leq 
\cO\bigg(\sqrt{K}\sqrt{d^3H^5\iota^2}\bigg).
%+ \underbrace{\cO\bigg(\sqrt{ d^4H^4\iota}P\log\left(1+\frac{KP}{d}\right)\bigg)}_{\text{Overhead term}}.
%\end{aligned}
\end{equation}
\end{theorem*}

Let $\Z_{\geq 0}$ ($\Z_{\geq 1}$) be the set of non-negative (positive) integers. 

All results that are direct adaptations or restatements from existing results in the single RL literature from~\citep{CJ-ZY-ZW-MIJ:20} will have their detailed proofs if necessary to understand the nuances of their adaptation to our setting. Such proofs will be deferred to the last Section~\ref{sec:remaining-proofs} of the supplementary material.

%===================================
\section{AUXILIARY RESULTS}

The following proposition is an immediate adaptation of an existing one in
%we present adaptations 
%-- possibly with different notation and providing a more detailed proof if pertinent -- 
%of some results existing in
\citep{CJ-ZY-ZW-MIJ:20} for MDPs.
%which will be helpful to our analysis.

\begin{proposition}[Bounded parameters for Q-functions -- Proposition~2.3 and Lemma~B.1 in~\citep{CJ-ZY-ZW-MIJ:20}]
\label{prop:lin-Q}
Consider a linear stochastic game  $\M$. Given a policy profile $\pi$, we have that for any $i\in[n]$, there exist paremeters $w^{i,\pi}_h\in\R^d$, $h\in[H]$, such that $Q_h^{i,\pi}(x, a) = \langle\phi (x, a),w^{i,\pi}_h\rangle$ for any $(x, a) \in \S\times\A$ and $\norm{w^{i,\pi}_h} \leq 2H\sqrt{d}$.
\end{proposition}

%\begin{lemma}[Bound on quadratic form]
%\label{lem:basic_ineq} Consider $\Lambda_{A} = \lambda I_d + \sum_{a=1}^A \phi_{a,b} \phi_{a,b}^\top$ where $\phi_{a,b} \in \R^d$ and $\lambda > 0$. Then,
%\begin{equation*}
%\sum_{a=1}^{A}\sum_{b=1}^{B} \phi_{a,b}^\top (\Lambda_{A})^{-1} \phi_{a,b} \leq d.
%\end{equation*}
%\end{lemma}
%\begin{proof}
%Observe that $$\sum_{a=1}^{A} \phi_{a,b}^\top (\Lambda_{A})^{-1} \phi_{a,b}
%= \sum_{a=1}^{A}\tr( \phi_{a,b}^\top (\Lambda_{A})^{-1} \phi_{a,b})
%= \tr((\Lambda_{A})^{-1} \sum_{a=1}^{A}\phi_{a,b}\phi_{a,b}^\top).$$
%Given the eigenvalue decomposition  $\sum_{a=1}^A\phi_{a,b}\phi_{a,b}^\top = U \diag(\lambda_1,  \ldots, \lambda_d)  U^\top$ with $\lambda_i\geq 0$, $i\in[d]$, we obtain $\Lambda_{A} = U \diag(\lambda_1+\lambda,  \ldots, \lambda_d+\lambda) U^\top$, and so $\tr((\Lambda_{A})^{-1} \sum_{a=1}^{A}\phi_{a,b}\phi_{a,b}^\top)=
%\sum_{i=1}^d \lambda_i/(\lambda_i + \lambda) \leq d$.
%\end{proof}

%We now introduce an auxiliary definition. 
%\begin{definition}[Sequences $\L_P$, $\bar{\L}_P$ and their truncation]
%We now define the infinite sequence $\L_P$, with integer $P\geq 1$, as %$(1,1),(1,2),\dots,(1,P)$, %$(2,1),(2,2),\dots,(2,P)$,$(3,1),\dots$;
%$$
%(1,1),(1,2),\dots,(1,P),(2,1),(2,2),\dots,(2,P),(3,1),\dots
%$$
%where the first term of the elements is an increasing sequence which takes values in $\Z_{\geq 1}$ but the second term only takes values in $[P]$ periodically. We can index sequences using $\L_P$, as for example in $\{\varphi_{\tau}\}_{\tau\in\L_P}$ where $\varphi_{\tau}\in \R^n$, $n\geq 1$. We denote by $\L_P(\bar{a},\bar{b})$ the finite sequence resulting from the truncation of the sequence $\L_P$ at its element $(\bar{a},\bar{b})\in\L_P$. 
%
%Similarly, we define the infinite sequence $\bar{\L}_P$, with integer $P\geq 1$, by appending the sequence $(0,1),(0,2),\dots,(0,P)$ at the beginning of $\L_P$. %$(1,1),(1,2),\dots,(1,P)$,$(2,1),\dots$;
%%$$
%%(0,1),(0,2),\dots,(0,P),(1,1),(1,2),\dots,(1,P),(2,1),\dots\;.
%%$$
%Thus $\L_P\subsetneq\bar{\L}_P$.
%For any appropriate element $(\bar{a},\bar{b})$ of $\L_P$ or $\bar{\L}_P$, we have that $(\bar{a},\bar{b})^{-k}$, $k\geq 1$, represents the $k$th previous element to $(\bar{a},\bar{b})$.
%\end{definition}


The following lemma is a restatement of another one in~\citep{CJ-ZY-ZW-MIJ:20}, though with some different notation. 
%We also provide a detailed proof.

%which will be helpful to our analysis.
%Using the recently introduced definition, we present another technical lemma.
%
%
\begin{lemma}[Concentration bound for self-normalized processes -- Lemma D.4 in~\citep{CJ-ZY-ZW-MIJ:20}]
\label{lem:self_norm_covering}
Let $\{\F_{\tau}\}_{\tau=0}^\infty$ be a filtration. Let $\{x_{\tau}\}_{\tau=1}^\infty$ be a stochastic process on $\S$ such that $x_{\tau}\in\F_{\tau}$, and let $\{\phi_{\tau}\}_{\tau=1}^\infty$ be an $\R^d$-valued stochastic process such that $\phi_{\tau} \in \F_{\tau-1}$ and $\norm{\phi_{\tau}}\leq 1$. Let $\G$ be a function class of real-valued functions such that $\sup_{x\in\S} |g(x)| \leq H$ for any $g\in\G$, and with $\epsilon$-covering number $\mathcal{N}_{\epsilon}$ with respect to the distance $\mathrm{dist}(g, g') = \sup_{x\in S} |g(x) - g'(x)|$. 
%
Let $\Lambda_{A} = \lambda I_d + \sum_{\tau=1}^A\phi_{\tau} \phi_{\tau}^\top$. Then for every $A\in \Z_{\geq 1}$, every $g \in \G$, and any $\delta\in(0,1]$, we have that with probability at least $1-\delta$,
\begin{equation*} 
\norm{\sum_{\tau = 1}^A\phi_{\tau} \{ g(x_{\tau}) - \E[g(x_{\tau})|\F_{\tau-1}] \} }^2_{\Lambda_{A}^{-1}}
\leq 4H^2 \left[ \frac{d}{2}\log\biggl( \frac{\lambda+A/d}{\lambda}\biggr )  + \log\frac{\mathcal{N}_{\epsilon}}{\delta}\right]  + \frac{8A^2\epsilon^2}{\lambda}.
\end{equation*}
\end{lemma}
%

The following lemma is a key result in the proof of Theorem~\ref{thm:main-nashQ}, as seen in Section~\ref{sec:NashQproof} from the main paper.
%,: here we make use of the assumptions on the existence of global optimal and saddle Nash equilibria. 


\begin{lemma}[Bounding the covering number]\label{lem:bound-import-app}
%%
%
Let $i\in[n]$, and let $\bar{w}_i\in\R^d$ be such that $\norm{\bar{w}_i}\leq L$, $\bar{\Lambda}\in\R^{d\times d}$ be such that its minimum eigenvalue is greater or equal than $\lambda$, and, for all $(x,a)\in\S\times\A$, let $\phi(x,a)\in\R^d$ be such that $\norm{\phi(x, a)}\leq 1$, and let $\beta>0$.
%
Define the function class 
%such that, for any $V\in\mathcal{V}$, we have  $V:\S\to\R$ with the form 
\begin{equation}
\label{eq:function_class}
\mathcal{V}_i = \Big\{V:\S\to\R\;\Big|\; V(\cdot)
=\max_{\nu\in\Delta(\A_i)}\E_{\substack{a_i\sim\nu\\a_{-i}\sim\pi_{-i}(\cdot)}}\Big[\min\Big\{ \bar{w}_i^\top\phi(\cdot, a)
+ \beta \sqrt{\phi(\cdot, a)^\top\bar{\Lambda}^{-1} \phi(\cdot, a)}, H \Big\}\Big]\Big\},
\end{equation}
where %$i\in[n]$ and
$\pi_{-i}(\cdot)\in\Delta(\A_{-i})$. Let $\mathcal{N}_{\epsilon_i}$ be the $\epsilon_i$-covering number of $\mathcal{V}_i$ with respect to the distance $\operatorname{dist}(V, V') = \sup_{x\in\S} |V(x) - V'(x)|$. Then, 
$$
\log \mathcal{N}_{\epsilon_i} \leq d  \log (1+ 4L/ \epsilon ) + d^2 \log \bigl [ 1 +  8 d^{1/2} \beta^2  / (\lambda\epsilon^2)  \bigr ].
$$
\end{lemma}
%
%
%
%
%
%\begin{proof}
%Then, we can use Lemma~D.6 of~\citep{CJ-ZY-ZW-MIJ:20} to obtain the bound
%\begin{equation}
%\label{eq:cover-aux_0}
%\log \mathcal{N}_{\epsilon} \le d  \log (1+ 4L / \epsilon ) + d^2 \log \bigl [ 1 +  8 d^{1/2} \beta^2  / (\lambda\epsilon^2)  \bigr ].
%\end{equation}
%\end{proof}
\begin{proof}
Let $V,V'\in\mathcal{V}_i$. Let $\bar{u}(x,a):=\sqrt{\phi(x, a)^\top\bar{\Lambda}^{-1} \phi(x,a)}$ and $\bar{u}'(x,a):=\sqrt{\phi(x, a)^\top(\bar{\Lambda}')^{-1} \phi(x, a)}$, and let $\bar{g}(x,a)=\min\left\{ \bar{w}_i^\top\phi(x, a) + \beta \bar{u}(x,a), H \right\}$ and $\bar{g}'(x,a)=\min\left\{ \bar{w}_i^\top\phi(x, a) + \beta \bar{u}'(x,a), H \right\}$. Then, 
\begin{equation}
%\label{eq:dist-covering-MG-0-app}
\begin{aligned}
\dist(V,V')
&=\sup_{x\in\S}
\Big|\max_{\nu\in\Delta(\A_i)}\E_{\substack{a_i\sim\nu\\a_{-i}\sim\pi_{-i}(x)}}[\bar{g}(x,a)]-\max_{\nu\in\Delta(\A_i)}\E_{\substack{a_i\sim\nu\\a_{-i}\sim\pi_{-i}(x)}}[\bar{g}'(x,a)]\Big|\\
&\overset{(a)}{\leq}\sup_{\substack{x\in\S\\\nu\in\Delta(\A_i)}}
\Big|\E_{\substack{a_i\sim\nu\\a_{-i}\sim\pi_{-i}(x)}}[\bar{g}(x,a)]-\E_{\substack{a_i\sim\nu\\a_{-i}\sim\pi_{-i}(x)}}[\bar{g}'(x,a)]\Big|\\
&=\sup_{\substack{x\in\S\\\nu\in\Delta(\A_i)}}
\Big|\E_{\substack{a_i\sim\nu\\a_{-i}\sim\pi_{-i}(x)}}[\bar{g}(x,a)-\bar{g}'(x,a)]\Big|\\
&\leq\sup_{\substack{x\in\S\\\nu\in\Delta(\A_i)}}
\E_{\substack{a_i\sim\nu\\a_{-i}\sim\pi_{-i}(x)}}|\bar{g}(x,a)-\bar{g}'(x,a)|\\
&\leq\sup_{\substack{x\in\S\\a\in\A}}
\Big|\bar{g}(x,a)-\bar{g}'(x,a)\Big|\\
&\leq\sup_{\substack{x\in\S\\a\in\A}}|\min\{ \bar{w}_i^\top\phi(x, a) + \beta \bar{u}(x,a), H \} - \min\{ (\bar{w}_i')^\top\phi(x, a) + \beta \bar{u}'(x,a), H \}|\\
&\overset{(b)}{\leq}\sup_{\substack{x\in\S\\a\in\A}}|\bar{w}_i^\top\phi(x, a) + \beta \bar{u}(x,a) - ((\bar{w}_i')^\top\phi(x, a) + \beta \bar{u}'(x,a))|\\
&\leq \sup_{\substack{x\in\S\\a\in\A}}|(\bar{w}_i-\bar{w}_i')^\top\phi(x,a)|+\beta\sup_{\substack{x\in\S\\a\in\A}}|\bar{u}(x,a)-\bar{u}'(x,a)|
%
%
\end{aligned}
\end{equation}
Inequality (a) follows from the property $|\max_{\nu\in\Delta(\A_i)}f(\nu)-\max_{\nu\in\Delta(\A_i)}h(\nu)|\leq \max_{\nu\in\Delta(\A_i)}|f(\nu)-h(\nu)|$ for any $f,h:\Delta(\A_i)\to\R$ since, letting $\bar{\nu}=\argmax_{\nu\in\Delta(\A_i)}f(\nu)$ and $\tilde{\nu}=\argmax_{\nu\in\Delta(\A_i)}h(\nu)$, we observe that:  (i) if $\max_{\nu\in\Delta(\A_i)}f(\nu)>\max_{\nu\in\Delta(\A_i)}h(\nu)$, then $\max_{\nu\in\Delta(\A_i)}f(\nu)-\max_{\nu\in\Delta(\A_i)}h(\nu)\leq f(\bar{\nu})-h(\bar{\nu})\leq\max_{\nu\in\Delta(\A_i)}|f(\nu)-h(\nu)|$; and (ii) if $\max_{\nu\in\Delta(\A_i)}f(\nu)\leq\max_{\nu\in\Delta(\A_i)}h(\nu)$, then $\max_{\nu\in\Delta(\A_i)}h(\nu)-\max_{\nu\in\Delta(\A_i)}f(\nu)\leq h(\tilde{\nu})-f(\tilde{\nu})\leq\max_{\nu\in\Delta(\A_i)}|f(\nu)-h(\nu)|$. Inequality (b) follows from $\min\{\cdot,H\}$ being a non-expansive operator. 

We can continue bounding,
\begin{equation}
\label{eq:dist-covering-MG-0-app}
\begin{aligned}
\dist(V,V')
&\overset{(a)}{\leq}\sup_{\phi:\norm{\phi}\leq 1}\Big|
(\bar{w}_i-\bar{w}_i')^\top\phi\Big|\\
%&\quad +\sup_{\phi:\norm{\phi}\leq 1}\beta\Big|
%\sqrt{\phi(x,a)^\top\bar{\Lambda}^{-1}\phi(x,a)}-
%\sqrt{\phi(x,a)^\top(\bar{\Lambda}')^{-1}\phi(x,a)}
%\Big|\\
&\quad +\sup_{\phi:\norm{\phi}\leq 1}\beta\Big|
\sqrt{\phi^\top\bar{\Lambda}^{-1}\phi}-
\sqrt{\phi^\top(\bar{\Lambda}')^{-1}\phi}
\Big|\\
&\overset{(b)}{\leq}%textrm
\norm{\bar{w}_i-\bar{w}_i'} + \sup_{\phi:\norm{\phi}\leq 1}
\beta\sqrt{\left|\phi(x,a)^\top(\bar{\Lambda}^{-1}-(\bar{\Lambda}')^{-1})\phi(x,a)\right|}\\
&=\norm{\bar{w}_i-\bar{w}_i'} + 
\beta\sqrt{\norm{\bar{\Lambda}^{-1}-(\bar{\Lambda}')^{-1}}}\\
&\leq\norm{\bar{w}_i-\bar{w}_i'} + 
\beta\sqrt{\norm{\bar{\Lambda}^{-1}-(\bar{\Lambda}')^{-1}}_F}
%
%
\end{aligned}
\end{equation}
where (a) follows from the assumption $\sup_{x\in\S}\max_{a\in\A}\norm{\phi(x,a)}\leq 1$, and (b) follows from the inequality $|\sqrt{p}-\sqrt{q}|\leq\sqrt{|p-q|}$ for any $p,q\geq 0$. Now, we notice that~\eqref{eq:dist-covering-MG-0-app} is a bound of the same form of equation~(28) from~\citep[Lemma~D.6]{CJ-ZY-ZW-MIJ:20}, and so we can use the proof of this lemma to obtain that the $\epsilon_i$-covering number of $\mathcal{V}_i$, denoted by $\mathcal{N}_{\epsilon_i}$
%, with respect to the distance $\dist(\cdot,\cdot)$ 
can be upper bounded as $\log \mathcal{N}_{\epsilon_i}  \leq d  \log (1+ 4L/ \epsilon ) + d^2 \log \bigl [ 1 +  8 d^{1/2} \beta^2  / (\lambda\epsilon^2)  \bigr ]$. %\begin{equation*}
%%\label{eq:cover-aux-0}
%%\log \mathcal{N}_{\epsilon} \le d  \log (1+ 4H\sqrt{dK}/ \epsilon ) + d^2 \log \bigl [ 1 +  32 d^{1/2} \beta^2  / (\lambda\epsilon^2)  \bigr ].
%\log \mathcal{N}_{\epsilon}  \leq d  \log (1+ 4L/ \epsilon ) + d^2 \log \bigl [ 1 +  8 d^{1/2} \beta^2  / (\lambda\epsilon^2)  \bigr ].
%\end{equation*}
This finishes the proof.
%Then, the proof of the lemma follows immediately from closely following the proof of 
%Lemma~\ref{lem:stochastic_term}.
\end{proof}

%
%

%===================================


\section{PROVING THEOREM~\ref{thm:main-nashQ}}
\label{subsec:proof_main_parallel}
For simplicity, we will use the following notation: at episode $k$, we denote $\pi^{i,k}=\{\pi^{i,k}_h\}_{h\in[H]}$ as the policy induced by $\{Q_h^{i,k}\}_{h=1}^H$ as performed by agent $i\in[n]$ (line 14 of Algorithm~1) across time steps $h\in[H]$, thus for a fixed step $h\in[H]$ we let $V_h^{i,k}(x_h^{k})= \E_{a\sim \pi^{k}_h(x_h^{k})}[Q_h^{i,k}(x_h^{k},a)]$ with $\pi^{k}_h(x_h^{k})$ being a Nash equilibrium from the stage game $(Q^{i,k}_h(x_h^k,\cdot))_{i\in[n]}$. With some abuse of notation, we similarly define $V_h^{i,k}(x)= \E_{a\sim \pi^{k}_h(x)}[Q_h^{i,k}(x,a)]$ with $\pi^{k}_h(x)$ being a Nash equilibrium from the game $(Q^{i,k}_h(x,\cdot))_{i\in[n]}$. Let $\phi^\tau_h:=\phi(x^\tau_h,a^\tau_h)$. 

\subsection{Preliminary technical results}
%
%The following three lemmas -- Lemma~\ref{lem:wn_estimate}, Lemma~\ref{lem:stochastic_term}, and Lemma~\ref{lem:basic_relation} -- are adaptations of results found in~\citep{CJ-ZY-ZW-MIJ:20}. We include detailed proofs in Section~\ref{sec:remaining-proofs} for completeness and a better understanding of the nuanced differences with respect to the (single-agent) RL proofs established by~\cite{CJ-ZY-ZW-MIJ:20}. 

We now bound the parameters $\{w^{i,k}_h\}_{(i,h,k)\in[n]\times[H]\times[K]}$ from the \algbrev ~algorithm.

\begin{lemma}[Parameter bound -- Lemma~B.2 in~\citep{CJ-ZY-ZW-MIJ:20}]
\label{lem:wn_estimate}
For any $(i,k, h) \in[n]\times[K]\times[H]$, the parameter $w^{i,k}_h$ in the \algbrev ~algorithm satisfies
$\norm{w^{i,k}_h}\leq(1+H) \sqrt{\frac{d(k-1)}{\lambda}}$.
\end{lemma}
%

Now we use Lemma~\ref{lem:bound-import-app} and Lemma~\ref{lem:self_norm_covering} to prove a useful concentration bound for \algbrev.

\begin{lemma}[Concentration bound on value functions for \algbrev ~-- Lemma~B.3 in~\citep{CJ-ZY-ZW-MIJ:20}] \label{lem:stochastic_term}
Consider the setting of Theorem~\ref{thm:main-nashQ}. There exists an absolute constant $C$ independent of $c_{\beta}$ such that for any fixed $\delta\in(0, 1)$, the following event $\mathcal{E}_i$ holds with probability at least $1-\delta$ for a fixed $i\in[n]$: for every $(k, h)\in [K]\times [H]$,
%\begin{multline*}
\begin{equation*}
\norm{\sum_{\tau = 1}^{k-1} \phi^{\tau}_h [V^{i,k}_{h+1}(x^{\tau}_{h+1}) - \Pe_h V^{i,k}_{h+1}(x_h^{\tau}, a_h^{\tau})]}_{(\Lambda^k_h)^{-1}}
\leq CdH\sqrt{\log [(c_\beta+1)dKH/\delta]}. 
\end{equation*}
%\end{multline*}
\end{lemma}
%

The following lemma crucially depends on the principle of optimism.  
%used to upper bound the difference between the estimated Q-function by POLSVI before adding the optimism bonus (see line 9 in algorithm~\ref{alg:main_LIN_UCB_LSVI}) and the Q-function for any fixed policy. The optimism bonus will play an important role in such an upper bound.

\begin{lemma}[Difference with an arbitrary Q-function -- Lemma~B.4 in~\citep{CJ-ZY-ZW-MIJ:20}]
\label{lem:basic_relation} Consider the setting of Theorem~\ref{thm:main-nashQ}. There exists an absolute constant $c_\beta$ such that for $\beta = c_\beta dH\sqrt{\iota}$ with $\iota = \log (dKH/\delta)$ and
any fixed joint policy $\bar{\pi}$, such that for any $i\in[n]$: given the event $\mathcal{E}_i$ defined in Lemma \ref{lem:stochastic_term}, we have for all $(x, a, h, k) \in \S\times\A\times[H]\times[K]$ that
\begin{equation*}
\langle\phi(x, a), w^{i,k}_h\rangle - Q_h^{i,\bar{\pi}}(x, a)  =  \Pe_h (V^{i,k}_{h+1} - V^{i,\bar{\pi}}_{h+1})(x, a) + \Delta^{i,k}_h(x, a),
\end{equation*}
for some $\Delta^{i,k}_h(x, a)$ such that $|\Delta^{i,k}_h(x, a)| \leq \beta \sqrt{\phi(x, a)^\top (\Lambda^k_h)^{-1}  \phi(x, a)}$.
\end{lemma}
%

The following key lemma makes use of optimism by using Lemma~\ref{lem:basic_relation} and of the fact that we choose a Nash equilibrium at each stage game.

\begin{lemma}[Optimism bounds]% -- Lemma~B.5 in~\citep{CJ-ZY-ZW-MIJ:20}]
\label{lem:optimism_bound_app} Consider the setting of Theorem~\ref{thm:main-nashQ}. Given the event $\mathcal{E}_i$ defined in Lemma \ref{lem:stochastic_term}, we have that for all $(x, a, h, k) \in \S\times\A\times[H]\times[K]$, $$Q^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_h(x,a)\leq Q^{i,k}_h(x,a)\quad \text{ and }\quad V^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_h(x)\leq V^{i,k}_h(x).$$
%where $\pi^*$ is any Nash equilibrium of the underlying stochastic game.
\end{lemma}
\begin{proof}
We prove the claims by induction in $h=H+1,\dots,1$. The base case $H+1$ is trivial, since $Q^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_{H+1}(x,a)= Q^{i,k}_{H+1}(x,a)=0$. Now, at step $h+1$ we have the induction hypothesis $Q^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_{h+1}(x,a)\leq Q^{i,k}_{h+1}(x,a)$. Then we have that 
\begin{equation}
\label{eq:boundV-at-h1}
\begin{aligned}
V^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_{h+1}(x)&\overset{(a)}{=}\max_{\nu\in\Delta(\A_i)}\E_{\substack{a_i\sim\nu\\a_{-i}\sim\pi^k_{-i,h+1}(x)}}[Q^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_{h+1}(x,a)]\\
&\overset{(b)}{\leq}\max_{\nu\in\Delta(\A_i)}\E_{\substack{a_i\sim\nu\\a_{-i}\sim\pi^k_{-i,h+1}(x)}}[Q^{i,k}_{h+1}(x,a)]\\
&\overset{(c)}{=}\E_{a\sim\pi^k_{h+1}(x)}[Q^{i,k}_{h+1}(x,a)]\\
&=V^{i,k}_{h+1}(x,a),
\end{aligned}
\end{equation}
%\E_{a\sim\pi^k_{h+1}}[Q^{i,k}_{h+1}(x,a)]
where (a) follows by definition of best response, (b) from the induction hypothesis, and (c) from the fact that \algbrev ~chooses a Nash equilibrium at every stage game.

Now, we have
\begin{equation}
%\label{eq:boundV-at-h1}
\begin{aligned}
Q^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_{h}(x,a)&\overset{(a)}{\leq}
\langle\phi(x, a), w^{i,k}_h\rangle +  \Pe_h (V^{i,k}_{h+1} - V^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_{h+1})(x, a) + \beta \sqrt{\phi(x,a)^\top (\Lambda^k_h)^{-1}  \phi(x,a)}\\
&\overset{(b)}{\leq} 
\langle\phi(x, a), w^{i,k}_h\rangle + \beta \sqrt{\phi(x,a)^\top (\Lambda^k_h)^{-1}  \phi(x,a)}\\
\overset{(c)}{\implies}Q^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_{h}(x,a)&\leq \min\{\langle\phi(x, a), w^{i,k}_h\rangle + \beta \sqrt{\phi(x,a)^\top (\Lambda^k_h)^{-1}  \phi(x,a)},H\}\\
&=Q^{i,k}_h(x,a),
\end{aligned}
\end{equation}
where (a) follows from Lemma~\ref{lem:basic_relation}, (b) from~\eqref{eq:boundV-at-h1}, and (c) from $Q^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_{h}\leq H$. From here we can repeat the steps in~\eqref{eq:boundV-at-h1} to obtain $V^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_h(x)\leq V^{i,k}_h(x)$. This finishes the proof. 
%
%
\end{proof}
%%
%================================================================

\subsection{Proof of Theorem~\ref{thm:main-nashQ}}
%
%We now continue with our proof of Theorem~\ref{thm:main-nashQ}. 
Let us first condition on the event $\bigcap_{i=1}^n\mathcal{E}_i$ where $\mathcal{E}_i$ is defined in Lemma~\ref{lem:stochastic_term}. Since $\Pe[\text{not }  \mathcal{E}_i]\leq \delta$, applying union bound let us conclude that $\Pe[\bigcap_{i\in[n]}\mathcal{E}_i]\geq 1-n\delta
$.
% Using a simple union bound, we find that this holds with probability at least $1-n\delta$. \pc{[THIS WILL HAVE REPERCURSION IN THE THEOREM STATEMENT AND ITS COMMENTS, SINCE $n$ WILL APPEAR MORE ON $\iota$!]}

For any $k\in[K]$, given the policy $\pi^k=\{\pi^k_i\}_{i\in[n]}$ defined by \algbrev, we define the functions $\hat{Q}_h^k$ %$:\S\times\A\to[0,2\beta]$
and $\hat{V}_h^k$ %$:\S\to[0,2\beta]$
recursively as: $\hat{V}_{H+1}^k(x)=\hat{Q}_{H+1}^k(x)=0$ and 
\begin{align*}
\hat{Q}_h^k(x,a) &= 
%\E_{x'\sim \P_h(\cdot|x,a)}[\hat{V}^k_{h+1}(x')]
\Pe_h\hat{V}^k_{h+1}(x,a)+2\beta\sqrt{(\phi^k_h)^\top(\Lambda^k_h)^{-1}\phi^k_h},\\
\hat{V}_h^k(x)&=\E_{a\sim\pi^k_h}[\hat{Q}_h^k(x,a)]
\end{align*}
for any $h= H,\dots,1$ and $(x,a)\in\S\times\A$. Notice that since $2\beta\sqrt{(\phi^k_h)^\top(\Lambda^k_h)^{-1}\phi^k_h}\leq 2\beta\sqrt{(\phi^k_h)^\top\phi^k_h}=2\beta\norm{\phi^k_h}\leq 2\beta$, we have that $\hat{Q}_h^k$ %$:\S\times\A\to[0,2\beta]$
and $\hat{V}_h^k$ are nonnegative with maximum value $2\beta H$.

%Let $i\in[n]$ and 
Let $k\in[K]$. We claim that for any $(h,x,a)\in[H]\times\S\times\A$, 
\begin{equation}
\label{eq:claim-upper-bound-app}
\begin{aligned}
        \max_{i\in[n]}(Q^{i,k}_h(x,a)-Q^{i,\pi^k}_h(x,a))&\leq \hat{Q}_h^k(x,a)\text{, and}\\
        \max_{i\in[n]}(V^{i,k}_h(x)-V^{i,\pi^k}_h(x))&\leq \hat{V}_h^k(x).
\end{aligned}
\end{equation}
%
We prove the claim by induction in $h= H+1,\dots,1$. The base case $H+1$ is trivial, since $Q^{i,k}_{H+1}(x,a)=Q^{i,\pi^k}_{H+1}(x,a)=\hat{Q}^k_{H+1}(x,a)=0$ for every $i\in[n]$. Now, at step $h+1$ we have the induction hypothesis $\max_{i\in[h]}(Q^{i,k}_{h+1}(x,a)-Q^{i,\pi^k}_{h+1}(x,a))\leq \hat{Q}_{h+1}^k(x,a)$. Taking expectations over $a\sim\pi^k_{h+1}(x)$ let us immediately obtain 
\begin{equation}
\label{eq:max_Vi}
\max_{i\in[h]}(V^{i,k}_{h+1}(x)-V^{i,\pi^k}_{h+1}(x))\leq \hat{V}_{h+1}^k(x).    
\end{equation} 

Now, for any $i\in[n]$,
\begin{equation}
%\label{eq:boundV-at-h1}
\begin{aligned}
Q^{i,k}_h(x,a)-Q^{i,\pi^k}_h(x,a)&= \min\{(w^{i,k}_h)^\top\phi(x,a)+\beta\sqrt{\phi(x,a)^\top(\Lambda_h^k)^{-1}\phi(x,a)},H\}-Q^{i,\pi^k}_h(x,a)\\
&\overset{(a)}{\leq}
\Pe_h (V^{i,k}_{h+1} - V^{i,\pi^k}_{h+1})(x, a) + 2\beta \sqrt{\phi(x,a)^\top (\Lambda^k_h)^{-1}  \phi(x,a)}\\
&\overset{(b)}{\leq} 
\Pe_h\hat{V}^{k}_{h+1}(x,a) + 2\beta \sqrt{\phi(x,a)^\top (\Lambda^k_h)^{-1}  \phi(x,a)}\\
%\overset{(c)}{\implies}Q^{i,\bre(\pi_{-i}^k),\pi_{-i}^k}_{h}(x,a)&\leq \min\{\langle\phi(x, a), w^{i,k}_h\rangle + \beta \sqrt{\phi(x,a)^\top (\Lambda^k_h)^{-1}  \phi(x,a)},H\}\\
&=\hat{Q}^{k}_h(x,a),
\end{aligned}
\end{equation}
where (a) follows from Lemma~\ref{lem:basic_relation} and (b) from \eqref{eq:max_Vi}. Taking expectations let us obtain  $V^{i,k}_h(x)-V^{i,\pi^k}_h(x)\leq \hat{V}^{k}_h(x)$. This finishes the proof for the claim in~\eqref{eq:claim-upper-bound-app}.

We now introduce the following notation: $\delta^{k}_h := \E_{a\sim\pi^k_h(x^k_h)}[\hat{Q}^k_h(x^k_h,a)]-\hat{Q}^k_h(x^k_h,a^k_h)$, and $\xi^k_{h+1} := 
\Pe_h\hat{V}^k_{h+1}(x^k_h,a^k_h) - \hat{V}^k_{h+1}(x^k_{h+1})$ with $\xi^k_1:=0$. Then, for any $(h,k) \in [H] \times [K]$, 
\begin{align*}
    \hat{V}^k_h(x^k_h)&=\E_{a\sim\pi^k_h(x^k_h)}[\hat{Q}^k_h(x^k_h,a)]\\
    &=\delta^k_h+\hat{Q}^k_h(x^k_h,a^k_h)\\
    &=\delta^k_h+\Pe_h\hat{V}^k_{h+1}(x^k_h,a^k_h)+2\beta\sqrt{(\phi^k_h)^\top(\Lambda^k_h)^{-1}\phi^k_h}\\
    &=\delta^k_h+\xi^k_{h+1}+2\beta\sqrt{(\phi^k_h)^\top(\Lambda^k_h)^{-1}\phi^k_h}+\hat{V}^k_{h+1}(x^k_{h+1}).
\end{align*}
%we use Lemma~\ref{lem:basic_relation} (with $x=x_h^k$ and $a=a^{k}_h$ following the lemma's notation) to obtain, 
%\begin{equation}
%\label{eq:recursive-paral-aux}
%\begin{aligned}
%&Q^{i,k}_h(x_h^{k}, a^{k}_h) - Q^{i,\pi_i^k,\pi^*_{-i}}_h(x_h^{k}, a^{k}_h)\\
%&\quad \leq \Pe_h (V^{i,k}_{h+1} - V^{i,\pi_i^k,\pi^*_{-i}}_{h+1})(x_h^{k}, a^{k}_h)+ \beta \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}  \phi^{k}_h}\\
%%
%\implies & V^{i,k}_h(x^{k}_{h}) - V^{i,\pi_i^k,\pi^*_{-i}}_h(x^{k}_{h})\\
%&\quad \leq (\Pe_h (V^{i,k}_{h+1} - V^{i,\pi_i^k,\pi^*_{-i}}_{h+1})(x_h^{k},a_h^{k})) - \delta^{i,k}_{h+1})+ \delta^{i,k}_{h+1} 
%+ \beta \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}  \phi^{k}_h}\\
%%
%\implies & \delta^{i,k}_h\leq 
%\xi^{i,k}_{h+1} + \delta^{i,k}_{h+1}
%+ \beta \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}  \phi^{k}_h}.
%%
%\end{aligned}
%\end{equation}
%
%We define $\xi_1^{i,k}=0$ for every $(i,k)\in[n]\times[K]$.
%

Now, let us focus on the regret performance metric.
% 
\begin{equation}
\label{eq:regret_prev_p_app}
\begin{aligned}
\textnormal{Regret}(K) &=  \sum_{k=1}^K\max_{i\in[n]}(V_1^{i,\bre(\pi^k_{-i}),\pi^k_{-i}}(s_o) - V_1^{i,\pi^k}(s_o))\\
&\overset{(a)}{\leq}\sum_{k=1}^{K} \max_{i\in[n]}(V^{i,k}_1(s_o) - V^{i,\pi^k}_1 (s_o))\\
%
&\overset{(b)}{\leq}\sum_{k=1}^{K} \hat{V}^{k}_1(s_o)
\\
%
&=\underbrace{\sum_{k=1}^K\sum_{h=1}^H\xi^k_h}_{\textrm{(I)}}+\underbrace{\sum_{k=1}^{K}\sum_{h=1}^H \delta^{k}_{h}}_{\textrm{(II)}} + \underbrace{2\beta \sum_{k=1}^{K}\sum_{h=1}^H \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h}}_{\textrm{(III)}},
\end{aligned}
\end{equation}
%
where (a) follows from Lemma~\ref{lem:optimism_bound_app} and the fact that we are conditioned on the event $\bigcap^n_{i=1}\mathcal{E}_i$; (b) follows from~\eqref{eq:claim-upper-bound-app}.
%found in Lemma~B.5 from~\citep{CJ-ZY-ZW-MIJ:20} but using the event $\mathcal{E}_i$ from Lemma~\ref{lem:stochastic_term}, 
%and (b) follows from the recursive formula in~\eqref{eq:recursive-paral-aux} and the fact that $\delta_{H+1}^{i,k}=\xi_{H+1}^{i,k}=0$ and and $\xi^{i,k}_1=0$ for every $(i,k)\in[n]\times[K]$.

We first analyze the term (I) from~\eqref{eq:regret_prev_p_app}. 
%Let us define the filtration $\{\F_{(k,h,p)}\}_{(k,h,p)\in\L^\star}$ where
%$\L^\star$ is a sequence such that $\L^\star\subset\mathbb{Z}_{\geq 1}\times[H]\times[P]$ and its elements are arranged as follows. Firstly, we let the third coordinate take values from $1$ to $P$ and repeat this periodically \emph{ad infinitum}, so that each period has $P$ elements of $\L^\star$.
%Secondly, the second coordinate takes the value $1$ for all elements in the first period of the third coordinate, then it takes the value $2$ for all elements of the second period of the third coordinate, and so on until taking the value of $H$ for all elements in the $H$-th period of the third coordinate --- this will constitute a period in the second coordinate --- after which the second coordinate takes the value $1$ again and continue describing periods (of $H$ elements each) \emph{ad infinitum}. Finally, we let the third coordinate take the value corresponding to the number of periods so far in the second coordinate (so the values of the first coordinate is unbounded). 
%
%Consider any element $(k,h,p)\in\L^\star$. We denote by $(k,h,p)^{-1}$ its previous element in $\L^\star$. We let $\F_{(k,h,p)}$ contain the information of all states $x^{\bar{k},\bar{p}}_{\bar{h}}$ and actions $a^{\bar{k},\bar{p}}_{\bar{h}}$ whose indexes $(\bar{k},\bar{h},\bar{p})$ belong to the set $\L^\star$ up to the element $(k,h,p)\in\L^\star$.
%
Let us define the filtration $\{\F_{(k,h)}\}_{(k,h)\in\L^\star}$ where
$\L^\star$ is a sequence such that $\L^\star\subset\mathbb{Z}_{\geq 1}\times[H]$ and its elements are arranged as follows. Firstly, we let the second coordinate take values from $1$ to $H$ and repeat this periodically \emph{ad infinitum}, so that each period has $H$ elements of $\L^\star$. Finally, we let the first coordinate take the value corresponding to the current number of periods so far progressed in the second coordinate (and so its value is unbounded). 
%
Consider any element $(k,h)\in\L^\star$. We denote by $(k,h)^{-1}$ its previous element in $\L^\star$. We let $\F_{(k,h)}$ contain the information of the tuple $(x^{\bar{k}}_{\bar{h}},a^{\bar{k}}_{\bar{h}})$ whose indexes $(\bar{k},\bar{h})$ belong to the set $\L^\star$ up to the element $(k,h)\in\L^\star$.

We then can conclude that $\{\xi^k_h\}_{(k,h)\in\L^\star}$ is a 
%Since the computation of $V_h^k$ is independent of the new observation $x^k_h$ at episode $k$, 
martingale difference sequence due to the following two properties:
\begin{enumerate}
    \item $\xi^k_h\in\F_{(k,h)^{-1}}$.
    For $h=1$, $\E[\xi^k_{h}|\F_{(k,h)^{-1}}]=0$ is trivial, so we focus on $h=2,\dots,H$. Then, since $x^{k}_h\sim\P_{h-1}(\cdot|x^{k}_{h-1},a^{k}_{h-1})$ (line 16 of \algbrev), we have $\E[\hat{V}^k_h(x^k_h)|\F_{(k,h)^{-1}}]=\E_{x'\sim\P_{h-1}(\cdot|x_{h-1}^{k},a_{h-1}^{k})}[\hat{V}^{k}_h(x')]=\Pe_{h-1}\hat{V}^k_h(x^k_{h-1},a^k_{h-1})$, which immediately implies $\E[\xi^k_{h}|\F_{(k,h)^{-1}}]=0$. 
    %   
    %and  
    %\begin{align*}
    %\E[\xi^{i,k}_{h}|\F_{(k,h)^{-1}}]&=\E_{x'\sim\P_{h-1}(\cdot|x_{h-1}^{k},a_{h-1}^{k})}[\delta^{i,k}_{h}]-\E[\delta^{i,k}_{h}|\F_{(k,h)^{-1}}]\\
    %&=\E_{x'\sim\P_{h-1}(\cdot|x_{h-1}^{k},a_{h-1}^{k})}[\delta^{i,k}_{h}]-\E[\delta^{i,k}_{h}|x_{h-1}^{k},a_{h-1}^{k}]=0
    %\end{align*}
    %(where we have the notation
    %$\E_{x'\sim\P_{h-1}(\cdot|x_{h-1}^{k},a_{h-1}^{k})}[g(x')]\equiv\E[g(x')|x_{h-1}^{k},a_{h-1}^{k}]$ for any function $g:\S\to\R$).
    \item $|\xi^k_h|\leq 
    |\Pe_{h-1}\hat{V}^k_h(x^{k}_{h-1}, a^{k}_{h-1})| +|\hat{V}^k_h(x^k_h)|\leq 4\beta H <\infty$
    since $\hat{V}^k_h(x)\in[0,2\beta H]$ for any $x\in\S$.
\end{enumerate}
Therefore, we can use the Azuma-Hoeffding inequality to conclude that, for any $\epsilon > 0$,
\begin{equation*}
\Pr \left(\sum_{k=1}^{K}\sum_{h=1}^H  \xi^{k}_{h}> \epsilon \right) \leq \exp \bigg (\frac{-2 \epsilon^2 } {(KH)(16\beta^2H^2) } \bigg ).
\end{equation*}
We choose $\epsilon=\sqrt{8KH^3\beta^2\log\left(\frac{1}{\delta}\right)}$. Then, with probability at least $1 -\delta$,   
\begin{equation}
\label{eq:final2-app}
  \textrm{(I)}=\sum_{k=1}^{K}\sum_{h=1}^H  \xi^k_{h}\leq \sqrt{8KH^3\beta^2\log\left(\frac{1}{\delta}\right)} \leq 8\beta H\sqrt{KH\iota}, 
\end{equation}
recalling that $\iota = \log\left(\frac{dKH}{\delta}\right)$. We call $\bar{\mathcal{E}}$ the event such that~\eqref{eq:final2-app} holds.

The term (II) can be analyzed in a very similar way as in (I) to show that $\{\delta^k_h\}_{(k,h)\in\cL^*}$ is a martingale difference sequence, and thus obtain that with probability at least $1-\delta$,
\begin{equation}
\label{eq:final3-app}
  \textrm{(II)}=\sum_{k=1}^{K}\sum_{h=1}^H  \delta^k_{h}\leq 8\beta H\sqrt{KH\iota}. 
\end{equation}
We call $\tilde{\mathcal{E}}$ the event such that~\eqref{eq:final3-app} holds.

We now analyze the term (III) from~\eqref{eq:regret_prev_p_app}. Then for a fixed $h\in[H]$,
$$
%\sum_{i=1}^{n}\sum_{h=1}^{H}\left(
\sum_{k=1}^{K}(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h
%\right)
%
\leq 2\log\left[\frac{\det(\Lambda_h^{K+1})}{\det(\lambda I_d)}\right]
$$
%
%\beta \sum_{k=1}^{K}\I[\mathcal{D}_t^c]\sum_{p=1}^{P}\sum_{h=1}^H \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h}
%
where the inequality follows from the so-called elliptical potential lemma~\citep[Lemma~11]{YAY-DP-CS:11}, whose conditions are satisfied from our bounded sequence $\{\phi_h^{k}\}_{k=1}^K$ and the fact that the minimum eigenvalue of $\Lambda_h^k$ is lower bounded by $\lambda=1$ for every $(h,k)\in[H]\times[K]$. Now, we have that $\Lambda_h^{K+1}$ is a positive definite matrix whose maximum eigenvalue can be bounded as 
$\norm{\Lambda_h^{K+1}}\leq \norm{\sum_{k=1}^K\phi^{k}_h(\phi^{k}_h)^\top}+\lambda\leq K+\lambda$, and so $\det(\Lambda^{K+1}_h)\leq \det((K+\lambda)I_d)=(K+\lambda)^d$. We also have that $\det(\lambda I_d)=\lambda^d$. Then, we obtain that
\begin{equation}
\label{eq:aux_last_1}
\sum_{k=1}^{K}(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h
%
\leq
2\log\left[\frac{K+\lambda}{\lambda}\right]^d=
2d\log(K+1)\leq 2d\iota,
\end{equation}
where the last inequality holds since $\log(K+1)\leq \log\left(\frac{dKH}{\delta}\right)=\iota$ for $d\geq 2$, $\delta>0$. 

Now, going back to term (III), 
%we assume the event $\bigcap_{i=1}^n\bar{\mathcal{E}}_i$ holds, which, after realizing that $\Pe[\text{not }  \bar{\mathcal{E}}_i]\leq \delta$ and applying union bound, holds with probability at least $1-n\delta$,
%%Then, taking the second term in~\eqref{eq:double_round_fact},   
\begin{equation}
\label{eq:aux_last_2-app}
\textrm{(III)}=2\beta \sum_{h=1}^H\sum_{k=1}^{K} \sqrt{(\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h}
%
  \overset{\textrm{(a)}}{\leq}2\beta \sum_{h=1}^H\sqrt{K}\sqrt{ \sum_{k=1}^K (\phi^{k}_h)^\top (\Lambda^k_h)^{-1}\phi^{k}_h}
%
\overset{(b)}{\leq} 2\beta H \sqrt{2dK\iota}, 
%
%\overset{(b)}{\leq} 2\beta H\sqrt{dK\iota},
\end{equation}
where (a) follows from the Cauchy-Schwartz inequality, and (b) from~\eqref{eq:aux_last_1}.
% 

Now, using the results in~\eqref{eq:final2-app}, \eqref{eq:final3-app}, and~\eqref{eq:aux_last_2-app} back in~\eqref{eq:regret_prev_p_app}, we conclude that,
%
\begin{multline}
\label{eq:regret_almost_l}
\textnormal{Regret}(K) \leq  
8\beta H\sqrt{KH\iota} + 8\beta H\sqrt{KH\iota}
+ 
2\beta H \sqrt{dK\iota}\\
%
=16c_\beta\sqrt{d^2 K H^5\iota^2}
+ 
2c_\beta\sqrt{ d^3KH^4\iota^2}
%
\overset{\textrm{(a)}}{\leq} 18c_\beta\sqrt{d^3KH^5\iota^2},
\end{multline}
%
where (a) follows from $\sqrt{\iota}\leq \iota$ which follows from equation~\eqref{eq:iota_bound}.

Finally, %since $\Pe[\text{not }  \bigcap_{i\in[n]}\mathcal{E}_i]\leq n\delta$ and $\Pe[\text{not } \bigcap_{i\in[n]}\bar{\mathcal{E}}_i]\leq n\delta$, %
applying union bound let us conclude that 
%$\Pe[\bigcap_{i\in[n]}\mathcal{E}_i\cap\bigcap_{i\in[n]}\bar{\mathcal{E}}_i]\geq 1-2n\delta
%$,
$\Pe[\bigcap_{i\in[n]}\mathcal{E}_i\cap\bar{\mathcal{E}}\cap\tilde{\mathcal{E}}]\geq 1-(n+2)\delta
$,
i.e., our final result holds with probability at least $1-(n+2)\delta
$. This finishes the proof of Theorem~\ref{thm:main-nashQ}.\qed
%
%================================================================
%================================================================ 

\section{REMAINING PROOFS}
\label{sec:remaining-proofs}

\begin{proof}[Proof of Lemma~\ref{lem:self_norm_covering}]
First, from our assumptions, for any $g \in \G$, there exists a $\tilde{g}$ in the $\epsilon$-covering such that
$g = \tilde{g} + \Delta_g$ with $\sup_{x\in\S} |\Delta_g(x)| \leq \epsilon$. Then,
\begin{equation}
\label{eq:conc-aux}
\begin{aligned}
&\norm{\sum_{\tau = 1}^A \phi_{\tau} \{ g(x_\tau) - \E[g(x_\tau)|\F_{\tau-1}] \}  }^2_{\Lambda_{A}^{-1}}\\
&\quad\leq  2\underbrace{\norm{\sum_{\tau = 1}^A \phi_{\tau} \{ \tilde{g}(x_{\tau}) - \E[\tilde{g}(x_{\tau})|\F_{\tau-1}]\} }^2_{\Lambda_{A}^{-1}}}_{\textrm{(I)}}
%&\quad+
+2\underbrace{\norm{\sum_{\tau = 1}^A \phi_{\tau} \{ \Delta_g(x_{\tau}) - \E[\Delta_g(x_{\tau})|\F_{\tau-1}] \} }^2_{\Lambda_{A}^{-1}}}_{\textrm{(II)}},
\end{aligned}
\end{equation}
where we used $\norm{a+b}\leq \norm{a}+\norm{b}\implies \norm{a+b}^2\leq \norm{a}^2+\norm{b}^2+2\norm{a}\norm{b}\leq 2\norm{a}^2+2\norm{b}^2$ for any $a,b\in\R^d$, and which actually holds for any weighted Euclidean norm. 

We start by analyzing the term (I) in equation~\eqref{eq:conc-aux}. Let $\varepsilon_{\tau}:=\tilde{g}(x_{\tau}) - \E[\tilde{g}(x_{\tau})|\F_{\tau-1}]$. Now, we observe that 1) $\E[\varepsilon_{\tau}|\F_{\tau-1}]=0$ and 2) $\varepsilon_{\tau}\in[-H,H]$ since $\tilde{g}(x_{\tau})\in[0,H]$. From these two facts we obtain that $\varepsilon_{\tau}|\F_{\tau-1}$ is $H$-sub-Gaussian. Therefore we can apply the concentration bound of self-normalized processes from Theorem~1 of~\citep{YAY-DP-CS:11} along with a union bound over the $\epsilon$-covering of $\G$ to conclude that, with probability at least $1-\delta$,
\begin{multline}
\label{eq:upp-b-lem-aux1}
\textrm{(I)}=\norm{\sum_{\tau=1}^A\phi_{\tau}\varepsilon_{\tau}}_{\Lambda_{A}^{-1}}^2\leq \log\left(\frac{\det(\Lambda_{A})^{1/2}\det(\lambda I_d)^{-1/2}}{\delta/\N_\epsilon}\right)
\overset{\textrm{(a)}}{\leq} 2H^2\left(
\frac{d}{2}\log\left(\frac{\lambda+AB/d}{\lambda}\right)+\log\left(\frac{\N_\epsilon}{\delta}\right)
\right),
\end{multline}
where (a) follows from $\det(\lambda I_d)=\lambda^d$ and from the determinant-trace inequality from Lemma~10 in~\citep{YAY-DP-CS:11} which let us obtain $\det(\Lambda_{A})\leq(\lambda+AB/d)^d$.

Now we analyze the term (II) in equation~\eqref{eq:conc-aux}. Let $\bar{\varepsilon}_{\tau}:=\Delta_g(x_{\tau}) - \E[\Delta_g(x_{\tau})|\F_{\tau-1}]$. Then, 
\begin{equation*}
\norm{\sum_{\tau=1}^A\phi_{\tau}\bar{\varepsilon}_{\tau}}\leq \sum_{\tau=1}^A\norm{\phi_{\tau}\bar{\varepsilon}_{\tau}}\overset{\textrm{(a)}}{\leq} \sum_{\tau=1}^A|\bar{\varepsilon}_{\tau}|
\leq \sum_{\tau=1}^A|\Delta_g(x_{\tau})|+|\E[\Delta_g(x_{\tau})|\F_{\tau-1}]|\leq \sum_{\tau=1}^A 2\epsilon=2A\epsilon,
\end{equation*}
where (a) follows from $\norm{\phi_{\tau}}\leq 1$. Thus, using this result, we obtain
$$
\textrm{(II)}\leq\frac{1}{\lambda}\norm{\sum_{\tau=1}^A\phi_{\tau}\bar{\varepsilon}_{\tau}}^2\leq \frac{1}{\lambda}4A^2\epsilon^2.$$

We finish the proof by multiplying by two the terms (I) and (II), and then adding them up to use them as an upper bound to~\eqref{eq:conc-aux} .
%
\end{proof}

\begin{proof}[Proof of Lemma~\ref{lem:wn_estimate}]
For any vector $v \in \R^d$,
\begin{align*}
%|v^\top w^{i,k}_h| & = |v^\top (\Lambda^k_h)^{-1} \sum_{\tau=1}^{k-1} \phi^{\tau}_h [r^{\tau}_h + \max_{a\in\A} Q_{h+1}^{i,k}(x^{\tau}_{h+1}, a)]|\\
|v^\top w^{i,k}_h| & = |v^\top (\Lambda^k_h)^{-1} \sum_{\tau=1}^{k-1} \phi^{\tau}_h [r^i_h + \max_{\substack{a\sim\pi^*\\\pi^*\text{ as in line 7 of Algorithm~1}}} Q_{h+1}^{i,k}(x^{\tau}_{h+1}, a)]|\\
%
& \overset{\textrm{(a)}}{\leq}(1+H)\sum_{\tau = 1}^{k-1}  |v^\top (\Lambda^k_h)^{-1} \phi^{\tau}_h|\\
%
&\overset{(b)}{\leq} (1+H)\sqrt{ \bigg[ \sum_{\tau = 1}^{k-1}  v^\top (\Lambda^k_h)^{-1}v\bigg]  \biggl [ \sum_{\tau = 1}^{k-1}  (\phi^\tau_h)^\top (\Lambda^k_h)^{-1}\phi^\tau_h\bigg] }\\
& \overset{\textrm{(c)}}{\leq} (1+H)\sqrt{d}\sqrt{\sum_{\tau = 1}^{k-1}  v^\top (\Lambda^k_h)^{-1}v}\\
&\overset{\textrm{(d)}}{\leq}(1+H)\sqrt{\frac{d(k-1)}{\lambda}}\norm{v}, %
\end{align*}
where (a) follows from the bounded rewards and $Q^{i,k}_{h+1}(\cdot,\cdot)\leq H$; (b) from applying Cauchy-Schwarz twice as in the following series of inequalities: given $q = (q_1,\dots,q_m)$ and $q = (p_1,\dots,p_m)$ where $q_i$ and $p_i$ are vectors of same arbitrary dimension we have $\sum^m_{i=1}|q_i^\top p_i|\leq \sum^m_{i=1}\norm{q_i}\norm{p_i}\leq \sqrt{\sum^m_{i=1}\norm{q_i}}\sqrt{\sum^m_{i=1}\norm{p_i}}$ ; (c) follows from~\cite[Lemma~D.1]{CJ-ZY-ZW-MIJ:20}; and (d) from $(\Lambda_h^k)^{-1}\preceq \lambda^{-1}I_d$. The proof concludes by considering that $\norm{w^{i,k}_h} = \max_{v:\norm{v} = 1} |v^\top w^{i,k}_h|$.
\end{proof}

\begin{proof}[Proof of Lemma~\ref{lem:stochastic_term}]
%
We obtain that, with probability at least $1-\delta$, $\delta\in(0,1)$,
\begin{equation}
    \label{eq:bound_aux_long}
\begin{aligned}
&\norm{\sum_{\tau = 1}^{k-1} \phi^{\tau}_h [V^{i,k}_{h+1}(x^{\tau}_{h+1}) - \Pe_h V^{i,k}_{h+1}(x_h^{\tau}, a_h^{\tau})]}_{(\Lambda^k_h)^{-1}}^2\\
&\overset{\textrm{(a)}}{\leq} 4H^2 \left[ \frac{d}{2}\log\biggl( \frac{\lambda+(k-1)/d}{\lambda}\biggr )  + \log\N_{\epsilon_i} + \log\frac{1}{\delta}
\right]  + \frac{8(k-1)^2\epsilon^2}{\lambda}\\
&\overset{(b)}{\leq} 4H^2 \left[ \frac{d}{2}\log\biggl( \frac{\lambda+(k-1)/d}{\lambda}\biggr )  
+ d\log \left(1+ \frac{4(1+H) \sqrt{d(k-1)}}{\epsilon\sqrt{\lambda}}\right) \right. \\
&\quad\left. + d^2 \log\left( 1 + \frac{8 d^{1/2}\beta^2}{\lambda\epsilon^2}\right) + \log\frac{1}{\delta}
\right]  + \frac{8(k-1)^2\epsilon^2}{\lambda}
\end{aligned}
\end{equation}
where (a) is a direct application of Lemma~\ref{lem:self_norm_covering}; and (b) follows from the realization that, from lines 9 and 10 in Algorithm~1, $V^{i,k}_{h+1}(\cdot)\in\mathcal{V}$ with $\mathcal{V}$ as in Lemma~\ref{lem:bound-import-app} and so we can use the bound on the covering number derived in such lemma with $L=(1+H) \sqrt{\frac{d(k-1)}{\lambda}}$ by using the bound from Lemma~\ref{lem:wn_estimate}.

Recalling that $\lambda = 1$ and $\beta=c_\beta dH\iota$ with $\iota=\log(dKH/\delta)$ in the setting of Theorem~\ref{thm:main-nashQ}, we claim that, after setting $\epsilon = \frac{dH}{K}$ in our previous equation, there exists an absolute constant $C > 0$ independent of $c_\beta$ such that 
\begin{equation}
\label{eq:final-bound-1}
\norm{\sum_{\tau = 1}^{k-1} \phi^{\tau}_h [V^{i,k}_{h+1}(x^{\tau}_{h+1}) - \Pe_h V^{i,k}_{h+1}(x_h^{\tau}, a_h^{\tau})]}_{(\Lambda^k_h)^{-1}}^2 \leq C d^2  H^2 \log ((c_\beta+1)dKH/
\delta).
\end{equation}
Proving~\eqref{eq:final-bound-1} would conclude the proof.
%

We first introduce a couple of useful results:
\begin{align}
\label{eq:iota_bound}
&\iota^2=\log\left(\frac{dKH}{\delta}\right)\geq \log(dKH)\geq \log(4)>1,\\ 
%\end{equation}
%and so
%\begin{equation}
\label{eq:upp_low_bound}
& \log \left(\frac{(c_\beta+1)dKH}{\delta}\right)= 
\log(c_\beta+1)+\iota\geq \iota>1.
\end{align}
%
Replacing $\lambda =1$ and $\epsilon=\frac{dH}{K}$ in the right-hand side of~\eqref{eq:bound_aux_long} and doing some algebraic calculations, let us conclude that
%
\begin{equation}
\label{eq:aux-first-res}
\begin{aligned}
\eqref{eq:bound_aux_long}
&\leq 4d^2H^2\left[\log\left( 1+\frac{K}{d}\right)  
+ \log \left(1+ \frac{8K^{3/2}}{d^{1/2}}\right) + \log\left(\frac{1}{\delta}\left( 1 + \frac{8 \beta^2K^2}{d^{3/2}H^2}\right)\right) \right]\\
&\quad +8d^2H^2.
\end{aligned}
\end{equation}
Replacing $\beta=c_\beta dH\sqrt{\iota}$ in the previous expression and doing some algebraic work let us obtain
%
\begin{equation}
\label{eq:to_upper_bound}
\begin{aligned}
\eqref{eq:aux-first-res}&\leq \underbrace{8d^2H^2\log \left(1+ \frac{8K^{3/2}}{d^{1/2}}\right)}_{\textrm{(I)}} + 
\underbrace{4d^2H^2\log\left(\frac{1}{\delta}\left( 1 + 8 c_\beta^2d^{1/2}\iota K^2\right)\right)}_{\textrm{(II)}}\\
&\quad 
+8d^2H^2\log\left(\frac{(c_\beta+1)dKH}{\delta}\right)
\end{aligned}
\end{equation}
where the inequality has made use of~\eqref{eq:upp_low_bound}. We now upper bound the terms highlighted in~\eqref{eq:to_upper_bound}.
Then, 
\begin{align*}
\textrm{(I)}&\leq 8d^2H^2\log \left(1+ 8K^{3/2}\right)\\
&\overset{\textrm{(a)}}{\leq} 8d^2H^2\log \left(\frac{(1+c_\beta)^2(dKH)^2}{\delta^2}\right)+8d^2H^2\log(9)\log \left(\frac{(c_\beta+1)dKH}{\delta}\right)\\
&=(16+8\log(9))d^2H^2\log \left(\frac{(c_\beta+1)dKH}{\delta}\right),
%
\end{align*}
where (a) follows from~\eqref{eq:upp_low_bound} and $c_\beta>0$. Similarly,
%
\begin{align*}
\textrm{(II)}
&\overset{\textrm{(a)}}{\leq} 
4d^2H^2\log\left(\frac{8(c_\beta+1)^2\iota(dKH)^2}{\delta}\right)\\
&\overset{(b)}{\leq} 
4d^2H^2\log\left(\frac{(c_\beta+1)^2\iota(dKH)^2}{\delta^2}\right)+4d^2H^2\log(8)\\
&=
4d^2H^2\log\left(\frac{(c_\beta+1)^2(dKH)^2}{\delta^2}\right)+4d^2H^2\log(\iota)+4d^2H^2\log(8)\\
&\overset{\textrm{(c)}}{\leq} 8d^2H^2\log\left(\frac{(c_\beta+1)dKH}{\delta}\right)+4d^2H^2\iota+4d^2H^2\log(8)\\
&\overset{\textrm{(d)}}{\leq} (12+4\log(8))d^2H^2\log\left(\frac{(c_\beta+1)dKH}{\delta}\right)
%
\end{align*}
where (a) follows from $c_\beta>0$, (b) from $\delta^2<\delta$, (c) from $\log(\iota)<\iota$ (since $\iota>1$ from~\eqref{eq:iota_bound}), and (d) from $\iota\leq \log\left(\frac{(c_\beta+1)dKH}{\delta}\right)$ and from~\eqref{eq:upp_low_bound}.

Now, joining the upper bounds for (I) and (II) in~\eqref{eq:to_upper_bound}, we finally obtain
\begin{equation*}
\norm{\sum_{\tau = 1}^{k-1} \phi^{\tau}_h [V^{i,k}_{h+1}(x^{\tau}_{h+1}) - \Pe_h V^{i,k}_{h+1}(x_h^{\tau}, a_h^{\tau})]}_{(\Lambda^k_h)^{-1}}^2 \leq 
(36+8\log(9)+4\log(8))d^2H^2 \log ((c_\beta+1)dKH/
\delta)
\end{equation*}
which proves the claim and thus the proof.
%
\end{proof}

\begin{proof}[Proof of Lemma~\ref{lem:basic_relation}]
For any $(i,k)\in[n]\times[K]$,
\begin{align*}
w^{i,k}_h -  w^{i,\bar{\pi}}_h
&= (\Lambda^k_h)^{-1} \sum_{\tau = 1}^{k-1} \phi^{\tau}_h (r^{\tau}_h + V^{i,k}_{h+1}(x^{\tau}_{h+1}))- w^{i,\bar{\pi}}_h  \\
%
&\overset{\textrm{(a)}}{=} (\Lambda^k_h)^{-1} \sum_{\tau = 1}^{k-1} \phi^{\tau}_h ({\phi^{\tau}_h}^\top w^{i,\bar{\pi}}_h - \Pe_h V^{i,\bar{\pi}}_{h+1}(x_h^{\tau}, a_h^{\tau}) + V^{i,k}_{h+1}(x^{\tau}_{h+1}))- w^{i,\bar{\pi}}_h  \\
%
&= (\Lambda^k_h)^{-1} \left(
\left(
\sum_{\tau = 1}^{k-1}
\phi^{\tau}_h(\phi^{\tau}_h)^\top
-\Lambda_h^k\right)w^{i,\bar{\pi}}_h \right. \\
&\quad \left.  
 + \sum_{\tau = 1}^{k-1} \phi^{\tau}_h \bigl (V^{i,k}_{h+1}(x^{\tau}_{h+1}) - \Pe_h V^{i,\bar{\pi}}_{h+1}(x_h^{\tau}, a_h^{\tau}) \bigr )\right) \\
%
&\overset{(b)}{=} (\Lambda^k_h)^{-1} \left(-\lambda w^{i,\bar{\pi}}_h + \sum_{\tau = 1}^{k-1} \phi^{\tau}_h (V^{i,k}_{h+1}(x^{\tau}_{h+1}) - \Pe_h V^{i,\bar{\pi}}_{h+1}(x_h^{\tau}, a_h^{\tau}))\right) \\
%
&= \underbrace{-\lambda (\Lambda^k_h)^{-1} w^{i,\bar{\pi}}_h}_{\textrm{(I)}} + 
\underbrace{(\Lambda^k_h)^{-1} \sum_{\tau = 1}^{k-1} \phi^{\tau}_h (V^{i,k}_{h+1}(x^{\tau}_{h+1}) - \Pe_h V^{i,k}_{h+1}(x_h^{\tau}, a_h^{\tau}))}_{\textrm{(II)}} \\
&\quad + \underbrace{(\Lambda^k_h)^{-1}\sum_{\tau = 1}^{k-1} \phi^{\tau}_h \Pe_h (V^{i,k}_{h+1} - V^{i,\bar{\pi}}_{h+1})(x_h^{\tau}, a_h^{\tau})}_{\textrm{(III)}}. %
\end{align*}
where (a) follows from the fact that,  for any $(x, a, h) \in \S \times \A \times [H]$, $Q^{i,\bar{\pi}}_h(x, a) :=  \langle \phi(x, a), w^{i,\bar{\pi}}_h\rangle = (r_h + \Pe_h V^{i,\bar{\pi}}_{h+1})(x, a) 
$ for some $w^{i,\bar{\pi}}_h\in\R^d$ (this follows from Proposition~\ref{prop:lin-Q} and the Bellman equation); and (b) follows from the definition of $\Lambda_h^k$. Since $\langle\phi(x, a), w^{i,k}_h\rangle - Q_h^{i,\bar{\pi}}(x, a)=\langle\phi(x, a), w^{i,k}_h-w^{i,\bar{\pi}}_h\rangle$ for any $(x,a)\in\S\times\A$, then we look to bound the inner product of each of the terms (I) -- (III) with the term $\phi(x, a)$.

Regarding the term (I),
\begin{multline*}
|\langle\phi(x,a),\textrm{(I)}\rangle|=|\langle \phi(x, a),\lambda (\Lambda^k_h)^{-1} w^{i,\bar{\pi}}_h\rangle| = |\lambda \langle (\Lambda^k_h)^{-1/2}\phi(x, a), (\Lambda^k_h)^{-1/2} w^{i,\bar{\pi}}_h\rangle|\\
\leq \lambda \norm{w_h^{i,\bar{\pi}}}_{ (\Lambda^k_h)^{-1}} \sqrt{\phi(x, a)^\top (\Lambda^k_h)^{-1}  \phi(x, a)}
\leq \sqrt{\lambda} \norm{w_h^{i,\bar{\pi}}} \sqrt{\phi(x, a)^\top (\Lambda^k_h)^{-1}  \phi(x, a)}
\end{multline*}
where the last inequality follows from $\norm{\,\cdot\,}_{(\Lambda_h^k)^{-1}}\leq \frac{1}{\sqrt{\lambda}}\norm{\,\cdot\,}$.

For the term (II), since the event $\mathcal{E}_i$ from Lemma~\ref{lem:stochastic_term} is given and $\lambda=1$, we directly obtain
\begin{multline*}
|\langle\phi(x,a),\textrm{(II)}\rangle|=\left|\left\langle \phi(x, a), (\Lambda^k_h)^{-1} \sum_{\tau = 1}^{k-1} \phi^{\tau}_h (V^{i,k}_{h+1}(x^{\tau}_{h+1}) - \Pe_h V^{i,k}_{h+1}(x_h^{\tau}, a_h^{\tau}))\right\rangle\right|
\\
%
\leq
\norm{\sum_{\tau = 1}^{k-1} \phi^{\tau}_h (V^{i,k}_{h+1}(x^{\tau}_{h+1}) - \Pe_h V^{i,k}_{h+1}(x_h^{\tau}, a_h^{\tau}))}_{(\Lambda^k_h)^{-1}}\norm{\phi(x,a)}_{(\Lambda^k_h)^{-1}}\\
%
\leq %\frac{1}{\sqrt{\lambda}} 
C dH\sqrt{\log ((c_\beta+1)dKH/\delta)} \sqrt{\phi(x, a)^\top (\Lambda^k_h)^{-1}  \phi(x, a)} 
\end{multline*}
where $C$ is an absolute constant independent of $c_\beta>0$.
%; where for the last inequality we used the assumption $\norm{\phi(x,a)}\leq 1$. 

For the term (III),
\begin{align*}
\langle\phi(x,a),\textrm{(III)}\rangle&=\left \langle \phi(x, a), (\Lambda^k_h)^{-1}\sum_{\tau = 1}^{k-1} \phi^{\tau}_h \Pe_h (V^{i,k}_{h+1} - V^{i,\bar{\pi}}_{h+1})(x_h^{\tau}, a_h^{\tau}) \right \rangle\\
%
&= \bigg \langle \phi(x, a), (\Lambda^k_h)^{-1}\sum_{\tau = 1}^{k-1} \phi^{\tau}_h (\phi^{\tau}_h)^\top \int_{\S} (V^{i,k}_{h+1} - V^{i,\bar{\pi}}_{h+1})(x') d \mu_h(x')\bigg\rangle\\
%
&\overset{\textrm{(a)}}{=} \underbrace{\bigg \langle \phi(x, a), \int_{\S} (V^{i,k}_{h+1} - V^{i,\bar{\pi}}_{h+1})(x') d \mu_h(x')\bigg \rangle}_{\textrm{(III.1)}}\\
&\quad 
\underbrace{-\lambda \bigg\langle \phi(x, a), (\Lambda^k_h)^{-1}\int_{\S} (V^{i,k}_{h+1} - V^{i,\bar{\pi}}_{h+1})(x') d \mu_h(x') \bigg\rangle}_{\textrm{(III.2)}}
\end{align*}
where (a) follows from the definition of $\Lambda_h^k$. We immediately see from our assumption on linear stochastic game that $\textrm{(III.1)}=\Pe_h (V^{i,k}_{h+1} - V^{i,\bar{\pi}}_{h+1})(x, a)$ and 
\begin{multline*}
|\textrm{(III.2)}|
\leq \lambda\norm{\int_\S (V^{i,k}_{h+1} - V^{i,\bar{\pi}}_{h+1})(x')d \mu_h(x')}_{(\Lambda_h^k)^{-1}} \sqrt{\phi(x, a)^\top (\Lambda^k_h)^{-1}  \phi(x, a)}\\
%
\leq \sqrt{\lambda}\norm{\int_\S (V^{i,k}_{h+1} - V^{i,\bar{\pi}}_{h+1})(x')d \mu_h(x')} \sqrt{\phi(x, a)^\top (\Lambda^k_h)^{-1}  \phi(x, a)}\\
%
\overset{\textrm{(a)}}{\leq} \sqrt{\lambda}2H\int_\S \norm{\mu_h(x')} dx'  \sqrt{\phi(x, a)^\top (\Lambda^k_h)^{-1}  \phi(x, a)}
%
\overset{(b)}{\leq} 2 H \sqrt{d\lambda} \sqrt{\phi(x, a)^\top (\Lambda^k_h)^{-1}  \phi(x, a)}
\end{multline*}
where (a) follows from the value functions being bounded, and (b) from the definiton of the linear MDP.

Finally, putting it all together with $\lambda=1$, we conclude that, 
\begin{multline*}
|\langle\phi(x, a), w^{i,k}_h\rangle - Q_h^{i,\bar{\pi}}(x, a)  -  \Pe_h (V^{i,k}_{h+1} - V^{i,\bar{\pi}}_{h+1})(x, a)| \\
%
\leq \left(\norm{w_h^{i,\bar{\pi}}}+CdH \sqrt{\log ((c_\beta+1)dKH/\delta)}+2H\sqrt{d}\right)\sqrt{\phi(x, a)^\top (\Lambda^k_h)^{-1}  \phi(x, a)}\\
%
\leq\left(4H\sqrt{d}+CdH \sqrt{\log ((c_\beta+1)dKH/\delta)}\right)\sqrt{\phi(x, a)^\top (\Lambda^k_h)^{-1}  \phi(x, a)}
\end{multline*}
where the last inequality follows from Proposition~\ref{prop:lin-Q}.

Now, from equation~\eqref{eq:upp_low_bound} in Lemma~\ref{lem:stochastic_term}, we have $\sqrt{\log ((c_\beta+1)dKH/\delta)}>1$ independently from $c_\beta>0$, and thus
$$
|\langle\phi(x, a), w^{i,k}_h\rangle - Q_h^{i,\bar{\pi}}(x, a)  -  \Pe_h (V^{i,k}_{h+1} - V^{i,\bar{\pi}}_{h+1})(x, a)| \leq \bar{C} dH\sqrt{\log ((c_\beta+1)dKH/\delta)} \sqrt{\phi(x, a)^\top (\Lambda^k_h)^{-1}  \phi(x, a)},
$$
for an absolute constant $\bar{C}= C+4$ independent of $c_{\beta}$.

Finally, to prove this lemma, we only need to show that there exists a choice of the absolute positive constant $c_\beta$ so that
$
\bar{C}\sqrt{\log ((c_\beta+1)dKH/\delta)}\leq c_\beta \sqrt{\iota}
$, which is equivalent to
%
\begin{equation} \label{eq:choice_beta_constant}
\bar{C}\sqrt{\iota + \log(c_\beta + 1)} \le c_\beta \sqrt{\iota}
\end{equation}
since $\sqrt{\log\left(\frac{(1+c_\beta)dKH}{\delta}\right)}=\sqrt{\log\left(\frac{dKH}{\delta}\right)+\log(1+c_\beta)}=\sqrt{\iota+\log(1+c_\beta)}$.

Two facts are known: 1) $\iota \in [\log(2), \infty)$ by its definition and $d\geq 2$; and 2) $\bar{C}$ is an absolute constant independent of $c_\beta$. 

Since we know we are looking for  $c_\beta>0$ and using the bound $\log(x)\leq x-1$ for any positive $x\in\R$, we conclude that proving the following equation implies~\eqref{eq:choice_beta_constant},
\begin{equation} \label{eq:choice_beta_constant_2}
\bar{C}\sqrt{\iota+c_\beta}\leq c_\beta\sqrt{\iota}.
\end{equation}
%Since both sides are nonegative, we square them and obtain that it becomes equivalent to showing that $c_\beta$ satisfies $0\leq \iota c_\beta^2-\bar{C}^2c_\beta-\bar{C}^2\iota$, and solving this quadratic expression let us conclude that this is satisfied if $c_\beta\geq g(\iota)$ with $g(\iota)=\frac{\bar{C}^2}{2\iota}+\frac{1}{2}\sqrt{\frac{\bar{C}^4}{\iota^2}+4\bar{C}^2}$. We now observe that $\iota\mapsto g(\iota)$ is a non-increasing function for $\iota \in [\log(2), \infty)$; therefore, if we want~\eqref{eq:choice_beta_constant_2} (and so~\eqref{eq:choice_beta_constant} to hold for any $\iota \in [\log(2), \infty)$, it suffices to choose 
%
After some algebraic calculations, we can show that
\begin{equation}
\label{eq:c_beta_lower}
c_\beta \geq \frac{\bar{C}^2}{2\log(2)}+\frac{1}{2}\sqrt{\frac{\bar{C}^4}{(\log(2))^2}+4\bar{C}^2}    
\end{equation}
suffices. This finishes the proof.
\end{proof}

\bibliography{NashQ}