\section{Proof of Regret Upper Bound}\label{sec:app:ftrl}
To start with, notice that $\Pi_{\max}$ is an affine subspace of $\R^{XA}_{\geq 0}$ satisfying $X$ linear constraints: for any $x_h\in\gX$, 
\begin{equation*}
    \sum_{a_h\in\gA}\mu_{1:h}(x_h,a_h)=\mu_{1:h-1}(x_{h-1},a_{h-1})\,,
\end{equation*}where $(x_{h-1},a_{h-1})$ is the unique predecessor of $x_h$ under perfect recall condition.
Thus $\Pi_{\max}$ can be decomposed as $\Pi_{\max }=(F+u) \cap \mathbb{R}_{\geq 0}^{XA}$ where $F$ is a linear subspace and $u\in \Pi_{\max}$. 

With slight abuse of notations, we further denote $\Psi_{\eta}(\mu)= \frac{1}{\eta}\sumH\Psi_h\left(p^\star_{1:h}\cdot\mu_{1:h}\right)$ and define its convex conjugate function $\Psi^{\star}_{\eta}$ on $\R^{XA}_{\geq 0}$ as
\begin{equation}\label{eq:conjugate}
    \Psi^{\star}_{\eta} (\vy)\coloneqq \sup_{\vx\in \R^{AX}_{\geq 0}} \left\langle \vx,\vy\right \rangle -\Psi_{\eta}(\vx)\,.
\end{equation}
Also, we denote $D_{\Psi^{\star}_{\eta}}(\vx,\vy)= \Psi^{\star}_{\eta}(\vx)-\Psi^{\star}_{\eta}(\vy)-\left\langle\nabla\Psi^{\star}_{\eta}(\vy),\vx-\vy\right\rangle$ as the Bregman divergence induced by $\Psi^{\star}_{\eta}$.
The following lemma shows the canonical regret decomposition of the FTRL framework. \citep{zimmert2019optimal,lattimore2020bandit}.
\begin{lemma}\label{lemma:reg_decompose_ftrl}
    The regret of \LSFTRL can be decomposed as
    \begin{equation*}\label{eq:ftrl_reg}
        \Reg_{\max}^T\leq \underbrace{\max_{\mu\in\Pi_{\max}}\left[-\Psi_{\eta}(\mu)\right]}_{\textsc{Penalty}}
        +\underbrace{\E\left[\sum_{t=1}^TD_{\Psi^\star_{\eta}}(\nabla\Psi_{\eta}(\mu^t)-\hat{\ell}^t, \nabla\Psi_{\eta}(\mu^t))\right]}_{\textsc{Stability}}\,.
    \end{equation*}
\end{lemma}
\begin{proof}
Let $\mu^{\dagger} \in \Pi_{\max }$ be some policy (in the sequence-form representation). For all $t \in [T]$, the instantaneous regret against $\mu^{\dagger}$ at step $t$ can be decomposed into
\begin{align*}
\left\langle\mu^{t}-\mu^{\dagger}, \hat{\ell}^{t}\right\rangle 
& =\left[\Phi_{\eta}\left(-\hat{L}^{t-1}\right)-\Phi_{\eta}\left(-\hat{L}^{t}\right)-\left\langle\mu^{\dagger}, \hat{\ell}^{t}\right\rangle\right]+\left[\left\langle\mu^{t}, \hat{\ell}^{t}\right\rangle+\Phi_{\eta}\left(-\hat{L}^{t}\right)-\Phi_{\eta}\left(-\hat{L}^{t-1}\right)\right]\,,
\end{align*}
where $\Phi_{\eta}(\vy)\coloneqq\sup _{\vmu \in \Pi_{\max }}\left\langle\vmu, \vy\right\rangle-\Psi_{\eta}\left(\vmu\right)$.

Taking summation of the above display over $t$ yields 
\begin{align*}
&\sum_{t=1}^{T}\left[\Phi_{\eta}\left(-\hat{L}^{t-1}\right)-\Phi_{\eta}\left(-\hat{L}^{t}\right)-\left\langle\mu^{\dagger}, \hat{\ell}^{t}\right\rangle\right] \\
=& \Phi_{\eta}(0)-\Phi_{\eta}\left(-\hat{L}^{t}\right)-\left\langle\mu^{\dagger}, \hat{L}^{t}\right\rangle \\
\overset{(i)}{\leq}&  \max _{\mu \in \Pi_{\max }}\left[-\Psi_{\eta}\left(\mu\right)\right]+\Psi_{\eta}\left(\mu^{\dagger}\right) \\
\overset{(ii)}{\leq}&  \max _{\mu \in \Pi_{\max }}\left[-\Psi_{\eta}\left(\mu\right)\right]\,,
\end{align*}
where $(i)$ comes from $\mu^{\dagger} \in \Pi_{\max }$; and $(ii)$ is due to the fact that $\Psi_{\eta}$ is a non-positive function.

On the other hand, due to that
$\Pi_{\max }=(F+u) \cap \mathbb{R}_{\geq 0}^{XA}$, we have
\begin{align*}
&\left\langle\mu^{t}, \hat{\ell}^{t}\right\rangle+\Phi_{\eta}  \left(-\hat{L}^{t}\right)-\Phi_{\eta}\left(-\hat{L}^{t-1}\right) \\
\overset{(i)}{=}&\left\langle\mu^{t}, \hat{\ell}^{t}\right\rangle+\Phi_{\eta}\left(\nabla \Psi_{\eta}\left(\mu^{t}\right)+\vg^t-\hat{\ell}^{t}\right)-\Phi_{\eta}\left(\nabla \Psi_{\eta}\left(\mu^{t}\right)+\vg^t\right) \\
\overset{(ii)}{=}&\left\langle\mu^{t}, \hat{\ell}_{t}\right\rangle+\Phi_{\eta}\left(\nabla \Psi_{\eta}\left(\mu^{t}\right)-\hat{\ell}^{t}\right)-\Phi_{\eta}\left(\nabla \Psi_{\eta}\left(\mu^{t}\right)\right) \\
\overset{(iii)}{\leq}& \left\langle\mu^{t}, \hat{\ell}_{t}\right\rangle+\Psi^{\star}_{\eta}\left(\nabla \Psi_{\eta}\left(\mu^{t}\right)-\hat{\ell}^{t}\right)-\Psi^{\star}_{\eta}\left(\nabla \Psi\left(\mu^{t}\right)\right) \\
\overset{(iv)}{=}&D_{\Psi^\star_{\eta}}\left(\nabla \Psi_{\eta}\left(\mu^{t}\right)-\hat{\ell}^{t}, \nabla \Psi_{\eta}\left(\mu^{t}\right)\right) \,,
\end{align*}
% \zhao{TBF: further clarifications are needed}
where $(i)$ follows from $\hat{L}^{t-1}+\nabla \Psi_{\eta}\left(\mu^{t}\right)+\vg^t=0$ for $\vg^t \in F^{\perp}$; 
$(ii)$ is due to the fact that $\vy \in \mathbb{R}^{XA}$, 
\begin{align*}
\Phi_{\eta}\left(\vy+\vg^t\right)
&=\sup _{\vmu \in (F+u) \cap \mathbb{R}_{\geq 0}^{XA}}\left\langle\vmu, \vy+\vg^t\right\rangle-\Psi_{\eta}\left(\vmu\right)\\
&= \left(\sup _{\vmu \in F \cap \mathbb{R}_{\geq 0}^{XA}}\left\langle\vmu, \vy+\vg^t\right\rangle-\Psi_{\eta}\left(\vmu\right)\right) + \left\langle u, \vy+\vg^t\right\rangle\\
&= \left(\sup _{\vmu \in F \cap \mathbb{R}_{\geq 0}^{XA}}\left\langle\vmu, \vy\right\rangle-\Psi_{\eta}\left(\vmu\right)\right) + \left\langle u, \vy+\vg^t\right\rangle\\
& = \left(\sup _{\vmu \in (F+u) \cap \mathbb{R}_{\geq 0}^{XA}}\left\langle\vmu, \vy\right\rangle-\Psi_{\eta}\left(\vmu\right)\right)+\left\langle u, \vg^t\right\rangle\\
 &= \Phi_{\eta}(\vy)+\left\langle u, \vg^t\right\rangle\,;
\end{align*}
$(iii)$ is by the observation that $\forall \vy \in \R^{XA}, \Phi_{\eta}(\vy) \leq \Psi^{\star}_{\eta}(\vy)$ and 
$\mu^{t}=\operatorname{argmax}_{\vx \in \R_{\geq 0}^{XA}}\left\langle \vx, \nabla \Psi_{\eta}\left(\mu^{t}\right)\right\rangle-\Psi_{\eta}(\vx)$ which implies that $\Phi_{\eta}\left(\nabla \Psi_{\eta}\left(\mu^{t}\right)\right)=\Psi^{\star}_{\eta}\left(\nabla \Psi_{\eta}\left(\mu^{t}\right)\right)$;
and $(iv)$ comes from the definition of $D_{\Psi^{\star}_{\eta}}(\vx,\vy)$.
\end{proof}

% The following lemma shows that the \textsc{Stability} term can be bounded by the variance of the loss estimate, which is the expected version of Lemma E.6 in~\citet{Fiegel2023adapting}. We also present the proof here for completeness.
% \begin{lemma}\label{lemma:varbound}
% Let $v_t=D_{\Psi^{\star}_{\eta}}(\nabla \Psi_{\eta}\left(\mu^t\right)-\hat{\ell}^t, \nabla \Psi_{\eta}\left(\mu^t\right))$ for all $t \in[T]$, which means that $\textsc{Stability} =\E\left[\sum_{t=1}^T v_t \right]$.
% Then, it holds that
% \[\E\left[\sum_{t=1}^T v_t \right] \leq \E\left[\frac{\eta}{2}\sum_{t=1}^T\sumH\sumlevel\frac{1}{p^\star_{1:h}(x_h)}\E^{\mu^t,\nu^t}\left[\muxa\hat{\ell}^t(x_h,a_h)^2\right] \right]\,.\]
% \end{lemma}
% \begin{proof}
% To begin with, for all $t \in[T]$, we define
% \[f_t(u)=D_{\Psi^{\star}_{\eta}}\left(\nabla \Psi_{\eta}\left(\mu^t\right)-u \hat{\ell}^t, \nabla \Psi_{\eta}\left(\mu^t\right)\right)\,,\]
% for $u \in[0,1]$, such that $f_t(0)=0$ and $f_t(1)=v_t$.  
% Also notice that $\operatorname{dom}(\Psi^\star_{\eta})=\R^{XA}_{\geq 0}$ 
% and both $\Psi_{\eta}$ and $\Psi^\star_{\eta}$ can be decomposed according to each infoset-action pair $(x_h,a_h)$. 
% Specifically, we have
% \[\Psi_{\eta}\left(\mu\right)=\sumH \sumlevel \Psi_{\eta, x_h, a_h}\left(\mu_{1: h}\left(x_h, a_h\right)\right)\,,\] 
% and 
% \[\Psi^{\star}_{\eta}(y)=\sumH \sumlevel \Psi_{\eta, x_h, a_h}^{\star}\left(y\left(x_h, a_h\right)\right)\,.\] 

% Then the derivative of $f_t$ can be expressed as
% \begin{align}\label{eq:proof_ftrl_stability1}
%     f_t^{\prime}(u)=\sumH \sumlevel \hat{\ell}_h^t\left(x_h, a_h\right)\left[\mu_{1: h}^t\left(x_h, a_h\right)-\nabla \Psi_{\eta, x_h, a_h}^{\star}\left(\nabla \Psi_{\eta, x_h, a_h}\left(\mu_{1: h}^t\left(x_h, a_h\right)\right)-u \hat{\ell}_h^t\left(x_h, a_h\right)\right)\right] \,.
% \end{align}

% Moreover, recall that we choose negative entropy as the potential function. Therefore, it holds that
% \begin{align*}
% \nabla \Psi_{\eta, x_h, a_h}\left(\mu_{1: h}\left(x_h, a_h\right)\right) & =\frac{p_{1: h}^{\star}\left(x_h\right)}{\eta}\left[\log \left(p_{1: h}^{\star}\left(x_h\right) \mu_{1: h}\left(x_h, a_h\right)\right)+1\right] \,,\\
% \nabla \Psi_{\eta, x_h, a_h}^{\star}\left(y\left(x_h, a_h\right)\right) & =\exp \left[\frac{\eta}{p_{1: h}^{\star}\left(x_h\right)}\left(y\left(x_h, a_h\right)\right)-1-\log \left(p_{1: h}^{\star}\left(x_h\right)\right)\right]\,,
% \end{align*}
% and
% \begin{align}\label{eq:proof_ftrl_stability2}
% \nabla \Psi_{\eta, x_h, a_h}^{\star}  &\left(\nabla \Psi_{\eta, x_h, a_h}\left(\mu_{1: h}^t\left(x_h, a_h\right)\right)-u \hat{\ell}_h^t\left(x_h, a_h\right)\right)\notag \\
% &= \exp \left[\frac{\eta}{p_{1: h}^{\star}\left(x_h\right)}\left(\frac{p_{1: h}^{\star}\left(x_h\right)}{\eta} \log \left(p_{1: h}^{\star}\left(x_h\right) \mu_{1: h}^t\left(x_h, a_h\right)\right)-u \hat{\ell}_h^t\left(x_h, a_h\right)\right)-\log \left(p_{1: h}^{\star}\left(x_h\right)\right)\right]\notag \\
% &= \mu_{1: h}^t\left(x_h, a_h\right) \exp \left[-u \frac{\eta \hat{\ell}_h^t\left(x_h, a_h\right)}{p_{1: h}^{\star}\left(x_h\right)}\right]\notag \\
% &\geq \mu_{1: h}^t\left(x_h, a_h\right)\left[1-u \frac{\eta \hat{\ell}_h^t\left(x_h, a_h\right)}{p_{1: h}^{\star}\left(x_h\right)}\right] \,,
% \end{align}
% where the last inequality follows from $e^{-x}\ge 1-x$ for all $x \in \mathbb{R}$. 

% Substituting \Eqref{eq:proof_ftrl_stability2} into \Eqref{eq:proof_ftrl_stability1} shows that 
% \begin{align*}
%     f_t^{\prime}(u) \leq u \sumH \sumlevel \hat{\ell}_h^t\left(x_h, a_h\right) \mu_{1: h}^t\left(x_h, a_h\right) \frac{\eta \hat{\ell}_h^t\left(x_h, a_h\right)}{p_{1: h}^{\star}\left(x_h\right)}\,.
% \end{align*}
% The proof is concluded by integrating the above display from $0$ to $1$ over $u$ and taking the expectation on both sides.
% \end{proof}





% \subsection{Proof of Theorem \ref{thm:ftrl_trans}}\label{app:ftrl_trans}
% In this section, we provide the proof of Theorem \ref{thm:ftrl_trans}, which takes $p^{\star}$ as the transition probability function over infoset-action pairs.
We are now ready to prove Theorem \ref{thm:ftrl_trans}.

\begin{proof}[Proof of Theorem \ref{thm:ftrl_trans}]
Combining Lemma \ref{lemma:reg_decompose_ftrl}, \ref{lemma:reg_trans} and \ref{lem:ftrl_stability_trans}, with $p^{\star}$ computed in Algorithm \ref{algo:compute_pstar}, we have that 
\begin{align}
\Reg_{\max}^T &\le \textsc{Penalty}+\textsc{Stability}\nonumber\\
&\leq \frac{H}{\eta}\log \left(XA\right) + 
\frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\lambda dHT\,,\label{eq:ftrl_trans}
\end{align}
% which along with choosing $\eta=\sqrt{\frac{2\log (AX)}{Td\lambda}}$ finishes the proof.
which along with choosing $\eta=\sqrt{\frac{2\log (AX)}{Td}}$ finishes the proof.
\end{proof}

% \zhao{TBF}
% Denote $\beta=\min _{h \in[H], x_h \in \mathcal{X}_h} p_{1: h}^{\star}\left(x_h\right)$ with $p^{\star}$ defined in Eq. \eqref{eq:p_star}.
% We note that leveraging $\beta$ as well as Assumption \ref{ass:ftrl_trans} necessitates identifying a transition probability function $p^\star$ with its minimum visitation probability achieving $\beta$.
% Finding such $p^\star$ is done by the procedure illustrated in Appendix \ref{app:maxlambda}.
% \zhao{TBF}

\subsection{Bounding the \textsc{Penalty} Term}
% The lemma below directly follows from Lemma E.5 of \citet{Fiegel2023adapting}, with its proof provided here for completeness.
The lemma below upper bounds the \textsc{Penalty} term.
\begin{lemma}\label{lemma:reg_trans}
For any fixed learning rate $\eta$ and transition probability $p^{\star}$ over infoset-action space, it holds that
\[
\textsc{Penalty} \leq \frac{H}{\eta} \log \left(XA\right)\,.
\]
\end{lemma}

\begin{proof}
It is clear that 
\begin{align*}
-\Psi_{\eta}(\mu)= -\frac{1}{\eta}\sum_{h=1}^{H}  \Psi_{h}\left(p_{1: h}^{\star} \cdot \mu_{1: h}\right) \overset{(i)}{\leq} \frac{1}{\eta}\sum_{h=1}^{H}  \log \left(X_hA\right) \leq \frac{1}{\eta}\sum_{h=1}^{H}  \log \left(XA\right)=\frac{H}{\eta} \log \left(XA\right) \,,
\end{align*}
where $(i)$ comes from Lemma \ref{lemma:transition}.
\end{proof}

\subsection{Bounding the \textsc{Stability} Term}\label{sec:app:ftrl_stability}
% Recall $\beta^\star_h=\min_{x_h\in\gX_h} p^{\star}_{1:h}(x_h)$ and $\beta^{\nu}_h=\max_{t\in[T],x_h\in\gX_h}p^{\nu^t}_{1:h}(x_h)$.
% The following lemma shows that the \textsc{Stability} term can be bounded by the variance of the loss estimate, which is the expected version of Lemma E.6 in \citet{Fiegel2023adapting}. We also present the proof here for completeness.
% \zhao{
% In the following, we define $\beta^\star_h\coloneqq\min_{x_h\in\gX_h} p^{\star}_{1:h}(x_h)$. 
% Besides, we let $\beta^{\nu}_h\coloneqq\max_{t\in[T],x_h\in\gX_h}p^{\nu^t}_{1:h}(x_h)$.
% Note that $\beta^\star_H=\min_{x_H\in\gX_H} p_{1:H}^\star(x_H)$ due to the ``balanced transition'' $p^\star$ computed in Algorithm \ref{algo:compute_pstar}.
% In the following, we define $\beta^\star_H\coloneqq\max_{\tilde{p}\in \sP^\star}\min_{h\in[H],x_h\in \gX_h}\tilde{p}_{1:h}(x_h)$. Note that $\beta^\star_H=\min_{x_H\in\gX_H} p_{1:H}^\star(x_H)$ due to the ``balanced transition'' $p^\star$ computed in Algorithm \ref{algo:compute_pstar}.
% }
% \zhao{
Before bounding the stability term, we first introduce the following lemma, which bounds the variance of the loss estimate.
\begin{lemma}\label{lem:loss_bound}
    For any $h\in[H]$ and any $(x_h,a_h)\in\gX_h\times \gA$, it holds that $|\hatellxa|\leq \frac{L^2}{\rho}$.
\end{lemma}
\begin{proof}
First notice that for any $\nu^t$ and any $(x_h,a_h)\in\gX_h\times \gA$, we have
\begin{align}\label{eq:phixa_bounded_norm}
    &\left\|\vphixa\right\|_2\notag\\
    =&\left\| -\sum_{(s_h,b_h)\in x_h\times \gB} p_{1:h}(s_h)\nu_{1:h}^t(y(s_h),b_h)\vphi(s_h,a_h,b_h)\right\|_2\notag\\
    \leq&\sum_{(s_h,b_h)\in x_h\times \gB} p_{1:h}(s_h)\nu_{1:h}^t(y(s_h),b_h)\left\|\vphi(s_h,a_h,b_h)\right\|_2\notag\\
    \overset{(i)}{\le}&L\sum_{(s_h,b_h)\in x_h\times \gB} p_{1:h}(s_h)\nu_{1:h}^t(y(s_h),b_h)\notag\\
    \overset{(ii)}{\le}&L\,,
\end{align}
where $(i)$ is due to Assumption \ref{assumption:linear}; and $(ii)$ follows from the proof of Lemma 2 by \citet{kozuno2021learning}.

% ---------------------------------- 2024.08.05 ----------------------------------
% Recall that $\mu^{t}=(1-\gamma)\hat{\mu}^{t}+\gamma\pi$. Let $\Phi_h^t\coloneqq \left\{\vphi^{\nu^t}(x_h,a_h)\right\}_{(x_h,a_h)\in \gX_h\times\gA}$. It is then clear that
% \begin{align*}
% |\hatellxa|& =|\vphi^{\nu^t}(x_h,a_h)^\top(\mQ_{\mu^t,h}^t)^{-1}\vphi^{\nu^t}(x_h^t,a_h^t)r_h^t(s_h^t,a_h^t,b_h^t)|\\
% &\overset{(i)}{\le}|\vphi^{\nu^t}(x_h,a_h)^\top(\mQ_{\mu^t,h}^t)^{-1}\vphi^{\nu^t}(x_h^t,a_h^t)|\\
% &\overset{(ii)}{\le} \|\vphi^{\nu^t}(x_h,a_h)\|_{(\mQ_{\mu^t,h}^t)^{-1}}\cdot \sup_{\vphi\in \Phi_h^t}\|\vphi\|_{(\mQ_{\mu^t,h}^t)^{-1}}\\
% &\le \sup_{\vphi\in\Phi_h^t}\|\vphi\|^2_{(\mQ_{\mu^t,h}^t)^{-1}}\\
% &\le \sup_{\vphi\in\Phi_h^t}\|\vphi\|^2_{(\gamma\mQ_{\pi,h}^t)^{-1}}\\
% &\le \sup_{\vphi\in\Phi_h^t}\|\vphi\|^2_{(\gamma\rho\mI)^{-1}}\\
% &\overset{(iii)}{\le} \frac{1}{\gamma\rho}\,,
% \end{align*}
% where $(i)$ is because $|r_h(s_h^t,a_h^t,b_h^t)|\leq 1$;
%  $(ii)$ is by the Cauchy-Schwarz inequality; and $(iii)$ comes from Eq. \eqref{eq:phixa_bounded_norm}.
% ---------------------------------- 2024.08.05 ----------------------------------

% Recall that $\mu^{t}=(1-\gamma)\hat{\mu}^{t}+\gamma\pi$. 
Let $\Phi_h^t\coloneqq \left\{\vphi^{\nu^t}(x_h,a_h)\right\}_{(x_h,a_h)\in \gX_h\times\gA}$. It is then clear that
\begin{align*}
|\hatellxa|& =|\vphi^{\nu^t}(x_h,a_h)^\top(\mQ_{\mu^t,h}^t)^{-1}\vphi^{\nu^t}(x_h^t,a_h^t)r_h^t(s_h^t,a_h^t,b_h^t)|\\
&\overset{(i)}{\le}|\vphi^{\nu^t}(x_h,a_h)^\top(\mQ_{\mu^t,h}^t)^{-1}\vphi^{\nu^t}(x_h^t,a_h^t)|\\
&\overset{(ii)}{\le} \|\vphi^{\nu^t}(x_h,a_h)\|_{(\mQ_{\mu^t,h}^t)^{-1}}\cdot \|\vphi^{\nu^t}(x_h^t,a_h^t)\|_{(\mQ_{\mu^t,h}^t)^{-1}}\\
&\le \|\vphi^{\nu^t}(x_h,a_h)\|_{(\mQ_{\mu^t,h}^t)^{-1}}\cdot \sup_{\vphi\in \Phi_h^t}\|\vphi\|_{(\mQ_{\mu^t,h}^t)^{-1}}\\
&\le \sup_{\vphi\in\Phi_h^t}\|\vphi\|^2_{(\mQ_{\mu^t,h}^t)^{-1}}\\
&\le \sup_{\vphi\in\Phi_h^t}\|\vphi\|^2_{(\rho\mI)^{-1}}\\
&\overset{(iii)}{\le} \frac{L^2}{\rho}\,,
\end{align*}
where $(i)$ is because $|r_h^t(s_h^t,a_h^t,b_h^t)|\leq 1$;
 $(ii)$ is by the Cauchy-Schwarz inequality; and $(iii)$ comes from Eq. \eqref{eq:phixa_bounded_norm}.
\end{proof}
% }

% Recall $\beta^\star_H\coloneqq\min_{x_H\in\gX_H} p^{\star}_{1:H}(x_H)$. 
Recall $\beta^\star_h=\min_{x_h\in\gX_h} p^{\star}_{1:h}(x_h)$ and $\beta^{\nu}_h=\max_{t\in[T],x_h\in\gX_h}p^{\nu^t}_{1:h}(x_h)$.
The following lemma shows that the \textsc{Stability} term can be bounded by the variance of the loss estimate.
% \zhao{
\begin{lemma}\label{lemma:varbound}
% Let $v_t=D_{\Psi^{\star}_{\eta}}(\nabla \Psi_{\eta}\left(\mu^t\right)-\hat{\ell}^t, \nabla \Psi_{\eta}\left(\mu^t\right))$ for all $t \in[T]$, which means that $\textsc{Stability} =\E\left[\sum_{t=1}^T v_t \right]$.
% Then, it holds that
% \[\E\left[\sum_{t=1}^T v_t \right] \leq \E\left[\frac{\eta}{2}\sum_{t=1}^T\sumH\sumlevel\frac{1}{p^\star_{1:h}(x_h)}\E^{\mu^t,\nu^t}\left[\muxa\hat{\ell}^t(x_h,a_h)^2\right] \right]\,.\]

The one-step stability term satisfies 
\begin{align*}
    D_{\Psi^{\star}_{\eta}}(\nabla \Psi_{\eta}\left(\mu^t\right)-\hat{\ell}^t, \nabla \Psi_{\eta}\left(\mu^t\right))\leq \frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\sum_{h=1}^{H}\sum_{(\xhah)\in\XhA}\frac{\sfr{\mu}}{\pstarx}\hatelltxah^2\,.
\end{align*}
\end{lemma}
\begin{proof}
    In what follows, we let $\tilde{\mu}^{t+1}\coloneqq \nabla \Psi^{\star}_{\eta}\left(\nabla \Psi_{\eta}\left(\mu^t\right)-\hat{\ell}^t\right)$. Thus it is clear that 
    \begin{align*}
        \tilde{\mu}^{t+1}=\argmin_{\mu\in\sR^{XA}_{\geq 0}}\langle\mu, \hatellt\rangle+D_{\psieta}(\mu,\mu^t)\,,
    \end{align*}
    and 
    \begin{align}\label{eq:tildemu}
        \nabla \Psi_{\eta}\left(\tilde{\mu}^{t+1}\right)=\nabla\Psi_{\eta}\left(\mu^t\right)-\hatellt\,,
    \end{align}
    where the latter follows from the first-order optimality condition.
    
    Then, one can deduce that
    \begin{align}\label{eq:one_step_stability1}
        &\quad\,\, D_{\Psi^{\star}_{\eta}}\left(\nabla \Psi_{\eta}\left(\mu^t\right)-\hat{\ell}^t, \nabla \Psi_{\eta}\left(\mu^t\right)\right)\notag\\
        &=D_{\Psi_{\eta}}\left(
       \nabla \Psi^{\star}_{\eta}\left(\nabla \Psi_{\eta}\left(\mu^t\right)\right),
        \nabla \Psi^{\star}_{\eta}\left(\nabla \Psi_{\eta}\left(\mu^t\right)-\hat{\ell}^t\right) \right)\notag\\
        &=D_{\Psi_{\eta}}\left(\mu^t,\tilde{\mu}^{t+1} \right)\notag\\
        &=-D_{\Psi_{\eta}}\left(\tilde{\mu}^{t+1},\mu^t \right)
            +\left\langle \mu^t-\tilde{\mu}^{t+1},
        \nabla \Psi_{\eta}\left(\mu^t\right)-\nabla \Psi_{\eta}\left(\tilde{\mu}^{t+1}\right)\right\rangle\notag\\
        &\overset{(i)}{\leq}-D_{\Psi_{\eta}}\left(\tilde{\mu}^{t+1},\mu^t \right)
        +\frac{1}{2}\| \nabla\Psi_{\eta}\left(\mu^t\right)-\nabla \Psi_{\eta}\left(\tilde{\mu}^{t+1}\right)\|_{\nabla^{-2}\Psi_{\eta}\left(z^t\right)}
        +\frac{1}{2}\| \mu^t-\tilde{\mu}^{t+1}\|_{\nabla^{2}\Psi_{\eta}\left(z^t\right)}\notag\\
        &\overset{(ii)}{=}-D_{\Psi_{\eta}}\left(\tilde{\mu}^{t+1},\mu^t \right)
        +\frac{1}{2}\| \hat{\ell}^t\|_{\nabla^{-2}\Psi_{\eta}\left(z^t\right)}
        +\frac{1}{2}\| \mu^t-\tilde{\mu}^{t+1}\|_{\nabla^{2}\Psi_{\eta}\left(z^t\right)}\notag\\
        &\overset{(iii)}{=}\frac{1}{2}\| \hat{\ell}^t\|_{\nabla^{-2}\Psi_{\eta}\left(z^t\right)}\notag\\
        &\overset{(iv)}{=}\frac{\eta}{2}\sum_{h=1}^{H}\sum_{(\xhah)\in\XhA}\frac{\sfr{z}}{\pstarx}\hatelltxah^2\,,
    \end{align}
    where $(i)$ is by the Cauchy-Schwarz inequality and $z^t=\alpha\mu^t+(1-\alpha)\tilde{\mu}^{t+1}$ for some $\alpha\in[0,1]$; $(ii)$ follows from Eq. \eqref{eq:tildemu}; $(iii)$ is due to the mean value theorem; and $(iv)$ is by noticing that
    $\frac{\partial^2\psieta(z_t)}{\partial\sfr{z}^2}=\frac{\pstarx}{\eta\sfr{z}}$. 
    % Note that $\frac{\partial\psieta(z_t)}{\partial\sfr{z}}=\frac{\pstarxa}{\eta}\left(\log\pstarxa\sfr{z}+1 \right)$ and 
    % $\frac{\partial^2\psieta(z_t)}{\partial\sfr{z}^2}=\frac{\pstarxa}{\eta\sfr{z}}$. 

    Further, using Eq. \eqref{eq:tildemu} again and noticing that $\frac{\partial\psieta(mu_t)}{\partial\sfr{\mu}}=\frac{\pstarx}{\eta}\left(\log\pstarx\sfr{\mu}+1 \right)$, one can see that $\forall(\xhah)\in\XhA$,
    \begin{align*}
        \sfrtp{\tilde{\mu}}=\sfr{\mu}\exp\left(-\frac{\eta\hatelltxah}{\pstarx} \right)\,.
    \end{align*}
    The above display along with the fact that $z^t$ in Eq. \eqref{eq:one_step_stability1} satisfies $z^t=\alpha\mu^t+(1-\alpha)\tilde{\mu}^{t+1}$ for some $\alpha\in[0,1]$ implies that $\forall(\xhah)\in\XhA$,
    \begin{align}\label{eq:z_range1}
        \sfr{z}\in\left[\sfr{\mu}\exp\left(-\frac{\eta|\hatelltxah|}{\pstarx} \right),\sfr{\mu}\exp\left(\frac{\eta|\hatelltxah|}{\pstarx} \right)\right]\,.
    \end{align}
    Combining Eq. \eqref{eq:z_range1} and Lemma \ref{lem:loss_bound} leads to 
    \begin{align}\label{eq:z_range2}
        \sfr{\mu}\exp\left(-\frac{\eta L^2}{\pstarx\rho} \right)\leq \sfr{z}
        \leq \sfr{\mu}\exp\left(\frac{\eta L^2}{\pstarx\rho} \right)\,.
    \end{align}

    Substituting Eq. \eqref{eq:z_range2} into Eq. \eqref{eq:one_step_stability1}, we have
    \begin{align*}
        D_{\Psi^{\star}_{\eta}}\left(\nabla \Psi_{\eta}\left(\mu^t\right)-\hat{\ell}^t, \nabla \Psi_{\eta}\left(\mu^t\right)\right)\leq& \frac{\eta}{2}\sum_{h=1}^{H}\sum_{(\xhah)\in\XhA}\frac{\sfr{z}}{\pstarx}\hatelltxah^2\\
        \leq&\frac{\eta}{2}\sum_{h=1}^{H}\sum_{(\xhah)\in\XhA}\exp\left(\frac{\eta L^2}{\pstarx\rho} \right)\frac{\sfr{\mu}}{\pstarx}\hatelltxah^2\\
        \leq& \frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\sum_{h=1}^{H}\sum_{(\xhah)\in\XhA}\frac{\sfr{\mu}}{\pstarx}\hatelltxah^2\,,
    \end{align*}
    which completes the proof.
\end{proof}
% }
% ---------------------------------------- 2024.08.05 ----------------------------------------
% \begin{lemma}\label{lemma:varbound}
% Let $v_t=D_{\Psi^{\star}_{\eta}}(\nabla \Psi_{\eta}\left(\mu^t\right)-\hat{\ell}^t, \nabla \Psi_{\eta}\left(\mu^t\right))$ for all $t \in[T]$, which means that $\textsc{Stability} =\E\left[\sum_{t=1}^T v_t \right]$.
% Then, it holds that
% \[\E\left[\sum_{t=1}^T v_t \right] \leq \E\left[\frac{\eta}{2}\sum_{t=1}^T\sumH\sumlevel\frac{1}{p^\star_{1:h}(x_h)}\E^{\mu^t,\nu^t}\left[\muxa\hat{\ell}^t(x_h,a_h)^2\right] \right]\,.\]
% \end{lemma}
% \begin{proof}
% To begin with, for all $t \in[T]$ and $u \in[0,1]$, we define
% \[f_t(u)=D_{\Psi^{\star}_{\eta}}\left(\nabla \Psi_{\eta}\left(\mu^t\right)-u \hat{\ell}^t, \nabla \Psi_{\eta}\left(\mu^t\right)\right)\,,\]
% such that $f_t(0)=0$ and $f_t(1)=v_t$.  
% Also notice that $\operatorname{dom}(\Psi^\star_{\eta})=\R^{XA}_{\geq 0}$ 
% and both $\Psi_{\eta}$ and $\Psi^\star_{\eta}$ can be decomposed in accordance with each infoset-action pair $(x_h,a_h)$. 
% Specifically, we have
% \[\Psi_{\eta}\left(\mu\right)=\sumH \sumlevel \Psi_{\eta, x_h, a_h}\left(\mu_{1: h}\left(x_h, a_h\right)\right)\,,\] 
% and 
% \[\Psi^{\star}_{\eta}(y)=\sumH \sumlevel \Psi_{\eta, x_h, a_h}^{\star}\left(y\left(x_h, a_h\right)\right)\,.\] 

% Then the derivative of $f_t$ can be expressed as
% \begin{align}\label{eq:proof_ftrl_stability1}
%     f_t^{\prime}(u)=&\sumH \sumlevel \hat{\ell}_h^t\left(x_h, a_h\right)\notag\\
%     &\cdot\left[\mu_{1: h}^t\left(x_h, a_h\right)-\nabla \Psi_{\eta, x_h, a_h}^{\star}\left(\nabla \Psi_{\eta, x_h, a_h}\left(\mu_{1: h}^t\left(x_h, a_h\right)\right)-u \hat{\ell}_h^t\left(x_h, a_h\right)\right)\right] \,.
% \end{align}

% Moreover, recall that we choose negative entropy as the potential function, implying that
% \begin{align*}
% \nabla \Psi_{\eta, x_h, a_h}\left(\mu_{1: h}\left(x_h, a_h\right)\right) & =\frac{p_{1: h}^{\star}\left(x_h\right)}{\eta}\left[\log \left(p_{1: h}^{\star}\left(x_h\right) \mu_{1: h}\left(x_h, a_h\right)\right)+1\right] \,,\\
% \nabla \Psi_{\eta, x_h, a_h}^{\star}\left(y\left(x_h, a_h\right)\right) & =\exp \left[\frac{\eta}{p_{1: h}^{\star}\left(x_h\right)}\left(y\left(x_h, a_h\right)\right)-1-\log \left(p_{1: h}^{\star}\left(x_h\right)\right)\right]\,,
% \end{align*}
% and
% \begin{align}\label{eq:proof_ftrl_stability2}
% &\nabla \Psi_{\eta, x_h, a_h}^{\star}  \left(\nabla \Psi_{\eta, x_h, a_h}\left(\mu_{1: h}^t\left(x_h, a_h\right)\right)-u \hat{\ell}_h^t\left(x_h, a_h\right)\right)\notag \\
% =& \exp \left[\frac{\eta}{p_{1: h}^{\star}\left(x_h\right)}\left(\frac{p_{1: h}^{\star}\left(x_h\right)}{\eta} \log \left(p_{1: h}^{\star}\left(x_h\right) \mu_{1: h}^t\left(x_h, a_h\right)\right)-u \hat{\ell}_h^t\left(x_h, a_h\right)\right)-\log \left(p_{1: h}^{\star}\left(x_h\right)\right)\right]\notag \\
% =& \mu_{1: h}^t\left(x_h, a_h\right) \exp \left[-u \frac{\eta \hat{\ell}_h^t\left(x_h, a_h\right)}{p_{1: h}^{\star}\left(x_h\right)}\right]\notag \\
% \geq& \mu_{1: h}^t\left(x_h, a_h\right)\left[1-u \frac{\eta \hat{\ell}_h^t\left(x_h, a_h\right)}{p_{1: h}^{\star}\left(x_h\right)}\right] \,,
% \end{align}
% where the last inequality follows from $e^{-x}\ge 1-x$ for all $x \in \mathbb{R}$. 

% Substituting \Eqref{eq:proof_ftrl_stability2} into \Eqref{eq:proof_ftrl_stability1} shows that 
% \begin{align*}
%     f_t^{\prime}(u) \leq u \sumH \sumlevel \hat{\ell}_h^t\left(x_h, a_h\right) \mu_{1: h}^t\left(x_h, a_h\right) \frac{\eta \hat{\ell}_h^t\left(x_h, a_h\right)}{p_{1: h}^{\star}\left(x_h\right)}\,.
% \end{align*}
% The proof is concluded by integrating the above display from $0$ to $1$ over $u$ and taking the expectation on both sides.
% \end{proof}
% ---------------------------------------- 2024.08.05 ----------------------------------------

With ``balanced transition'' $p^\star$ computed in Algorithm \ref{algo:compute_pstar}, the following lemma upper bounds the \textsc{Stability} term via $\lambda$ and $d$ by considering the ratio between transition $p^{\nu^t}_{1:h}$ contributed by the environment state transition $\sP$ as well as opponent's policy $\nu^t$ and ``balanced transition'' $p^\star$.
\begin{lemma}\label{lem:ftrl_stability_trans}
With ``balanced transition'' $p^\star$ computed in Algorithm \ref{algo:compute_pstar}, for any fixed learning rate $\eta$, it holds that
\begin{align*}
    \textsc{Stability}\leq \frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\lambda dHT\,.
\end{align*}
\end{lemma}
\begin{proof}
% In the following, we define $\beta\coloneqq\max_{\tilde{p}\in \sP^\star}\min_{h\in[H],x_h\in \gX_h}\tilde{p}_{1:h}(x_h)$. Note that $\beta=\min_{x_H\in\gX_H} p_{1:H}^\star(x_H)$ due to the ``balanced transition'' $p^\star$ computed in Algorithm \ref{algo:compute_pstar}.
Using Lemma \ref{lemma:varbound}, one can see that
\begin{align*}
        &\textsc{Stability} \\
        =& \E\left[\sum_{t=1}^TD_{\Psi^\star_{\eta}}(\nabla\Psi_{\eta}(\mu^t)-\hat{\ell}^t, \nabla\Psi_{\eta}(\mu^t))\right]\\
        \leq& \E\left[\frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\sum_{t=1}^T\sum_{h=1}^H\sumlevel\frac{1}{p^{\star}_{1:h}(x_h)}\E^{\mu^t,\nu^t}\left[\muxa\hat{\ell}^t(x_h,a_h)^2\right]\right] \\
        =& \E\left[\frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\sum_{t=1}^T\sum_{h=1}^H
        \sumlevel\frac{1}{p^{\star}_{1:h}(x_h)}\muxa \E^{\mu^t,\nu^t}\left[(\hat{\vtheta}_h^t)^\top\vphixa\vphixa^\top\hat{\vtheta}_h^t\right]\right] \\
        \overset{(i)}{=}& \E\left[\frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\sum_{t=1}^T\sum_{h=1}^H \sumlevel\frac{1}{p^{\star}_{1:h}(x_h)}\muxa\right. \\
        &\left.\cdot\E^{\mu^t,\nu^t}\left[r_h^t(s_h^t,a_h^t,b_h^t)^2 \vphi^{\nu^t}(x_h^t,a_h^t)^\top (\mQ_{\mu^t,h}^t)^{-1}\vphixa\vphixa^\top(\mQ_{\mu^t,h}^t)^{-1} \vphi^{\nu^t}(x_h^t,a_h^t)\right]\right] \\
        \overset{(ii)}{\leq}& \E\left[\frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\sum_{t=1}^T\sum_{h=1}^H \sumlevel\frac{1}{p^{\star}_{1:h}(x_h)}\muxa\right. \\
        &\left.\cdot\E^{\mu^t,\nu^t}\left[\vphi^{\nu^t}(x_h^t,a_h^t)^\top (\mQ_{\mu^t,h}^t)^{-1}\vphixa\vphixa^\top(\mQ_{\mu^t,h}^t)^{-1} \vphi^{\nu^t}(x_h^t,a_h^t)\right]\right] \\
        \overset{(iii)}{\leq}& \E\left[\frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\sum_{t=1}^T\sum_{h=1}^H\frac{1}{\betastarH} \right. \\
        &\left.\cdot\E^{\mu^t,\nu^t}\left[\vphi^{\nu^t}(x_h^t,a_h^t)^\top (\mQ_{\mu^t,h}^t)^{-1}\left(\sumlevel\muxa\vphixa\vphixa^\top\right)(\mQ_{\mu^t,h}^t)^{-1} \vphi^{\nu^t}(x_h^t,a_h^t)\right]\right] \\
        =& \E\left[\frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\sum_{t=1}^T\sum_{h=1}^H\frac{1}{\betastarH}\E^{\mu^t,\nu^t}\left[\vphi^{\nu^t}(x_h^t,a_h^t)^\top (\mQ_{\mu^t,h}^t)^{-1} \vphi^{\nu^t}(x_h^t,a_h^t)\right]\right] \\
        =& \E\left[\frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\sum_{t=1}^T\sum_{h=1}^H\frac{1}{\betastarH}\operatorname{tr}\left(\sumlevel p^{\nu^t}_{1:h}(x_h)\muxa\vphixa\vphixa^\top (\mQ_{\mu^t,h}^t)^{-1}\right)\right]\\
        \overset{(iv)}{\leq}& \E\left[\frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\sum_{t=1}^T\sum_{h=1}^H\lambda\operatorname{tr}\left(\sumlevel\muxa\vphixa\vphixa^\top (\mQ_{\mu^t,h}^t)^{-1}\right)\right]\\
        =& \E\left[\frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)\sum_{t=1}^T\sum_{h=1}^H\lambda\operatorname{tr}\left(\mI_d\right)\right]\\
        =& \frac{\eta}{2}\exp\left(\frac{\eta L^2}{\beta^\star_H\rho} \right)THd\lambda \,,
\end{align*}
where
% \zhao{
% $(i)$ is by Lemma \ref{lemma:varbound};
% }
$(i)$ comes from the definition of $\hat{\vtheta}_h^t$ in Eq. \eqref{eq:hat_theta};
$(ii)$ holds due to that $|r_h^t(s_h^t,a_h^t,b_h^t)|\leq 1$; $(iii)$ is because $\betastarH\leq p^\star_{1:h}(x_h)$ for any $x_h\in\gX_h$ and $h\in[H]$; and $(iv)$ comes from the definition of $\lambda$.

The proof is thus concluded.
\end{proof}

\subsection{Properties of Balanced Transition $p^\star$}\label{app:sec:balanced_transition_property}

The lemma below delineates the property of $p^\star$ as transition probability over infoset-action space.
\begin{lemma}\label{lemma:transition}
    For any $h\in[H]$, any $p^{\star}$ as transition probability over infoset-actions and any policy $\mu\in\Pi_{\max}$ of the max-player, it holds that
    \[\sumlevel p^{\star}_{1:h}(x_h)\mu_{1:h}(x_h,a_h)=1\,.\]
\end{lemma}
% \begin{proof}
%     See Appendix~\ref{app:proof_lem_transition}.
% \end{proof}
% \subsection{Proof of Lemma~\ref{lemma:transition}}
\begin{proof}
    By the definition of perfect recall and transition probability over infoset-action space, we have
    \begin{align*}
        \sP^{\mu,\nu}(x_h,a_h)&= \sP^{\mu,\nu}(x_1,\dots,x_h,a_h)\\
        &= p^{\star}_0(x_1) \prod_{h^\prime=1}^{h-1} p_{h^\prime}^{\star}(x_{h^\prime+1}|x_{h^\prime},a_{h^\prime}) \cdot \prod_{h^\prime=1}^h \mu_{h^\prime}(a_{h^\prime}|x_{h^\prime})\\
        &= p^\star(x_h) \mu_{1:h}(x_h,a_h)\,.
    \end{align*}
    % The lemma holds since 
    The proof is thus concluded by noticing that $\sumlevel \sP^{\mu,\nu}(x_h,a_h) = 1$.
\end{proof}

% For $p^\star$ computed in Algorithm \ref{algo:compute_pstar}, the following lemma guarantees that $\lambda\leq 1$ when the environment state transition is uniformly at random and the game tree is $k$-ary tree.
The following lemma shows that $p^\star$ computed in Algorithm \ref{algo:compute_pstar} guarantees $\lambda\leq 1$ when the environment state transition is uniformly at random and the game tree is a $k$-ary tree.
\begin{lemma}\label{lem:lambda_leq_1}
When the environment state transition $\sP$ is a uniform distribution and the game tree is a $k$-ary tree, the ``balanced transition'' $p^\star$ computed in Algorithm \ref{algo:compute_pstar} guarantees that $\lambda\leq 1$.
\end{lemma}
\begin{proof}
    Let $n,m>0$ be the number of the states and the number of the infosets that the game can transit to when taking any action $a$. Note that $m\leq n$ due to the properties of the perfect recall and tree structure conditions defined in Section \ref{sec:setting}.
    
    Fix some $h\in[H]$. Recall $p_{1: h}^{\nu^t}\left(x_h\right)=\sum_{s_h:x(s_h)= x_h} p_{1: h}\left(s_h\right) \nu_{1: h-1}^t(y\left(s_{h-1}\right),b_{h-1})$. Since the environment state transition $\sP$ is a uniform distribution, it holds that 
    \begin{align}\label{eq:local_fdsfs}
        p_{1: h}^{\nu^t}\left(x_h\right)&=\sum_{s_h:x(s_h)= x_h} p_{1: h}\left(s_h\right) \nu_{1: h-1}^t(y\left(s_{h-1}\right),b_{h-1})\notag\\
        &=\left(\frac{1}{n}\right)^{h-1}\sum_{s_h:x(s_h)= x_h} \nu_{1: h-1}^t(y\left(s_{h-1}\right),b_{h-1})\notag\\
        &\leq \left(\frac{1}{n}\right)^{h-1}\left(\frac{n}{m} \right)^{h-1}\notag\\
        &= \left(\frac{1}{m}\right)^{h-1}\,,
    \end{align}
    where the inequality is by noticing that $\nu_{1: h-1}^t(y\left(s_{h-1}\right),b_{h-1})\leq 1$ for all $(y\left(s_{h-1}\right),b_{h-1})\in\gY_h\times\gB$. On the other hand, it is easy to check that $p^\star$ computed in Algorithm \ref{algo:compute_pstar} satisfies $p^\star_{1:h}(x_h)=\left(\frac{1}{m}\right)^{h-1}$ for all $x_h\in\gX_h$, which together with Eq. \eqref{eq:local_fdsfs} concludes the proof.
\end{proof}

% The lemma below demonstrates that $p^\star$ computed in Algorithm \ref{algo:compute_pstar} guarantees that $\lambda\leq X$ in the worst-case scenario.

In the worst case, $p_{1: H}^{\nu^t}(x_H)=1$ for some $x_H\in\mathcal{X}_H$ (note again that this is almost impossible to happen in practice as discussed in Section \ref{sec:ftrl_analysis}), meaning that $\lambda=\min_{x_H\in\mathcal{X}_H }1/p^\star_{1:H}(x_H)$.  Intuitively, $\lambda$ can be well-controlled if the ``balanced transition'' $p^\star_{1:h}(\cdot)$ is ``balanced'' enough in the sense that the ``transition probability'' of visiting $x_h$ is lower bounded for any $ x_h\in\mathcal{X}_h$ and $h\in[H]$. This is exactly guaranteed by the design of our ``balanced transition'' $p^\star_{1:h}(\cdot)$ specified in Eq. \eqref{eq:p_star}, the computation of which is solved by our Algorithm \ref{algo:compute_pstar}. This is formalized by the following lemma.

\begin{lemma}\label{lem:lambda_leq_x}
The ``balanced transition'' $p^\star$ computed in Algorithm \ref{algo:compute_pstar} guarantees that $\lambda\leq X$.
\end{lemma}
\begin{proof}
It suffices to show that $p^\star_{1:h}(x_h)\geq \nicefrac{1}{X}$ for any $x_h\in\gX_h$ and $h\in[H]$.
Clearly, $p^\star_{1:h}(\cdot)$ is minimzed at $h=H$ for 
some $x_H\in\mathcal{X}_H$ by its definition. 
By the construction of $p^\star_{1:h}(\cdot)$ in Algorithm \ref{algo:compute_pstar}, one can deduce that 
$\forall x_H\in\mathcal{X}_H$, we have 
(understanding $\{(x_h,a_h)\}_{h\in[H-1]}$ as the unique trajectory leading to $x_H$ below)
\begin{align*}
\notag p^\star_{1:H}(x_H)&=p[x_H]\\
&=p\left[x_{H-1}\right] \cdot \frac{f\left[x_{H}\right]}{\sum_{x_{H}^\prime \in C\left(x_{H-1}, a_{H-1}\right)} f\left[x_{H}^\prime \right]}\\
&=p\left[x_{H-2}\right] \cdot \frac{f\left[x_{H-1}\right]}{\sum_{x_{H-1}^\prime \in C\left(x_{H-2}, a_{H-2}\right)} f\left[x_{H-1}^\prime \right]}\cdot \frac{f\left[x_{H}\right]}{\sum_{x_{H}^\prime \in C\left(x_{H-1}, a_{H-1}\right)} f\left[x_{H}^\prime \right]}\\
&=p\left[x_{H-2}\right] \cdot \frac{f\left[x_{H-1}\right]}{\sum_{x_{H-1}^\prime \in C\left(x_{H-2}, a_{H-2}\right)} f\left[x_{H-1}^\prime \right]}\cdot \frac{f\left[x_{H}\right]}{ C\left[x_{H-1}, a_{H-1}\right]}\\
&\overset{(i)}{\geq} p\left[x_{H-2}\right] \cdot \frac{f\left[x_{H-1}\right]}{\sum_{x_{H-1}^\prime \in C\left(x_{H-2}, a_{H-2}\right)} f\left[x_{H-1}^\prime \right]}\cdot \frac{f\left[x_{H}\right]}{ f[x_{H-1}]}\\
&= p\left[x_{H-2}\right] \cdot \frac{f\left[x_{H}\right]}{\sum_{x_{H-1}^\prime \in C\left(x_{H-2}, a_{H-2}\right)} f\left[x_{H-1}^\prime \right]}\\
&\geq\ldots\\
&\geq \frac{f\left[x_H\right]}{\sum_{x_1 \in \mathcal{X}_1} f\left[x_1\right]}\\
&\geq \frac{f\left[x_H\right]}{X_H}\\
&\geq \frac{f\left[x_H\right]}{X}\\
&=\frac{1}{X},
\end{align*}
where $(i)$ is due to $f[x_{H-1}]=\max_{a\in\mathcal{A}} C[x_{H-1},a]\geq C[x_{H-1},a_{H-1}]$ in Algorithm \ref{algo:compute_pstar}.

The proof is thus completed.
\end{proof}
% \zhao{TBF}