

% \vspace{-0.4cm}
\section{Fictitious Least-Squares Follow-the-Regularized-Leader}\label{sec:lsftrl}
% In \Cref{sec:Linear_Loss_Estimator}, we introduce the proposed \textit{fictitious} least-squares loss estimator. Then in \Cref{sec:f2trl_details}, we introduce the algorithmic details of our \LSFTRL algorithm, with the pseudocodes illustrated in Algorithm \ref{algo:f2trl}.
In \Cref{sec:Linear_Loss_Estimator}, we present the proposed \textit{fictitious} least-squares loss estimator for learning the unknown reward parameter. Subsequently, in \Cref{sec:f2trl_details}, we provide the algorithmic details of the \LSFTRL algorithm, along with its pseudocode shown in Algorithm \ref{algo:f2trl}.

% \vspace{-0.2cm}
\subsection{Fictitious Least-Squares Loss Estimator}\label{sec:Linear_Loss_Estimator}
% This section presents the proposed \textit{fictitious} least-squares loss estimator for learning the unknown reward parameter.

For a fixed $\nu^t$,  Eq. \eqref{eq:reaching_prob} indicates that the value function $V^{\mu^t,\nu^t}$ is linear in $\mu^t$ \citep{kozuno2021learning}:
% \begin{align*}
% V^{\mu^t, \nu^t} =\sum_{h=1}^H \sum_{\left(x_h, a_h\right) \in \mathcal{X}_h \times \mathcal{A}} \mu^t_{1: h}\left(x_h, a_h\right) \times \sum_{s_h \in x_h, b_h \in \mathcal{B}} p_{1: h}\left(s_h\right) \nu_{1: h}^t\left(y\left(s_h\right), b_h\right) \bar{r}_h\left(s_h, a_h, b_h\right)\,.
% \end{align*}
\begingroup
\setlength{\belowdisplayskip}{3pt} \setlength{\belowdisplayshortskip}{3pt}
\setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
\begin{align*}
V^{\mu^t, \nu^t} &=\sum_{h=1}^H \sum_{\left(x_h, a_h\right) \in \mathcal{X}_h \times \mathcal{A}}
\mu^t_{1: h}\left(x_h, a_h\right) \\
&\times \sum_{s_h \in x_h, b_h \in \mathcal{B}} p_{1: h}\left(s_h\right) \nu_{1: h}^t\left(y\left(s_h\right), b_h\right) \bar{r}_h\left(s_h, a_h, b_h\right)\,.
\end{align*}
\endgroup
Hence, the regret in Eq. \eqref{equation:regret} can be rewritten as $\Reg^T_{\max} = \max _{\mu^{\dagger} \in \Pi_{\max }} \mathbb{E}\left[\sum_{t=1}^T\left\langle\mu^t-\mu^{\dagger}, \ell^t\right\rangle\right]$,
% \begin{align}
%       \Reg^T_{\max} = \max _{\mu^{\dagger} \in \Pi_{\max }} \sum_{t=1}^T\left\langle\mu^t-\mu^{\dagger}, \ell^t\right\rangle\,,
% \end{align}
where $\ell_h^t$ is \textit{loss function} in round $t$ such that
\begin{align*}
&\ell_h^t\left(x_h, a_h\right) \coloneqq\\
-&\sum_{s_h \in x_h, b_h \in \mathcal{B}} p_{1: h}\left(s_h\right) \nu_{1: h}^t\left(y\left(s_h\right), b_h\right)\bar{r}_h\left(s_h, a_h, b_h\right)\,.
\end{align*}
This implies that one can translate the regret minimization in Eq. \eqref{equation:regret} into a linear regret minimization problem.

To learn the unknown parameter $\vtheta_h$ with the leverage of the linear structure over the reward function,
one may construct some linear loss estimator $\hat{\vtheta}_h$ of $\vtheta_h$. 
However, this is more challenging in our case than that of 
linear bandits \citep{Abbasi-YadkoriPS11}, 
linear MDPs \citep{JinYWJ20}, and 
fully observable linear MGs \citep{XieCWY20}, as the learning agent only observes the infoset $x(s_h)$ and does not even know the underlying state $s_h$ and its associated feature vector $\vphi(s_h,a_h,b_h)$, making it impossible to regress $r_h(s_h,a_h,b_h)$ against $\vphi(s_h,a_h,b_h)$. To cope with this issue and build a ``least-squares loss estimator'', we instead consider constructing the following feature vector for each $(x_h,a_h)$, which is a composite feature vector weighted by the opponent's policy $\nu^t$ and transition $\sP$:
% \begingroup
% \setlength{\belowdisplayskip}{3pt} \setlength{\belowdisplayshortskip}{3pt}
% \setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
% \begin{align*}
% \vphi^{\nu^t}&(x_h,a_h) \coloneqq\\
% &-\sum_{(s_h,b_h)\in x_h\times \gB} p_{1:h}(s_h)\nu_{1:h}^t(y(s_h),b_h)\vphi(s_h,a_h,b_h)\,.
% \end{align*}
% \endgroup
\begingroup
\setlength{\belowdisplayskip}{3pt} \setlength{\belowdisplayshortskip}{3pt}
\setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
\begin{align*}
&\vphi^{\nu^t}(x_h,a_h) \\
\coloneqq&-\sum_{(s_h,b_h)\in x_h\times \gB} p_{1:h}(s_h)\nu_{1:h}^t(y(s_h),b_h)\vphi(s_h,a_h,b_h)\,.
\end{align*}
\endgroup
% \zhao{
Intuitively, the constructed composite feature vector $\vphi^{\nu^t}(x_h,a_h)$ can be regarded as the  ``feature vector'' of corresponding infoset-action $(x_h,a_h)$.
\footnote{
% Note that the construction of $\vphi^{\nu^t}(x_h,a_h)$ relies on the knowledge of the policy $\nu_t$ of the min-player. Though this is not required in some works studying tabular POMGs \citep{kozuno2021learning,bai2022nearoptimal}, requiring the knowledge of the policies of the opponents and even requiring all the players to be controlled by a central controller can be seen in various works studying (fully-observable) Markov games with linear function approximation (\textit{e.g.}, \citep{ChenZG22a,XieCWY20,CuiZD23}).
Note that our construction of $\vphi^{\nu^t}(x_h,a_h)$ depends on the knowledge of the min-player’s policy $\nu_t$. While this is not necessary in some works on tabular POMGs \citep{kozuno2021learning,bai2022nearoptimal}, the requirement for knowledge of opponents’ policies—and even the more restrictive assumption that all players are controlled by a central controller—can be seen in various studies on (fully-observable) MGs with linear function approximation (\textit{e.g.}, \citep{ChenZG22a,XieCWY20,CuiZD23}).
% \zhao{
% By ``offline setting'' we refer to that the policy $\nu_t$ of the opponent (\textit{i.e.}, the min-player)  in episode $t$ is accessible to the max-player \textit{after} the $t$-th episodes ends.
% We use the terminology ``offline setting'' following \citet{ChenZG22a,XieCWY20}, which is also termed as ``self-play setting'' in the literature. However, note that ``offline setting'' in this work is slightly more general than it in \citet{ChenZG22a,XieCWY20} as we do not require that both the max-player and min-player are controlled by a central controller.
}
Further, one can see that $\ell_h^t\left(x_h, a_h\right)$ is indeed linear with $\vtheta_h$ and  $\vphi^{\nu^t}(x_h,a_h)$:
% \begingroup
% \setlength{\belowdisplayskip}{3pt} \setlength{\belowdisplayshortskip}{3pt}
% \setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
% \begin{align*}
%     &\left\langle \vphi^{\nu^t}(x_h,a_h),\vtheta_h\right\rangle \\
%     =&\left\langle-\sum_{(s_h,b_h)\in x_h\times \gB} p_{1:h}(s_h)\nu^t_{1:h}(y(s_h),b_h)\vphi(s_h,a_h,b_h),\vtheta_h\right\rangle\\
%     =& \ell_h^t(x_h,a_h)\,.
% \end{align*}
% \endgroup
\begingroup
\setlength{\belowdisplayskip}{3pt} \setlength{\belowdisplayshortskip}{3pt}
\setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
\begin{align*}
    &\left\langle -\vphi^{\nu^t}(x_h,a_h),\vtheta_h\right\rangle \\
    =&\left\langle\sum_{(s_h,b_h)\in x_h\times \gB} p_{1:h}(s_h)\nu^t_{1:h}(y(s_h),b_h)\vphi(s_h,a_h,b_h),\vtheta_h\right\rangle\\
    =& -\ell_h^t(x_h,a_h)\,.
\end{align*}
\endgroup
Based on $\vphi^{\nu^t}(x_h,a_h)$,
we define the ``feature covariance matrix'' $\mQ_{\mu,h}^t$ for any policy $\mu$ at step $h$ in episode $t$ as
% \begin{align}\label{eq:covariance_mat}
%     \mQ_{\mu,h}^t\!=\! \sumlevel \mu_{1: h}\left(x_h, a_h\right) \vphixa \vphixa^\top\,.
% \end{align}
\begingroup
\setlength{\belowdisplayskip}{3pt} \setlength{\belowdisplayshortskip}{3pt}
\setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
\begin{align}\label{eq:covariance_mat}
    &\mQ_{\mu,h}^t \notag\\
    =&\sumlevel \mu_{1: h}\left(x_h, a_h\right) \vphixa \vphixa^\top\,.
\end{align}
\endgroup
We are now ready to introduce the proposed ``least-squares loss estimator'' $\hat{\vtheta}_h^t$:
% \begin{align}\label{eq:hat_theta}
%     \hat{\vtheta}_h^t= -(\mQ_{\mu^t,h}^t)^{-1}\vphi^{\nu^t}(x_h^t,a_h^t)r_h^t(s_h^t,a_h^t,b_h^t)\,,
% \end{align}
\begingroup
\setlength{\belowdisplayskip}{5pt} \setlength{\belowdisplayshortskip}{5pt}
\setlength{\abovedisplayskip}{4pt} \setlength{\abovedisplayshortskip}{4pt}
\begin{align}\label{eq:hat_theta}
    \hat{\vtheta}_h^t= -(\mQ_{\mu^t,h}^t)^{-1}\vphi^{\nu^t}(x_h^t,a_h^t)r_h^t(s_h^t,a_h^t,b_h^t)\,,
\end{align}
\endgroup
which we call the \textit{fictitious} least-squares loss estimator. Importantly, we show that $\hat{\vtheta}_h^t$ is an unbiased estimator of the unknown $\vtheta_h$, 
guaranteed in the following lemma.
Its proof is deferred to Appendix \ref{app:proof_unbiased_estimator}.
\begin{lemma}\label{lemma:unbiased_estimator}
    For any $t\in [T]$ and $h\in[H]$, it holds that $\E^{t-1}[\hat{\vtheta}_h^t] = \vtheta_h$.
\end{lemma}
% \zhao{
\begin{remark}
Intuitively, this ``least-squares loss estimator'' shares a similar spirit as its counterpart in adversarial linear bandit literature \citep{lattimore2020bandit}. However, note that there are two crucial distinctions between $\hat{\vtheta}_h^t$ defined above and the common least-squares loss estimator in adversarial linear bandits: (a) $\mu_{1: h}^t(\cdot,\cdot)$ in the definition of the ``feature covariance matrix'' $\mQ_{\mu,h}^t$ is not necessarily a probability distribution over $\mathcal{X}_h \times \mathcal{A}$ (thus $\mQ_{\mu,h}^t$ itself is not a true feature covariance matrix); and (b) the ``feature vector'' $\boldsymbol{\phi}^{\nu^t}(x_h^t, a_h^t)$ is de facto not necessarily linear with the regressand $r_h^t(s_h^t, a_h^t, b_h^t)$ (recall $\bar{r}_h\left(s_h, a_h, b_h\right)=\left\langle\boldsymbol{\phi}\left(s_h, a_h, b_h\right), \boldsymbol{\theta}_h\right\rangle$, which only means $\bar{r}_h$ is linear in $\boldsymbol{\phi}(\cdot,\cdot,\cdot)$ instead of $\boldsymbol{\phi}^{\nu^t}(\cdot,\cdot)$).
Due to the above two reasons, $\hat{\vtheta}_h^t$ is not a real least-squares loss estimator and this is why we term $\hat{\vtheta}_h^t$ as the \textit{fictitious} least-squares loss estimator. On the other hand, as shown in Lemma \ref{lemma:unbiased_estimator},
via constructing $\hat{\vtheta}_h^t$, we indeed address the challenge that we can not regress $r_h(s_h,a_h,b_h)$ against $\vphi(s_h,a_h,b_h)$ due to the partial observability in POMGs.
\end{remark}

\begin{remark}
When constructing the composite feature vector $\vphi^{\nu^t}(x_h,a_h)$, our algorithm uses the product of the sequence-form transition probability $p_{1:h}(s_h)$ and the sequence-form policy $\nu_{1: h}^t\left(y\left(s_h\right), b_h\right)$ to weight the feature vectors over state-action triplets. Some works studying adversarial linear Markov decision processes (MDPs) (\textit{e.g.}, \citet{0002Z0024} and \citet{LiuWZ24}) use the occupancy measure (OM) $\mu^{\pi,p}(s, a)$, which is the probability of visiting state-action pair $(s,a)$ under policy $\pi$ and transition probability $p$, to weight the feature vectors over state-action pairs. We would like to note that there remain several key differences between our idea and theirs. First, from an algorithmic design perspective, for each infoset-action pair $(x_h,a_h)$, our weighting operation is performed only on a subset of state-actions $\{s_h\in\mathcal{S}_h:x(s_h)=x_h\} \times \mathcal{B}$ and the weight $p_{1: h}\left(\cdot\right) \nu_{1: h}^t\left(\cdot,\cdot\right)$ actually is not a probability measure over $\{s_h\in\mathcal{S}_h:x(s_h)=x_h\} \times \mathcal{B}$. In contrast, such a weighting operation in works of \citet{0002Z0024} and \citet{LiuWZ24} is performed on all the state-action pairs $\mathcal{S}_h\times\mathcal{A}$ ($\mathcal{S}_h$ is the set of all the states on step $h$ of a layered MDP) and $\mu^{\pi,p}(\cdot, \cdot)$ is a probability measure over $\mathcal{S}_h\times\mathcal{A}$. More importantly, the purpose of our weighting operation is mainly to construct a kind of composite feature vector $\phi^{\nu^t}\left(x_h, a_h\right)$ for infoset-action pairs $(x_h,a_h)$ so as to construct an unbiased least-squares loss estimator. After such a weighting operation, for each step $h$, we obtain a set of feature vectors $\{\phi^{\nu^t}(x_h, a_h)\}_{(x_h, a_h)\in\mathcal{X}_h\times\mathcal{A}}$. On the contrary, the weighting operation in works of \citet{0002Z0024} and \citet{LiuWZ24} is to construct a kind of feature vector $\phi^\pi$ for each policy $\pi$ so as to reduce learning adversarial linear MDPs into the problem of learning adversarial linear bandits with $\left(\phi^\pi\right)_{\pi \in \Pi}$ as the underlying action set.
\end{remark}
% }

% ----------------------------- algo. 2024.08.08 -----------------------------
\begin{algorithm}[!thb]
\caption{\LSFTRL (max-player version)}\label{algo:f2trl}
\begin{algorithmic}[1]
\STATE \textbf{Input:} Tree-like structure of $\gX\times \gA$, learning rates $\eta$ and ``balanced transition'' $p^{\star}$.
\STATE \textbf{Initialization:} Set $\mu^1$ as the uniform policy.\alglinelabel{algo:ftrl:init}
% $\mu^1=\argmin_{\mu\in \Pi_{\max}} \sum_h\Psi_h\left(p^\star_{1:h}\cdot\mu_{1:h}\right)$.
\FOR{$t=1$ to $T$} 
    \FOR{$h=1$ to $H$}\alglinelabel{algo:ftrl:interact1}
        \STATE Observe infoset $x^t_h$, execute action $a_h^t \sim \mu^t_h(\cdot|x_h^t)$ and receive reward $r_h^t(s_h^t,a_h^t,b_h^t)$.
    \ENDFOR\alglinelabel{algo:ftrl:interact2}
    \STATE Construct composite features $\{\vphi^{\nu^t}(x, a)\}_{(x,a)\in\gX\times\gA}$\,.\alglinelabel{algo:ftrl:loss_estimate1}
    \FOR{ $h=1$ to $H$}
        \STATE Compute 
        % $\mQ_{\mu^t,h}^t= \sumlevel \muxa \vphixa \vphixa^\top$
        $\mQ_{\mu^t,h}^t$ as defined in Eq. \eqref{eq:covariance_mat}.
        \STATE Compute 
        % $\hat{\vtheta}_h^t= -(\mQ_{\mu^t,h}^t)^{-1}\vphi^{\nu^t}(x_h^t,a_h^t)r_h^t(s_h^t,a_h^t,b_h^t)$.
        $\hat{\vtheta}_h^t$ as defined in Eq. \eqref{eq:hat_theta}.
    \ENDFOR\alglinelabel{algo:ftrl:fic_loss_estimate}
    \STATE Construct loss estimate for all $(x_h,a_h)$ and $h\in[H]$: $\displaystyle \hatellxa=\langle \vphi^{\nu^t}(x_h,a_h),\hat{\vtheta}_h^t\rangle$.\alglinelabel{algo:ftrl:loss_estimate2}
    \STATE Compute cumulative loss estimate at episode $t$: $\hat{L}^t=\hat{L}^{t-1}+\hat{\ell}^t$.\alglinelabel{algo:ftrl:cum_loss_estimate}
    \STATE  Solve Eq. \eqref{update:FTRL} to update policy $\mu^{t+1}$ via Algorithm \ref{algo:upftrl}.\alglinelabel{algo:ftrl:policy_update}
\ENDFOR
\end{algorithmic}
\end{algorithm}
% ----------------------------- algo. 2024.08.08 -----------------------------

% \vspace{-0.2cm}
\subsection{Algorithmic Details}\label{sec:f2trl_details}
With the constructed fictitious loss estimator in Eq. \eqref{eq:hat_theta}, we are now ready to introduce the algorithmic design of our \LSFTRL algorithm.

In each episode $t$, 
after interacting with the min-player using policy $\mu^t$ (Line \ref{algo:ftrl:interact1} - Line \ref{algo:ftrl:interact2}), \LSFTRL will construct the composite feature vectors and fictitious least-squares loss estimator $\hat{\vtheta}_h^t$ defined in Eq. \eqref{eq:hat_theta} (Line \ref{algo:ftrl:loss_estimate1} - Line \ref{algo:ftrl:fic_loss_estimate}). Then \LSFTRL will compute the loss estimate $\hat{\ell}_h^t\left(x_h, a_h\right)$ with the leverage of $\hat{\vtheta}_h^t$ as well as the composite feature vectors for all $(x_h,a_h)\in\gX_h\times\gA$ and $h\in[H]$ and update the cumulative loss estimate $\hat{L}^t$ (Line \ref{algo:ftrl:loss_estimate2} - Line \ref{algo:ftrl:cum_loss_estimate}).
At the end of episode $t$, to update the policy $\mu^{t+1}$ used in episode $t+1$, it solves the following linear optimization problem regularized by potential function $\{\Psi_h\}_{h\in[H]}$ (Line \ref{algo:ftrl:policy_update}):
\begingroup
\setlength{\belowdisplayskip}{2pt} \setlength{\belowdisplayshortskip}{2pt}
\setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
\begin{align}\label{update:FTRL}
\mu^{t+1} =\argmin_{\mu\in \Pi_{\max}} \left\langle\mu,\hat{L}^{t}\right\rangle + \frac{1}{\eta}\sumH\Psi_h\left(p^\star_{1:h}\cdot\mu_{1:h}\right)\,,
\end{align}
\endgroup
where $\hat{L}^{t}=\sum_{k=1}^t\hat{\ell}^k$ is the cumulative loss estimate, 
$\Psi_h(w_h)=\sumlevel \evw_h(x_h,a_h)\log(\evw_h(x_h,a_h))$ is the negentropy potential function,
$p_{1:h}^\star(x_h)= p^{\star}_0(x_1) \prod_{h^\prime=1}^{h-1} p_{h^\prime}^{\star}(x_{h^\prime+1}|x_{h^\prime},a_{h^\prime})$ 
with $p_{h}^{\star}(\cdot|x_{h},a_{h})\in\Delta_{C(x_h,a_h)}$ being a kind of transition probability over
$\gX_h\times\gA\times\gX_{h+1}$, and $p^\star_{1:h}\cdot\mu_{1:h}$ is defined as 
$[p^\star_{1:h}\cdot\mu_{1:h}](x_h,a_h)=p^\star_{1:h}(x_h)\mu_{1:h}(x_h,a_h)$.
% We remark that the computation of Eq. \eqref{update:FTRL} can be solved by reducing the update of \LSFTRL to an OMD-like update and we defer the details to Appendix \ref{app:efficient_upd_ftrl}.
We note that $p^{\star}$ is well-defined due to the perfect recall condition, and $p^\star_{1:h}\cdot\mu_{1:h}$ is a probability distribution over the infoset-action space $\gX_h\times\gA$ at step $h$.

When bounding the regret of the FTRL, it is essential to make the stability term well-controlled in the analysis. To this end,
% In particular, 
we construct the following ``balanced transition'' as our
transition probability $p_{1:h}^\star(\cdot)$ over infoset-action space:
% \begin{align}\label{eq:p_star}
%     p^\star = \argmax_{\tilde{p}\in \sP^\star}\min_{h\in[H],x_h\in \gX_h}\tilde{p}_{1:h}(x_h)\,,
% \end{align}
\begingroup
\setlength{\belowdisplayskip}{2pt} \setlength{\belowdisplayshortskip}{2pt}
\setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
\begin{align}\label{eq:p_star}
p^\star = \argmax_{\tilde{p}\in \sP^\star}\min_{h\in[H],x_h\in \gX_h}\tilde{p}_{1:h}(x_h)\,,
\end{align}
\endgroup
where $\sP^\star$ denotes the set of all the valid transition probabilities over infoset-action space. 
We also remark that similar approaches that utilize FTRL or OMD with 
``balanced transition'' over $\gX_h\times\gA\times\gX_{h+1}$
have also been exploited in previous works (see, \textit{e.g.}, \citet{bai2022nearoptimal,Fiegel2023adapting}).
However, the design of our ``balanced transition'' $p_{1:h}^\star(\cdot)$ differs from the previous ones in the following two aspects:
% \vspace{-0.2cm}
\begin{itemize}
    \item $p^\star(x_{h+1}|x_h,a_h)$ in this work is proportional to all the reachable infosets $C_H(x_{h+1},a_{h+1})$ in $\gX_H$ by taking some fixed action $a_{h+1}\in\gA$ at infoset $x_{h+1}$. In contrast, the ``balanced transitions'' of \citet{bai2022nearoptimal,Fiegel2023adapting} are devised by only considering the reachable infosets in $\gX_{h^\prime}$ for some $h^\prime\geq h+1$ or all the reachable infosets in the whole sub-tree.
    \item Our $p^\star(x_{h+1}|x_h,a_h)$ is contributed by some fixed action $a_{h+1}\in\gA$ that maximizes the number of the reachable infosets $|C_H(x_{h+1},a_{h+1})|$ in $\gX_H$, while previous ``balanced transitions'' of \citet{bai2022nearoptimal,Fiegel2023adapting} are contributed by 
    the sum of all the reachable infosets by taking all actions $a_{h+1}\in\gA$ at infoset $x_{h+1}$.
\end{itemize}
As we shall see in \Cref{sec:ftrl_analysis}, the property of our $p^\star$ plays a crucial role when bounding the stability term of \LSFTRL algorithm in the regret analysis. 

\paragraph{Computation} 
We prove that the computation of Eq. \eqref{update:FTRL} has a closed-form update and can be solved by backward dynamic programming, as illustrated in \Cref{algo:upftrl} in Appendix \ref{app:efficient_upd_ftrl}.
% We remark that the computation of Eq. \eqref{update:FTRL} can be solved by reducing the update of \LSFTRL to an OMD-like update and we defer the details to Appendix \ref{app:efficient_upd_ftrl}.
% We remark that the computation of Eq. \eqref{update:FTRL} can be solved by reducing the update of \LSFTRL to an OMD-like update and we defer the details to Appendix \ref{app:efficient_upd_ftrl}.
Besides, the computation of our ``balanced transition'' $p^\star$ in Eq. \eqref{eq:p_star} can be also efficiently 
solved by Algorithm \ref{algo:compute_pstar} and we defer the details to Appendix \ref{app:maxlambda}.

