% \vspace{-0.2cm}
\section{Analysis}\label{sec:ftrl_analysis}
% In this section, we first present the regret upper bound of our \LSFTRL algorithm. Then in \Cref{sec:lower_bound}, we present the regret lower bound for learning IIEFGs with linearly parameterized rewards.
In this section, we first derive the regret upper bound for our \LSFTRL algorithm. Then, in \Cref{sec:lower_bound}, we provide the regret lower bound for learning IIEFGs with linearly parameterized rewards.

\subsection{Regret Upper Bound}
Let $p_{1: h}^\nu\left(x_h\right)\coloneqq\sum_{s_h\in x_h} p_{1: h}\left(s_h\right) \nu_{1: h-1}(y\left(s_{h-1}\right),$ $ b_{h-1})$, 
which can be seen as the probability of reaching $x_h$ contributed by environment transition $\sP=\{p_h\}_{h=0}^{H-1}$ and opponent's policy $\nu$. 
Denote by $\beta^\star_h\coloneqq\min_{x_h\in\gX_h} p^{\star}_{1:h}(x_h)$ and $\beta^{\nu}_h\coloneqq\max_{t\in[T],x_h\in\gX_h}p^{\nu^t}_{1:h}(x_h)$.
Then we define the ``balance coefficient'' $\lambda$ as $\lambda\coloneqq \max_{h\in[H]} \beta^{\nu}_h/\beta^\star_h$.
Besides, let $\rho=\min_{t\in[T],h\in[H]} \rho_{\min}(\mQ_{\mu^t,h}^t)$ be the minimum of all the minimal eigenvalues of the feature covariance matrices.\footnote{Note that it is guaranteed that $\rho>0$ due to that $\mu^1$ is set as the uniform policy in Line \ref{algo:ftrl:init} of \Cref{algo:f2trl} and the closed-form update of $\mu^t$ as shown in \Cref{algo:upftrl}.}

% \vspace{-0.2cm}
We are now ready to present the regret upper bound of \LSFTRL, the proof of which is postponed to  Appendix \ref{sec:app:ftrl}.
\begin{theorem}\label{thm:ftrl_trans}
For IIEFGs with linearly realizable rewards and known sequence-form transition probabilities,
    by setting learning rate $\eta=\sqrt{\frac{2\log (XA)}{Td}}$, the regret of \LSFTRL is upper bounded by 
    $\Reg_{\max}^T\leq\gO(\exp(\nicefrac{ L^2\sqrt{\log (XA)}}{(\beta^\star_H\rho\sqrt{Td})} )\lambda H\sqrt{ d T \log(XA)})$.
    % $\Reg_{\max}^T\leq\gO(\exp( L^2\sqrt{\log (XA)}/(\beta^\star_H\rho\sqrt{Td}))\lambda H\sqrt{ d T \log(XA)})$.
\end{theorem}
\begin{remark}
Intuitively, $\lambda$ measures the balance effect of the ``balanced transition'' $p^\star_{1: h}$ compared with the transition over infoset-action space contributed by the environment state transition $\sP$ and the opponent's policy $\nu^t$. Indeed, due to the design of our ``balanced transition'' $p^\star$, $\lambda$ is moderately large when the environment state transition $\sP$ is nearly uniform (in particular, $\lambda\leq 1$ when the environment state transition $\sP$ is a uniform distribution and the game tree is a $k$-ary tree; see Lemma \ref{lem:lambda_leq_1} in Appendix \ref{app:sec:balanced_transition_property}).
On the other hand, the design of our ``balanced transition'' $p^\star$ also guarantees that $\beta^\star_H\geq \nicefrac{1}{X}$ and thus $\lambda\leq X$ in the worst-case scenario (see Lemma \ref{lem:lambda_leq_x} in Appendix \ref{app:sec:balanced_transition_property}).
Nevertheless, we should note that this worst case is very unlikely to happen in practice unless it simultaneously happens that (a) the environment state transitions along the trajectory $\{(s_h,a_h,b_h)\}_{h\in[H-1]}$ leading to $s_H$ s.t. 
$p^{\star}_{1:H}(x(s_H))=\beta^\star_H$ 
satisfy $p_h(s_{h+1}| s_h,a_h,b_h)=1$ for all $(s_h,a_h,b_h)$ along the trajectory; and (b) the opponent \textit{knows} the underlying environment transitions and the mapping $y:\gS\to\gY$ so that the opponent can intentionally ensure $\nu^t_{1: H-1}\left(y\left(s_{H-1}\right), b_{H-1}\right)=1$ by setting $\nu^t(b_h| y(s_h))=1$ for all $(s_h,b_h)$ along the trajectory. 
Notice that condition (b) hardly happens in the self-play setting where the policies of the min-player are also generated by an algorithm.
Also, if the opponent is a pure adversary aiming to maximize the regret of the max-player and only condition (a) holds, the best that the opponent can do is to uniformly pick an action $b_y\in\mathcal{B}$ at each infoset $y\in\mathcal{Y}$ and set her policy $\nu^t$ such that $\nu^t(b_y|y)=1$.
This can only guarantee that $\nu^t_{1: H-1}\left(y\left(s_{H-1}\right), b_{H-1}\right)=1$ (and thus  $\lambda=X$) happens with an exponentially small probability of $B^{-(H-1)}$.
Additionally, we remark that $\lambda$ has nothing to do with the commonly discussed ``concentrability coefficient'' in offline RL literature \citep{KumarFSTL19}, which might be arbitrarily large in practice \citep{0009SAB20,XieJWXB21}.
\end{remark}
\begin{remark}
% For the adversarial linear bandit problem, which is a more amenable special case of IIEFGs with linearly parameterized rewards, one may eliminate the dependence of regret bound on $\nicefrac{1}{\rho}$ by mixing $\mu^t$ with an optimal design distribution. Yet for general linear IIEFG problems, it is highly unclear how to achieve this as for all $\mu\in\Pi_{\max}$, $\mu_{1:h}(\cdot,\cdot)$ is not even a probability distribution over $\gX_h\times \gA$. On the other hand, note that the dependence of $\nicefrac{1}{\rho}$ only appears in the exponential term in our regret upper bound, which is inversely correlated with the number of episodes $T$ and will approach $1$ for large enough $T$ (\textit{i.e.}, $T\geq\Omega(\nicefrac{ L^4\log (XA)}{((\beta^\star_H\rho)^2 d)})$). We also remark that the $\sqrt{\log (XA)}$ dependence has also appeared in works studying the more amenable (fully-observable) adversarial linear (mixture) MDPs \citep{neu2021online,ZhaoY0023,Li0Z24,LiZZ24}.
For the adversarial linear bandit problem, a more tractable special case of IIEFGs with linearly parameterized rewards, one can eliminate the dependence of the regret bound on $\nicefrac{1}{\rho}$ by mixing $\mu^t$ with an optimal design distribution. However, for general linear IIEFG problems, it is highly unclear how to achieve this, as for all $\mu \in \Pi_{\max}$, $\mu_{1:h}(\cdot,\cdot)$ is not even a valid probability distribution over $\gX_h \times \gA$. On the other hand, it is worth noting that the dependence on $\nicefrac{1}{\rho}$ only appears in the exponential term of our regret upper bound, which is inversely related to the number of episodes $T$ and approaches $1$ as $T$ grows large (\textit{i.e.}, $T \geq \Omega(\nicefrac{ L^4 \log (XA)}{((\beta^\star_H \rho)^2 d)})$). Besides, we remark that the $\sqrt{\log (XA)}$ dependence has also appeared in previous works studying the more tractable (fully observable) adversarial linear (mixture) MDPs \citep{neu2021online,ZhaoY0023,Li0Z24,LiZZ24}.
% Ignoring the logarithmic factor, \LSFTRL obtains the regret guarantee of order $\widetilde{\gO}(\lambda\sqrt{dH^2 T})$  when $T$ is large enough.
% Compared with previous results, the regret upper bound in Theorem \ref{thm:ftrl_trans} improves over the minimax optimal regret $\widetilde{\gO}(\sqrt{XAT})$ of \citet{Fiegel2023adapting} by an $\widetilde{\gO}(\sqrt{XA/H^2})$ factor in large $T$ regime (omitting the dependence on $d$ and $\lambda$). 
\end{remark}

\begin{figure*}[htbp!]
\centering
\includegraphics[width=0.95\linewidth]{Contents/figs/linear_iiefg_0520.pdf}
% \vspace{-0.4cm}
\caption{\label{fig:exp}
% Experiment results of baseline methods and \LSFTRL on two linear IIEFG environments. The curves show the value of Eq. \eqref{equation:regret} against the number of episodes and are averaged over $10$ different seeds, with the shaded areas as the $1$ standard error.
Experimental results of baseline methods and our \LSFTRL algorithm on two linear IIEFG environments. The curves depict the value of Eq. \eqref{equation:regret} as a function of the number of episodes, averaged over 10 different seeds, with shaded areas representing the $1$ standard error.
}
\end{figure*}

% \vspace{-0.2cm}
% \paragraph{Technique Overview}
\subsubsection{Technique Overview}
In the following, 
% we briefly discuss the technical difficulties in deriving the regret upper bound in \Cref{thm:ftrl_trans} and how we overcome them.
we briefly explain the technical challenges involved in deriving the regret upper bound in \Cref{thm:ftrl_trans} and the approaches we use to address them.
\paragraph{Loss Estimates with Large Negative Magnitudes}
We bound the regret of \LSFTRL by considering the common analysis scheme of FTRL 
to decompose the regret into the \textit{penalty} term and the \textit{stability} term.
However, when bounding the stability term, simply following the previous analysis for tabular IIEFGs and other online learning problems with linear function approximation (say, adversarial linear bandits) does not address our problem. In detail, we note that the analysis of \citet{Fiegel2023adapting} to the bound the stability term can only work with non-negative loss estimates, which naturally hold in the tabular case but not in the linear case. A plausible remedy from the analysis of adversarial linear bandits (see, \textit{e.g.}, Chap. 27 of \citet{lattimore2020bandit}) is to explicitly bound the Bregman divergence between $\nabla \Psi(\mu^t)$ and $\nabla \Psi(\mu^t)-\eta\hat{\ell}^t$ and use the inequality $\exp(x)\leq 1+x+x^2$ for $x\leq 1$, which in turn requires $\eta\hatellxa/\pstarxa\geq -1$ in our case. Unfortunately, $\eta\hatellxa/\pstarxa\geq -1$ does not hold in our case and thus thwarts this remedy. To tackle this issue, we instead first evaluate the Bregman divergence between $\nabla \Psi(\mu^t)$ and $\nabla \Psi(\mu^t)-\eta\hat{\ell}^t$ using a local norm regarding some $z^t=\alpha\mu^t+(1-\alpha)\tilde{\mu}^{t+1}$ for some $\alpha\in[0,1]$,
where $\tilde{\mu}^{t+1}\coloneqq \nabla \Psi^{\star}(\nabla \Psi(\mu^t)-\eta\hat{\ell}^t)$ and $\Psi^{\star}$ is the convex conjugate of $\Psi$.
Then, with the observation that $\sfr{z}\leq\max\{\sfr{\mu},\sfrtildemu\}$ and $\sfrtildemu$ is proportional to $\sfr{\mu}$, we bound this local norm by upper bounding $\sfr{z}$ using $\sfr{\mu}$ (see Lemma \ref{lemma:varbound} for details).
\paragraph{Non-zero Loss Estimates and Balanced Effects of $p^\star$}
% Further, concluding the final bound of the stability term in our linear case requires particular care so that the variances of the loss estimates are well-controlled by $\lambda$ as well as $d$, which is enabled by the design of our ``balanced transition'' $p^\star$ computed in Algorithm \ref{algo:compute_pstar} (see Lemma \ref{lem:ftrl_stability_trans} for details). Besides, previous works studying tabular IIEFGs \citep{kozuno2021learning,bai2022nearoptimal,Fiegel2023adapting} solve the update in Eq. \eqref{update:FTRL} critically rely on the sparsity of the importance-weighted loss estimator, \textit{i.e.}, only the infoset-action pairs along the experienced trajectory $\{(x^t_h,a^t_h)\}_{h\in[H]}$ have non-zero loss estimates and the loss estimates of all other infoset-action pairs are zero in the tabular case. In contrast, all the infoset-action pairs in our linear case can have non-zero loss estimates and thus the methods in previous works are not applicable to our linear case. We overcome this obstacle by recursively considering all the descendants of one infoset-action pair $(x_h,a_h)$ instead of only one descendant $(x_{h+1}^{t},a_{h+1}^t)$ in the experienced trajectory of episode $t$ when updating policy $\mu^{t+1}_h(a_h|x_h)$ (see Appendix \ref{app:efficient_upd_ftrl}).
Moreover, in IIEFGs, bounding the stability term critically relies on the closed-form update of the policies (\textit{e.g.}, Eq. \eqref{update:FTRL} in our case). Nevertheless, previous works on tabular IIEFGs \citep{kozuno2021learning,bai2022nearoptimal,Fiegel2023adapting} solve the updates similar to Eq. \eqref{update:FTRL} by heavily relying on the sparsity of the importance-weighted loss estimator. Specifically, only the infoset-action pairs along the experienced trajectory $\{(x^t_h,a^t_h)\}_{h\in[H]}$ have non-zero loss estimates, while all other infoset-action pairs have zero loss estimates in the tabular case. However, in our linear case, all infoset-action pairs can have non-zero loss estimates, making the methods in previous works inapplicable. We address this challenge by recursively considering all descendants of a given infoset-action pair $(x_h, a_h)$, rather than just one descendant $(x_{h+1}^{t},a_{h+1}^t)$ from the experienced trajectory in episode $t$, when updating the policy $\mu^{t+1}_h(a_h|x_h)$ (see Appendix \ref{app:efficient_upd_ftrl} for details).
Finally, deriving the overall bound for the stability term in our case requires particular care to bound the ratio $\beta^{\nu}_h/(\min_{x_h} \tilde{p}_{1:h}(x_h))$ for any ``balanced transition'' $\tilde{p}$ adopted by the algorithm. To make this ratio well-controlled, we seek to maximize $\min_{x_h} \tilde{p}_{1:h}(x_h)$ for all $h\in[H]$. This is exactly facilitated by the design of our ``balanced transition'' $p^{\star}$ defined in Eq. \eqref{eq:p_star} computed in Algorithm \ref{algo:compute_pstar}, which guarantees that this ratio is upper bounded by the desired ``balance coefficient'' $\lambda$ (please refer to Lemma \ref{lem:ftrl_stability_trans} for details).

% Finally, deriving the overall bound for the stability term in our linear case requires careful attention to ensure that the variances of the loss estimates are well-controlled by both $\lambda$ and $d$. This is facilitated by the design of our ``balanced transition'' $p^\star$, computed in Algorithm \ref{algo:compute_pstar} (please refer to Lemma \ref{lem:ftrl_stability_trans} for details).

% \vspace{-0.2cm}
\subsection{Regret Lower Bound}\label{sec:lower_bound}
The following theorem provides the regret lower bound of learning IIEFGs with linearly realizable rewards and known state transition probabilities, the proof of which is deferred to Appendix \ref{app:sec:lower_proof}.
\begin{theorem}\label{thm:regret_lower}
    Suppose $A\geq2$, $d\geq2$ and $T\geq 2d^2$. Then for any algorithm $\operatorname{Alg}$ that controls the max-player, generates and executes policies $\{\mu^t\}_{t\in[T]}$, there exists an IIEFG instance with linearly realizable rewards and known state transition probabilities on which $\Reg^T_{\max}\geq\Omega(\sqrt{d\min(d,H)T})$.
\end{theorem}
% \vspace{-0.1cm}
\begin{remark}
Note that both the regret upper and lower bounds of our algorithm do not have polynomial dependence on $X$ and $A$, as opposed to the $\Omega(\sqrt{X A T})$ regret lower bound of \citet{bai2022nearoptimal}.
However, we would like to note that this does not imply that our results contradict those of previous works, as both our regret upper and lower bounds are specifically established for IIEFG instances with linear structures over reward functions, while the regret lower bound of \citet{bai2022nearoptimal} is derived by considering learning on the IIEFG instances without any function approximation structures (\textit{i.e.}, tabular rewards).
Besides, we conjecture that the lower bound might be further improved to $\Reg^T_{\max}\!\geq\!\Omega(\sqrt{dHT})$, and currently 
    the regret upper bound of \LSFTRL is loose by an  $\widetilde{\gO}(\sqrt{H})$ factor in large $T$ regime (omitting dependence on $\lambda$). 
    The investigation into the possible improvements of the upper and lower bounds is an interesting and also challenging future direction, and we leave this extension as our future study.
\end{remark}
% }
