% \vspace{-0.2cm}
\section{Preliminaries}\label{sec:setting}
% \vspace{-0.2cm}
% \zhao{TBC}
% 1. POMG: def
% 2. learning protocol
%     a. interaction procedure
%     b. value functions
% 3. perfect recall and tree structure
%     a. sequence-form transition probability
%     b. sequence-form policies 
%     c. sequence form representation (policy set)
% 4. linear assumption
% 5. regret and NE
%     a. learning objective
%     b. regret -> nash equilibrium

% Following previous works \citep{kozuno2021learning,bai2022nearoptimal}, we study 
% This section introduces the preliminaries of POMGs.
For ease of discussion, we study IIEFGs in the formulation of POMGs \citep{kozuno2021learning,bai2022nearoptimal}. In this section, we introduce the preliminaries of POMGs.

% We study two-player zero-sum iterative incomplete extensive form games (IIEFGs) by modeling them as Partially Observable Markov Games (POMGs), as proposed in \citep{kozuno2021learning,bai2022nearoptimal}.
% \vspace{-0.1cm}
\paragraph{Partially Observable Markov Games}
An episodic, finite-horizon, two-player zero-sum POMG is denoted by $\operatorname{POMG}(H,\gS,\gX,\gY,\gA,\gB,\sP,r)$, in which 
% $H$ is the length of the horizon; $\mathcal{S}=\bigcup_{h \in[H]} \mathcal{S}_h$ is a finite state space with cardinality $S=\sum_{h=1}^H S_h$ and $\left|\mathcal{S}_h\right|=S_h$; $\mathcal{X}=\bigcup_{h \in[H]} \mathcal{X}_h$ and $\mathcal{Y}=\bigcup_{h \in[H]} \mathcal{Y}_h$ are the spaces of information sets (short for \emph{infosets} below) for the \textit{max-player} and \textit{min-player}, respectively. In specific, the cardinality $X$ of $\gX$ satisfies $X\coloneqq\sum_{h=1}^H X_h$ with $\left|\mathcal{X}_h\right|=X_h$ and the cardinality $Y$ of $\gY$ satisfies $Y\coloneqq\sum_{h=1}^H Y_h$ with $\left|\mathcal{Y}_h\right|=Y_h$;  $\gA$ with $|\gA|=A$ and $\gB$ with $|\gB|=B$ are the finite action spaces for the max-player and min-player, respectively; $\mathbb{P}=\left\{p_0(\cdot) \in \Delta\left(\mathcal{S}_1\right)\right\} \cup\left\{p_h\left(\cdot \mid s_h, a_h, b_h\right) \in \Delta\left(\mathcal{S}_{h+1}\right)\right\}_{\left(s_h, a_h, b_h\right) \in \mathcal{S}_h \times \mathcal{A} \times \mathcal{B}, h \in[H-1]}$ are the transition
% probability functions\footnote{While in some games, $\{p_h\}_{h=1}^{H-1}$  might be time-homogeneous, \textit{i.e.}, $\{p_h\}_{h=1}^{H-1}$ does not depend on $h$,
% we retain the dependence on $h$ in our notations as it allows the results to be applicable more broadly without too much additional efforts in the analysis, following previous works \citep{bai2022nearoptimal,Fiegel2023adapting}.}, 
% with $p_0(\cdot)$ being the probability distribution of the initial states, and $p_h(s_{h+1}|s_h,a_h,b_h)$ being the probability of transmitting to the next state $s_{h+1}$ conditioned on $(s_h,a_h,b_h)$ at step $h$; and $r=\left\{r_h\left(s_h, a_h, b_h\right) \in[-1,1]\right\}_{\left(s_h, a_h, b_h\right) \in \mathcal{S}_h \times \mathcal{A} \times \mathcal{B}}$ are the stochastic reward functions with $\bar{r}_h\left(s_h, a_h, b_h\right)$ as means.
\begin{itemize}
% \vspace{-0.2cm}
\item $H$ is the length of the horizon;
\item $\mathcal{S}=\bigcup_{h \in[H]} \mathcal{S}_h$, where $\mathcal{S}_h\bigcap \mathcal{S}_{h^\prime}=\emptyset$ for all $h\neq h^\prime$, is a finite state space with cardinality $S=\sum_{h=1}^H S_h$ and $\left|\mathcal{S}_h\right|=S_h$, $\forall h\in[H]$;
% (we define $[H]=\{1,\ldots, H\}$);
% ($[H]$ denotes the set $\{1,\ldots, H\}$);
% \item $\mathcal{X}=\bigcup_{h \in[H]} \mathcal{X}_h$ and $\mathcal{Y}=\bigcup_{h \in[H]} \mathcal{Y}_h$ are the spaces of information sets (short for \emph{infosets} in what follows) for the \textit{max-player} and \textit{min-player}, respectively. The cardinality $X$ of $\gX$ satisfies $X=\sum_{h=1}^H X_h$ with $\left|\mathcal{X}_h\right|=X_h$ and the cardinality $Y$ of $\gY$ satisfies $Y=\sum_{h=1}^H Y_h$ with $\left|\mathcal{Y}_h\right|=Y_h$, $\forall h\in[H]$.
% Also, we denote by $x:\gS\to\gX$ and $y:\gS\to\gY$ the emission function for the max-player and min-player. 
\item $\mathcal{X}=\bigcup_{h \in[H]} \mathcal{X}_h$ is the finite space of information sets (short for \emph{infosets} in what follows) for the max-player, where $\mathcal{X}_h=\{x(s):s\in\gS_h\}$ 
% is a partition of $\gS_h$ 
with $x:\gS\to\gX$ as the emission function and $\mathcal{X}_h\bigcap \mathcal{X}_{h^\prime}=\emptyset$ for all $h\neq h^\prime$. 
The cardinality $X$ of $\gX$ satisfies $X=\sum_{h=1}^H X_h$ with $\left|\mathcal{X}_h\right|=X_h$. The finite space of infosets $\mathcal{Y}=\bigcup_{h \in[H]} \mathcal{Y}_h$ for the min-player and associated quantities are defined analogously;
% and $\mathcal{Y}=\bigcup_{h \in[H]} \mathcal{Y}_h$ are the spaces of information sets (short for \emph{infosets} in what follows) for the \textit{max-player} and \textit{min-player}, respectively. The cardinality $X$ of $\gX$ satisfies $X=\sum_{h=1}^H X_h$ with $\left|\mathcal{X}_h\right|=X_h$ and the cardinality $Y$ of $\gY$ satisfies $Y=\sum_{h=1}^H Y_h$ with $\left|\mathcal{Y}_h\right|=Y_h$, $\forall h\in[H]$.
% Also, we denote by $x:\gS\to\gX$ and $y:\gS\to\gY$ the emission function for the max-player and min-player. 

\item  $\gA$ with $|\gA|=A$ and $\gB$ with $|\gB|=B$ are the finite action spaces for the max-player and min-player, respectively;
\item 
% $\mathbb{P}\!=\!\left\{p_0(\cdot) \in \Delta_{\mathcal{S}_1}\right\} \cup\left\{p_h\left(\cdot| s_h, a_h, b_h\right) \in \Delta_{\mathcal{S}_{h+1}}\right\}_{\left(s_h, a_h, b_h\right) \in \mathcal{S}_h \times \mathcal{A} \times \mathcal{B}, h \in[H-1]}$ 
$\mathbb{P}=\left\{p_0(\cdot) \in \Delta_{\mathcal{S}_1}\right\} \bigcup\{p_h\left(\cdot| s_h, a_h, b_h\right) \in \Delta_{\mathcal{S}_{h+1}}\}_{\left(s_h, a_h, b_h\right) \in \mathcal{S}_h \times \mathcal{A} \times \mathcal{B}, h \in[H-1]}$ 
are the state transition probabilities,
% \footnote{While in some games, $\{p_h\}_{h=1}^{H-1}$  might be time-homogeneous, \textit{i.e.}, $\{p_h\}_{h=1}^{H-1}$ does not depend on $h$, we retain the dependence on $h$ in our notations as it allows the results to be applicable more broadly without too much additional efforts in the analysis, following previous works \citep{bai2022nearoptimal,Fiegel2023adapting}.} 
with $p_0(\cdot)$ as the probability distribution over initial states and $p_h(s_{h+1}|s_h,a_h,b_h)$ as the probability of transitioning to the next state $s_{h+1}$ conditioned on $(s_h,a_h,b_h)$ at step $h$;
\item $r=\left\{r_h\left(s_h, a_h, b_h\right) \in[-1,1]\right\}_{\left(s_h, a_h, b_h\right) \in \mathcal{S}_h \times \mathcal{A} \times \mathcal{B},h\in[H]}$ are the random reward functions with $\bar{r}_h\left(s_h, a_h, b_h\right)$ as means.
\end{itemize} 

% \begin{itemize}
% \item $H$ is the length of the horizon;
% \item $\mathcal{S}=\bigcup_{h \in[H]} \mathcal{S}_h$ is a finite state space with cardinality $S=\sum_{h=1}^H S_h$ and $\left|\mathcal{S}_h\right|=S_h$;
% \item $\mathcal{X}=\bigcup_{h \in[H]} \mathcal{X}_h$ and $\mathcal{Y}=\bigcup_{h \in[H]} \mathcal{Y}_h$ are the spaces of information sets (short for \emph{infosets} below) for the \textit{max-player} and \textit{min-player}, respectively. In specific, the cardinality $X$ of $\gX$ satisfies $X\coloneqq\sum_{h=1}^H X_h$ with $\left|\mathcal{X}_h\right|=X_h$ and the cardinality $Y$ of $\gY$ satisfies $Y\coloneqq\sum_{h=1}^H Y_h$ with $\left|\mathcal{Y}_h\right|=Y_h$;
% \item  $\gA$ with $|\gA|=A$ and $\gB$ with $|\gB|=B$ are the finite action spaces for the max-player and min-player, respectively;
% \item $\mathbb{P}=\left\{p_0(\cdot) \in \Delta\left(\mathcal{S}_1\right)\right\} \cup\left\{p_h\left(\cdot \mid s_h, a_h, b_h\right) \in \Delta\left(\mathcal{S}_{h+1}\right)\right\}_{\left(s_h, a_h, b_h\right) \in \mathcal{S}_h \times \mathcal{A} \times \mathcal{B}, h \in[H-1]}$ are the transition
% probability functions\footnote{While in some games, $\{p_h\}_{h=1}^{H-1}$  might be time-homogeneous, \textit{i.e.}, $\{p_h\}_{h=1}^{H-1}$ does not depend on $h$,
% we retain the dependence on $h$ in our notations as it allows the results to be applicable more broadly without too much additional efforts in the analysis, following previous works \citep{bai2022nearoptimal,Fiegel2023adapting}.}, 
% with $p_0(\cdot)$ being the probability distribution of the initial states, and $p_h(s_{h+1}|s_h,a_h,b_h)$ being the probability of transmitting to the next state $s_{h+1}$ conditioned on $(s_h,a_h,b_h)$ at step $h$;
% \item $r=\left\{r_h\left(s_h, a_h, b_h\right) \in[-1,1]\right\}_{\left(s_h, a_h, b_h\right) \in \mathcal{S}_h \times \mathcal{A} \times \mathcal{B}}$ are the stochastic reward functions with $\bar{r}_h\left(s_h, a_h, b_h\right)$ as means.
% \end{itemize} 

% [TBC]
% We further introduce mapping $\gX : \gS \rightarrow \gX$ with $\gX(s_h)$ denoting the infoset $s_h$ is in.[TBC]

% \vspace{-0.3cm}
\paragraph{Learning Protocol} 
% Denote by $\mu= \{\mu_h\}_{h\in[H]}$ with $\mu_h:\gX_h\to \Delta_{\gA}$ the max-player's stochastic policy and by $\Pi_{\max}=\{\mu:\gX\to\Delta_{\gA}\}$ the set of the policies of the max-player. 
Let $\mu= \{\mu_h\}_{h\in[H]}$ be the max-player's (stochastic) policy, where $\mu_h:\gX_h\to \Delta_{\gA}$. We denote by $\Pi_{\max}=\{\mu:\gX\to\Delta_{\gA}\}$ the set of the policies of the max-player. 
% Denote by $\mu= \{\mu_h\}_{h\in[H]}$ with $\mu_h:\gX_h\to \Delta_{\gA}$ the max-player's stochastic policy and by $\Pi_{\max}=\{\mu:\gX\to\Delta_{\gA}\}$ the set of the policies of the max-player. 
Similarly, the min-player's (stochastic) policy is defined as $\nu=\{\nu_h\}_{h\in[H]}$ 
% with $\nu^{t}_h:\gY_h\to \Delta_{\gB}$ 
and
the set of the policies of the min-player is denoted by $\Pi_{\min}$. The game proceeds in $T$ episodes. 
% The min-player's stochastic policy $\nu$ and the set of the policies of the min-player $\Pi_{\min}$ are defined similarly. The game proceeds in $T$ episodes. 
% At the beginning of episode $t$, the max-player chooses a stochastic policy $\mu_t\in\Pi_{\max}$. And similarly, the min-player chooses $\nu_t\in\Pi_{\min}$.
At the beginning of episode $t$, the max-player and the min-player choose policies $\mu_t\in\Pi_{\max}$ and $\nu_t\in\Pi_{\min}$, respectively.
Then, an initial state $s^t_1$ will be sampled from $p_0(\cdot)$. At each step $h$, the max-player and min-player will only observe their infosets $x_h^t\coloneqq x\left(s_h^t\right)$ and $y_h^t\coloneqq y\left(s_h^t\right)$ respectively, 
% where we denote by $x:\gS\to\gX$ and $y:\gS\to\gY$ the emission function for the max-player and min-player. 
but \textit{without} observing $s^t_h$. 
% Note that the underlying state $s^t_h$ is not observable to both players.
Conditioned on $x^t_h$, the max-player will take an action $a_h^t \sim \mu_h^t\left(\cdot | x_h^t\right)$ and simultaneously the min-player will take an action $b_h^t \sim \nu_h^t\left(\cdot | y_h^t\right)$. Subsequently, the game will transition to the next state $s^t_{h+1}\sim p_h\left(\cdot | s_h^t, a_h^t, b_h^t\right)$. Meanwhile, the max-player and min-player will receive rewards $r_h^t\coloneqq r_h\left(s_h^t, a_h^t, b_h^t\right)$ and $-r_h^t$ respectively. The $t$-th episode will terminate after the max-player and the min-player take actions $a^t_H$ and $b^t_H$ and receive rewards $r_H^t$ and $-r_H^t$, respectively.
% , \textit{i.e.}, the game will terminate in $H$ steps.
% [TBC] where we let $x:\gS\to\gX$ to denote
% \zhao{05.20}

% \vspace{-0.3cm}
\paragraph{Perfect Recall and Tree Structure}
As in previous works 
\citep{kozuno2021learning,bai2022nearoptimal,Fiegel2023adapting}, we suppose that the POMGs satisfy the \textit{tree structure} and the \textit{perfect recall} condition
% \shuai{justify they are common and mild assumptions} 
\citep{kuhn11953extensive}. Specifically, the tree structure means that for any $h\!=\!2,\ldots,H$ and $s_h\!\in\!\gS_h$, there exists a \textit{unique} trajectory $(s_1,a_1,b_1,\ldots, s_{h-1},a_{h-1},b_{h-1})$ leading to $s_h$.
% \shuai{seems quite strong, justify}. 
Perfect recall condition holds for each player if for any $h=2,\ldots,H$ and any infoset $x_h\in\gX_h$, there exists a \textit{unique} history $\left(x_1, a_1, \ldots, x_{h-1}, a_{h-1}\right)$ leading to $x_h$ and similarly for the min-player. In addition, we denote by $C_{h^\prime}(x_h,a_h)\subset\gX_{h^\prime}$ the descendants of $(x_h,a_h)$ at step $h^\prime\geq h$. With a slight abuse of notation, we also let $C_{h^\prime}(x_h)\coloneqq \cup_{a_h\in \gA}C_{h^\prime}(x_h,a_h)$ and $C(x_h,a_h)\coloneqq C_{h+1}(x_h,a_h)$.
% and $C(x_h)\coloneqq \cup_{a_h\in \gA}C_{h+1}(x_h,a_h)$.
% \zhao{
% TBF
% }
% \shuai{no $C(x_h)$?}

% \vspace{-0.2cm}
\paragraph{Sequence-form Representations}
For any pair of product policy $(\mu,\nu)$, the tree structure and perfect recall condition enable the \textit{sequence-form representations} of the reaching probability of state-action $\left(s_h, a_h, b_h\right)$:
% \begin{align}\label{eq:reaching_prob}
% % \setlength\abovedisplayskip{3pt}
% % \setlength\belowdisplayskip{3pt}
%     \sP^{\mu,\nu}(\!s_h,\!a_h,\!b_h\!)\! =\! p_{1:h}(\!s_h\!)\mu_{1:h}(\!x(s_h),\!a_h\!)\nu_{1:h}(\!y(s_h),\!b_h\!)\,,
% \end{align}
\begingroup
\setlength{\belowdisplayskip}{3pt} \setlength{\belowdisplayshortskip}{3pt}
\setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
\begin{align}\label{eq:reaching_prob}
    &\sP^{\mu,\nu}(s_h,a_h,b_h) \notag\\
    = &p_{1:h}(s_h)\mu_{1:h}(x(s_h),a_h)\nu_{1:h}(y(s_h),b_h)\,,
\end{align}
\endgroup
where 
% $p_{1:h}(s_h)$ is the sequence-form transition probability defined as $p_{1: h}\left(s_h\right)= p_0\left(s_1\right) \prod_{h^{\prime} \leq h-1} p_{h^{\prime}}\left(s_{h^{\prime}+1} \mid s_{h^{\prime}}, a_{h^{\prime}}, b_{h^{\prime}}\right)$,
$p_{1: h}\left(s_h\right)= p_0\left(s_1\right) \prod_{h^{\prime}=1}^{h-1} p_{h^{\prime}}\left(s_{h^{\prime}+1}| s_{h^{\prime}}, a_{h^{\prime}}, b_{h^{\prime}}\right)$ is the sequence-form transition probability,
$\mu_{1: h}\left(x_h, a_h\right)\!\coloneqq\!\prod_{h^{\prime}=1}^h \mu_{h^{\prime}}\left(a_{h^{\prime}}| x_{h^{\prime}}\right)$ and $\nu_{1: h}\left(y_h, b_h\right)\coloneqq\prod_{h^{\prime}=1}^h \nu_{h^{\prime}}\left(b_{h^{\prime}}| y_{h^{\prime}}\right)$ are the sequence-form policies.
% \zhao{In this work, we assume that the products of the transition probabilities of $(s_{h^\prime},a_{h^\prime},s_{h^\prime+1})$ along the trajectory $\{(s_{h^\prime},a_{h^\prime},s_{h^\prime+1})\}_{h^\prime=1}^h$ that leads to any $s_h$, \textit{i.e.}, $$}
% \zhao{
% In this work, we assume that the sequence-form transition probability $p_{1:h}(s_h)$ of any $s_h$ is known. 
% }
Under sequence-form representations, we slightly abuse the meanings of $\mu$ and $\nu$ by viewing $\mu=\{\mu_{1: h}\}_{h\in[H]}$ and 
$\nu=\{\nu_{1: h}\}_{h\in[H]}$.
Also, it is clear that $\Pi_{\max}$ is a convex compact subspace of $\sR^{XA}$ satisfying constraints $\mu_{1: h}\left(x_h, a_h\right) \geq 0$ and $\sum_{a_h \in \mathcal{A}} \mu_{1: h}\left(x_h, a_h\right)\!=\!\mu_{1: h-1}\left(x_{h-1}, a_{h-1}\right)$ with $(x_{h-1}, a_{h-1})$ being such that $x_h\!\in\! C(x_{h-1}, a_{h-1})$ (understanding $\mu_{1:0} (x_0,a_0)=p(\emptyset)= 1$).

% We 
% In this work, we assume that the knowledge of the sequence-form transition probabilities is accessible to the player, illustrated in the following assumption.
In this work, we assume that the player has access to the knowledge of the sequence-form transition probabilities, as explained in the following assumption.
\begin{assumption}\label{ass:known_P}
    % In this work, we assume that t
    The sequence-form transition probability $p_{1:h}(s_h)$ of any $s_h$ is known. 
\end{assumption}
\begin{remark}
    Note that this is slightly weaker than assuming knowing $\sP$ as Assumption \ref{ass:known_P} only assumes $p_{1: h}\left(s_h\right)= p_0\left(s_1\right) \prod_{h^{\prime}=1}^{h-1} p_{h^{\prime}}\left(s_{h^{\prime}+1}| s_{h^{\prime}}, a_{h^{\prime}}, b_{h^{\prime}}\right)$ is known. Though this assumption is not required as in previous works studying tabular POMGs \citep{kozuno2021learning,bai2022nearoptimal}, we remark that a similar assumption of knowing $\sP$ is also required by \citet{neu2021online}, which initiates the first step for learning adversarial linear MDPs. We leave the question of whether this assumption can be eliminated in our problem as our future work.
\end{remark}

% --------------------------------------------------------------------------------------
% In an IIEFG, due to imperfect information, a player will have to decide on his action based on his current infoset over time. At each infoset, a player's strategy is a probability distribution $p(\cdot|x_h)$ over $\Delta(\gA)$. Under perfect recall, sequence-form representation encodes this sequential decision making process: 
% \[\forall x_h,a_h\in \gX\times\gA,\mu_{1:h}(x_h,a_h)\coloneqq \prod_{h^\prime=1}^h \mu_{h^\prime}(a_{h^\prime}|x_{h^\prime})\]

% We call such $\mu$ \emph{realization plan}. It can be shown that realization plan demonstrates linearity : $\forall x_h,a_h\in \gX\times\gA,\sum_{a_h\in\gA}\mu_{1:h}(x_h,a_h) = \mu_{1:h-1}(x_{h-1},a_{h-1})$ where $(x_{h-1},a_{h-1})$ is the unique parent infoset and action prior to $x_h$, with $\mu_1:0(x_0,a_0)=p(\emptyset)= 1$.

% Back to our POMG setting, we can formulate the probability of reaching state-action $(s_h,a_h,b_h)$ as : 
% \begin{align}
%     \sP^{visit}(s_h,a_h,b_h) = p_{1:h}(s_h)\mu^t_{1:h}(\gX(s_h),a_h)\nu^t_{1:h}(\gY(s_h),b_h)\,
% \end{align}
% where the underlying state transition is Markovian 
% \[p_{1:h}(s_h)\coloneqq p_0(s_1)\prod_{h^\prime=1}^{h-1}p_{h^\prime}(s_{h^\prime+1}|s_{h^\prime},a_{h^\prime},b_{h^\prime})\,\]

% \iffalse
% \begin{align*}
% &\sumH\sum_{(s_h,a_h,b_h)\in \gS_h\times\gA\times\gB}p_{1:h}(s_h)\mu^t_{1:h}(\gX(s_h),a_h)\nu^t_{1:h}(\gY(s_h),b_h)\\&=\sumH\sum_{(s_h,a_h,b_h)\in \gS_h\times\gA\times\gB}\sP^{visit}(s_h,a_h,b_h)\\
% &=1
% \end{align*}
%\fi
% --------------------------------------------------------------------------------------

% [TBC]
% Here we formally define the \emph{perfect recall} assumption mentioned in the introduction: for each infoset $x_h\in\gX$, there exists a unique history of past infosets and actions : $(x_1,a_1,\dots,x_h)$ leading to $x_h$ (similar assumptions for the min-player).  
% We further introduce some further denotations, $C_{h^\prime}(x_h,a_h)\in\gX_{h^\prime}$ represents the descendants of $(x_h,a_h)$ at level $h^\prime$ and $C_{h^\prime}(x_h)\coloneqq \cup_{a_h\in \gA}C_{h^\prime}(x_h,a_h)$; specially,  $C(x_h,a_h)\in\gX_{h+1}$ denotes the descendants of $(x_h,a_h)$ at level $h+1$.
% Moreover, to exploit the structure of game tree in IIEFG, we are only interested in POMGs with \emph{tree structure}, that is : For $s_h\in\gS$, there exists a unique history $(s_1,a_1,b_1,\dots s_{h_1},a_{h-1},b_{h-1},s_h)$ of past states and  actions leading to $s_h$.
% [TBC]

% \vspace{-0.2cm}
\paragraph{POMGs with Linear Function Approximation} We now introduce the definition of linear realizability over the reward functions of POMGs, detailed as follows.

\begin{definition}[Linear Rewards in POMGs]\label{assumption:linear}
% \begin{assumption}[Linear Rewards]\label{assumption:linear}
The reward function $r$ in $\operatorname{POMG}(\gS,\gX,\gY,\gA,\gB,H,\sP,r)$ is linearly realizable with a known feature mapping $\vphi:\gS\times\gA\times\gB\to\sR^d$ if for each $h\in[H]$, there exists an unknown parameter vector $\vtheta_h\in \R^d$ 
such that $\bar{r}_h(s_h,a_h,b_h)=\left\langle\vphi(s_h,a_h,b_h),\vtheta_h\right\rangle$ for any $(s_h,a_h,b_h)\!\in\! \gS_h\times\gA\times \gB$. Further, 
% w.l.o.g., 
we assume that 
% $\sup_{(s_h,a_h,b_h)\in \gS_h\times\gA\times \gB}|\bar{r}_h(s_h,a_h,b_h)|\leq 1$,
% $\sup_{(s_h,a_h,b_h)\in \gS_h\times\gA\times \gB}\|\vphi(s_h,a_h,b_h)\|_2\!\leq\! 1$
$\sup_{(s_h,a_h,b_h)\in \gS_h\times\gA\times \gB}\|\vphi(s_h,a_h,b_h)\|_2\leq L$
and $\{\vphi(s_h,a_h,b_h)\}_{(s_h,a_h,b_h)\in \gS_h\times\gA\times \gB}$ spans $\sR^d$, $\forall h\in[H]$.
\end{definition}

% \vspace{-0.2cm}
Similar definitions of linear reward functions can also be seen in fully observable linear MGs \citep{XieCWY20}. However, as we shall see in Section \ref{sec:Linear_Loss_Estimator}, the imperfect information in POMGs brings significant difficulties in utilizing the linear structure over the reward functions compared with fully observable MGs.
% \zhao{TBF: subject mismatch before and after `compared with'.}
% \shuai{Note that xxx}We also 
Note that the regularity assumption 
% imposed over $\bar{r}_h(\cdot,\cdot,\cdot)$ and $\vphi(\cdot,\cdot,\cdot)$ 
that the range of $\bar{r}_h(\cdot,\cdot,\cdot)$ and the norm of $\vphi(\cdot,\cdot,\cdot)$ are bounded
is only for the purpose of normalization, and the assumption that $\sR^d$ is spanned by the feature vectors is for convenience only \citep{lattimore2020bandit}.
% \end{remark}

% \paragraph{Offline Setting} 

% --------------------------------------------------------------------------------------
% \begin{assumption}[Linear POMGs]
% An IIG {\small$(\gS,\gX,\gY,\gA,\gB,H,p,r)$} is a \emph{linear IIG} with a feature mapping $\vphi: \gS\times\gA\times\gB \rightarrow \R^d$, where at each level $h$, $\rm span(\phi_h)=\R^d$. If for any $h\in[H-1]$, there exists an \emph{unknown} vector $\vtheta_h\in \R^d$, such that for any $(s_h,a_h,b_h)\in \gS\times\gA\times \gB$, we have
% \begin{align*}
%     r_h(s_h,a_h,b_h)=\left\langle\vphi(s_h,a_h,b_h),\vtheta_h\right\rangle.
% \end{align*}
% \end{assumption}
% That is to say, the player can only have access to the feature mapping and the rewards along his trajectory.
% --------------------------------------------------------------------------------------

% --------------------------- commented by canzhe -----------------------------------------------------------
% \paragraph{Regret and Nash Equilibrium} 
% To start with, for any pair of product policy $(\mu,\nu)$, define the value function of $(\mu,\nu)$ as 
% \begin{align}\label{equation:value}
%     V^{\mu,\nu}=\E\left[\sum_{h=1}^H r_h(s_h,a_h,b_h)\Big|\mu,\nu, \sP\right]\,,
% \end{align}
% where the expectation is taken over the randomness of the underlying state transitions and the policies of both players. Particularly, the max-player and min-player aim to maximize and minimize the value function, respectively.
% We consider two learning objectives: regret minimization and learning of NE.
% Without loss of generality, we consider the case where the max-player is the learning agent, and the min-player is the (potentially adversarial) opponent,
% % W.l.o.g., we are interested in minimizing the regret of the max-player and assume that the min-player 
% who might choose her policy $\nu^t$ arbitrarily, probably based on all the history information (including the knowledge of $\{\mu_k\}_{k=1}^{t-1}$) before episode $t$. In specific, the max-player aims to design policies $\left\{\mu^t\right\}_{t=1}^T$ to minimize the \textit{pseudo-regret} compared with the best fixed policy $\mu^\dagger$ in hindsight, defined as
% \begin{align}\label{equation:regret}
%   \Reg^T_{\max} = \max_{\mu^\dagger\in\Pi_{\max}} \E\left[\sum_{t=1}^T \left(V^{\mu^\dagger, \nu^t} - V^{\mu^t, \nu^t}\right)\right]\,.
% \end{align}
% % In the regret minimization setting, 
% In this work, we consider the regret minimization for the max-player in the \textit{offline} setting, in which the max-player has access to the 
% feature vectors of state-action weighted by min-player's policy $\nu^{t}$ in episode $t$ (as well as transitions)
% % min-player's policy $\nu^{t}$ in episode $t$ 
% after the $t$-th episode ends. Note that this is slightly more general than the ``offline" setting considered by \citet{ChenZG22a,XieCWY20}, as we neither require the policy $\nu^{t}$ to be accessible to the max-player nor require both players to be directly controlled by a central controller.
% [TBC]

% A produce policy $(\mu,\nu)$ is said to be an $\varepsilon$-approximate Nash equilibrium if 
% \begin{align*}
%   \negap(\mu, \nu) \coloneqq \max_{\mu^\dagger\in\Pi_{\max}} V^{\mu^\dagger, \nu} - \min_{\nu^\dagger\in\Pi_{\min}} V^{\mu, \nu^\dagger} \le \varepsilon\,,
% \end{align*}
% which means that $\mu$ and $\nu$ are $\varepsilon$-approximate best responses to each other.
% Moreover, the following standard online-to-batch conversion guarantees that sublinear regret for both players implies the pair of average policies $(\bar{\mu},\bar{\nu})$, defined as $\bar{\mu}\coloneqq \frac{1}{T} \sum_{t=1}^T \mu^t$ and $\bar{\nu}\coloneqq\frac{1}{T} \sum_{t=1}^T \nu^t$, is an approximate NE.

% \begin{proposition}[\citet{kozuno2021learning}, Theorem 1]\label{proposition:reg_to_nash}
%   For any sequence of policies $\left\{\mu^t\right\}_{t=1}^T\subset\Pi_{\max}$ and $\left\{\nu^t\right\}_{t=1}^T\subset\Pi_{\min}$, the average policies $\Bar{\mu},\Bar{\nu}$ satisfy
%   \begin{align*}
%     \negap(\Bar{\mu}, \Bar{\nu}) = \frac{\Reg_{\max}^T + \Reg_{\min}^T}{T}\,,
%   \end{align*}
% where $\mathfrak{R}_{\max }^T$ is the max-player's regret defined in Eq. \eqref{equation:regret} and $\Reg_{\min}^T\coloneqq\max _{\nu^{\dagger} \in \Pi_{\min }} \sum_{t=1}^T(V^{\mu^t, \nu^t}-V^{\mu^t, \nu^{\dagger}})$ is the min-player's regret.
% \end{proposition}
% Proposition \ref{proposition:reg_to_nash} indicates that an approximate NE can be learned if both players run algorithms to obtain the sublinear regret against each other, in a \textit{self-play} manner, where both players are controlled by a central controller \footnote{Since in \textit{self-play} setting the central controller can pass the min-player's policy $\nu^t$ on to the max-player after the $t$-th episode ends and vice versa, by ``self-play" we implicitly denote that both players are in the \textit{offline} setting. [TBC]}. 
% [TBC]

% \paragraph{Additional Notations} With sequence-form representations, for any $\mu\in\Pi_{\max}$ and a sequence of functions $f=(f_h)_{h\in[H]}$ with $f_h:\gX_h\times\gA\to\sR$, we let $\langle\mu, f\rangle\coloneqq \sum_{h \in[H]} \sum_{x_h \in \mathcal{X}_h, a \in \mathcal{A}} \mu_{1: h}\left(x_h, a_h\right) f_h\left(x_h, a_h\right)$.
% We denote by $\gF^t$ the $\sigma$-algebra generated by $\{(s^k_h,a^k_h,b^k_h,r^k_h)\}_{h\in[H], k\in[t]}$.  For simplicity, we abbreviate $\mathbb{E}\left[\cdot \mid \mathcal{F}^{t}\right]$ as $\mathbb{E}^{t}[\cdot]$.  The notation $\widetilde{\gO}(\cdot)$ in this paper hides all the logarithmic factors.
% --------------------------- commented by canzhe -----------------------------------------------------------
% \vspace{-0.1cm}
% \paragraph{Regret Minimization} 
\paragraph{Learning Objective} 
For any product policy $(\mu,\nu)$, 
denote by $V^{\mu,\nu}\!=\!\E_{\mu,\nu}\left[\sum_{h=1}^H r_h(s_h,a_h,b_h)\right]$ the value function of $(\mu,\nu)$,
% the value function of $(\mu,\nu)$ is defined as $ V^{\mu,\nu}\!=\!\E_{\mu,\nu}\left[\sum_{h=1}^H r_h(s_h,a_h,b_h)\right]$,
% \begin{align}\label{equation:value}
%     V^{\mu,\nu}=\E\left[\sum_{h=1}^H r_h(s_h,a_h,b_h)\middle|\mu,\nu, \sP\right]\,,
% \end{align}
% where the expectation is taken over the randomness of the 
% underlying state transitions and the policies of both players. 
where the expectation is taken over the randomness of the policies $(\mu,\nu)$ and the environment.
% the state transitions, and reward functions. 
% Particularly, the max-player and min-player aim to maximize and minimize the value function, respectively.
% \chen{This sentence may be unnecessary.}
% We consider two learning objectives: regret minimization and learning of NE.
% In this paper, the learning objective of regret minimization is considered.
In this paper, we focus on the learning objective of regret minimization.
% Without loss of generality, 
W.l.o.g.,
we consider the case where the max-player is the learning agent, and the min-player is the (potentially adversarial) opponent,
% W.l.o.g., we are interested in minimizing the regret of the max-player and assume that the min-player 
who might choose her policy $\nu^t$ arbitrarily, probably based on all the history information (including the knowledge of $\{\mu^k\}_{k=1}^{t-1}$) up to episode $t-1$. Formally, the max-player aims to design policies $\left\{\mu^t\right\}_{t=1}^T$ to minimize the \textit{pseudo-regret} (\textit{regret} for short) compared with the best fixed policy $\mu^\dagger$ in hindsight:
% \begin{align}\label{equation:regret}
%   \Reg^T_{\max} = \max_{\mu^\dagger\in\Pi_{\max}} \E\left[\sum_{t=1}^T \left(V^{\mu^\dagger, \nu^t} - V^{\mu^t, \nu^t}\right)\right]\,,
% \end{align}
\begingroup
% \setlength{\belowdisplayskip}{4pt} \setlength{\belowdisplayshortskip}{4pt}
% \setlength{\abovedisplayskip}{4pt} \setlength{\abovedisplayshortskip}{4pt}
\begin{align}\label{equation:regret}
  \Reg^T_{\max} = \max_{\mu^\dagger\in\Pi_{\max}} \E\left[\sum_{t=1}^T \left(V^{\mu^\dagger, \nu^t} - V^{\mu^t, \nu^t}\right)\right]\,,
\end{align}
\endgroup
where the expectation is taken over the (potential) randomness of both the max-player and min-player.
% In the regret minimization setting, 
% In this work, we consider the regret minimization for the max-player in the \textit{offline} setting, in which the max-player has access to the 
% feature vectors of state-action weighted by min-player's policy $\nu^{t}$ in episode $t$ (as well as transitions)
% after the $t$-th episode ends. 
% Note that this is slightly more general than the ``offline" setting (also called self-play) considered by \citet{ChenZG22a,XieCWY20}, as we neither require the policy $\nu^{t}$ to be accessible to the max-player nor require both players to be directly controlled by a central controller.

% A produce policy $(\mu,\nu)$ is said to be an $\varepsilon$-approximate Nash equilibrium if 
% \begin{align*}
%   \negap(\mu, \nu) \coloneqq \max_{\mu^\dagger\in\Pi_{\max}} V^{\mu^\dagger, \nu} - \min_{\nu^\dagger\in\Pi_{\min}} V^{\mu, \nu^\dagger} \le \varepsilon\,,
% \end{align*}
% which means that $\mu$ and $\nu$ are $\varepsilon$-approximate best responses to each other.
% Moreover, the following standard online-to-batch conversion guarantees that sublinear regret for both players implies the pair of average policies $(\bar{\mu},\bar{\nu})$, defined as $\bar{\mu}\coloneqq \frac{1}{T} \sum_{t=1}^T \mu^t$ and $\bar{\nu}\coloneqq\frac{1}{T} \sum_{t=1}^T \nu^t$, is an approximate NE.

% \begin{proposition}[\citet{kozuno2021learning}, Theorem 1]\label{proposition:reg_to_nash}
%   For any sequence of policies $\left\{\mu^t\right\}_{t=1}^T\subset\Pi_{\max}$ and $\left\{\nu^t\right\}_{t=1}^T\subset\Pi_{\min}$, the average policies $\Bar{\mu},\Bar{\nu}$ satisfy
%   \begin{align*}
%     \negap(\Bar{\mu}, \Bar{\nu}) = \frac{\Reg_{\max}^T + \Reg_{\min}^T}{T}\,,
%   \end{align*}
% where $\mathfrak{R}_{\max }^T$ is the max-player's regret defined in Eq. \eqref{equation:regret} and $\Reg_{\min}^T\coloneqq\max _{\nu^{\dagger} \in \Pi_{\min }} \sum_{t=1}^T(V^{\mu^t, \nu^t}-V^{\mu^t, \nu^{\dagger}})$ is the min-player's regret.
% \end{proposition}
% Proposition \ref{proposition:reg_to_nash} indicates that an approximate NE can be learned if both players run algorithms to obtain the sublinear regret against each other, in a \textit{self-play} manner, where both players are controlled by a central controller \footnote{Since in \textit{self-play} setting the central controller can pass the min-player's policy $\nu^t$ on to the max-player after the $t$-th episode ends and vice versa, by ``self-play" we implicitly denote that both players are in the \textit{offline} setting. [TBC]}. 
% [TBC]
% \vspace{-0.1cm}
\paragraph{Additional Notations} 
We slightly abuse the notation to view $x_h$ as the set $\{s\in\gS_h:x(s)=x_h\}$, when writing $s\in x_h$.
With sequence-form representations, for any $\mu\in\Pi_{\max}$ and a sequence of functions $f=(f_h)_{h\in[H]}$ with $f_h:\gX_h\times\gA\to\sR$, let $\langle\mu, f\rangle\coloneqq \sum_{h \in[H],(x_h,a_h) \in \mathcal{X}_h\times \mathcal{A}} \mu_{1: h}\left(x_h, a_h\right) f_h\left(x_h, a_h\right)$. We denote by $\gF^t$ the $\sigma$-algebra generated by $\{(s^k_h,a^k_h,b^k_h,r^k_h)\}_{h\in[H], k\in[t]}$.  For simplicity, we abbreviate $\mathbb{E}\left[\cdot \mid \mathcal{F}^{t}\right]$ as $\mathbb{E}^{t}[\cdot]$. The notation $\widetilde{\gO}(\cdot)$ in this paper suppresses all the logarithmic factors.



% --------------------------------------------------------------------------------------
% \subsection{Episodic Unfolding, Adversarial Game and Bandit Feedback}
% \label{sec:episodic_unfolding}

% We view the max player's perspective, presuming the min player is an arbitrary, potentially \emph{adversarial} opponent who can select her policy $\nu^t$ founded on the entire history, comprehending $\mu^t$, preceding episode $t$'s commencement. Here \emph{adversarial} means that we allow one player (W.L.O.G min-player) to change the reward function of the extensive-form game at her will at each episode $t$. Moreover, we consider a more difficult setting where the other player (max-player) only receives partial information of the reward function. Specifically, he can only have access to the reward along the trajectory he went through (also called \emph{trajectory feedback} in \citep{Fiegel2023adapting}). 
% --------------------------------------------------------------------------------------

% --------------------------------------------------------------------------------------
% \paragraph{Value Functions and Regret}
% At episode $T$, given min-player's policy $\nu^t$, we can define the value function for max player as 
% \begin{align}
% \label{equation:value}
%     V^{\mu,\nu^t}=\E ^{\mu,\nu^t}[\sum_{h=1}^H \rr_h(\rs_h,\ra_h,\rb_h)]
% \end{align}
% The two players subsequently engage in episode $t$ jointly employing $(\mu^t,\nu^t)$. The max player's objective is crafting a policy sequence $\left\{\mu^t\right\}_{t=1}^T$ that minimizes regret relative to the optimal stationary policy in hindsight.
% \begin{align}
% \label{equation:regret}
%   \Reg^T_{\max} \coloneqq \max_{\mu^\dagger\in\Pi_{\max}} \sum_{t=1}^T \left(V^{\mu^\dagger, \nu^t} - V^{\mu^t, \nu^t}\right)\,.
% \end{align}
% --------------------------------------------------------------------------------------

% --------------------------------------------------------------------------------------
% \paragraph{Average Profile and $\varepsilon$-Nash-equilibrium}
% For sequence-form representation, we can define \emph{Average Profile} over $T$ episodes as 
% \begin{equation}\label{eq:average_profile}
%     \Bar{\mu}=\frac{1}{T}\sum_{t=1}^T\mu^t
% \end{equation}
% We say a product policy $(\mu, \nu)$ is an $\epsilon$-approximate Nash equilibrium ($\epsilon$-NE) if
% \begin{align*}
%   \negap(\mu, \nu) \coloneqq \max_{\mu^\dagger\in\Pi_{\max}} V^{\mu^\dagger, \nu} - \min_{\nu^\dagger\in\Pi_{\min}} V^{\mu, \nu^\dagger} \le \epsilon,
% \end{align*}
% i.e. $\mu$ and $\nu$ are each other's $\epsilon$-approximate best response. 
% It is a standard result that sublinear regret for both players ensures that the pair of average policies $(\Bar{\mu}, \Bar{\nu})$ is an approximate NE (see e.g.~\citet{kozuno2021learning,bai2022nearoptimal}):
% \begin{proposition}[Regret to Nash conversion]
%   \label{proposition:reg_to_nash}
%   For any sequence of policies $\left\{\mu^t\right\}_{t=1}^T\in\Pi_{\max}$ and $\left\{\nu^t\right\}_{t=1}^T\in\Pi_{\min}$, the average policies $\Bar{\mu},\Bar{\nu}$ satisfy
%   \begin{align*}
%     \negap(\Bar{\mu}, \Bar{\nu}) = \frac{\Reg_{\max}^T + \Reg_{\min}^T}{T},
%   \end{align*}
% \end{proposition}

% Therefore, an approximate NE can be learned by letting both players play some sublinear regret algorithm against each other in a self-play fashion.

% \begin{assumption}[Offline Setting and Self-play]\label{assumption:offline}
%     We assume the state transition probabilities of underlying states are known to each player in prior. Moreover, the players will receive other player's policy at the end of each episode $t$.
% \end{assumption}

% --------------------------------------------------------------------------------------















