% \begin{table*}[t]
% % \centering
% \caption{Comparisons of regret bounds with most related works studying IIEFGs with unknown transition and bandit feedback.
% % when the full knowledge of the game is not known a priori. 
% % $T$ is the number of episodes, $H$ is the horizon length, $d$ is the ambient dimension of the feature mapping, $X$ and $A$ are the cardinalities of the information set space and action space respectively.
% }
% \label{table:rate}
% \begin{center}
% \begin{threeparttable}
% % ----------------------------- Adjust linespread ----------------------------
% \renewcommand{\arraystretch}{1.6} 
% % ----------------------------------------------------------------------------
% \begin{tabular}{@{}|c|c|c|c|@{}}
% \hline
%  \textbf{Algorithm}  & \textbf{Setting} & \textbf{Regret}\\
% \hline
%  \IXOMD\citep{kozuno2021learning}  &\multirow{3}{*}{Online} & $\widetilde{\gO}(HX \sqrt{AT})$ \\
%  \cline{1-1}\cline{3-3}
%  \BalancedOMDCFR\citep{bai2022nearoptimal}  & & 
%  $\widetilde{\gO}(\sqrt{H^3XAT})$\\
%  \cline{1-1}\cline{3-3}
%  \BalancedFTRL\citep{Fiegel2023adapting}  & & 
%  $\widetilde{\gO}(\sqrt{XAT})$\\
%  \cline{1-2}\cline{3-3}
%  \cellcolor{LightGray}
%      \LSOMD (this paper)  & \multirow{2}{*}{Offline\tnote{1}} & 
%       $\widetilde{\gO}(\sqrt{(\nicefrac{1}{\rho}+d)HX^2T})$\;\tnote{2}\\
%      \cline{3-3}
%  \cellcolor{LightGray}
%      \LSFTRL (this paper)  &  & 
%      $\widetilde{\gO}(\sqrt{\lambda dH^2T})/\widetilde{\gO}(\sqrt{dHXT})$\;\tnote{3}\\
%  \hline
%  \cellcolor{LightGray}
%   Lower bound (this paper)& - &  $\Omega(\sqrt{d\min(d,H)T})$\\
%  \hline
% \end{tabular}
% {\scriptsize
% \begin{tablenotes}
% \item[1] See Section \ref{sec:Linear_Loss_Estimator} for details of the \textit{offline} setting.
% \item[2] See Assumption~\ref{assump:exploration_omd} for the definition of  $\rho$.
% \item[3] The $\lambda$ in the former bound depends on the game tree structure, defined in Assumption~\ref{ass:ftrl_trans}. 
% The latter bound is obtained by the same algorithm but with a different initiation of parameters. 
% \end{tablenotes}}
% \end{threeparttable}
% \end{center}
% \end{table*}

% \begin{table*}[t]
% % \centering
% \caption{Comparisons of regret bounds with most related works studying IIEFGs with bandit feedback. 
% % \shuai{related work is not for linear structure, right? why to emphasize online/offline but not mention function approximation?}
% % when the full knowledge of the game is not known a priori. 
% % $T$ is the number of episodes, $H$ is the horizon length, $d$ is the ambient dimension of the feature mapping, $X$ and $A$ are the cardinalities of the information set space and action space respectively.
% }
% \label{table:rate}
% \begin{center}
% \begin{threeparttable}
% % ----------------------------- Adjust linespread ----------------------------
% \renewcommand{\arraystretch}{1.6} 
% % ----------------------------------------------------------------------------
% \begin{tabular}{@{}|c|c|c|c|@{}}
% \hline
%  \textbf{Algorithm}  & \textbf{Setting} & \textbf{Regret}\\
% \hline
%  \IXOMD\citep{kozuno2021learning}  &\multirow{3}{*}{Online} & $\widetilde{\gO}(HX \sqrt{AT})$ \\
%  \cline{1-1}\cline{3-3}
%  \BalancedOMDCFR\citep{bai2022nearoptimal}  & & 
%  $\widetilde{\gO}(\sqrt{H^3XAT})$\\
%  \cline{1-1}\cline{3-3}
%  \BalancedFTRL\citep{Fiegel2023adapting}  & & 
%  $\widetilde{\gO}(\sqrt{XAT})$\\
%  \cline{1-2}\cline{3-3}
%  \cellcolor{LightGray}
%      \LSOMD (this paper \zhao{TBF: whether to put FOMD here.})  & \multirow{2}{*}{Offline} & 
%       $\widetilde{\gO}(\sqrt{(\nicefrac{1}{\rho}+d)HX^2T})$\;\tnote{1}\\
%      \cline{3-3}
%  \cellcolor{LightGray}
%      \LSFTRL (this paper)  &  & 
%      $\widetilde{\gO}(\sqrt{\lambda dH^2T})$\;\tnote{2}\\
%  \hline
%  \cellcolor{LightGray}
%   Lower bound (this paper)& - &  $\Omega(\sqrt{d\min(d,H)T})$\\
%  \hline
% \end{tabular}
% {\scriptsize
% \begin{tablenotes}
% % \item[1] See Section \ref{sec:Linear_Loss_Estimator} for details of the \textit{offline} setting.
% \item[1] See Assumption~\ref{assump:exploration_omd} for the definition of  $\rho$.
% \item[2] The $\lambda$ in the former bound depends on the game tree structure, defined in Assumption~\ref{ass:ftrl_trans}. 
% The latter bound is obtained by the same algorithm but with a different initiation of parameters. 
% \end{tablenotes}}
% \end{threeparttable}
% \end{center}
% \end{table*}

% % -------------------------------- 2024.05.20 --------------------------------
% \begin{table*}[t]
% % \centering
% \caption{Comparisons of regret bounds with most related works studying IIEFGs with bandit feedback. 
% % \shuai{related work is not for linear structure, right? why to emphasize online/offline but not mention function approximation?}
% % when the full knowledge of the game is not known a priori. 
% % $T$ is the number of episodes, $H$ is the horizon length, $d$ is the ambient dimension of the feature mapping, $X$ and $A$ are the cardinalities of the information set space and action space respectively.
% }
% \label{table:rate}
% \begin{center}
% \begin{threeparttable}
% % ----------------------------- Adjust linespread ----------------------------
% \renewcommand{\arraystretch}{1.6} 
% % ----------------------------------------------------------------------------
% \begin{tabular}{@{}|c|c|c|c|@{}}
% \hline
%  \textbf{Algorithm}  & \textbf{Setting} & \textbf{Regret}\\
% \hline
%  \IXOMD\citep{kozuno2021learning}  &\multirow{3}{*}{Online} & $\widetilde{\gO}(HX \sqrt{AT})$ \\
%  \cline{1-1}\cline{3-3}
%  \BalancedOMDCFR\citep{bai2022nearoptimal}  & & 
%  $\widetilde{\gO}(\sqrt{H^3XAT})$\\
%  \cline{1-1}\cline{3-3}
%  \BalancedFTRL\citep{Fiegel2023adapting}  & & 
%  $\widetilde{\gO}(\sqrt{XAT})$\\
%  \cline{1-2}\cline{3-3}
%  % \cellcolor{LightGray}
%  %     \LSOMD (this paper \zhao{TBF: whether to put FOMD here.})  & \multirow{2}{*}{Offline} & 
%  %      $\widetilde{\gO}(\sqrt{(\nicefrac{1}{\rho}+d)HX^2T})$\;\tnote{1}\\
%  %     \cline{3-3}
%  \cellcolor{LightGray}
%      \LSFTRL (this paper)  & \multirow{1}{*}{Offline \zhao{TBD: whether to add the ``offline'' setting}} & 
%      $\widetilde{\gO}(\sqrt{\lambda dH^2T})$\;\tnote{1}\\
%  \hline
%  \cellcolor{LightGray}
%   Lower bound (this paper)& - &  $\Omega(\sqrt{d\min(d,H)T})$\\
%  \hline
% \end{tabular}
% {\scriptsize
% \begin{tablenotes}
% % \item[1] See Section \ref{sec:Linear_Loss_Estimator} for details of the \textit{offline} setting.
% % \item[1] See Assumption~\ref{assump:exploration_omd} for the definition of  $\rho$.
% % \item[1] The $\lambda$ in the former bound depends on the game tree structure, defined in Assumption~\ref{ass:ftrl_trans}. 
% % The latter bound is obtained by the same algorithm but with a different initiation of parameters. 
% \item[1] $\lambda$ is problem-dependent quantity, formally defined in Section \ref{sec:ftrl_analysis}.
% \end{tablenotes}}
% \end{threeparttable}
% \end{center}
% \end{table*}
% % -------------------------------- 2024.05.20 --------------------------------

% % -------------------------------- 2024.05.20 --------------------------------
% \begin{table*}[t]
% \caption{Comparisons of regret bounds with most related works studying IIEFGs with bandit feedback. 
% }
% \label{table:rate}
% \begin{center}
% \begin{threeparttable}
% % ----------------------------- Adjust linespread ----------------------------
% \renewcommand{\arraystretch}{1.6} 
% % ----------------------------------------------------------------------------
% \begin{tabular}{@{}|c|c|c|c|@{}}
% \hline
%  \textbf{Algorithm}  & \textbf{Setting} & \textbf{Regret}\\
% \hline
%  \IXOMD\citep{kozuno2021learning}  &\multirow{3}{*}{Tabular IIEFGs} & $\widetilde{\gO}(HX \sqrt{AT})$ \\
%  \cline{1-1}\cline{3-3} \BalancedOMDCFR\citep{bai2022nearoptimal}  & & $\widetilde{\gO}(\sqrt{H^3XAT})$\\
%  \cline{1-1}\cline{3-3} \BalancedFTRL\citep{Fiegel2023adapting}  & & $\widetilde{\gO}(\sqrt{XAT})$\\
%  \cline{1-2}\cline{3-3} \cellcolor{LightGray} \LSFTRL (this paper)  & \multirow{2}{*}{Linear IIEFGs} & 
%      $\widetilde{\gO}(\lambda H\sqrt{dT})$\;\tnote{1}\\
%  % \hline
%  \cline{1-2}\cline{3-3}
%  \cellcolor{LightGray}
%   Lower bound (this paper)&  &  $\Omega(\sqrt{d\min(d,H)T})$\\
%  \hline
% \end{tabular}
% {\scriptsize
% \begin{tablenotes}
% \item[1] An exponential term that approaches $1$ for large enough $T$ is omitted for simplicity. Please see Theorem \ref{thm:ftrl_trans} for details.
% % Our upper bound holds in large $T$ regime. 
% The balance coefficient $\lambda$ is formally defined in Section \ref{sec:ftrl_analysis}.
% \end{tablenotes}}
% \end{threeparttable}
% \end{center}
% \end{table*}
% % -------------------------------- 2024.05.20 --------------------------------



% \vspace{-0.3cm}
\subsection{Additional Related Works}
In addition to tabular IIEFGs/POMGs, the other line of research most related to our work is
learning fully observable MGs with function approximation \citep{XieCWY20,ChenZG22a,XiongZSSZ22,JinLY22,WangL0023,CuiZD23,Ni0ZDJW23,ZhangBJ23}.
% To tackle MGs in large state-action space, 
% there has been growing research interest in learning MGs with function approximation recently \citep{XieCWY20,ChenZG22a,XiongZSSZ22,JinLY22,WangL0023,CuiZD23,Ni0ZDJW23,ZhangBJ23}. 
% In particular, \citet{XieCWY20} assume both the transition and reward functions of the episodic two-player zero-sum MGs are linearly realizable and achieve an $\widetilde{\mathcal{O}}(\sqrt{d^3 H^4 T})$ regret.
These works generally fall into two categories. The first category aims to relax the assumption of linear function approximation by studying MGs with general function approximation \citep{XiongZSSZ22,JinLY22,Ni0ZDJW23}, and the other category of works focuses on learning general-sum MGs  \citep{WangL0023,CuiZD23,Ni0ZDJW23,ZhangBJ23}. 
% \zhao{
However, we note that all these works study \textit{fully observable} MGs with function approximation, which assume the underlying states are observable to the players and thus are not applicable for solving POMGs. To our knowledge, there are no existing works studying POMGs with function approximation, which is the main focus of this work.
% }
% However, we note that all these works study \textit{perfect information}\shuai{better to state the limitations of their algorithms under the assumption of perfect information since we are going to relax} MGs with function approximation, and (to our knowledge) there are no existing works studying \textit{partially observable} MGs with function approximation, which is the main focus of this work.

% \begin{table}[t]
% \caption{Comparisons of regret bounds with most related works studying IIEFGs when the full knowledge of the game is not known a priori. $T$ is the number of episodes, $H$ is the horizon length, $d$ is the ambient dimension of the feature mapping, $X$ and $A$ are the cardinalities of the information set space and action space respectively.
% }
% \label{table:rate}
% \begin{center}
% \begin{threeparttable}
% % ----------------------------- Adjust linespread ----------------------------
% \renewcommand{\arraystretch}{1.6} 
% % ----------------------------------------------------------------------------
% \begin{tabular}{@{}|c|c|c|c|@{}}
% \hline
%  \textbf{Algorithm}  & \textbf{Setting} & \textbf{Regret}\\
% \hline
%  \IXOMD\citep{kozuno2021learning}  &\multirow{3}{*}{Online} & $\widetilde{\gO}(HX \sqrt{AT})$ \\
%  \cline{1-1}\cline{3-3}
%  \BalancedOMDCFR\citep{bai2022nearoptimal}  & & 
%  $\widetilde{\gO}(\sqrt{H^3XAT})$\\
%  \cline{1-1}\cline{3-3}
%  \BalancedFTRL\citep{Fiegel2023adapting}  & & 
%  $\widetilde{\gO}(\sqrt{XAT})$\\
%  \cline{1-2}\cline{3-3}
%  \cellcolor{LightGray}
%      \LSOMD (this paper)  & \multirow{2}{*}{Offline\tnote{1}} & 
%      {\revise $\widetilde{\gO}(\sqrt{(d+\nicefrac{1}{\rho})HX^2T})$}\;\tnote{2}\\
%      \cline{3-3}
%  \cellcolor{LightGray}
%      \LSFTRL (this paper)  &  & 
%      $\widetilde{\gO}(\sqrt{H^2d\lambda T})/\widetilde{\gO}(\sqrt{HXdT})$\;\tnote{3}\\
%  \hline
%  \cellcolor{LightGray}
%  {\revise Lower bound (this paper)}&{\revise -} & {\revise $\Omega(\sqrt{d\min(d,H)T})$}\\
%  \hline
% \end{tabular}
% {\scriptsize
% \begin{tablenotes}
% \item[1] See Section \ref{sec:setting} for the definition of our \textit{offline} setting.
% \item[2] See Assumption~\ref{assump:exploration_omd} for the definition of {\revise $\rho$}.
% \item[3] The $\lambda$ in the former bound depends on the game tree structure, defined in Assumption~\ref{ass:ftrl_trans}. 
% The latter bound is obtained by the same algorithm but with a different initiation of parameters. 
% \end{tablenotes}}
% \end{threeparttable}
% \end{center}
% \end{table}




% ------------------------------------ Original Table ------------------------------------------------
% \begin{table}[t!]
% %\centering
% \caption{Algorithms for computing a $\varepsilon$-NE of an IIG with bandit feedback and their respective convergence rate. Adversarial game means that one opponent can change certain game settings arbitrarily over different episodes. Self-play corresponds to our assumption of known transition and access to opponent's past strategies(see Assumption~\ref{assumption:offline}).\chen{Regret bound for max-player}}
% \label{table:rate}
% \begin{center}
% \begin{threeparttable}
%  \resizebox{\textwidth}{!}{  
%     \begin{tabular}{@{}|c|c|c|c|@{}}
%     \hline
%      \textbf{Algorithm} & \textbf{Adv. Game} & \textbf{Self-Play} & \textbf{Regret}\\
% 		\hline
%      \MCCFR\citep{farina20stochastic} & \multirow{1}{*}{NO} & \multirow{4}{*}{NO} 
%      & $\widetilde{\gO}(H^2(X \sqrt{A}+Y\sqrt{B})/\sqrt{T})$ \\
%      \cline{1-2}\cline{4-4}
%      \IXOMD\citep{kozuno2021learning} & \multirow{5}{*}{YES} & & 
%      $\widetilde{\gO}(H(X \sqrt{A}+Y\sqrt{B})/\sqrt{T})$ \\
%      \cline{1-1}\cline{4-4}
%      \BalancedOMDCFR\citep{bai2022nearoptimal} & & & 
%      $\widetilde{\gO}(\sqrt{H^3(XA+YB)})/\sqrt{T})$\\
%      \cline{1-1}\cline{4-4}
%      \BalancedFTRL\citep{Fiegel2023adapting} & & & 
%      $\widetilde{\gO}(\sqrt{H(XA+YB)})/\sqrt{T})$\\
%      \cline{1-1}\cline{3-4}
%      \cellcolor{LightGray}
%          \LSOMD (this paper) & & \multirow{2}{*}{YES} & 
%          $\widetilde{\gO}(\sqrt{HX^2d\alpha^{-1}}/\sqrt{T})$\;\tnote{1}\\
%          \cline{4-4}
%      \cellcolor{LightGray}
%          \LSFTRL (this paper) & &  & 
%          $\widetilde{\gO}(\sqrt{H^2d\lambda^{-1}}/\sqrt{T})/\widetilde{\gO}(\sqrt{HXd}/\sqrt{T})$\;\tnote{2}\\
%      \hline
%     \end{tabular}}
% %\hspace{+0.3cm}
%     %\vspace{0.5cm}
%     {\scriptsize
%     \begin{tablenotes}
%     \item[1] Here $\alpha$ is similar to an exploration policy specified in Assumption~\ref{assump:exploration_omd}.
%     \item[2] Here $\lambda$ is a constant dependent on the game tree structure, see Section~\ref{sec:spread_factor}. The latter bound $\widetilde{\gO}(\sqrt{HXd}/sqrt{T})$ is the worst case performance \\guarantee.
% \end{tablenotes}}
% \end{threeparttable}
% \end{center}
% \end{table}
% ----------------------------------------------------------------------------------------







% \begin{table}[t]

% %\centering
% \caption{Comparisons of regret bounds with most related works studying IIEFGs when the full knowledge of the game is not known priori. $T$ is the number of episodes, $H$ is the horizon length, $d$ is the ambient dimension of the feature mapping, $X$ and $A$ are the cardinalities of the information set space and action space respectively.
% % Algorithms for computing a $\varepsilon$-NE of an IIG with bandit feedback and their respective convergence rate. Adversarial game means that one opponent can change certain game settings arbitrarily over different episodes. 
% % Self-play corresponds to our assumption of known transition and access to opponent's past strategies (see Assumption~\ref{assumption:offline}).
% % \chen{Comparisons of regret bound for max-player with most related works studying IIEFGs with bandit feedback. $T$ is the number of episodes, $H$ is the horizon length, $X,A$ are the cardinalities of the information set space and action space, and $d$ is the ambient dimension of the feature mapping.; [Adjust] linespread.}
% }
% \label{table:rate}
% \begin{center}
% \begin{threeparttable}
% % ----------------------------- Adjust linespread ----------------------------
% \renewcommand{\arraystretch}{1.5} 
% %{\tiny
% % ----------------------------------------------------------------------------
%  \resizebox{\textwidth}{!}{  
%  {\tiny
%     \begin{tabular}{@{}|c|c|c|c|@{}}
%     \hline
%      \textbf{Algorithm}  & \textbf{Offline} & \textbf{Regret}\\
% 		\hline
%      \MCCFR\citep{farina20stochastic}  & \multirow{4}{*}{NO} 
%      & $\widetilde{\gO}(H^2X \sqrt{AT})$ \\
%      \cline{1-1}\cline{3-3}
%      \IXOMD\citep{kozuno2021learning}  & & 
%      $\widetilde{\gO}(HX \sqrt{AT})$ \\
%      \cline{1-1}\cline{3-3}
%      \BalancedOMDCFR\citep{bai2022nearoptimal}  & & 
%      $\widetilde{\gO}(\sqrt{H^3XAT})$\\
%      \cline{1-1}\cline{3-3}
%      \BalancedFTRL\citep{Fiegel2023adapting}  & & 
%      $\widetilde{\gO}(\sqrt{XAT})$\\
%      \cline{1-2}\cline{3-3}
%      \cellcolor{LightGray}
%          \LSOMD (this paper)  & \multirow{2}{*}{YES} & 
%          $\widetilde{\gO}(\sqrt{HX^2d\alpha^{-1}T})$\;\tnote{1}\\
%          \cline{3-3}
%      \cellcolor{LightGray}
%          \LSFTRL (this paper)  &  & 
%          $\widetilde{\gO}(\sqrt{H^2d\lambda T})/\widetilde{\gO}(\sqrt{HXdT})$\;\tnote{2}\\
%      \hline
%     \end{tabular}}}
%     {\scriptsize
    
%     \begin{tablenotes}
%     \item[1] See Assumption~\ref{assump:exploration_omd} for the definition of $\alpha$.
%     \item[2] The $\lambda$ in the former bound depends on the game tree structure, defined in Assumption~\ref{ass:ftrl_trans}. 
%     % The latter bound $\widetilde{\gO}(\sqrt{HXdT})$ is the worst case performance guarantee.
%     The latter bound is obtained by the same algorithm but with a different initiation of parameters. 
% \end{tablenotes}}
% \end{threeparttable}
% \end{center}
% \end{table}




% \begin{table}[t]

% %\centering
% \caption{Comparisons of regret bounds with most related works studying IIEFGs when the full knowledge of the game is not known a priori. $T$ is the number of episodes, $H$ is the horizon length, $d$ is the ambient dimension of the feature mapping, $X$ and $A$ are the cardinalities of the information set space and action space respectively.
% }
% \label{table:rate}
% \begin{center}
% \begin{threeparttable}
% % ----------------------------- Adjust linespread ----------------------------
% \renewcommand{\arraystretch}{1.6} 
% %{\tiny
% % ----------------------------------------------------------------------------
%  %\resizebox{\textwidth}{!}{  
 
%     \begin{tabular}{@{}|c|c|c|c|@{}}
%     \hline
%      \textbf{Algorithm}  & \textbf{Setting} & \textbf{Regret}\\
% 		\hline
%      \MCCFR\citep{farina20stochastic}  & \multirow{4}{*}{Online} 
%      & $\widetilde{\gO}(H^2X \sqrt{AT})$ \\
%      \cline{1-1}\cline{3-3}
%      \IXOMD\citep{kozuno2021learning}  & & 
%      $\widetilde{\gO}(HX \sqrt{AT})$ \\
%      \cline{1-1}\cline{3-3}
%      \BalancedOMDCFR\citep{bai2022nearoptimal}  & & 
%      $\widetilde{\gO}(\sqrt{H^3XAT})$\\
%      \cline{1-1}\cline{3-3}
%      \BalancedFTRL\citep{Fiegel2023adapting}  & & 
%      $\widetilde{\gO}(\sqrt{XAT})$\\
%      \cline{1-2}\cline{3-3}
%      \cellcolor{LightGray}
%          \LSOMD (this paper)  & \multirow{2}{*}{Offline\tnote{1}} & 
%          $\widetilde{\gO}(\sqrt{HX^2d\alpha^{-1}T})$\;\tnote{2}\\
%          \cline{3-3}
%      \cellcolor{LightGray}
%          \LSFTRL (this paper)  &  & 
%          $\widetilde{\gO}(\sqrt{H^2d\lambda T})/\widetilde{\gO}(\sqrt{HXdT})$\;\tnote{3}\\
%      \hline
%     \end{tabular}
%     {\scriptsize
    
%     \begin{tablenotes}
%     \item[1] See Section \ref{sec:setting} for the definition of our \textit{offline} setting.
%     \item[2] See Assumption~\ref{assump:exploration_omd} for the definition of $\alpha$.
%     \item[3] The $\lambda$ in the former bound depends on the game tree structure, defined in Assumption~\ref{ass:ftrl_trans}. 
%     % The latter bound $\widetilde{\gO}(\sqrt{HXdT})$ is the worst case performance guarantee.
%     The latter bound is obtained by the same algorithm but with a different initiation of parameters. 
    
% \end{tablenotes}}
% \end{threeparttable}
% \end{center}
% \end{table}

