% \vspace{-0.2cm}
\section{Fictitious Least-Squares Loss Estimator}\label{sec:Linear_Loss_Estimator}
This section presents the proposed \textit{fictitious} least-squares loss estimator for learning the unknown reward parameter.

For a fixed $\nu^t$,  Eq. \eqref{eq:reaching_prob} indicates that the value function $V^{\mu^t,\nu^t}$ is linear in $\mu^t$ \citep{kozuno2021learning}:
% \begin{align*}
% V^{\mu^t, \nu^t} =\sum_{h=1}^H \sum_{\left(x_h, a_h\right) \in \mathcal{X}_h \times \mathcal{A}} \mu^t_{1: h}\left(x_h, a_h\right) \times \sum_{s_h \in x_h, b_h \in \mathcal{B}} p_{1: h}\left(s_h\right) \nu_{1: h}^t\left(y\left(s_h\right), b_h\right) \bar{r}_h\left(s_h, a_h, b_h\right)\,.
% \end{align*}
\begingroup
\setlength{\belowdisplayskip}{3pt} \setlength{\belowdisplayshortskip}{3pt}
\setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
\begin{align*}
V^{\mu^t, \nu^t} =&\sum_{h=1}^H \sum_{\left(x_h, a_h\right) \in \mathcal{X}_h \times \mathcal{A}}
\mu^t_{1: h}\left(x_h, a_h\right) \\
&\times \sum_{s_h \in x_h, b_h \in \mathcal{B}} p_{1: h}\left(s_h\right) \nu_{1: h}^t\left(y\left(s_h\right), b_h\right) \bar{r}_h\left(s_h, a_h, b_h\right)\,.
\end{align*}
\endgroup
Hence, the regret in Eq. \eqref{equation:regret} can be rewritten as $\Reg^T_{\max} = \max _{\mu^{\dagger} \in \Pi_{\max }} \mathbb{E}\left[\sum_{t=1}^T\left\langle\mu^t-\mu^{\dagger}, \ell^t\right\rangle\right]$,
% \begin{align}
%       \Reg^T_{\max} = \max _{\mu^{\dagger} \in \Pi_{\max }} \sum_{t=1}^T\left\langle\mu^t-\mu^{\dagger}, \ell^t\right\rangle\,,
% \end{align}
where $\ell_h^t$ is \textit{loss function} in round $t$ such that
\begin{align*}
&\ell_h^t\left(x_h, a_h\right) \coloneqq\\
-&\sum_{s_h \in x_h, b_h \in \mathcal{B}} p_{1: h}\left(s_h\right) \nu_{1: h}^t\left(y\left(s_h\right), b_h\right)\bar{r}_h\left(s_h, a_h, b_h\right)\,.
\end{align*}
This implies that one can translate the regret minimization in Eq. \eqref{equation:regret} into a linear regret minimization problem.

To learn the unknown parameter $\vtheta_h$ with the leverage of the linear structure over the reward function,
one may construct some linear loss estimator $\hat{\vtheta}_h$ of $\vtheta_h$. 
However, this is more challenging in our case than that of 
% linear bandits \citep{Abbasi-YadkoriPS11}, 
% linear MDPs \citep{JinYWJ20}, and 
fully observable linear MGs \citep{XieCWY20}, as the learning agent only observes the infoset $x(s_h)$ and does not even know the underlying state $s_h$ and its associated feature vector $\vphi(s_h,a_h,b_h)$, making it impossible to regress $r_h(s_h,a_h,b_h)$ against $\vphi(s_h,a_h,b_h)$. To cope with this issue and build a ``least-squares loss estimator'', we instead consider constructing the following feature vector for each $(x_h,a_h)$, which is a composite feature vector weighted by the opponent's policy $\nu^t$ and transition $\sP$:
\begingroup
\setlength{\belowdisplayskip}{3pt} \setlength{\belowdisplayshortskip}{3pt}
\setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
\begin{align*}
\vphi^{\nu^t}&(x_h,a_h) \coloneqq\\
&-\sum_{(s_h,b_h)\in x_h\times \gB} p_{1:h}(s_h)\nu_{1:h}^t(y(s_h),b_h)\vphi(s_h,a_h,b_h)\,.
\end{align*}
\endgroup
% \zhao{
Intuitively, the constructed composite feature vector $\vphi^{\nu^t}(x_h,a_h)$ can be regarded as the  ``feature vector'' of corresponding infoset-action $(x_h,a_h)$.
Further, one can see that $\ell_h^t\left(x_h, a_h\right)$ is indeed linear with $\vtheta_h$ and  $\vphi^{\nu^t}(x_h,a_h)$:
\begingroup
\setlength{\belowdisplayskip}{3pt} \setlength{\belowdisplayshortskip}{3pt}
\setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
\begin{align*}
    &\left\langle \vphi^{\nu^t}(x_h,a_h),\vtheta_h\right\rangle \\
    =&\left\langle-\sum_{(s_h,b_h)\in x_h\times \gB} p_{1:h}(s_h)\nu^t_{1:h}(y(s_h),b_h)\vphi(s_h,a_h,b_h),\vtheta_h\right\rangle\\
    =& \ell_h^t(x_h,a_h)\,.
\end{align*}
\endgroup
Based on $\vphi^{\nu^t}(x_h,a_h)$,
\footnote{
Note that the construction of $\vphi^{\nu^t}(x_h,a_h)$ relies on the knowledge of the policy $\nu_t$ of the min-player. 

% \zhao{
By ``offline setting'' we refer to that the policy $\nu_t$ of the opponent (\textit{i.e.}, the min-player)  in episode $t$ is accessible to the max-player \textit{after} the $t$-th episodes ends.
We use the terminology ``offline setting'' following \citet{ChenZG22a,XieCWY20}, which is also termed as ``self-play setting'' in the literature. However, note that ``offline setting'' in this work is slightly more general than it in \citet{ChenZG22a,XieCWY20} as we do not require that both the max-player and min-player are controlled by a central controller.
}
we define the ``feature covariance matrix'' $\mQ_{\mu,h}^t$ for any policy $\mu$ at step $h$ in episode $t$ as
% \begin{align}\label{eq:covariance_mat}
%     \mQ_{\mu,h}^t\!=\! \sumlevel \mu_{1: h}\left(x_h, a_h\right) \vphixa \vphixa^\top\,.
% \end{align}
\begingroup
\setlength{\belowdisplayskip}{3pt} \setlength{\belowdisplayshortskip}{3pt}
\setlength{\abovedisplayskip}{3pt} \setlength{\abovedisplayshortskip}{3pt}
\begin{align}\label{eq:covariance_mat}
    \mQ_{\mu,h}^t\!=\! \sumlevel \mu_{1: h}\left(x_h, a_h\right) \vphixa \vphixa^\top\,.
\end{align}
\endgroup
We are now ready to introduce the proposed ``least-squares loss estimator'' $\hat{\vtheta}_h^t$:
% \begin{align}\label{eq:hat_theta}
%     \hat{\vtheta}_h^t= -(\mQ_{\mu^t,h}^t)^{-1}\vphi^{\nu^t}(x_h^t,a_h^t)r_h^t(s_h^t,a_h^t,b_h^t)\,,
% \end{align}
\begingroup
\setlength{\belowdisplayskip}{5pt} \setlength{\belowdisplayshortskip}{5pt}
\setlength{\abovedisplayskip}{4pt} \setlength{\abovedisplayshortskip}{4pt}
\begin{align}\label{eq:hat_theta}
    \hat{\vtheta}_h^t= -(\mQ_{\mu^t,h}^t)^{-1}\vphi^{\nu^t}(x_h^t,a_h^t)r_h^t(s_h^t,a_h^t,b_h^t)\,,
\end{align}
\endgroup
which we call the \textit{fictitious} least-squares loss estimator. We show that $\hat{\vtheta}_h^t$ is an unbiased estimator of the unknown $\vtheta_h$, 
guaranteed in the following lemma.
Its proof is deferred to Appendix \ref{app:proof_unbiased_estimator}.
\begin{lemma}\label{lemma:unbiased_estimator}
    For any $t\in [T]$ and $h\in[H]$, it holds that $\E^{t-1}[\hat{\vtheta}_h^t] = \vtheta_h$.
\end{lemma}
% \zhao{
\begin{remark}
Intuitively, this ``least-squares loss estimator'' shares a similar spirit as its counterpart in adversarial linear bandit literature \citep{lattimore2020bandit}. However, note that there are two crucial distinctions between $\hat{\vtheta}_h^t$ defined above and the common least-squares loss estimator in adversarial linear bandits: (a) $\mu_{1: h}^t(\cdot,\cdot)$ in the definition of the ``feature covariance matrix'' $\mQ_{\mu,h}^t$ is not necessarily a probability distribution over $\mathcal{X}_h \times \mathcal{A}$ (thus $\mQ_{\mu,h}^t$ itself is not a true feature covariance matrix); and (b) the ``feature vector'' $\boldsymbol{\phi}^{\nu^t}(x_h^t, a_h^t)$ is de facto not necessarily linear with the regressand $r_h^t(s_h^t, a_h^t, b_h^t)$ (recall $\bar{r}_h\left(s_h, a_h, b_h\right)=\left\langle\boldsymbol{\phi}\left(s_h, a_h, b_h\right), \boldsymbol{\theta}_h\right\rangle$, which only means $\bar{r}_h$ is linear in $\boldsymbol{\phi}(\cdot,\cdot,\cdot)$ instead of $\boldsymbol{\phi}^{\nu^t}(\cdot,\cdot)$).
Due to the above two reasons, $\hat{\vtheta}_h^t$ is not a real least-squares loss estimator and this is why we term $\hat{\vtheta}_h^t$ as the \textit{fictitious} least-squares loss estimator. On the other hand, as shown in Lemma \ref{lemma:unbiased_estimator},
via constructing $\hat{\vtheta}_h^t$, we indeed address the challenge that we can not regress $r_h(s_h,a_h,b_h)$ against $\vphi(s_h,a_h,b_h)$ due to the partial observability in IIEFGs.
\end{remark}
% }