% \onecolumn
% \setcounter{secnumdepth}{2}


% \section{Property of the Game}\label{app:proof_lem_transition}
% The lemma below delineates the key property of $p^\star$ as transition probability over infoset-action space.
% \begin{lemma}\label{lemma:transition}
%     For any $h\in[H]$, any $p^{\star}$ as transition probability over infoset-actions and any policy $\mu\in\Pi_{\max}$ of the max-player, it holds that
%     \[\sumlevel p^{\star}_{1:h}(x_h)\mu_{1:h}(x_h,a_h)=1\,.\]
% \end{lemma}
% % \begin{proof}
% %     See Appendix~\ref{app:proof_lem_transition}.
% % \end{proof}
% % \subsection{Proof of Lemma~\ref{lemma:transition}}
% \begin{proof}
%     By the definition of perfect recall and ``transition probability'' over infoset-action space, we have
%     \begin{align*}
%         \sP^{\mu,\nu}(x_h,a_h)&= \sP^{\mu,\nu}(x_1,\dots,x_h,a_h)\\
%         &= p^{\star}_0(x_1) \prod_{h^\prime=1}^{h-1} p_{h^\prime}^{\star}(x_{h^\prime+1}|x_{h^\prime},a_{h^\prime}) \cdot \prod_{h^\prime=1}^h \mu_{h^\prime}(a_{h^\prime}|x_{h^\prime})\\
%         &= p^\star(x_h) \mu_{1:h}(x_h,a_h)\,.
%     \end{align*}
%     % The lemma holds since 
%     The proof is thus concluded by noticing that $\sumlevel \sP^{\mu,\nu}(x_h,a_h) = 1$.
% \end{proof}


\section*{Supplementary Material}
\section{Properties of the Fictitious Least-Squares Loss Estimator}\label{app:properties}
This section presents the proofs of two key properties of the proposed fictitious least-squares loss estimator.
\subsection{Unbiasness of the Fictitious Least-Squares Loss Estimator}
\label{app:proof_unbiased_estimator}
\begin{proof}[Proof of Lemma~\ref{lemma:unbiased_estimator}]
The definition of $\hat{\vtheta}_h^t$ in Eq. \eqref{eq:hat_theta} implies that
    \begin{align*}
    \E^{t-1}\left[\hat{\vtheta}_h^t\right]
    &= \E^{\mu^t,\nu^t}\left[\hat{\vtheta}_h^t\right] \\
    &= \E^{\mu^t,\nu^t}\left[-(\mQ_{\mu^t,h}^t)^{-1}\cdot\vphi^{\nu^t}(x_h^t,a_h^t)\cdot r_h^t(s_h^t,a_h^t,b_h^t)\right]\\
    &= \E^{\mu^t,\nu^t}\left[-(\mQ_{\mu^t,h}^t)^{-1}\cdot\vphi^{\nu^t}(x_h^t,a_h^t)\cdot \bar{r}_h(s_h^t,a_h^t,b_h^t)\right]\\
    &= -(\mQ_{\mu^t,h}^t)^{-1} \sum_{x_h\in\gX_h}\sum_{s_h\in x_h}\sum_{a_h\in \gA} \sum_{b_h\in \gB} \sP^{\mu^t,\nu^t}(s_h,a_h,b_h) \vphixa \bar{r}_h(s_h,a_h,b_h)\\
    &=  -(\mQ_{\mu^t,h}^t)^{-1} \sum_{x_h\in\gX_h} \sum_{a_h\in \gA} \mu^t_{1:h}(x_h,a_h)\vphixa \sum_{s_h\in x_h} \sum_{b_h\in \gB} p_{1:h}(s_h)\nu^t_{1:h}(y(s_h),b_h)\bar{r}_h(s_h,a_h,b_h)\\
    &= (\mQ_{\mu^t,h}^t)^{-1} \sum_{x_h\in\gX_h} \sum_{a_h\in \gA} \mu^t_{1:h}(x_h,a_h)\vphixa \left\langle \vphi^{\nu^t}(x_h,a_h),\vtheta_h\right\rangle\\
    &= (\mQ_{\mu^t,h}^t)^{-1} \left(\sum_{x_h\in\gX_h} \sum_{a_h\in \gA} \mu^t_{1:h}(x_h,a_h)\vphixa\vphixa^\top\right)\vtheta_h\\
    &= \vtheta_h\,,
\end{align*}
which concludes the proof.
\end{proof}

% \zhao{TBD: I will place the following in the proof of regret of OMD and FTRL.}
% A corollary of Lemma~\ref{lemma:unbiased_estimator} is that the estimated loss $\E^{t-1}\left[\hat{\ell^t}\right]=\ell^t$ is also unbiased. Consequently, this suggests the regret remains unbiased. Formally,  
% \begin{align*}
%     \E\left[\hat{\Reg}_{\max}^T(\mu^\dagger)\right] &= \sum_{t=1}^T \E^{t-1}\left[\left\langle\mu^t - \mu^\dagger, \hat{\ell}^t\right\rangle\right]  \\
%     &= \sum_{t=1}^T\left\langle\mu^t - \mu^\dagger, \E^{t-1}\left[\hat{\ell^t}\right]\right\rangle\\
%     &= \sum_{t=1}^T\left\langle\mu^t - \mu^\dagger, \ell^t\right\rangle\\
%     &= \Reg^T_{\max}(\mu^\dagger)\,.
% \end{align*}
% \zhao{TBD}

\subsection{Variance of the Fictitious Least-Squares Loss Estimator}
\label{app:proof_quadraticbound}
% Importantly, the following lemma demonstrates the other key property of our least-squares loss estimator.
The following lemma shows that the ``variance'' of the proposed loss estimator is well controlled.
% the other key property of our least-squares loss estimator.
\begin{lemma}\label{lemma:quadraticbound}
For any $h\in[H]$, it holds that
    \begin{align}
 &\E^{t-1}\left[\sumlevel\muxa\hatellxa^2\right]
 \leq d\,.
\end{align}
\end{lemma}
% \begin{proof}
%     See Appendix \ref{app:proof_quadraticbound}.
% \end{proof}
% The proof draws inspiration from the trace technique leveraged in linear bandits. 
\begin{proof}
It is clear that
        \begin{align*}
& \mathbb{E}^{t-1}\left[\sumlevel\muxa\hatellxa^2\right]\\
=& \sumlevel \muxa \vphixa^\top \E^{\mu^t,\nu^t}\left[\hat{\vtheta}_h^t(\hat{\vtheta}_h^t)^\top\right]\vphixa\\
\overset{(i)}{=}& \sumlevel \muxa \vphixa^\top\\
&\qquad\cdot  \E^{\mu^t,\nu^t}\left[r_h^t(s_h^t,a_h^t,b_h^t)^2(\mQ_{\mu^t,h}^t)^{-1}\vphi^{\nu^t}(x_h^t,a_h^t)\vphi^{\nu^t}(x_h^t,a_h^t)^\top(\mQ_{\mu^t,h}^t)^{-1}\right]\vphixa\\
\overset{(ii)}{\leq}&  \sumlevel \muxa \vphixa^\top \E^{\mu^t,\nu^t}\left[(\mQ_{\mu^t,h}^t)^{-1}\vphi^{\nu^t}(x_h^t,a_h^t)\vphi^{\nu^t}(x_h^t,a_h^t)^\top(\mQ_{\mu^t,h}^t)^{-1}\right]\vphixa\\
=& \sumlevel \muxa \vphixa^\top (\mQ_{\mu^t,h}^t)^{-1}\\
&\qquad\cdot\left(\sumlevelprime p^{\nu^t}_{1:h}(x_h^\prime)\muxaprime\vphixaprime\vphixaprime^\top\right)(\mQ_{\mu^t,h}^t)^{-1}\vphixa\\
=& \operatorname{tr}\left[\left(\sumlevel (\mQ_{\mu^t,h}^t)^{-1} \muxa \vphixa\vphixa^\top \right)\right.\\
&\qquad\cdot\left.\left(\sumlevel p^{\nu^t}_{1:h}(x_h)\muxa\vphixa\vphixa^\top (\mQ_{\mu^t,h}^t)^{-1}\right)\right]\\
=& \operatorname{tr}\left(\mI_d\cdot\sumlevel p^{\nu^t}_{1:h}(x_h)\muxa\vphixa\vphixa^\top (\mQ_{\mu^t,h}^t)^{-1}\right)\\
\overset{(iii)}{\leq}& \operatorname{tr}\left(\sumlevel\muxa\vphixa\vphixa^\top (\mQ_{\mu^t,h}^t)^{-1}\right)\\
=&\operatorname{tr}\left(\mI_d\right)=d\,,
\end{align*}
where $(i)$ is due to the definition of $\hat{\vtheta}_h^t$ in Eq. \eqref{eq:hat_theta}; $(ii)$ is by $|r_h^t(s_h,a_h,b_h)|\leq 1$ for any $(s_h,a_h,b_h)\in\gS_h\times\gA\times\gB$; and $(iii)$ follows from $p^{\nu^t}_{1:h}(x_h)\leq 1$ for any $x_h\in\gX_h$. The proof is thus completed.
\end{proof}

\input{Contents/7_app_F2TRL_proof}
\input{Contents/7_app_computation}
\input{Contents/7_app_lower_bound}







