% {\revise 
\section{Proof of Regret Lower Bound}\label{app:sec:lower_proof}
In this section, we present the proof of Theorem \ref{thm:regret_lower}.
% \begin{theorem}\label{thm:regret_lower}
%     Suppose $A\geq2$, $d\geq2$ and $T\geq 2d^2$. Then for any algorithm $\operatorname{Alg}$ that controls the max-player, generates and executes policies $\{\mu^t\}_{t\in[T]}$, there exists an POMG instance on which $\Reg^T_{\max}\geq\Omega(\sqrt{d\min(d,H)T})$.
% \end{theorem}
% \begin{remark}
%     % Note that the regret lower bound is established on the worst-case instances satisfying 
%     % $H\geq d$, implying that $d\sqrt{T}\leq \sqrt{dHT}$. Thus the dependence on $d$ in our regret lower bound does not conflict with it in our regret upper bounds.
%     We conjecture that the regret lower bound can be further improved to $\Reg^T_{\max}\geq\Omega(\sqrt{dHT})$, and currently our regret upper bounds of \LSOMD and \LSFTRL with the second initialization are loose in $X$ and regret upper bound of \LSFTRL with the first initialization is loose in $H$.
% \end{remark}
\begin{proof}[Proof of Theorem \ref{thm:regret_lower}]

    We consider an $A$-ary tree IIEFG instance, in which 
    \begin{itemize}
        \item $B=1$ so that there is actually no opponent effectively (and hence the dependence on the opponent's action $b$ is omitted in what follows);
        \item $X_h=S_h=A^{h-1}$ for all $h\in[H]$, which means that $\gX_h=\gS_h$ and there is actually no partial observability;
        \item $r_h(s,a)=0$ for all $h\in[H-1]$, and $r_H(s,a)$ is a reward sampled from Bernoulli distribution $\operatorname{Ber}(\bar{r}_H(s,a))$ with mean $\bar{r}_H(s,a)=\langle\vphi(s,a), \vtheta\rangle$.
    \end{itemize}
    By the construction, there exists a unique action sequence $(a_1,\ldots,a_{h-1})$ that determines $s_h$ (and hence $x_h$) and the transition is deterministic and known. Following similar arguments by \citet{bai2022nearoptimal,Fiegel2023adapting}, it can be shown that
    if algorithm $\operatorname{Alg}$ achieves regret $\Reg^T_{\max}$ on this IIEFG instance, then  $\operatorname{Alg}$ can be used to tackle a stochastic linear bandit problem with $A^H$ ``arms'' and obtain the regret with the same order as $\Reg^T_{\max}$, where the reward for ``arm'' $(a_1,a_2,\ldots,a_H)$ (\textit{i.e.}, $(s_H,a_H)$) is sampled from $\operatorname{Ber}(\langle\vphi(s_H,a_H), \vtheta\rangle)$. 
    % with $\vphi(s_H,a_H)$ and $\vtheta$ defined above
    
We now first consider the case when $H\geq d$. In this case, $\vphi$ and $\vtheta$ satisfy 
$\vphi(s,a)_{[1:d-1]}\in\{-1,1\}^{d-1}$, $\vphi(s,a)_{d}=1/4$, $\vtheta_{[1:d-1]}\in\{-\Delta,\Delta\}^{d-1}$ with $\Delta=1/(8\sqrt{2T})$ and $\vtheta_{d}=1$. Moreover, since $|\gS_{H}\times\gA|=A^{H-1}\cdot A=A^H$ as well as $H\geq d$ and $A\geq 2$, 
$\vphi$ can be chosen such that $\{\vphi(s,a)_{[1:d-1]}\}_{(s,a)\in\gS_{H}\times\gA}=\{-1,1\}^{d-1}$ 
(omitting the duplicate feature vectors). Then by canonical analysis for the regret lower bound of stochastic linear bandits (see, \textit{e.g.}, Theorem 24.1 by \citet{lattimore2020bandit}; Lemma 25 by \citet{ZhouGS21}), there exists a $\vtheta_{[1:d-1]}^{\operatorname{Alg}}\in\{-\Delta,\Delta\}^{d-1}$ such that $\Reg^T\geq (d-1)\sqrt{T}/(16\sqrt{2})=\Omega(\sqrt{d^2T})$.

In case when  $H< d$, we can choose $\vphi$ such that the stochastic linear bandit problem, on which $\operatorname{Alg}$ suffers the same regret as on the IIEFG instance, has $2^H$ distinct feature vectors since $A\geq 2$ and $A^{H}\geq 2^H$. Then by similar reasoning of the construction of 
$\vphi$ and $\vtheta$ in the case $H\geq d$ and the proof of Corollary 3 by \citet{zhoulowerbound}, there exists a $\vtheta^{\operatorname{Alg}}$ such that $\Reg^T\geq \Omega(\sqrt{dHT})$. 

The proof is concluded by combining the results of the two cases.
% since $|\gS_{H}\times\gA|=A^{H-1}\cdot A=A^H$

% where $\vphi(s,a)_{[1:d-1]}\in\{-1,1\}^{d-1}$, $\vphi(s,a)_{d}=1/4$, $\vtheta_{[1:d-1]}\in\{-\Delta,\Delta\}^{d-1}$ with $\Delta=1/(8\sqrt{2T})$ and $\vtheta_{d}=1$. Moreover, $\vphi$ can be chosen such that $\{\vphi(s,a)_{[1:d-1]}\}_{(s,a)\in\gS_{H}\times\gA}=\{-1,1\}^{d-1}$ since $|\gS_{H}\times\gA|=A^{H-1}\cdot A=A^H$ as well as $H\geq d$ and $A\geq 2$
    
    % Then by canonical analysis for the regret lower bound of stochastic linear bandits (see, \textit{e.g.}, Theorem 24.1 by \citet{lattimore2020bandit}; Lemma 25 by \citet{ZhouGS21}), there exists a $\vtheta_{[1:d-1]}^{\operatorname{Alg}}\in\{-\Delta,\Delta\}^{d-1}$ such that $\Reg^T\geq (d-1)\sqrt{T}/(16\sqrt{2})$, which concludes the proof.
\end{proof}
% \begin{remark}
%     Note that the regret lower bound is established on the worst-case instances satisfying 
%     $H\geq d$, implying that $d\sqrt{T}\leq \sqrt{dHT}$. Thus the dependence on $d$ in our regret lower bound does not conflict with it in our regret upper bounds.
%     We conjecture that the regret lower bound can be further improved to $\Reg^T_{\max}\geq\Omega(\sqrt{dHT})$ and currently our regret upper bounds of \LSOMD and \LSFTRL with the second initialization are loose in $X$ and regret upper bound of \LSFTRL with the first initialization is loose in $H$.
% \end{remark}
% }
% ------------------------------------------------------------------------------------
% [Chen]
% \chen{Refined lower bound}
% The above analysis reduces the lower bound problem into a linear bandit problem, for which we have:
% \begin{theorem}[Lower Bound for Stochastic Linear Bandit; see Theorem 24.1 of \citet{lattimore2020bandit}]
%      Let the action set of some stochastic linear bandits be $\mathcal{A}=[-1,1]^d$ and $\Theta=\left\{-T^{-1 / 2}, T^{-1 / 2}\right\}^d$. Then, for any policy, there exists a vector $\vtheta \in \Theta$ such that:
% $$
% \Reg_n(\mathcal{A}, \vtheta) \geq \frac{\exp (-2)}{8} d \sqrt{T}\,.
% $$
% \end{theorem}

%  When $A^H\geq 2^d$, we use duplicate feature vectors to restrict the action set to $[-1,1]^d$. Then the lower bound is given by the above theorem under stochastic linear bandit: $\Omega(\sqrt{d^2T})\leq \Omega(\sqrt{dHT\log A}) $; When $A^H < 2^d$, there exists $k\in [d]$ such that $A^H>2^k$. For simplicity, denote $n\coloneqq A^H\,.$ The idea here is to break the instance into smaller $k$ dimensional instances. Suppose we have $\beta=\frac{d}{k}$ many $k$-dimensional, $n$-arm, horizon $\frac{T}{\beta}$, instances $I_1, I_2, \ldots, I_\beta$ such that $\Reg_{\frac{T}{\beta}, I_j} \gtrsim k \sqrt{\frac{T}{\beta}}$. Construct $I=$ $I\left(I_1, I_2, \ldots, I_\beta\right)$ as follows
% \begin{itemize}
%     \item  Divide $d$ dimensions into $\beta$ blocks of size $k$
% \item Divide $T$ into $\beta$ consecutive periods, each having $\frac{T}{\beta}$ time steps
% \item Let the hidden vector be $\vec{\theta}=\left(\vec{\theta}_1, \vec{\theta}_2, \ldots, \vec{\theta}_\beta\right)$, where $\vec{\theta}_j \in I_j$. 
% \end{itemize}
% Feature vectors at time $\tau=(i-1) \frac{T}{\beta}+t$ offer the corresponding loss vector at time $t$ from $I_j$. That is, for all $i \in[n]$, $\mathcal{X}_{i, \tau}=\left(0^{\top}, \ldots, 0^{\top}, \mathcal{X}_{i, t}^{(j)}, \mathbf{0}^{\top}, \ldots, 0^{\top}\right)$, where the non-zero entries are located in the $j$-th block.

% Hence for all policies $\pi$, there exists policies $\pi_1, \pi_2, \ldots, \pi_\beta$ such that $\mathcal{R}_{T, I}^\pi=\sum_{j=1}^\beta \mathcal{R}_{\frac{T}{\beta}, I_j}^{\pi_j}$.
% Using Theorem 2, we can find instances $I_1, I_2, \ldots, I_\beta$ such that $\mathcal{R}_{T, I}^\pi=\sum_{j=1}^\beta \mathcal{R}_{\frac{T}{\beta}, I_j}^{\pi_j} \gtrsim \sum_{j=1}^\beta k \sqrt{\frac{T}{\beta}}=$ $k \sqrt{T \beta}=k \sqrt{\frac{T d}{k}}=\sqrt{d T k}=\sqrt{d T \log n}=\sqrt{dHT\log A}\,.$

% Thus, we conclude the lower bound is $\Omega(\sqrt{dHT\log A})\,.$ 
% ------------------------------------------------------------------------------------