% \newpage
\renewcommand{\thesection}{\Alph{section}}
\setcounter{section}{0}
\onecolumn


\title{Power Mean Estimation in Monte-Carlo Tree Search\\(Supplementary Material)}
\maketitle

\section{Outline}
\begin{itemize}
    \item Notations will be described in Section B.
    \item Supporting Lemmas are presented in Section C.
    \item The Convergence of Stochastic-Power-UCT in Non-stationary multi-armed bandits is shown in Section D.
    \item Experimental setup and Hyperparameter selection are provided in Section E.
\end{itemize}
\section{Notations}

\begin{table}[!ht]
\caption{List of all notations for Non-stationary Multi-arms bandit.}
\centering
\renewcommand*{\arraystretch}{2}
\begin{tabular}{ccc} 
    \toprule
    \textbf{Notation} &  \textbf{Type} & \textbf{Description}\\ \hline
    \midrule
 $K$ & $\mathbb{N}$ & Number of arms\\
 \hline
 $T_{a}(t)$ & $\mathbb{N}$ & Number of visitations at arm $a$ after $t$ timesteps\\
 \hline
 $\mu_a$ &$\mathbb{R}$ & {mean value of arm $a$}\\
 \hline
 $a_\star$ &$\mathcal{A}$ & {optimal action}\\
 \hline
 $\mu_\star$ &$\mathbb{R}$ & {mean value of an optimal arm. We assume it is unique.}\\
 \hline
 $\widehat{\mu}_n(p)$ &$\mathbb{R}$ & {power mean estimator, with a constant $p \in [1,+\infty)$}\\
 \hline
 $\widehat{\mu}_{a,n}$ &$\mathbb{R}$ & {mean estimator of arm $a$ after $n$ visitations}\\
 \hline
\bottomrule
\end{tabular}\label{list_notations_bandt}
\end{table}

\section{Supporting Lemmas}
In this section, we will present all necessary supporting Lemmas for the main theoretical analysis.

We start with a result of the following lemma which plays an important role in the analysis of our MCTS algorithm.
\begin{manuallemma}{1}\label{lem:concQ_appendix} For $m \in [M]$, let $(\widehat{V}_{m,n})_{n\geq 1}$ be a sequence of estimator satisfying $\widehat{V}_{m,n}\cv{\alpha,\beta} V_m$, and there exists a constant $L$ such that $\widehat{V}_{m,n} \leq L, \forall n \geq 1$. Let $X_i$ be an iid sequence with mean $\mu$ and $S_i$ be an iid sequence from a distribution $p=(p_1,\dots,p_M)$ supported on $\{1,\dots,M\}$. Introducing the random variables $N_m^{n} = \# |\{ i \leq n : S_i = s_m\}|$, we define the sequence of estimator 
\[\widehat{Q}_n = \frac{1}{n}\sum_{i=1}^{n} X_i + \gamma \sum_{m=1}^{M} \frac{N_m^{n}}{n}\widehat{V}_{m,N_m^{n}}.\]
Then with $2\alpha \leq \beta, \beta > 1$, 
\[\widehat{Q}_n \cv{\alpha,\beta} \mu + \sum_{m=1}^{M} p_m V_m.\]
\end{manuallemma}

\begin{proof}
Let $p = (p_1,p_2,...p_M), p \in \triangle^M$ where $\triangle^M = \{x\in \mathbb{R}^M: \sum^M_{i=1}x_i=1, x_i\geq0\}$ is the $(M-1)$-dimensional simplex. Without loss of generality, we assume that $p_m>0$ for all $m$.
Let us study a random vector $\widehat{p}_n = (\frac{N^n_1}{n},\frac{N^n_2}{n},...,\frac{N^n_M}{n})$.
Let us define $V = (V_1,V_2,...V_M)$.
Let $\widehat{X}_n = \frac{1}{n}\sum^{n}_{i=1}X_i, \widehat{V}_n = (\widehat{V}_{1,N^n_1},\widehat{V}_{2,N^n_2},...,\widehat{V}_{M,N^n_M})$, $\sum^{M}_{i=1} N^n_i = n$, $N^n_i$ is the number of times that population $i$ was observed. We have $\widehat{Q}_n = \widehat{X}_n + \gamma \left\langle \widehat{p}_n, \widehat{V}_n \right\rangle$. Therefore,
\begin{flalign}
\bP \bigg( \widehat{Q}_n - \big(\mu + \gamma \left\langle p , V \right\rangle \big) \geq \epsilon \bigg) &\leq \bP \bigg(\widehat{X}_n - \mu \geq \frac{1}{2}\epsilon \bigg) +\bP \bigg( \gamma \left\langle \widehat{p}_n , \widehat{V}_n \right\rangle - \gamma \left\langle p , Y \right\rangle \geq \frac{1}{2}\epsilon\bigg)\nonumber \\ 
&\leq \exp\{-2n\frac{\epsilon^2}{4}\} + \underbrace{\bP \bigg( \left\langle \widehat{p}_n , \widehat{V}_n \right\rangle - \left\langle p , Y \right\rangle \geq \frac{1}{2\gamma}\epsilon\bigg)}_\text{A}.\nonumber
\end{flalign}
To upper bound $\text{A}$, let us consider $\left\langle \widehat{p}_n , \widehat{V}\right\rangle - \left\langle p , V \right\rangle = \left\langle (\widehat{p}_n - p),\widehat{V}_n \right\rangle + \left\langle p, (\widehat{V}-V) \right\rangle$. 
Then, 
\begin{flalign}
        A &\leq \underbrace{\bP \bigg( \left\langle (\widehat{p}_n - p),\widehat{V}_n \right\rangle \geq \frac{1}{4\gamma}\epsilon\bigg)}_\text{$A_1$} + \underbrace{\bP \bigg( \left\langle p, (\widehat{V}_n - V) \right\rangle \geq \frac{1}{4\gamma}\epsilon\bigg)}_\text{$A_2$}.\nonumber
\end{flalign}
By applying a Hölder inequality to $\widehat{p}_n - p$ and $\widehat{V}$, we obtain 
\begin{flalign}
        \left\langle (\widehat{p}_n - p),\widehat{V}_n \right\rangle \leq \parallel \widehat{p}_n - p \parallel_1 \parallel \widehat{V}_n \parallel_\infty = \parallel \widehat{p}_n - p \parallel_1 L,\nonumber
\end{flalign}
with $ L \geq \parallel \widehat{V}\parallel_\infty, L$ is a constant.
Then we can derive
\begin{flalign}
        A_1 &= \bP \bigg( \left\langle (\widehat{p}_n - p),\widehat{V}_n \right\rangle \geq \frac{1}{4\gamma}\epsilon\bigg)\nonumber \\
        &\leq \bP \bigg( \parallel \widehat{p}_n - p \parallel_1 L \geq \frac{1}{4\gamma}\epsilon\bigg)\nonumber \\
        &= \bP \bigg( \parallel \widehat{p}_n - p \parallel_1 \geq \frac{1}{4\gamma L}\epsilon\bigg).\nonumber
\end{flalign}
According to \cite{weissman2003inequalities}, we have for any $M\geq 2$ and $\delta \in [0,1]$
\begin{flalign}
\bP \bigg( \parallel \widehat{p}_n - p \parallel_1 \geq \sqrt{\frac{2M \ln(2/\delta)}{n}}\bigg) \leq \delta.\nonumber
\end{flalign}
Define  $\epsilon = \sqrt{\frac{2M \ln(2/\delta)}{n}}$, therefore $\delta = 2\exp\{\frac{-n\epsilon^2}{2M}\}$, we have
\begin{flalign}
\bP \bigg( \parallel \widehat{p}_n - p \parallel_1 \geq \epsilon\bigg) \leq 2\exp\{\frac{-n\epsilon^2}{2M}\}.\nonumber
\end{flalign}
Therefore,
\begin{flalign}
A_1 \leq \bP \bigg( \parallel \widehat{p}_n - p \parallel_1 \geq \epsilon\bigg) \leq 2\exp\{\frac{-n\epsilon^2}{32M\gamma^2 L^2}\}.\nonumber
\end{flalign}
We also have
\begin{flalign}
A_2 &= \bP \bigg( \sum_{m=1}^{M} p_m(\widehat{V}_{m,N^n_m} - V_m) \geq \frac{1}{4\gamma }\epsilon \bigg)\nonumber\\ 
&\leq \sum_{m=1}^{M} \E\bigg[ \bP \bigg( \frac{1}{N^n_m}\sum^{N^n_m}_{t=1}V_{m,t} - V_m \geq \frac{1}{4\gamma p_m}\epsilon \big|N^n_m\bigg) \bigg]\nonumber \\
&\leq \sum_{m=1}^{M} \E\bigg[ c (N^n_m)^{-\alpha}(\frac{\epsilon}{4\gamma p_m})^{-\beta} \bigg].\nonumber
\end{flalign}
Let us define an event $\mathcal{E} = \bigg\{N^n_m > \frac{n p_m}{2}\bigg\}$.
Therefore, 
\begin{flalign}
    A_2 &\leq \sum_{m=1}^{M} \E\bigg[ c (\frac{n p_m}{2})^{-\alpha}(\frac{\epsilon}{4\gamma p_m})^{-\beta} \bigg] + \sum_{m=1}^{M} \E\bigg[ \bP(N^n_m \leq \frac{n p_m}{2}) \bigg] \nonumber \\
    &= \sum_{m=1}^{M}(c 2^{\alpha + 2\beta} \gamma^{\beta} p_m^{-\alpha + \beta}) n^{-\alpha} \epsilon^{-\beta} + \sum_{m=1}^{M} \E\bigg[ \bP(N^n_m - p_m n \leq -\frac{p_m n}{2}) \bigg]\nonumber \\
    &\leq \sum_{m=1}^{M}(c 2^{\alpha + 2\beta} \gamma^{\beta} p_m^{-\alpha + \beta}) n^{-\alpha} \epsilon^{-\beta} + \sum_{m=1}^{M} \exp \bigg\{ -2n(\frac{p_m n}{2})^2 \bigg\}
\end{flalign}
Therefore,
\begin{flalign}
A &\leq A_1 + A_2 \leq 2\exp\{\frac{-n\epsilon^2}{32M\gamma^2 L^2}\} + \sum_{m=1}^{M}(c 2^{\alpha + 2\beta} \gamma^{\beta} p_m^{-\alpha + \beta}) n^{-\alpha} \epsilon^{-\beta} + \sum_{m=1}^{M} \exp \bigg\{ -2n(\frac{p_m n}{2})^2 \bigg\}.\nonumber
\end{flalign}
That leads to
\begin{flalign}
\bP \bigg( \widehat{Q}_n - \big(\mu + \gamma \left\langle p , V \right\rangle \big) \geq \epsilon \bigg) &\leq \exp\{-2n\frac{\epsilon^2}{4}\} + 2\exp\{\frac{-n\epsilon^2}{32M\gamma^2 L^2}\} + \sum_{m=1}^{M}(c 2^{\alpha + 2\beta} \gamma^{\beta} p_m^{-\alpha + \beta}) n^{-\alpha} \epsilon^{-\beta} \nonumber \\
&+ \sum_{m=1}^{M} \exp \bigg\{ -2n(\frac{p_m n}{2})^2 \bigg\} \leq c^{'}n^{-\alpha}\epsilon^{-\beta},\nonumber
\end{flalign}
with $c^{'}>0$ depends on $c,M,\alpha,\beta, p_i$. Here we need 
\begin{flalign}
2\alpha \leq \beta, \label{alpha_leq_beta}
\end{flalign}
to argue that $\exp(-cn\varepsilon^2) = \mathcal{O}(n^{-\alpha}\varepsilon^{-\beta})$.
By following the same steps, we can derive
\[
\bP \bigg( \widehat{Q}_n - \big(\mu + \gamma \left\langle p , V\right\rangle\big) \leq -\epsilon \bigg) \leq c^{'} n^{-\alpha}\epsilon^{-\beta}.\nonumber
\]
Therefore, with $n\geq 1, \epsilon > 0$,
\begin{flalign}
\bP \bigg( \left| \widehat{Q}_n - \big(\mu + \gamma \left\langle p , V\right\rangle\big) \right|
\geq \epsilon \bigg) \leq c^{'} n^{-\alpha}\epsilon^{-\beta}.\label{upperbound_Q}
\end{flalign}
%Furthermore,
%\begin{flalign}
   % &\lim_{n\longrightarrow\infty} \E\left[ \left| \widehat{Q} - \left(\mu + \gamma\sum_{m=1}^{M} p_m V_m\right) \right|\right] \nonumber \\
    %&= \lim_{n\longrightarrow\infty} \int^{\infty}_0 \bP \left( \left| \widehat{Q} - \left(\mu + \gamma\sum_{m=1}^{M} p_m V_m  \right) \right| \geq s \right) ds \nonumber \\
    %&\leq  \lim_{n\longrightarrow\infty} \left( \int^{n^{-\frac{\alpha}{\beta}}}_0 1 ds +  \int^{+\infty}_{n^{-\frac{\alpha}{\beta}}} c^{'} n^{-\alpha}s^{-\beta} \right) \nonumber \\
    %&= \lim_{n\longrightarrow\infty} \left(n^{-\frac{\alpha}{\beta}} + c^{'} n^{-\alpha} \left(\frac{s^{-\beta + 1}}{-\beta + 1} + C \right)\Big|^{+\infty}_{n^{-\frac{\alpha}{\beta}}} \right) =0 \nonumber\\
    %&= \lim_{n\longrightarrow\infty} \left(n^{-\frac{\alpha}{\beta}} - c^{'} n^{-\alpha} \left(\frac{n^{\frac{\alpha(\beta - 1)}{\beta}}}{-\beta + 1} \right) \right) =0 \nonumber \text{ (because $\alpha > 0, \beta > 1)$}
%\end{flalign}
%so that,
%\[
%    \lim_{n \rightarrow \infty} \E[\widehat{Q}_n] = \mu + \gamma \sum_{m=1}^M p_m V_m. \nonumber
%\]
This means \[\widehat{Q}_n \cv{\alpha,\beta} \mu + \gamma \sum_{m=1}^{M} p_m V_m,\]
which concludes the proof.
\end{proof}

\begin{manuallemma}{2}\label{lm:intermediate_inequality}
Let consider non-negative variables $x, y\in \mathbb{R}^{+} $, and a constant m that $0 \leq m \leq 1$. Then 
\begin{flalign}
(x + y)^m \leq x^m + y^m. \label{intermediate_inequality}
\end{flalign}
\end{manuallemma}
\begin{proof}
    With $y = 0$, or $x= 0$, the inequality~(\ref{intermediate_inequality}) becomes correct. Let consider the case where $x> 0, y> 0$, the inequality~(\ref{intermediate_inequality}) can be written as 
    \[\left(\frac{x}{y} + 1\right)^m \leq \left(\frac{x}{y}\right)^m + 1.\]
    Let us define a function 
    \[
    f(t) = (t+1)^m - t^m - 1, (t > 0).
    \]
    We can see that 
    \[
    f^{'}(t) = m(t+1)^{m-1} - m t^{m-1} = m\left( (t+1)^{m-1} - t^{m-1} \right) \leq 0 \text{ with } m \in [0,1], t > 0,
    \]
    because $g(x) = x^{m-1}$ is a decreasing function with $m \in [0,1], x > 0$.
    Therefore, 
    \[
    f(t) \leq f(0) = 0 \text{ with } t > 0.
    \]
    So that, 
    \[
    (t+1)^m - t^m - 1 \leq 0, (t > 0).
    \]
    with $t = \frac{x}{y} \geq 0$, we can derive the inequality~ (\ref{intermediate_inequality}).
\end{proof}

We use Minkowski's inequality as shown below
\begin{manuallemma} {3\textbf{(Minkowski's inequality)}} \label{Minkowski_inequality}
    Given $p \geq 1, \{x_i, y_i\} \in \mathbb{R}, i = 1,2,...,n$, then we have the following inequality
    \begin{flalign}
        \left( \sum_i (|x_i + y_i|)^p \right)^{\frac{1}{p}} \leq \left( \sum_i (|x_i|)^p \right)^{\frac{1}{p}} + \left( \sum_i (|y_i|)^p \right)^{\frac{1}{p}}
    \end{flalign}
\end{manuallemma}
\begin{proof}
    This is a basic result.
\end{proof}
\section{Convergence of Stochastic-Power-UCT in Non-stationary multi-armed bandits}
In an MCTS tree, each node functions as a non-stationary multi-armed bandit, with the average mean drifting due to the action selection strategy. To address this, we first study the convergence of Stochastic-Power-UCT in non-stationary multi-armed bandits, where action selection is based on Thompson sampling, and the power mean backup operator is used at the root node. Detailed descriptions of Stochastic-Power-UCT in non-stationary bandit settings can be found in the Theoretical Analysis section of the main article.

We establish the convergence and concentration properties of the power mean backup operator in non-stationary bandits, as detailed in Theorem~\ref{thm:theorem1_appendix} for Stochastic-Power-UCT which mainly based on the results of Lemma~\ref{lem:pm_intermediate_results_tighter_fixed_appendix}. To derive the results for Lemma~\ref{lem:pm_intermediate_results_tighter_fixed_appendix}, we need results of Lemma~\ref{lem:concentration_optimal_arm_intermediate_appendix},
Lemma~\ref{lem:hp_bound_visits_appendix},
Lemma~\ref{lem:upper_bound_power_mean_and_optimal_appendix},
Lemma~\ref{lem:upper_bound_optimal_arm_in_power_mean_appendix}, 
and Lemma~\ref{lem:upper_bound_suboptimal_arm_in_power_mean_appendix}. 
These lemmas collectively support the theoretical understanding of Stochastic-Power-UCT in non-stationary multi-armed bandit settings.

Lemma~\ref{lem:concentration_optimal_arm_intermediate_appendix} shows the upper bound for probability of the difference between the mean value estimation at the optimal arm (with $T_{a_\star}(n)$ number of visitations) and the optimal value $\mu_\star$.

Lemma~\ref{lem:hp_bound_visits_appendix} show the crucial on the high-probability bound on the number of selection of each sub-optimal arm, which based on the results of Lemma~\ref{lem:basic_conc_appendix}.
Lemma~\ref{lem:upper_bound_power_mean_and_optimal_appendix} show the upperbound for absolute value of the difference of the power mean estimator and the optimal value. Lemma~\ref{lem:upper_bound_optimal_arm_in_power_mean_appendix}, 
and Lemma~\ref{lem:upper_bound_suboptimal_arm_in_power_mean_appendix}
show intermediate results that helps to derive results of Lemma~\ref{lem:pm_intermediate_results_tighter_fixed_appendix}.
\begin{manuallemma}{4}\label{lem:concentration_optimal_arm_intermediate_appendix}
Consider a bandit problem defined as in Section~\ref{s:bandit_definition}. Let us define $A(n) = \big(\frac{2C n^{\frac{b}{\beta}}}{\triangle}\big)^{\frac{\beta}{\alpha}}$, where $\Delta = \min_{a\in[K]}\{\mu_* - \mu_a\}, a\neq a_*$, with $R \geq \epsilon \geq n^{-\frac{\alpha}{\beta}} $ then we have
    \begin{flalign}
        \bP \left(\left| \widehat{\mu}_{a_{\star},T_{a_*}(n)} - \mu_{\star}\right| > \epsilon \right) \leq \sum^{K}_{a\neq a_*} \bP \left( T_a(n) > (A(n)+1)\right) + \frac{c}{\alpha-1}\epsilon^{-\beta} (n - (K-1) A(n)+1)^{-\alpha + 1}.
    \end{flalign}
\end{manuallemma}
\begin{proof}
Consider an event $\mathcal{E} \stackrel{\text{def}}{=} \left\{\sum^{K}_{a\neq a_*} T_a(n) > (K-1)(A(n)+1) \right\}$. Then,
\begin{flalign}
    \bP \left( \left| \widehat{\mu}_{a_{\star},T_{a_*}(n)} - \mu_{\star}\right| > \epsilon \right) &\leq \bP \left( \sum^{K}_{a\neq a_*} T_a(n) > (K-1) (A(n)+1) \right) \nonumber \\
    &+ \underbrace{\bP \left( \sum^{K}_{a\neq a_*} T_a(n) \leq (K-1) (A(n)+1);\left| \widehat{\mu}_{a_{\star},T_{a_*}(n)} - \mu_{\star}\right| \geq \epsilon \right)}_{D_1}.\label{eq_d1_t_a}
\end{flalign}
When $\sum^{K}_{a\neq a_*} T_a(n) \leq (K-1) (A(n)+1) \Rightarrow T_{a_*}(n) = n - \sum^{K}_{a\neq a_*} T_a(n) \geq n - (K-1) (A(n)+1) $, so that with $\alpha > 0$
\begin{flalign}
    D_1 &\leq  \bP \left( T_{a_*}(n) \geq n - (K-1) (A(n)+1); \left|\widehat{\mu}_{a_{\star},T_{a_*}(n)} - \mu_{\star}\right| \geq \epsilon \right) \leq \sum^{n}_{t = n - (K-1) (A(n)+1)} \bP \left(\left| \widehat{\mu}_{a_{\star},t} - \mu_{\star}\right| \geq \epsilon \right) \nonumber \\ 
    &\leq \sum^{n}_{t = n - (K-1) (A(n)+1)} c t^{-\alpha} \epsilon^{-\beta} \nonumber \\
    &\leq c\epsilon^{-\beta} \left( \int^{\infty}_{n - (K-1)(A(n)+1)-1} t^{-\alpha} dt\right) =  \frac{c}{\alpha-1}\epsilon^{-\beta} (n - (K-1) (A(n)+1) - 1)^{-\alpha + 1} \label{eq_d1} (\text{ because $\alpha > 2$}).
\end{flalign}
Combining Equation~(\ref{eq_d1_t_a}) and Equation~(\ref{eq_d1}), we can conclude the proof.
\end{proof}

We introduce the notation $U_{a,t,s} = \widehat{\mu}_{a,s} + C \frac{t^{\frac{b}{\beta}}}{s^{\frac{\alpha}{\beta}}}$ and we first borrow two lemmas of \cite{shah2022journal}. 

Introducing for all $a$ the quantity 
\[A_a(t) := \inf \left\{s \leq t : C \frac{t^{\frac{b}{\beta}}}{s^{\frac{\alpha}{\beta}}} \leq \frac{\Delta_a}{2}\right\} =  \left(\frac{2C}{\Delta_a}\right)^{\frac{\beta}{\alpha}}t^{\frac{b}{\alpha}},\]
where $\Delta_a = \mu_* - \mu_a$, the concentration properties permits to prove the following Lemma.

\begin{manuallemma}{5}\label{lem:basic_conc_appendix} Let $n \geq 1$.  
\begin{enumerate}
 \item For all $s\in \{1, \dots,n\}$, $\bP\left(U_{a,n,s} < \mu_a\right) \leq c C^{-\beta} n^{-b}$
 \item For all $s\in \{A_a(n), \dots,n\}$, $\bP(U_{a,n,s} > \mu_\star) \leq c C^{-\beta} n^{-b}$
\end{enumerate}
\end{manuallemma}
\begin{proof}
    1. 
    \[
        \bP \left( U_{a,n,s} < \mu_a \right) = \bP \left( \widehat{\mu}_{a,s} -\mu_a < -C \frac{n^{\frac{b}{\beta}}}{s^{\frac{\alpha}{\beta}}} \right) \leq c C^{-\beta}n^{-b} (\text{ Assumption~\ref{assumpt_1_main} })
    \]
    2. We have
    \[
        \bP \left( U_{a,n,s} > \mu_\star \right) = \bP \left( \widehat{\mu}_{a,s} + C \frac{n^{\frac{b}{\beta}}}{s^{\frac{\alpha}{\beta}}} > \mu_\star \right) = \bP \left( \widehat{\mu}_{a,s} -\mu_a > \Delta_a - C \frac{n^{\frac{b}{\beta}}}{s^{\frac{\alpha}{\beta}}} \right)
    \]
    Because we choose 
    \[A_a(n) := \inf \left\{s \leq n : C \frac{n^{\frac{b}{\beta}}}{s^{\frac{\alpha}{\beta}}} \leq \frac{\Delta_a}{2}\right\} =  \left(\frac{2C}{\Delta_a}\right)^{\frac{\beta}{\alpha}}n^{\frac{b}{\alpha}},\]
    therefore,
    \[
    \bP \left( U_{a,n,s} > \mu_\star \right) \leq \bP \left( \widehat{\mu}_{a,s} -\mu_a > C \frac{n^{\frac{b}{\beta}}}{s^{\frac{\alpha}{\beta}}} \right) \leq c C^{-\beta}  n^{-b} (\text{ Assumption~\ref{assumpt_1_main} })
    \]
    that concludes the proof.
\end{proof}

In turn, Lemma~\ref{lem:hp_bound_visits_appendix} permits us to prove the following crucial high-probability bound on the number of selection of each sub-optimal arm. 

\begin{manuallemma}{6}\label{lem:hp_bound_visits_appendix}
Consider a bandit problem defined as in Section~\ref{s:bandit_definition}. Assume $b > 1$. Let us define $A_a(n) := \inf \left\{s \leq n : C \frac{n^{\frac{b}{\beta}}}{s^{\frac{\alpha}{\beta}}} \leq \frac{\Delta_a}{2}\right\} =  \left(\frac{2C}{\Delta_a}\right)^{\frac{\beta}{\alpha}}n^{\frac{b}{\alpha}}$. For all $u\geq A_a(n)$, 
 \[\bP\left(T_a(n) \geq u \right) \leq 2c C^{-\beta}\frac{\left(u-1\right)^{-(b-1)}}{b-1}.\]
\end{manuallemma}

\begin{proof}
    For any $\tau \in \mathbb{R}$, we study two following events
\begin{flalign}
    &\mathcal{E}_1 = \{ \text{for each integer } t \in [u,n], \text{ we have } U_{a,t,u} \leq \tau\}, \label{E_1} \\
    &\mathcal{E}_2 = \{ \text{for each integer } t_0 \in [1,n-u], \text{ we have } U_{a_{*},u+t_0,t_0} > \tau\}. \label{E_2}
\end{flalign}
\noindent We want to prove that 
\[
\mathcal{E}_1 \cap \mathcal{E}_2 \Rightarrow T_{a}(n) \leq u.
\]
Recall that 
\begin{flalign}
U_{a,t,s} = \widehat{\mu}_{a,s} + C \frac{t^{\frac{b}{\beta}}}{s^{\frac{\alpha}{\beta}}} \Rightarrow U_{a,t,u} = \widehat{\mu}_{a,u} + C \frac{t^{\frac{b}{\beta}}}{u^{\frac{\alpha}{\beta}}} \text { and }
U_{a_*,u+t_0,t_0} = \widehat{\mu}_{a_*,t_0} + C \frac{(u+t_0)^{\frac{b}{\beta}}}{t_0^{\frac{\alpha}{\beta}}}. \nonumber
\end{flalign}
Then, for each $t_0$ such that $1\leq t_0 \leq n-u$, and each $t$ such that $u+t_0 \leq t \leq n$,\\ 
We have 
\begin{flalign}
U_{a_{*},t,t_0} = \widehat{\mu}_{a_*,t_0} + C \frac{t^{\frac{b}{\beta}}}{t_0^{\frac{\alpha}{\beta}}} \geq \widehat{\mu}_{a_*,t_0} + C \frac{(u+t_0)^{\frac{b}{\beta}}}{t_0^{\frac{\alpha}{\beta}}} > \tau > U_{a,t,u} =  \widehat{\mu}_{a,u} + C \frac{t^{\frac{b}{\beta}}}{u^{\frac{\alpha}{\beta}}}.\label{contrad}
\end{flalign}
We want to prove $T_a(n) \leq u$ by contradiction. Let assume that $T_a(n) > u$, then let denote $t'$ is the first time that the arm $a$ have been played $u$ times:
\[
t' = \min \{t: t \leq n, T_a(n) = u\}.
\]
Then at anytime t such that $t' < t \leq n$, meaning at any time $t$ after the arm $a$ has been selected $u$ time, from \ref{contrad}, we have
\[
U_{a_{*},t,t_0} > U_{a,t,u},
\]
which mean the arm $a$ will not be selected after $u$ times, which contradicts our assumption that $T_a(n) > u$. Therefore
\[
\mathcal{E}_1 \cap \mathcal{E}_2 \Rightarrow T_{a}(n) \leq u.
\]
Then
\begin{flalign}
\{T_{a}(n) \geq u\} \subset (\mathcal{E}^c_1 \cup \mathcal{E}^c_2) = ( \{ \exists t: u \leq t \leq n, U_{a,t,u} > \tau \} \cup \{ \exists t_0: 1 \leq t_0 \leq n-u, U_{a_*, u+t_0, t_0} \leq \tau \} ).
\end{flalign}
Therefore, 
\begin{flalign}
    \bP \bigg( T_{a}(n) \geq u \bigg) \leq \sum^n_{t=u} \bP\bigg( U_{a,t,u} > \tau \bigg) + \sum^{n-u}_{t_0=1} \bP \bigg( U_{a_*, u+t_0, t_0} \leq \tau \bigg).
\end{flalign}
We set $\tau = \mu_*$, and since $u \geq A_a(n)$, from Lemma~\ref{lem:basic_conc_appendix}, we have the following result
\begin{flalign}
    \sum^n_{t=u} \bP\bigg( U_{a,t,u} > \tau \bigg) &= \sum^n_{t=u} \bP\bigg( U_{a,t,u} > \mu_* \bigg) \leq c C^{-\beta}\sum^{n}_{t=u} t^{-b} \leq c C^{-\beta}\int^{\infty}_{u-1} t^{-b}dt = c C^{-\beta}\frac{(u-1)^{-(b-1)}}{b-1}
\end{flalign}
Similarly,
\begin{flalign}
    \sum^{n-u}_{t_0=1} \bP\bigg( U_{a_*, u+t_0, t_0} \leq \tau \bigg) &= \sum^{n-u}_{t_0=1} \bP\bigg( \widehat{\mu}_{a_*,t_0} + C \frac{(u+t_0)^{\frac{b}{\beta}}}{t_0^{\frac{\alpha}{\beta}}} > \mu_* \bigg) \leq c C^{-\beta}\sum^{n-u}_{t_0=1} (u+t_0)^{-b} \leq c C^{-\beta}\int^{\infty}_{u-1} t^{-b}dt\\
    &= c C^{-\beta}\frac{(u-1)^{-(b-1)}}{b-1},
\end{flalign}
that concludes the proof.
\end{proof}

\begin{manuallemma}{7}\label{lem:upper_bound_power_mean_and_optimal_appendix}
    Let us define the power mean estimator $\widehat{\mu}_n(p)$ as $\widehat{\mu}_n(p) = \left(\sum^{K}_{a=1} \frac{T_{a}(n)}{n} \widehat{\mu}^{p}_{a,T_{a}(n)}\right)^{\frac{1}{p}}$. For any $p\geq 1$, we have
    \begin{flalign}
        \left|\widehat{\mu}_n(p) -  \mu_*\right| &\leq R\sum^{K}_{a=1,a\neq a_*} \frac{T_a(n)}{n} + \left(\sum_{a=1}^{K} \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p \right)^{\frac{1}{p}}
    \end{flalign}
\end{manuallemma}
\begin{proof}
We observe that 
\begin{flalign}
\widehat{\mu}_{a,T_a(n)} \leq \mu_a + \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|. \label{absolute_inequality_1} 
\end{flalign}
    Since $\mu_* = \max_{a \in [K]} \{\mu_a\}$, we have  
    \begin{flalign}
    \widehat{\mu}_n(p) - \mu_* = \widehat{\mu}_n(p) - \sum^{K}_{a=1}T_a(n)\mu_{*} &\leq \left(\sum_{a=1}^{K} \frac{T_a(n)}{n}\left(\widehat{\mu}_{a,T_a(n)}\right)^{p}\right)^{\frac{1}{p}} - \left(\sum_{a=1}^{K} \frac{T_a(n)}{n}\left(\mu_a\right)^{p}\right)^{\frac{1}{p}}\\
    &= \frac{\left(\sum_{a=1}^{K} T_a(n)\left(\widehat{\mu}_{a,T_a(n)}\right)^{p}\right)^{\frac{1}{p}} - \left(\sum_{a=1}^{K} T_a(n)\left(\mu_a\right)^{p}\right)^{\frac{1}{p}}}{n^{\frac{1}{p}}}
    \end{flalign}
    Applying Minkowski's inequality from Lemma~\ref{Minkowski_inequality}, and the result of (\ref{absolute_inequality_1}), we have
    \begin{flalign}
        \widehat{\mu}_n(p) -  \mu_* &\leq \frac{\left(\sum_{a=1}^{K} T_a(n)\left(\mu_a + \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^{p}\right)^{\frac{1}{p}} - \left(\sum_{a=1}^{K} T_a(n)\left(\mu_a\right)^{p}\right)^{\frac{1}{p}}}{n^{\frac{1}{p}}}\\
        &\leq \frac{\left(\sum_{a=1}^{K} T_a(n) \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p \right)^{\frac{1}{p}} }{n^{\frac{1}{p}}}\label{first_half}
    \end{flalign}
    On the other hand,
    \begin{flalign}
        \mu_* - \widehat{\mu}_n(p) &= \frac{ n\mu_* - n\widehat{\mu}_n(p)}{n} = \frac{ n\mu_* - (\sum^{K}_{a=1}T_a(n)\mu_a) + \sum^{K}_{a=1}T_a(n)\mu_a - n\widehat{\mu}_n(p)}{n}\\
        &= \frac{ \sum^{K}_{a=1,a\neq a_*}T_a(n) \left|\mu_* - \mu_a\right| + \sum^{K}_{a=1}T_a(n)\mu_a - n\widehat{\mu}_n(p)}{n} \\
        &\leq  R\sum^{K}_{a=1,a\neq a_*} \frac{T_a(n)}{n}  + \sum^{K}_{a=1}\frac{T_a(n)}{n}\mu_a - \widehat{\mu}_n(p)  \label{inter_eq_1}
    \end{flalign}
    Because power mean is an increasing function of $p$, so that $\sum^{K}_{a=1}\frac{T_a(n)}{n}\mu_a \leq \left(\sum^{K}_{a=1}\frac{T_a(n)}{n}\left(\mu_a\right)^p\right)^{1/p}$. Furthermore, we observe that \[ \mu_a \leq \widehat{\mu}_{a,T_a(n)} + \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\label{absolute_inequality_2}. \]
    So that, from Equation $(\ref{inter_eq_1})$ we have 
    \begin{flalign}
        \mu_* - \widehat{\mu}_n(p) &\leq R\sum^{K}_{a=1,a\neq a_*} \frac{T_a(n)}{n}  + \left(\sum^{K}_{a=1}\frac{T_a(n)}{n}\left(\mu_a\right)^p\right)^{1/p} - \widehat{\mu}_n(p) \nonumber \\
        &\leq R\sum^{K}_{a=1,a\neq a_*} \frac{T_a(n)}{n} + \frac{\left(\sum_{a=1}^{K} T_a(n)\left(\widehat{\mu}_{a,T_a(n)} + \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^{p}\right)^{\frac{1}{p}} - \left(\sum_{a=1}^{K} T_a(n)\left(\widehat{\mu}_{a,T_a(n)}\right)^{p}\right)^{\frac{1}{p}}}{n^{\frac{1}{p}}} \nonumber \\
        &\leq R \sum^{K}_{a=1,a\neq a_*} \frac{T_a(n)}{n} + \frac{\left(\sum_{a=1}^{K} T_a(n) \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p \right)^{\frac{1}{p}} }{n^{\frac{1}{p}}} \label{second_half}
    \end{flalign}    
    Therefore, from equation (\ref{first_half}), and equation (\ref{second_half}), we can derive
    \begin{flalign}
    \left|\widehat{\mu}_n(p) -  \mu_*\right| &\leq R\sum^{K}_{a=1,a\neq a_*} \frac{T_a(n)}{n} + \left(\sum_{a=1}^{K} \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p \right)^{\frac{1}{p}}, \nonumber
    \end{flalign}
    that concludes the proof.
\end{proof}

\begin{manuallemma}{8}\label{lem:upper_bound_optimal_arm_in_power_mean_appendix}
    Consider a bandit problem defined as in Section~\ref{s:bandit_definition}. With $R \geq \epsilon \geq n^{-\frac{\alpha}{\beta}}$, we have
    \begin{flalign}
    \bP\left( \frac{T_{a_*}(n)}{n} \left( \left| \widehat{\mu}_{a_*,T_{a_*}(n)} - \mu_{*} \right|\right)^p  > \epsilon^p \right) \leq \frac{2c C^{-\beta}(K-1) A(n)^{-(b-1)} }{b-1} + \frac{c}{\alpha-1}\epsilon^{-\beta} (n - (K-1) (A(n)+1)-1)^{-\alpha + 1}
    \end{flalign}
\end{manuallemma}
\begin{proof}
    We have
    \begin{flalign}
    \bP\left(  \frac{T_{a_*}(n)}{n} \left(\left| \widehat{\mu}_{a_*,T_{a_*}(n)} - \mu_{*} \right|\right)^p  > \epsilon^p \right) \leq \bP \left( \left| \widehat{\mu}_{a_*,T_{a_*}(n)} - \mu_{*} \right|  >  \epsilon \right). \nonumber 
    \end{flalign}
    Applying results of Lemma~\ref{lem:concentration_optimal_arm_intermediate_appendix}, we have
    \begin{flalign}
        \bP \left(\left| \widehat{\mu}_{a_{\star},T_{a_*}(n)} - \mu_{\star}\right| > \epsilon \right) \leq \underbrace{\sum^{K}_{a\neq a_*} \bP \left( T_a(n) > A(n) + 1\right)}_{F_{11}} + \underbrace{\frac{c}{\alpha-1}\epsilon^{-\beta} (n - (K-1)( A(n)+1)- 1)^{-\alpha + 1}}_{F_{12}}.
    \end{flalign}
    From the result of Lemma~\ref{lem:hp_bound_visits_appendix}, with $b > 1$, we also have
    \[
    F_{11} \leq \sum^{K}_{a=1,a\neq a_*} \bP\left(  T_a(n) > A(n) +1 \right) \leq \sum^{K}_{a=1,a\neq a_*} 2c C^{-\beta} \frac{A(n)^{-(b-1)}}{b-1} = \frac{2c C^{-\beta}(K-1)A(n)^{-(b-1)}}{b-1}
    \]
    that concludes the proof.
\end{proof}

\begin{manuallemma}{9}\label{lem:upper_bound_suboptimal_arm_in_power_mean_appendix}
    Consider a bandit problem defined as in Section~\ref{s:bandit_definition}. With a is a suboptimal arm, $R \geq \epsilon \geq n^{-\frac{\alpha}{\beta}}$, we can find a constant $N_0$ such that $\forall n \geq N_0$, such that 
    \begin{itemize}
        \item With $1\leq p \leq 2, \alpha \leq \frac{\beta}{p}$, we have
        \begin{flalign}
        \bP\left( \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p  > \frac{1}{K-1}\epsilon^p \right) \leq \frac{2c C^{-\beta}}{(b-1)} A(n)^{-(b-1)} + \frac{2c(K-1)^{\frac{\beta}{p}}}{-(\alpha - \frac{\beta}{p}) + 1}\epsilon^{-\beta} (A_a(n)+1)^{-(\alpha - 1)} .\label{lem:upper_bound_suboptimal_arm_in_power_mean:case_1}
        \end{flalign}
        \item With $p > 2$, and $0 < \alpha - \frac{\beta}{p} < 1$, we have 
        \begin{flalign}
        \bP\left( \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p  > \frac{1}{K-1}\epsilon^p \right) \leq \frac{2c C^{-\beta}}{(b-1)} A(n)^{-(b-1)} + \frac{c  (K-1)^{\frac{\beta}{p}}}{1 - (\alpha - \frac{\beta}{p})} \epsilon^{-\beta}  (A(n)+1)^{-(\alpha - 1)}. \label{lem:upper_bound_suboptimal_arm_in_power_mean:case_2}
        \end{flalign}
        \item With $p > 2$, and $\alpha - \frac{\beta}{p} > 1$, we have
        \begin{flalign}
        \bP\left( \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p  > \frac{1}{K-1}\epsilon^p \right) \leq \frac{2c C^{-\beta}}{(b-1)} A(n)^{-(b-1)} + \frac{c  (K-1)^{\frac{\beta}{p}} (\alpha - \frac{\beta}{p}) }{(\alpha - \frac{\beta}{p}) - 1} \epsilon^{-\beta} (A(n)+1)^{-\frac{\beta}{p}} \label{lem:upper_bound_suboptimal_arm_in_power_mean:case_3}
        \end{flalign}
    \end{itemize}
\end{manuallemma}
\begin{proof}
     Recall that $\forall u > A_a(n) = \big(\frac{2C n^{\frac{b}{\beta}}}{\triangle_a}\big)^{\frac{\beta}{\alpha}} $, 
    \[
    \bP \left( T_a(n) > u \right) \leq 2c C^{-\beta}\frac{(u-1)^{-(b-1)}}{b-1}.
    \]
    We consider 2 events, $\mathcal{E}_1 = \{ T_a(n) > A_a(n) + 1)\}$, and $\mathcal{E}^c_1 = \{ T_a(n) \leq A_a(n) + 1\}$, then 
    \begin{flalign}
    \bP\left( \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p  > \frac{1}{K-1}\epsilon^p \right) &\leq \bP \left( T_a(n) > A_a(n) +1 \right)\nonumber \\ 
    &+ \bP \left( T_a(n) \leq A_a(n) + 1; \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p  > \frac{1}{K-1}\epsilon^p \right) \nonumber \\
    &\leq \underbrace{2c C^{-\beta}\frac{ A_a(n)^{-(b-1)}}{b-1}}_{G_1} + \underbrace{\sum^{ A_a(n)+1}_{t=1} \bP \left( \frac{t}{n} \left| \widehat{\mu}_{a,t} - \mu_a \right|^p > \frac{1}{K-1}\epsilon^p \right)}_{G2} \nonumber
    \end{flalign}
    For $G_2$, we can see that we can find $N_0$ such that with $t \leq A(n) + 1, \forall n \geq N_0$, $\left(\frac{n}{t(K-1)}\right)^{\frac{1}{p}}\epsilon > \epsilon \geq n^{-\frac{\alpha}{\beta}}$. Therefore, 
    \[
    G_2 \leq \sum^{A_a(n)+1}_{t=1} \bP \left( \left| \widehat{\mu}_{a,t} - \mu_a \right| > \left(\frac{n}{t(K-1)}\right)^{\frac{1}{p}}\epsilon \right) \leq \sum^{A_a(n)+1}_{t=1} c t^{-\alpha}  \left(\left(\frac{n}{t(K-1)}\right)^{\frac{1}{p}}\epsilon\right)^{-\beta} \leq \sum^{A_a(n)+1}_{t=1} c  (K-1)^{\frac{\beta}{p}} t^{-(\alpha -\frac{\beta}{p}) } \epsilon^{-\beta} n^{-\frac{\beta}{p}}.
    \]
    We study 2 cases:\\
    Case 1: $\alpha - \frac{\beta}{p} \leq 0$, which can only happen if $p \leq 2$ because $\alpha \leq \frac{\beta}{2}$, and actually when $\alpha \leq \frac{\beta}{2}$ we just need $1 \leq p \leq 2$, then 
    \begin{flalign}
    G_2 &\leq  c  (K-1)^{\frac{\beta}{p}} \epsilon^{-\beta} n^{-\frac{\beta}{p}} \left( \int^{A_a(n)+1}_1 t^{-(\alpha - \frac{\beta}{p})} dt + (A_a(n)+1)^{-(\alpha - \frac{\beta}{p})} \right) \nonumber \\
    &= c  (K-1)^{\frac{\beta}{p}} \epsilon^{-\beta} n^{-\frac{\beta}{p}} \left( \left(\frac{t^{-(\alpha - \frac{\beta}{p}) + 1}}{-(\alpha - \frac{\beta}{p}) + 1} + C \right) \Bigg|^{A_a(n)+1}_1 + (A_a(n)+1)^{-(\alpha - \frac{\beta}{p})} \right) \nonumber \\
    &\leq c  (K-1)^{\frac{\beta}{p}} \epsilon^{-\beta} (A_a(n)+1)^{-\frac{\beta}{p}} \left( \frac{(A_a(n)+1)^{-(\alpha - \frac{\beta}{p}) + 1}}{-(\alpha - \frac{\beta}{p}) + 1} -\frac{1}{-(\alpha - \frac{\beta}{p}) + 1} + (A_a(n)+1)^{-(\alpha - \frac{\beta}{p})}\right). \nonumber
    \end{flalign}
    Because $-(\alpha - \frac{\beta}{p}) + 1 \geq 1$, we can find a constant $N_{\epsilon}$ such that $\forall n \geq N_{\epsilon}$, we have 
    \[
    G_2 \leq 2c (K-1)^{\frac{\beta}{p}} \epsilon^{-\beta} (A_a(n)+1)^{-\frac{\beta}{p}} \frac{(A_a(n)+1)^{-(\alpha - \frac{\beta}{p}) + 1}}{-(\alpha - \frac{\beta}{p}) + 1} = \frac{2c(K-1)^{\frac{\beta}{p}}}{-(\alpha - \frac{\beta}{p}) + 1}\epsilon^{-\beta} (A_a(n)+1)^{-(\alpha - 1)}.
    \]
    Therefore, we have
    \begin{flalign}
        \bP\left( \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p  > \frac{1}{K}\epsilon^p \right) \leq \frac{2c C^{-\beta}}{(b-1)} A(n)^{-(b-1)} + \frac{2c(K-1)^{\frac{\beta}{p}}}{-(\alpha - \frac{\beta}{p}) + 1}\epsilon^{-\beta} (A_a(n)+1)^{-(\alpha - 1)}.
    \end{flalign}
    that concludes for the Inequality~\ref{lem:upper_bound_suboptimal_arm_in_power_mean:case_1}.
    
    Case 2: $\alpha - \frac{\beta}{p} > 0$, which can only happen if $p > 2$ because $\alpha \leq \frac{\beta}{2}$. We have
    \begin{flalign}
    \sum^{A_a(n)+1}_{t=1} t^{-(\alpha - \frac{\beta}{p})} &\leq 1 + \int^{A_a(n)+1}_1 t^{-(\alpha - \frac{\beta}{p})} dt = 1 + \left(\frac{t^{-(\alpha - \frac{\beta}{p}) + 1}}{-(\alpha - \frac{\beta}{p}) + 1} + C\right) \bigg|^{A_a(n)+1}_1\nonumber \\ 
    &= 1 + \frac{(A_a(n)+1)^{-(\alpha - \frac{\beta}{p}) + 1}}{-(\alpha - \frac{\beta}{p}) + 1} - \frac{1}{-(\alpha - \frac{\beta}{p}) + 1} \nonumber \\
    &= \frac{\alpha - \frac{\beta}{p}}{\alpha - \frac{\beta}{p} -1} - \frac{(A_a(n)+1)^{-(\alpha - \frac{\beta}{p}) + 1}}{(\alpha - \frac{\beta}{p}) - 1}, \nonumber
    \end{flalign}
    so that 
    \begin{flalign}
    G_2 &\leq c  (K-1)^{\frac{\beta}{p}} \left(\frac{\alpha - \frac{\beta}{p}}{\alpha - \frac{\beta}{p} -1} - \frac{(A_a(n)+1)^{-(\alpha - \frac{\beta}{p}) + 1}}{(\alpha - \frac{\beta}{p}) - 1}\right) \epsilon^{-\beta} n^{-\frac{\beta}{p}} \nonumber \\
    &= c  (K-1)^{\frac{\beta}{p}} \left( \frac{(A_a(n)+1)^{-(\alpha - \frac{\beta}{p}) + 1}}{1 - (\alpha - \frac{\beta}{p})} - \frac{\alpha - \frac{\beta}{p}}{1 - (\alpha - \frac{\beta}{p})} \right)\epsilon^{-\beta} (A(n)+1)^{-\frac{\beta}{p}}.\nonumber
    \end{flalign}
    If $0 < \alpha - \frac{\beta}{p} < 1$, then we can find a constant $N_{G2}$ such that $\forall n \geq N_{G2}$, we have
    \[
    G_2 \leq \frac{c  (K-1)^{\frac{\beta}{p}} }{1 - (\alpha - \frac{\beta}{p})} \epsilon^{-\beta}  (A(n)+1)^{-(\alpha - 1)} = \frac{c  (K-1)^{\frac{\beta}{p}}}{1 - (\alpha - \frac{\beta}{p})} \epsilon^{-\beta}  (A(n)+1)^{-(\alpha - 1)}.
    \]
    Therefore, 
    \begin{flalign}
        \bP\left( \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p  > \frac{1}{K}\epsilon^p \right) \leq \frac{2c C^{-\beta}}{(b-1)} A(n)^{-(b-1)} + \frac{c  (K-1)^{\frac{\beta}{p}}}{1 - (\alpha - \frac{\beta}{p})} \epsilon^{-\beta}  (A(n)+1)^{-(\alpha - 1)}.
    \end{flalign}
    that concludes for the Inequality~\ref{lem:upper_bound_suboptimal_arm_in_power_mean:case_2}.

    If $\alpha - \frac{\beta}{p} > 1$, we can find a constant $N_0$ such that $\forall n \geq N_0$, we have
    \[
    G_2 \leq c  (K-1)^{\frac{\beta}{p}} \left(\frac{\alpha - \frac{\beta}{p}}{\alpha - \frac{\beta}{p} -1} - \frac{(A(n)+1)^{-(\alpha - \frac{\beta}{p}) + 1}}{(\alpha - \frac{\beta}{p}) - 1}\right) \epsilon^{-\beta} (A(n)+1)^{-\frac{\beta}{p}} \leq \frac{c  (K-1)^{\frac{\beta}{p}} (\alpha - \frac{\beta}{p}) }{(\alpha - \frac{\beta}{p}) - 1} \epsilon^{-\beta} (A(n)+1)^{-\frac{\beta}{p}},
    \]
    that concludes for the Inequality~\ref{lem:upper_bound_suboptimal_arm_in_power_mean:case_3}
    \begin{flalign}
        \bP\left( \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p  > \frac{1}{K}\epsilon^p \right) \leq \frac{2c C^{-\beta}}{(b-1)} A(n)^{-(b-1)} + \frac{c  (K-1)^{\frac{\beta}{p}} (\alpha - \frac{\beta}{p}) }{(\alpha - \frac{\beta}{p}) - 1} \epsilon^{-\beta} (A(n)+1)^{-\frac{\beta}{p}}.
    \end{flalign}
    Actually we should not choose our parameters to fall in this case because when p is big $-\frac{\beta}{p}$ will get big and the bound is looser.
\end{proof}

\begin{manuallemma}{10} \label{lem:pm_intermediate_results_tighter_fixed_appendix}
Consider a bandit problem defined as in Section~\ref{s:bandit_definition}.
Let us define the power mean estimator $\widehat{\mu}_n(p)$ as $\widehat{\mu}_n(p) = \left(\sum^{K}_{a=1} \frac{T_{a}(n)}{n} \widehat{\mu}^{p}_{a,T_{a}(n)}\right)^{\frac{1}{p}}$.
Define $A(n) = \big(\frac{2C n^{\frac{b}{\beta}}}{\triangle}\big)^{\frac{\beta}{\alpha}} $, where $\triangle = \min_{a \in [K]} \{\triangle_a\}, \triangle_a = \mu_* - \mu_a$. Let $\epsilon_0 = \frac{2^{\frac{1}{p}}n\epsilon}{x} + \frac{nR(K-1)}{x}(\frac{2^{\frac{1}{p}}(3 + A(n))x}{n}), R \geq \epsilon \geq n^{-\frac{\alpha}{\beta}}$. We can find a constant $N_p$ such that for any $n \geq N_p$ and $x \geq 1, $ such that
\begin{flalign}
    \bP \bigg( \left| \widehat{\mu}_n(p) - \mu_*\right| \geq \frac{\epsilon_0 x}{n} \bigg ) \leq \frac{8c C^{-\beta}K R^{\beta}\epsilon^{-\beta} A(n)^{-(b-1)} }{b-1} +  2c C^{-\beta}(K-1) \frac{(2^{\frac{1}{p}}(3 + A(n))x - 1)^{-(b-1)}}{b-1}.
\end{flalign}
\end{manuallemma}
\begin{proof}
As the results from Lemma~\ref{lem:upper_bound_power_mean_and_optimal_appendix}, we can derive
    \begin{flalign}
    \left|\widehat{\mu}_n(p) -  \mu_*\right| &\leq R\sum^{K}_{a=1,a\neq a_*} \frac{T_a(n)}{n} + \left(\sum_{a=1}^{K} \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p \right)^{\frac{1}{p}} \nonumber \\
    \end{flalign}
    Because $\frac{\epsilon_0 x}{n} = 2^{\frac{1}{p}}\epsilon + R(K-1) \left( \frac{2^{\frac{1}{p}}(3 + A(n))x}{n} \right)$, so that
    \begin{flalign}
    \Rightarrow \bP\left( \left|\widehat{\mu}_n(p) -  \mu_a\right| > \frac{\epsilon_0 x}{n} \right) &\leq \underbrace{\bP\left( R\sum^{K}_{a=1,a\neq a_*} \frac{T_a(n)}{n} > R(K-1)(\frac{2^{\frac{1}{p}}(3 + A(n))x}{n}) \right)}_{H_1} \nonumber \\
    &+ \underbrace{\bP\left( \left(\sum_{a=1}^{K} \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p \right)^{\frac{1}{p}} \geq 2^{\frac{1}{p}}\epsilon \right)}_{H_2} \nonumber
    \end{flalign}
    \underline{To upper bound $H_1$}: with $x\geq 1$ We have
    \begin{flalign}
        H_1 &\leq \sum^{K}_{a=1,a\neq a_*} \bP \left( \frac{T_a(n)}{n} > \frac{2^{\frac{1}{p}}(3 + A(n))x}{n} \right) = \sum^{K}_{a=1,a\neq a_*} \bP \left( T_a(n) > 2^{\frac{1}{p}}(3 + A(n))x \right) \nonumber \\
        &\stackrel{(\text{Lemma}~\ref{lem:hp_bound_visits_appendix})}{\leq} 2c C^{-\beta}(K-1) \frac{(2^{\frac{1}{p}}(3 + A(n))x - 1)^{-(b-1)}}{b-1} \label{eq_h1}
    \end{flalign}
    
    \underline{To upper bound $H_2$}:
    \begin{flalign}
    H_2 &=  \bP\left( \sum_{a=1}^{K} \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p  > 2\epsilon^p \right) \nonumber\\
    &\leq \underbrace{\bP\left( \frac{T_{a_*}(n)}{n} \left( \left| \widehat{\mu}_{a_*,T_{a_*}(n)} - \mu_{a_*} \right|\right)^p  > \epsilon^p \right)}_{F_1} + \underbrace{\sum^{K}_{a=1, a\neq a_*} \bP\left( \frac{T_a(n)}{n} \left( \left| \widehat{\mu}_{a,T_a(n)} - \mu_a \right|\right)^p  > \frac{1}{K-1}\epsilon^p \right)}_{F_2} \nonumber
    \end{flalign} 
    \underline{With $F_1$}: According to Lemma~\ref{lem:upper_bound_optimal_arm_in_power_mean_appendix}, we can find a constant $N_0$, such that $\forall n \geq N_0$, we have
    \begin{flalign}
        F_1 \leq \frac{2c C^{-\beta}(K-1) A(n)^{-(b-1)} }{b-1} + \frac{c}{\alpha-1}\epsilon^{-\beta} (n - (K-1) (A(n)+1)-1)^{-\alpha + 1}
    \end{flalign}
   \underline{With $F_2$}: According to Lemma~\ref{lem:upper_bound_suboptimal_arm_in_power_mean_appendix}, we can find a constant $N_0$, such that $\forall n \geq N_0$, we have
\begin{itemize}
        \item With {$1\leq p \leq 2, \alpha \leq \frac{\beta}{p}$}, we have
        \begin{flalign}
        F_2  \leq \frac{2c C^{-\beta}}{(b-1)} A(n)^{-(b-1)} + \frac{2c(K-1)^{\frac{\beta}{p}}}{-(\alpha - \frac{\beta}{p}) + 1}\epsilon^{-\beta} (A_a(n)+1)^{-(\alpha - 1)}.
        \end{flalign}
        \item With {$p > 2$, and $0 < \alpha - \frac{\beta}{p} < 1$}, we have 
        \begin{flalign}
        F_2 \leq \frac{2c C^{-\beta}}{(b-1)} A(n)^{-(b-1)} + \frac{c(K-1)^{\frac{\beta}{p}}}{-(\alpha - \frac{\beta}{p}) + 1}\epsilon^{-\beta} (A_a(n)+1)^{-(\alpha - 1)}.
        \end{flalign}
    \end{itemize}
    So that 
    \begin{flalign}
        H_2 &\leq F_1 + F_2 \leq \frac{2c C^{-\beta}(K-1) A(n)^{-(b-1)} }{b-1} + \frac{c}{\alpha-1}\epsilon^{-\beta} (n - (K-1) (A(n)+1)-1)^{-\alpha + 1} \\
        &+ \frac{2c C^{-\beta}}{(b-1)} A(n)^{-(b-1)} + \frac{c(K-1)^{\frac{\beta}{p}}}{-(\alpha - \frac{\beta}{p}) + 1}\epsilon^{-\beta} (A_a(n)+1)^{-(\alpha - 1)} \nonumber
    \end{flalign}
    where {$1\leq p \leq 2, \alpha \leq \frac{\beta}{p}$ or $p > 2$, and $0 < \alpha - \frac{\beta}{p} < 1$}.
    
    Because $b-1 < \alpha - 1$, $ n^{-\frac{\alpha}{\beta}} \leq \epsilon \leq R$, so that we can find a constant $N_p$ such that $\forall n \geq N_p$
    \begin{flalign}
        H_2 \leq \frac{8c C^{-\beta}K \left(\frac{R}{\epsilon}\right)^\beta A(n)^{-(b-1)} }{b-1} = \frac{8c C^{-\beta}K R^{\beta}\epsilon^{-\beta} A(n)^{-(b-1)} }{b-1} \label{eq_h2}.
    \end{flalign}
    Combining \ref{eq_h1} and \ref{eq_h2}, we can conclude the proof. 
\end{proof}

\begin{manualtheorem}{1}\label{thm:theorem1_appendix} For $a \in [K]$, let $(\widehat{\mu}_{a,n})_{n\geq 1}$ be a sequence of estimators satisfying $\widehat{\mu}_{a,n}\cv{\alpha,\beta} \mu_a$ and let $\mu_\star = \max_{a} \{ \mu_a\}$. Assume that the arms are sampled according to the strategy \eqref{action_select} with parameters $\alpha,\beta, b$ and $C$. Assume that $p,\alpha,\beta$ and $b$ satisfy one of these two conditions: 
\begin{enumerate}
    \item[(i)] $1\leq p \leq 2$ and $\alpha \leq \frac{\beta}{2}$ 
    \item[\ref{two}] $p > 2$ and $0 < \alpha - \frac{\beta}{p} < 1$
\end{enumerate}
If $\alpha\left(1 -\frac{b }{\alpha}\right) \leq b < \alpha$ then the sequence of estimators 
$\widehat{\mu}_n(p)$
satisfies \[\widehat{\mu}_n(p) \cv{\alpha',\beta'} \mu_\star\] for $\alpha' = (b-1)\left(1-\frac{b}{\alpha}\right)$ and $\beta' = (b-1)$ for some value of the constant $C$ in \eqref{action_select} that depends on $K, b, \alpha,p, \Delta_{\min}$ with $\Delta_{\min} = \min_{a : \mu_a < \mu_\star} (\mu_\star - \mu_a)$. 
\end{manualtheorem}

\begin{proof}  
We will use the results of Lemma~\ref{lem:pm_intermediate_results_tighter_fixed_appendix} to derive the proof of Theorem~\ref{thm:theorem1_appendix}. We want to have an upper bound 
\begin{flalign}
    D = \bP \bigg( \left| \widehat{\mu}_n(p) - \mu_* \right| \geq \epsilon \bigg ).\nonumber
\end{flalign}
Due to Lemma~\ref{lem:pm_intermediate_results_tighter_fixed_appendix}, we have
\begin{flalign}
    \epsilon_0 = \frac{2^{\frac{1}{p}}n\epsilon^{'}}{x} + \frac{nR(K-1)}{x}(\frac{2^{\frac{1}{p}}(3+A(n))x}{n}) \Rightarrow \frac{\epsilon_0 x}{n} = 2^{\frac{1}{p}}\epsilon^{'} + R(K-1)\left(\frac{2^{\frac{1}{p}}(3+A(n))x}{n}\right).\nonumber
\end{flalign}
Also, from Lemma~\ref{lem:pm_intermediate_results_tighter_fixed_appendix}, recall that $A(n) = \big(\frac{2C n^{\frac{b}{\beta}}}{\triangle}\big)^{\frac{\beta}{\alpha}} $, we study 
\begin{flalign}
    \epsilon = 2^{\frac{1}{p}}R(2K-1)\left(\frac{\big(\frac{2C}{\triangle}\big)^{\frac{\beta}{\alpha}} n^{\frac{b}{\beta}}x}{n}\right) = 2^{\frac{1}{p}}R(2K-1)\left( \frac{A(n)x}{n}\right).\label{epsilon_def}
\end{flalign}
We want to find $N_0 > 0$, that for any $n \geq N_0, \epsilon \geq \frac{\epsilon_0 x}{n}$. To do that, we compute
\begin{flalign}
    \epsilon - \frac{\epsilon_0x}{n} &= 2^{\frac{1}{p}}R(2K-1)\left( \frac{A(n)x}{n}\right) - R(K-1)\left(\frac{2^{\frac{1}{p}}(3+A(n))x}{n}\right) - 2^{\frac{1}{p}}\epsilon^{'} \nonumber \\
    &= 2^{\frac{1}{p}}R(2K-1)\left( \frac{A(n)x}{n}\right) - 2^{\frac{1}{p}}R(K-1)(\frac{A(n)x}{n}) - 2^{\frac{1}{p}}R(K-1)(\frac{3x}{n}) - 2^{\frac{1}{p}} \epsilon^{'}\nonumber \\
    &= 2^{\frac{1}{p}}RK(\frac{A(n)x}{n}) - 2^{\frac{1}{p}}R(K-1)(\frac{3x}{n}) - 2^{\frac{1}{p}}\epsilon^{'} = \underbrace{2^{\frac{1}{p}}R(K-1)(\frac{x}{n})(A(n) - 3)}_{\text{$T_1$}} + \underbrace{2^{\frac{1}{p}}R(\frac{A(n)x}{n}) - 2^{\frac{1}{p}}\epsilon^{'}}_{\text{$T_2$}} \nonumber
\end{flalign}
Because $A(n) \sim \Theta(n^{\frac{b}{\alpha}})$ and $\frac{b}{\alpha} > 0$, then $\exists N_1 > 0$ with $n \geq N_1$ that $T_1 > 0$. We can see that $\frac{A(n)}{n} \sim \Theta(n^{-(1-\frac{b}{\alpha})})$. We choose $\epsilon^{'} = (n^{-\frac{\alpha}{\beta}}x)$ that satisfies condition $\epsilon' \geq n^{-\frac{\alpha}{\beta}}$. With $c\geq 1$, and $\frac{R}{\Delta^{\frac{\beta}{\alpha}}} > 1$, We have 
\[
\frac{RA(n)x}{n} = \frac{R\big(\frac{2C n^{\frac{b}{\beta}}}{\triangle}\big)^{\frac{\beta}{\alpha}} x}{n} = (2C)\frac{R}{\Delta^{\frac{\beta}{\alpha}}}n^{-(1-\frac{b}{\alpha})}x \geq n^{-(1-\frac{b}{\alpha})}x
\]
and because $1-\frac{b}{\alpha} \leq \frac{1}{2} \leq \frac{\alpha}{\beta}$ then 
\begin{flalign}
    T_2 = 2^{\frac{1}{p}}\left(R(\frac{A(n)x}{n}) - n^{-\frac{\alpha}{\beta}}x\right) \geq 0. \nonumber
\end{flalign}
Then we can define $N_0 = \min \{ t: R(2K-1)\left( \frac{A(t)x}{t}\right) - R(K-1)\left(\frac{(3+A(t))x}{t}\right) - \epsilon^{'} \geq 0 \}$, therefore with $n \geq N_{0}$, According to Lemma~\ref{lem:pm_intermediate_results_tighter_fixed_appendix}, with { $1\leq p \leq 2, \alpha \leq \frac{\beta}{p}$ or $p > 2;0 < \alpha - \frac{\beta}{p} < 1$
}, we have
\begin{flalign}
    D &= \bP \bigg(\left|\widehat{\mu}_n(p) - \mu_*\right| \geq \epsilon \bigg ) \leq \bP \bigg(\left|\widehat{\mu}_n(p) - \mu_*\right| \geq \frac{\epsilon_0x}{n} \bigg )\nonumber\\
    &\leq \frac{8c C^{-\beta}K R^{\beta}\epsilon^{-\beta} A(n)^{-(b-1)} }{b-1} +  2c C^{-\beta}(K-1) \frac{(2^{\frac{1}{p}}(3 + A(n))x - 1)^{-(b-1)}}{b-1} \text{ (Lemma~\ref{lem:pm_intermediate_results_tighter_fixed_appendix})} \nonumber
\end{flalign}
Furthermore, we observe that $2^{\frac{1}{p}}(3 + A(n))x - 1 > A(n)x$ with $x\geq 1$. So that,
\begin{flalign}
    D &\leq \frac{8c C^{-\beta}K R^{\beta}(n^{-(1-\frac{b}{\alpha})}x)^{-\beta} A(n)^{-(b-1)} }{b-1} +  2c C^{-\beta}(K-1) \frac{(A(n)x)^{-(b-1)}}{b-1} \nonumber \\
    &\leq \frac{8c C^{-\beta}K R^{\beta} n^{\beta(1-\frac{b}{\alpha})}A(n)^{-(b-1)} x^{-\beta}}{b-1} +  2c C^{-\beta}(K-1) \frac{A(n)^{-(b-1)} x^{-(b-1)}}{b-1} \nonumber \\
    &= \frac{8c C^{-\beta}K R^{\beta} n^{\beta(1-\frac{b}{\alpha})}(\big(\frac{2C n^{\frac{b}{\beta}}}{\triangle}\big)^{\frac{\beta}{\alpha}})^{-(b-1)} x^{-\beta}}{b-1} +  2c C^{-\beta}(K-1) \frac{(\big(\frac{2C n^{\frac{b}{\beta}}}{\triangle}\big)^{\frac{\beta}{\alpha}})^{-(b-1)} x^{-(b-1)}}{b-1} \nonumber\\
    &= \frac{8c C^{-\beta}K R^{\beta} n^{\beta(1-\frac{b}{\alpha})} n^{-\frac{b}{\alpha}(b-1)} \big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)} x^{-\beta}}{b-1} + 2c C^{-\beta}(K-1) \frac{\big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)} n^{-\frac{b}{\alpha}(b-1)} x^{-(b-1)}}{b-1} \nonumber\\
    &= \frac{8c C^{-\beta}K R^{\beta} \big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)} n^{\beta(1-\frac{b}{\alpha})} n^{-\frac{b}{\alpha}(b-1)}x^{-\beta}}{b-1} + 2c C^{-\beta}(K-1) \frac{\big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)} n^{-\frac{b}{\alpha}(b-1)} x^{-(b-1)}}{b-1} \nonumber
\end{flalign}
From ($\ref{epsilon_def}$), we have $A(n)x = \frac{n\epsilon}{(2^{\frac{1}{p}}R(2K-1))}$, and $x = \frac{1}{(2^{\frac{1}{p}}R(2K-1))} \epsilon n^{-(\frac{b}{\alpha} - 1)} \big(\frac{2C}{\triangle}\big)^{\frac{-\beta}{\alpha}}$. Therefore,
\begin{flalign}
    D &\leq \frac{8c C^{-\beta}K R^{\beta} \big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)} n^{\beta(1-\frac{b}{\alpha})} n^{-\frac{b}{\alpha}(b-1)} }{b-1} \left(\frac{1}{(2^{\frac{1}{p}}R(2K-1))} \epsilon n^{-(\frac{b}{\alpha} - 1)} \big(\frac{2C}{\triangle}\big)^{\frac{-\beta}{\alpha}} \right)^{-\beta} \nonumber \\
    &+ 2c C^{-\beta}(K-1) \frac{\big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)} n^{-\frac{b}{\alpha}(b-1)}}{b-1} \left(\frac{1}{(2^{\frac{1}{p}}R(2K-1))} \epsilon n^{-(\frac{b}{\alpha} - 1)} \big(\frac{2C}{\triangle}\big)^{\frac{-\beta}{\alpha}} \right)^{-(b-1)}  \nonumber \\
    &\leq \frac{8c C^{-\beta}K R^{\beta} \big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)}}{b-1} \left(\frac{1}{(2^{\frac{1}{p}}R(2K-1))} \big(\frac{2C}{\triangle}\big)^{\frac{-\beta}{\alpha}} \right)^{-\beta} \epsilon^{-\beta} n^{-\beta(1-\frac{b}{\alpha})} n^{\beta(1-\frac{b}{\alpha})} n^{-\frac{b}{\alpha}(b-1)} \nonumber \\
    &+ 2c C^{-\beta}(K-1) \frac{\big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)}}{b-1} \left(\frac{1}{(2^{\frac{1}{p}}R(2K-1))} \big(\frac{2C}{\triangle}\big)^{\frac{-\beta}{\alpha}} \right)^{-(b-1)} \epsilon^{-(b-1)} n^{-(b-1)(1-\frac{b}{\alpha})} n^{-\frac{b}{\alpha}(b-1)}  \nonumber\\
    &\leq c_0 n^{-\alpha'} \left(\frac{\epsilon}{R}\right)^{-\beta'} = c_0 R^{\beta'} n^{-\alpha'} \epsilon^{-\beta'} \nonumber
\end{flalign}
with 
\begin{flalign}
    c_0 &= 2\max \left\{ \frac{8c C^{-\beta}K \big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)}}{b-1} \left(\frac{\big(\frac{2C}{\triangle}\big)^{\frac{-\beta}{\alpha}}}{(2^{\frac{1}{p}}R(2K-1))}  \right)^{-\beta}, \frac{2c C^{-\beta}(K-1)\big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)}}{b-1} \left(\frac{\big(\frac{2C}{\triangle}\big)^{\frac{-\beta}{\alpha}}}{(2^{\frac{1}{p}}(2K-1))} \right)^{-(b-1)} \right \} \label{def_c} \\
    &= \frac{16c C^{-\beta}K \big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)}}{b-1} \left(\frac{\big(\frac{2C}{\triangle}\big)^{\frac{-\beta}{\alpha}}}{(2^{\frac{1}{p}}R(2K-1))}  \right)^{-\beta} \text{ because $\left(\frac{\big(\frac{2C}{\triangle}\big)^{\frac{-\beta}{\alpha}}}{(2^{\frac{1}{p}}R(2K-1))}  \right) < 1$ and $2K > 2(K-1)$} \nonumber \\
    \alpha' &= \min\{ \frac{b}{\alpha}(b-1),b-1 \} = \frac{b}{\alpha}(b-1) \nonumber\\
    \beta' &= \max\{b-1,\beta\} = b-1 \nonumber
\end{flalign}
But we need $\epsilon \geq n^{-\frac{\alpha'}{\beta'}}$. Then with the condition $1 - \frac{b}{\alpha} \leq \frac{b}{\alpha} \Rightarrow \alpha(1-\frac{b}{\alpha}) \leq b$, we can choose 
\begin{flalign}
\alpha^{'} = (b-1) (1 - \frac{b}{\alpha}), \nonumber \\
\beta^{'} = (b-1),\nonumber
\end{flalign}
and according to (\ref{def_c})
\[
c^{'} = c_0R^{\beta} = \frac{8c C^{-\beta}K R^{\beta} \big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1)}}{b-1} \left(\frac{\big(\frac{2C}{\triangle}\big)^{\frac{-\beta}{\alpha}}}{(2^{\frac{1}{p}}R(2K-1))}  \right)^{-\beta} = \frac{2^{b+\frac{\beta}{p}} c C^{-\beta}  K(2K-1)^{\beta} R^{2\beta}}{(b-1)}\big(\frac{2C }{\triangle}\big)^{-\frac{\beta}{\alpha}(b-1-\beta)}.
\]
This inequality is only correct for $n \geq N_0$. 
We want the inequality to be correct for all $n$. We want to show that the following inequality is correct for all $N_0 > n \geq 1$
\begin{flalign}
    &\bP \bigg( \left| \widehat{\mu}_n(p) - \mu_*\right| \geq \epsilon \bigg ) \leq c^{'} n^{- (b-1) (1 - \frac{b}{\alpha})} \epsilon^{-(b-1)}. \nonumber
\end{flalign}
We have $| \widehat{\mu}_n(p) - \mu_* | \leq R$.
We choose $\epsilon$ as the form $RN_0\epsilon$.
Then we have to prove that for $1 \leq n < N_0$,
\begin{flalign}
    D &= \bP \bigg( \left| \widehat{\mu}_n(p) - \mu_* \right| \geq RN_0\epsilon\bigg ) \leq c^{'} n^{- (b-1) (1 - \frac{b}{\alpha})} (R\epsilon N_0)^{-(b-1)} = c^{'} \left(\frac{1}{RN_0}\right)^{(b-1)} \left(\frac{n^{-(1-\frac{b}{\alpha})}}{\epsilon}\right)^{(b-1)} .\nonumber\\
    &= \underbrace{c^{'} \left(\frac{1}{RN_0}\right)^{(b-1)} \left(\frac{1}{n^{(1-\frac{b}{\alpha})}\epsilon}\right)^{(b-1)}}_{\text{$D_3$}}. \nonumber
\end{flalign}
In case $\epsilon > \frac{1}{N_0}$, then $RN_0\epsilon > R$, but $| \widehat{\mu}_n(p) - \mu_* | \leq R$, that leads to D = 0. The inequality is trivially correct.\\
In case $\epsilon \leq \frac{1}{N_0}$, because $ n < N_0 \text{ and } b >2, b < \alpha$, so that $0 < (1-\frac{b}{\alpha}) < 1$. Therefore $n^{(1-\frac{b}{\alpha})} < n < N_0$. Therefore, $n^{(1-\frac{b}{\alpha})}\epsilon < 1$. So that $\left(\frac{1}{n^{(1-\frac{b}{\alpha})}\epsilon}\right)^{b-1} > 1$. We can choose a constant $c^{'}>0$ that $D_3 > 1$, so the inequality is trivially correct. \\

Furthermore, 
\begin{flalign}
&\lim_{n\longrightarrow\infty}  \left| \bE[\widehat{\mu}_n(p)] - \mu_{\star}\right| \leq \lim_{n\longrightarrow\infty}  \bE[\left|\widehat{\mu}_n(p)- \mu_{\star}\right|] = \lim_{n\longrightarrow\infty} \int^{\infty}_0 \bP \left( \left| \widehat{\mu}_n(p) - \mu_{\star} \right| \geq s \right) ds \nonumber \\
&\leq \lim_{n\longrightarrow\infty} \int^{\infty}_0 c^{'} n^{-\alpha'}s^{-\beta'} ds \leq \lim_{n\longrightarrow\infty} \int^{n^{-\frac{\alpha'}{\beta'}}}_{0} \1 ds 
 + \lim_{n\longrightarrow\infty} \int^{\infty}_{n^{-\frac{\alpha'}{\beta'}}} c^{'} n^{-\alpha'}s^{-\beta'} ds \nonumber\\
&= \lim_{n\longrightarrow\infty} c^{'} n^{-\alpha'} \left(s^{-\beta' + 1} + C \right)\Big|^{\infty}_{n^{-\frac{\alpha'}{\beta'}}} = 0 \nonumber (\text{ we need } \beta' > 1 \rightarrow \beta > 2)
\end{flalign}
\end{proof}

\newpage
\section{Experimental Setup and Hyperparameter Selection}
We conduct tests with $p = 1, 2, 4, 8, 10, 16$ in SyntheticTree and plot the results. We run experiments with different exploration constants $C = 0.01, 0.1, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5$ and find that for Fixed-Depth-MCTS, $C = 0.1$ yields the best performance. For Stochastic-Power-UCT and UCT, the best results are obtained with $C = 0.25$. For Power-UCT, $C = 0.5$ shows the best results. When using adaptive $\{\alpha_i\},\{ \beta_i\}, \{b_i\}$ values $i \in [0, H]$, we find that $C = 0.01$ works the best.

In FrozenLake, Taxi we show results for $p=1, 2, 2.2$. Hyperparameter search for $C$ is performed via gridsearch: $C = 0.25, 0.5, 0.75, 1.0, 1.25, 1.5$. The best performance is achieved with $c=1.25, 1.5, 1.0, 1.0$ for UCT, Fixed-Depth-MCTS, Stochastic-Power-UCT $p=2$ and Stochastic-Power-UCT $p=2.2$ respectively in FrozenLake ($4\times4$), with $c=1.5, 1.0, 0.75, 0.75$ for UCT, Fixed-Depth-MCTS, Stochastic-Power-UCT $p=2$ and Stochastic-Power-UCT $p=2.2$ respectively in FrozenLake ($8\times8$). In Taxi, we find $c=1.5, 1.5, 1.5, 1.0$ for UCT, Fixed-Depth-MCTS, Stochastic-Power-UCT $p=2$ and Stochastic-Power-UCT $p=2.2$ respectively.


% \begin{manualtheorem}{3 (Convergence of Failure Probability)}
% Let $a_k$ be the action returned by Stochastic-Power-UCT at iteration n at the root node. Then $\exists c > 0$ as constants that
% \begin{flalign}
% &\bP \bigg( a_k \neq a_{k^*} \bigg ) \leq  c n^{-\alpha_{0}}.
% \end{flalign}
% \end{manualtheorem}
% \begin{proof}
% At the root node $s_{0}$, let us define
% \begin{flalign}
% \widehat{\mu}_{k}(n) &= \widehat{Q}_{n}(s_{0}, a_k), \nonumber \\
% \mu_{k}(n) &= \widetilde{Q}_{n}(s_{0}, a_k).
% \end{flalign}
% Let us also define $a_{k^*}$ as a unique optimal action at the root node $s_{0}$.
% We have 
% \begin{flalign}
% &\bP \bigg(\exists k \in [K], \widehat{\mu}_{k}(T^a_{s_0}(n)) > \widehat{\mu}_{k^{*}}(T^{a_{k*}}_{s_0}(n)) \bigg) \leq \sum_{k \neq k^{*} } \underbrace{\bP \bigg(\widehat{\mu}_{k}(T^a_{s_0}(n)) > \widehat{\mu}_{k^{*}}(T^{a_{k*}}_{s_0}(n)) \bigg)}_{\text{$K_k$}}\nonumber
% \end{flalign},
% where $T^a_{s_0}(n)$ is the number of pulling arm $a$ at state $s_0$ after n timesteps.
% We will upper bound $K_k$. We study $\epsilon$ that satisfies
% \[2R > \mu_{k^{*}} - \mu_k > 2 \epsilon > 2 n^{-(1-\frac{b^{0}}{\alpha_{0}})}.\]
% Therefore, 
% \begin{flalign}
% K_k &\leq \bP \bigg(\widehat{\mu}_{k}(T^a_{s_0}(n)) - \mu_k + \mu_{k^{*}} - \widehat{\mu}_{k^{*}}(T^{a_{k*}}_{s_0}(n)) > \mu_{k^*} -\mu_{k} \bigg) \nonumber \\ 
% &\leq \bP \bigg( \widehat{\mu}_{k}(T^a_{s_0}(n)) - \mu_k + \mu_{k^{*}} - \widehat{\mu}_{k^{*}}(T^{a_{k*}}_{s_0}(n)) > 2\epsilon \bigg) \nonumber \\
% &\leq \bP \bigg( \widehat{\mu}_{k}(T^a_{s_0}(n)) - \mu_k > \epsilon \bigg) + \bP \bigg ( \mu_{k^{*}} - \widehat{\mu}_{k^{*}}(T^{a_{k*}}_{s_0}(n)) > \epsilon \bigg) \nonumber \\
% &\leq \sum^{A_k(n)+1}_{t=1} c_{0} t^{-\alpha_{0}} \epsilon^{-\beta_{0}} + \bP (T^a_{s_0}(n) > A_k(n)+1) + \sum^{n}_{t=1}  c_{0} t^{-\alpha_{0}} \epsilon^{-\beta_{0}}\nonumber \\ 
% &\leq 2c_{0}\epsilon^{-\beta_{0}} \int^{+\infty}_1 t^{-\alpha_{0}} dt + 2\frac{(A_k(n))^{-(b-1)}}{b-1} \nonumber\\
% &\leq \frac{2c_{0}}{\alpha_{0}-1}  \epsilon^{-\beta_{0}} + 2 \frac{\left(\left(\frac{2(c_{0})^{\frac{1}{\beta_{0}}}}{\Delta_a}\right)^{\frac{-\beta_{0}}{\alpha_{0}}}n^{-\frac{b_{0}}{\alpha}}\right)^{(b_{0}-1)}}{b_{0}-1} \nonumber \\
% &\leq \frac{2c_{0}}{\alpha_{0}-1}  n^{-\beta_{0}(1-\frac{b_{0}}{\alpha_{0}})} + 2 \frac{\left(\left(\frac{2(c_{0})^{\frac{1}{\beta_{0}}}}{\Delta_a}\right)^{\frac{-\beta_{0}}{\alpha_{0}}}n^{-\frac{b_{0}}{\alpha_{0}}}\right)^{(b_{0}-1)}}{b_{0}-1} \nonumber \text{( because $\epsilon \geq n^{(1 - \frac{b_{0}}{\alpha_{0}})}$)}
% \end{flalign}
% because
% \begin{flalign}
%  b_{0}-1 &< \beta_{0} \nonumber \\
%  \frac{b_{0}}{\alpha_{0}} & \leq 1 - \frac{b_{0}}{\alpha_{0}} \nonumber 
% \end{flalign}
% Therefore,
% \begin{flalign}
% &\bP \bigg( a_k \neq a_{k^*} \bigg ) = \bP \bigg(\exists k \in [K], \widehat{\mu}_{k}(n) > \widehat{\mu}_{k^{*}}(n) \bigg) \nonumber \\
% &\leq 4(K-1)  \frac{\left(\left(\frac{2(c_{0})^{\frac{1}{\beta_{0}}}}{\Delta_a}\right)^{\frac{-\beta_{0}}{\alpha_{0}}}n^{-\frac{b_{0}}{\alpha_{0}}}\right)^{(b_{0}-1)}}{b_{0}-1} \nonumber  \\
% &= c n^{-\alpha_{0}}, \nonumber
% \end{flalign}
% with $c = \sum_{k \neq k^{*} } (c_{0} (\eta \mu_k)^{-\beta_{0}} + c_{0} (\zeta \mu_{k^*})^{-\beta_{0}}) $
% which concludes the proof.
% \end{proof}