\subsection{Useful results}
\begin{lemma}[Expectation, Lemma D.3.6 in \cite{jin2021best}]\label{lemma: expectation on random variables}
    Suppose that a random variable $Z$ satisfies the following conditions:
    \begin{itemize}
        \item $Z < R$ where $R$ is a constant.
        \item $Z < Y$ conditioning on event $E$, where $Y\geq 0$ is a random variable.
    \end{itemize}
    Then, it holds that $\expect\sbr{Z}\leq \expect\sbr{Y} + \prob\sbr{E^c}\cdot R$ where $E^c$ is the complementary event of $E$.
\end{lemma}

\begin{lemma}[Concentration of Laplace variable]
\label{lemma: Concentration of Laplace variable}
    Suppose random variable $Z\sim\lap{\lambda}$, then for any $t>0$ we have, 
    $$\prob\sbr{\abr{Z}\geq\lambda\ln t} \leq \frac{1}{t}.$$
\end{lemma}

% \begin{lemma}[Hoeffding inequality]
% \label{lemma: Hoeffding inequality}
%     Suppose that for all $i\in[n]$, the random variables $Z_i$ with mean $\mu_i$ are independent, and $\abr{Z_i-\mu_i}\leq b$ for some $b>0$. Then for any $t>0$ we have 
%     $$\prob\sbr{\sum_{i=1}^n \rbr{Z_i-\mu_i}\geq t} \leq \exp\rbr{-\frac{t^2}{2n b^2}}.$$
% \end{lemma}
% \begin{corollary}[Uniform bound from Hoeffding inequality]
% \label{corl: Uniform bound from Hoeffding bound}
%     Suppose that for all $i\in[n]$, the random variables $Z_i$ with mean $\mu_i$ are independent, and $\abr{Z_i-\mu_i}\leq b$ for some $b>0$. Then for any $\delta>0$ we have,
%     $$\prob\sbr{\frac{1}{n} \sum_{i=1}^n \rbr{Z_i-\mu_i} \leq \sqrt{\frac{2b^2}{n}\log\rbr{\frac{2}{\delta}}}} \geq 1-\delta.$$
% \end{corollary}

% \begin{lemma}[Maxima of Laplace Variables]
% \label{lemma: Maxima of Laplace Variables}
%     Let $\bZ \triangleq \rbr{Z_1,\cdots,Z_n}$ where $\forall i\in\sbr{n}$, $Z_i$ follows the same Laplace distribution $\lap{\lambda}$, then
%     \begin{equation}
%     \begin{aligned}
%         \expect\rbr{\max_{i\in\sbr{n}} Z_i - \min_{i\in\sbr{n}} Z_i} \leq O\rbr{\lambda \ln n}.
%     \end{aligned}
%     \end{equation}
% \end{lemma}

\begin{definition}[Sub-Exponential variables] \label{def: Sub-Exponential variables}
A random variable $X$ with mean $\mu=\mathbb{E}[X]$ is sub-exponential if there are non-negative parameters $(\nu,b)$ such that
\begin{equation}\notag
\mathbb{E}\left[e^{\lambda(X-\mu)}\right] \leq e^{\frac{\lambda^2 \nu^2 }{2}} \quad  \forall |\lambda|<\frac{1}{b}.
\end{equation}
\end{definition}

\begin{lemma}[Laplace variable is sub-exponential] \label{lemma: Laplace variable is sub-exponential}
A Laplace variable with parameter $(\mu,\beta)$, i.e., its probability density function is $f(x)=\frac{1}{2\beta}\exp\rbr{-\frac{|x-\mu|}{\beta}}$, is also a sub-exponential variable with parameter $(2\beta,\beta)$.
\end{lemma}

\begin{lemma}[Linear combination rule for sub-exponential variables] \label{property: linear combination for sub-exponential variables}
Consider an independent sequence of $\{X_k\}_{k=1}^n$ of random variables, such that $X_k$ has mean $\mu_k$, and is sub-exponential with parameter $(\nu_k,b_k)$. Then the variable $Z=\sum_{k=1}^n X_k$ is sub-exponential with parameters $(\nu_*,b_*)$, where
\begin{equation}\notag
\quad \nu_*:=\sqrt{\sum_{k=1}^n v_k^2}   \quad  \quad  b_*:=\max _{k=1, \ldots, n} b_k.
\end{equation}
\end{lemma}

\begin{lemma}[Maxima of sub-exponential variables] \label{lemma: Maxima of sub-exponential variables}
    Let $\bZ \triangleq \rbr{Z_1,\cdots,Z_n}$, and for all $i\in [n]$, $Z_i$ is a independent identical distributed sub-exponential random variable with parameter $(\nu,b)$.
    Then, if $b\sqrt{2\ln n}<\nu$, we have 
    \begin{equation}\notag
    \expect\sbr{\max_{i\in[n]} Z_i} \leq \nu\sqrt{2\ln n}.
    \end{equation}
\end{lemma}
\begin{proof}
With the definition of sub-exponential variables in Def. \ref{def: Sub-Exponential variables}, the following inequality holds for all $\lambda$ with $|\lambda|<\frac{1}{b}$, by using Jensen's inequality, 
\begin{align}
    \exp\rbr{\lambda\expect\max_{i\in[n]} Z_i} &\leq \expect \exp\rbr{\lambda\max_{i\in[n]} Z_i} = \expect \max_{i\in[n]} e^{\lambda Z_i} 
    \leq \sum_{i\in[n]} \expect e^{\lambda Z_i} \leq n e^{\frac{\lambda^2\nu^2}{2}}.\notag
\end{align}
Taking logarithms on both sides, we have 
$\expect\max_{i\in[n]} Z_i \leq \frac{\ln n}{\lambda} + \frac{\lambda\nu^2}{2}.$
Therefore, the upper bound is minimized when $\lambda = \frac{\sqrt{2\ln n}}{\nu} \leq \frac{1}{b}$, which yields
$$\expect\sbr{\max_{i\in[n]} Z_i} \leq \nu\sqrt{2\ln n}.$$
\end{proof}

\begin{lemma}[Maxima of the sum of i.i.d. Laplace variables]
\label{lemma: Maxima of Laplace Variables}
    Let $\bZ \triangleq \rbr{Z_1,\cdots,Z_n}$, and $Z_i=\sum_{j}^{m} b_{ij}$ for all $i\in\sbr{n}$, where for all $i\in[n],j\in[m]$, $b_{ij}$ is independent identical distributed Laplace variable $\lap{0,\beta}$ with parameter $(0,\beta)$. 
    Then, if $m > \frac{\ln n}{2} $, we have
    \begin{equation}
    \begin{aligned}
        \expect\sbr{\max_{i\in\sbr{n}} Z_i} = \expect\sbr{\max_{i\in\sbr{n}} \sum_{j\in[m]} b_{ij}} \leq 2\beta\sqrt{2m\ln n}.
    \end{aligned}
    \end{equation}
\end{lemma}
\begin{proof}
    Since Laplace variable is also sub-exponential (refer to Lemma \ref{lemma: Laplace variable is sub-exponential}), the sum of i.i.d. Laplace variable is also a sub-exponential variable with parameter $\rbr{2\beta\sqrt{m},\beta}$ according to the linear combination rule of sub-exponential variables in Lemma \ref{property: linear combination for sub-exponential variables}. 
    Then, we can obtain the lemma by using the maxima of sub-exponential variables in Lemma \ref{lemma: Maxima of Laplace Variables}.
\end{proof}

\begin{lemma}[Revised Lemma 11 in \cite{bourel2020tightening}]\label{lemma: revised bourel lemma}
    Consider $x$ and $y$ satisfying $\abr{x-y}\leq \alpha\sqrt{y(1-y)}+\beta$. Then
    $$\sqrt{y(1-y)}\leq \sqrt{x(1-x)} + 1.9\alpha + 1.5\sqrt{\beta}.$$
\end{lemma}
\begin{proof}
By Taylor's expansion, we have
\begin{equation}\notag
\begin{aligned}
    y(1-y) &= x(1-x) + (1-2x)(y-x) - (y-x)^2\\
    &=x(1-x) + (1-x-y)(y-x)\\
    &\leq x(1-x) + \abr{1-x-y}\rbr{\alpha\sqrt{y(1-y)}+\beta} \\
    & \leq x(1-x) + \alpha\sqrt{y(1-y)}+\beta
\end{aligned}
\end{equation}
Using the fact that $a\leq b\sqrt{a} +c$ implies $a\leq b^2 +b\sqrt{c} + c$ for non-negative numbers $a,b$ and $c$, we get
\begin{equation}\notag
\begin{aligned}
    y(1-y) & \leq \alpha^2 + \alpha\sqrt{x(1-x)+\beta} + x(1-x) + \beta \\
    & \leq \alpha^2 + \alpha\sqrt{x(1-x)} + \alpha\sqrt{\beta} + x(1-x) + \beta \\
    & = \rbr{\sqrt{x(1-x)} + \frac{\alpha}{2}}^2 + \frac{3\alpha^2}{4} + \alpha\sqrt{\beta} + \beta,
\end{aligned}
\end{equation}
where we use $\sqrt{a+b}\leq\sqrt{a}+\sqrt{b}$ for all $a,b\geq 0$.
Taking square-root from both sides, and applying $\sqrt{ab}\leq \frac{a+b}{2}$, we have the desired result:
\begin{equation}\notag
\begin{aligned}
    \sqrt{y(1-y)} &\leq \sqrt{x(1-x)} + \frac{\alpha}{2} + \frac{\sqrt{3}\alpha}{2} + \frac{\alpha}{2} + \frac{\sqrt{\beta}}{2} + \sqrt{\beta} \leq \sqrt{x(1-x)} + 1.9\alpha + 1.5\sqrt{\beta}.
\end{aligned}
\end{equation}
\end{proof}

\begin{lemma}[Lemma 19 in \cite{jaksch2010near}, Lemma 24 in \cite{talebi2018variance}] \label{lemma: jaksch sequence number lemma}
For any sequence of numbers $z_1,z_2,\dots,z_n$ with $0\leq z_k\leq Z_{k-1}:=\max\cbr{1,\sum_{i=1}^{k-1}z_i}$, it holds
\begin{equation}\notag
\begin{aligned}
    (1) &\quad \sum_{k=1}^n \frac{z_k}{\sqrt{Z_{k-1}}} \leq \rbr{\sqrt{2}+1} \sqrt{Z_n}, \\
    (2) &\quad \sum_{k=1}^n \frac{z_k}{Z_{k-1}} \leq 2\log\rbr{Z_n} + 1.
\end{aligned}
\end{equation}
\end{lemma}

The next one is a standard Bernstein-type concentration inequality for martingale (Theorem 1 in \cite{beygelzimer2011contextual}).
\begin{lemma}\label{lemma: concentration for martingale}
Let $Y_1,\dots,Y_\episodetotal$ be a martingale difference sequence with respect to a filtration $\cF_1,\dots,\cF_\episodetotal$.
Assume $Y_\episode\leq R$ a.s. for all $\episode$. 
Then for any $\delta\in(0,1)$ and $\lambda\in[0,1/R]$, with probability at least $1-\delta$, we have
$$\sum_{\episode=1}^\episodetotal Y_\episode \leq \lambda \sum_{\episode=1}^\episodetotal \expect_\episode\sbr{Y_\episode^2} + \frac{\ln(1/\delta)}{\lambda}.$$
\end{lemma}