\section{Proof of Theorem \ref{theorem: UB-SR}}\label{secappendix: proof of SRM}
\begin{comment}
Let $q'_1, q'_2, \dots, q'_{2N}$ be a permutation of $q_{i,x}$ for $1 \leq i \leq N$ and $x \in {0,1}$, and their corresponding size of c-components be $k'_1, k'_2, \dots k'_{2N}$ such that $(q'_1)^{k'_1} \leq (q'_2)^{k'_2} \leq \dots \leq (q'_{2N})^{k'_{2N}}$ and $q = \min_j q'_j$, $k = \max_j k'_j$. Let $p_{i,x,\mathbf{z}} = P(X_i = x, \mathbf{Pa}^c(X_i) = \mathbf{z})$, $\mathcal{Z}_i$ be the domain from which $\mathbf{Pa}^c(X_i)$ takes values. Note that $\mathcal{Z}_i \leq 2^{k_id_i+k_i}$ and assume $\mathcal{Z} = \max_i \mathcal{Z}_i$.
\end{comment}
\VIN{For the sake of analysis, we assume without loss of generality that $q_1, q_2, \ldots, q_{N}$ are arranged such that their corresponding c-component sizes $k_1, k_2, \ldots, k_{N}$ satisfy the following relation: $(q_1)^{k_1} \leq (q_2)^{k_2} \leq \ldots \leq (q_{N})^{k_{N}}$. Also, let $q = \min_{i\{q_i > 0\}} q_i$ (if $q_i = 0$ for all $i \in [N]$ then $q = \frac{1}{N+1}$), $k = \max_i k_i$, and $p^{i,x}_{\mathbf{z}} = \mathbb{P}(X_i = x, \mathbf{Pa}^c(X_i) = \mathbf{z})$. We remark that $p^{i,x}_{\mathbf{z}}$ is different from $p^{i,x}_{\mathbf{z}}$ used in Section \ref{sec: cumulative regret} to denote $\mathbb{P}(X_i=x, \mathbf{Pa}(X_i) =\mathbf{z})$; note that $\mathbf{Pa}(X_i) \subseteq \mathbf{Pa}^c(X_i)$. Let $d$ be the maximum indegree of any node in $S_i$ for $i \in [N]$. Finally, let $Z_i$ be the size of the domain from which $\mathbf{Pa}^c(X_i)$ takes values, and note that $Z_i \leq 2^{k_id+k_i}$ and let $Z = \max_i Z_i$. Note that, by our assumption $Z$ is $O(1)$. Also, in this section, let $m(\mathcal{C})$ be denoted by $m$.}
%\todo{Aurghya please use the above notation and rewrite the proof!}

%\AUR{The following lemmas show that when our estimations of $q_i$ for all $i \in [N]$ is good then the regret incurred by \ref{SR-algorithm} is small and the probability of our estimation of $q_i$s being bad is small. The following theorem finds the propability of making a bad estimate on $q_i$ for any $i \in [N]$.}
We begin by proving Lemmas \ref{q-estimate-lemma}, \ref{m-estimate-lemma}, and \ref{mu-estimation-lemma} which would be used to prove Theorem \ref{theorem: UB-SR}.  The following lemma bounds the probability of making a bad estimate of $q_i$ for any $i \in [N]$, at the end of $T/2$ rounds.

\VIN{\begin{lemma}
\label{q-estimate-lemma}
Let $F =\mathds{1}\{\text{At the end of } T/2 \textit{ rounds, there exists } i \textit{ such that } |\widehat{q}_i - q_i| \geq \frac{1}{4}(1-2^{-1/k})q \}$. Then $\mathbb{P}(F = 1) \leq 4NZe^{-\frac{1}{16}(1-2^{-1/k})^2 q^2 T}$.
\end{lemma}}
\begin{comment}
\begin{lemma}
\label{q-estimate-lemma}
Let $\mathcal{F}=\mathds{1}\{\text{At the end of } T/2 \textit{ rounds, there exists } i \textit{ such that } |\widehat{q'}_j - q'_j| \geq (1-2^{-1/k})q \}$. Then, $\mathbb{P}(\mathcal{F} = 1) \leq 2Ne^{-\frac{q^2}{16}T}$.
\end{lemma}
\end{comment}
\begin{proof}
Let $F_{i,x} = \mathds{1}\{\textit{At the end of } T/2 \textit{ rounds there exists } \mathbf{z} \textit{ such that } |\widehat{p}_{\mathbf{z}}^{i,x} - p_{\mathbf{z}}^{i,x}| \geq \frac{1}{4}(1-2^{-1/k})q\}$. From Lemma \ref{lemma: chernoff-hoeffding inequality}, it follows that,
\begin{align*}
    \mathbb{P}\Big(|\widehat{p}_{\mathbf{z}}^{\ i,x} - p_{\mathbf{z}}^{i,x}| \geq \frac{1}{4}(1-2^{-1/k})q\Big) \leq 2e^{-2\frac{1}{16}(1-2^{-1/k})^2 q^2 \frac{T}{2}}
\end{align*}
By union bound,
\begin{align*}
    \mathbb{P}(F_{i,x} = 1) \leq 2Z_ie^{-\frac{1}{16}(1-2^{-1/k})^2 q^2 T}
\end{align*}
By definition $q_i = \min_{x, \mathbf{z}} p_{\mathbf{z}}^{i,x}$ and $\widehat{q}_{i} = \min_{x, \mathbf{z}} \widehat{p}_{\mathbf{z}}^{\ i,x}$. Hence,
\begin{align*}
    \mathbb{P}\Big(|\widehat{q}_{i} - q_{i}| \geq \frac{1}{4}(1-2^{-1/k})q \Big) \leq 2P(F_{i,x} = 1) \leq 4Z_ie^{-\frac{1}{16}(1-2^{-1/k})^2 q^2 T}
\end{align*}
Taking union bound, we get $\mathbb{P}(F=1) \leq 4NZe^{-\frac{1}{16}(1-2^{-1/k})^2 q^2 T}$.
\end{proof}

\VIN{The next lemma shows that with high probability the estimate of $m$ at Step 6 of \SRM\ is good.}
\VIN{
\begin{lemma}
\label{m-estimate-lemma}
Let $F$ be as defined in Lemma \ref{q-estimate-lemma} and let $J = \mathds{1}\{\text{At the end of $T/2$ rounds the following holds } \widehat{m} \leq 2m $\}. Then $F = 0$ implies $J = 1$, and in particular, $\mathbb{P}(J = 1) \geq 1-4NZe^{-\frac{1}{16}(1-2^{-1/k})^2 q^2 T}$.
\end{lemma}
}
\begin{proof}
Note that if $q_i = 0$ for all $i \in [N]$, then our proposition is trivially true. $F = 0$ implies after $T/2$ rounds for all $i \in [N]$, $|\widehat{q}_{i} - q_{i}| \leq \frac{1}{4}(1-2^{-1/k})q$. Now from definition of $m$ we know that there is an $l \leq m$ such that for $i > l$, $(q_i)^{k_i} \geq (\frac{1}{m})$. Hence, for $i > l$, since $q \leq q_i$ by definition \\
\begin{align*}
(\widehat{q}_i)^{k_i} \geq \Big(q_i - \frac{1}{4}(1-2^{-1/k})q \Big)^{k_i} \geq\Big(q_i - (1-2^{-1/k})q_i \Big)^{k_i} \geq \frac{1}{2^{k_i/k} m} \geq \frac{1}{2m}
\end{align*}

Since, $l \leq m$, we have $|\{j \mid \widehat{q}_j^{k_j} < \frac{1}{2m}\}| \leq 2m$. This implies $\widehat{m} \leq 2m$.

\end{proof}

%\todo{Check this with Arnab Bhattacharyya's result!} \\

The next lemma provides the confidence bound on the estimate of $\mu_{i,x}$ computed by Algorithm \ref{alg: estimating rewards from observations} for each $i,x$ .
\VIN{
\begin{lemma}
\label{mu-estimation-lemma}
For an action $a_{i,x} \in \mathcal{A}$, at the end of $T/2$ rounds $\mathbb{P}(|\widehat{\mu}_{i,x} - \mu_{i,x}| > \epsilon) 
\leq \exp{\big({-\epsilon^2 \frac{q_{i}^{k_i} T}{K_{\mathcal{G}}}}\big)}$, where $K_{\mathcal{G}} \geq 1$ is a constant dependent on the structure of $\mathcal{G}$ but independent of $\mathbb{P}$.
\end{lemma}
}
\begin{proof}
Using \textbf{Theorem 2.5} and \textbf{Theorem A.1} in \cite{BhattacharyyaGKMV20}, it can be inferred that the learner can estimate $\widehat{\mu}_{i,x}$, such that $|\widehat{\mu}_{i,x} - \mu_{i,x}| \leq \epsilon$, with probability $1-\delta_i$, using $O\big(2^{2u_i^2} \log 2^{2u_i^2} \log \frac{1}{\delta_i} /(q_{i}^{k_i} \epsilon^2)\big)$ samples, where $u_i = 1 + k_i(d+1)$. Hence using samples $T = K'\frac{2^{2.2u_i^2}}{q_{i}^{k_i} \epsilon^2} \log \frac{1}{\delta_i}$, where $K'$ is a constant independent of the problem instance, we get, $P(|\widehat{\mu}_{i,x} - \mu_{i,x}| \leq \epsilon) \geq 1-\delta_i$. Writing $\delta_i$ in terms of $T$ and $\epsilon$, and using $K_{\mathcal{G}} = \max \{1, K' 2^{2.2u_i^2}\}$,
\begin{align*}
    \mathbb{P}(|\widehat{\mu}_{i,x} - \mu_{i,x}| > \epsilon) \leq \exp{\bigg(-\frac{T}{K'} \frac{q_{i}^{k_i} \epsilon^2}{2^{2.2u_i^2}}\bigg)} \leq \exp{\bigg({-\epsilon^2 \frac{q_{i}^{k_i} T}{K_{\mathcal{G}}}}\bigg)}
\end{align*}
Also by \ref{lemma: chernoff-hoeffding inequality}, for $a_0$, by,
\begin{align*}
    \mathbb{P}(|\widehat{\mu}_0 - \mu_0| \geq \epsilon) \leq \exp{\bigg(-2 \epsilon^2 \frac{T}{2}\bigg)}~.
\end{align*}
\end{proof}
Now we are ready to prove the theorem using the above Lemmas, and let $K = 2^{k-1}K_{\mathcal{G}}$. \AUR{Let $L_1 = \min_{t\in \mathbb{N}}(4NZe^{-\frac{1}{16}(1-2^{-1/k})^2q^2t} \leq \sqrt{\frac{144Km}{t}\log\frac{Nt}{m}})$ and $L_2 = \min_{t \in \mathbb{N}} \frac{6}{N^3}(\frac{m}{t})^4 \leq \sqrt{\frac{16Km}{t}\log\frac{Nt}{m}}$ and we assume throughout the proof that $T \geq \max\{L_1, L_2\}$}.
%\todo{Aurghya: Rewrite the below proof using the above lemmas and the notation}
%\begin{theorem}
%The simple regret $R_T$ of Algorithm \ref{SR-algorithm} is $O\bigg(\sqrt{\frac{m(\mathbf{q})}{T}\log \frac{NT}{m(\mathbf{q})}}\bigg)$.
%\end{theorem}
%\begin{proof}
%For ease of notation denote $m(\mathbf{q})$ by $m$.
Consider $a_{i,x} \in \mathcal{Q}$. By Lemma \ref{lemma: chernoff-hoeffding inequality}, and Lemma \ref{m-estimate-lemma},
\begin{align*}
    \mathbb{P}\big(|\widehat{\mu}_{i,x} - \mu_{i,x}| \geq \epsilon \mid F = 0\big) \leq 2\exp{\bigg({-\epsilon^2\frac{2T}{4\widehat{m}}}\bigg)} \leq 2\exp{\bigg({-\epsilon^2\frac{T}{4m}}\bigg)} \leq 2\exp{\bigg({-\epsilon^2\frac{T}{4K m}}\bigg)} 
\end{align*}
If $a_{i,x} \notin \mathcal{Q}$, and $q_i^{k_i} \geq \frac{1}{m}$, then given $F = 0$ we get,
\begin{align*}
    &\mathbb{P}\big(|\widehat{\mu}_{i,x} - \mu_{i,x}| > \epsilon \mid F = 0\big) \leq \exp{\bigg({-\epsilon^2 \frac{q_{i}^{k_i} T}{K_{\mathcal{G}}}}\bigg)} \leq  \exp{\bigg({-\epsilon^2 \frac{T}{4K m}}\bigg)}
\end{align*}

If $a_{i,x} \notin \mathcal{Q}$, and $q_i^{k_i} < \frac{1}{m}$, then given $F = 0$ from Lemma \ref{q-estimate-lemma}, $q_i^{k_i} \geq (\widehat{q}_i - \frac{1}{4}(1-2^{-1/k})q)^{k_i} \geq ((\frac{1}{\widehat{m}})^{1/k_i} - \frac{1}{4}(\frac{1}{m})^{1/k_i}))^{k_i} \geq ((\frac{1}{2m})^{1/k_i} - \frac{1}{4}(\frac{1}{m})^{1/k_i}))^{k_i} \geq \frac{1}{2^{k+1}m}$ we get,
\begin{align*}
    &\mathbb{P}\big(|\widehat{\mu}_{i,x} - \mu_{i,x}| > \epsilon \mid F = 0\big) \leq \exp{\bigg({-\epsilon^2 \frac{q_{i}^{k_i} T}{K_{\mathcal{G}}}}\bigg)} \leq \exp{\bigg({-\epsilon^2 \frac{T}{2^{k+1}K_{\mathcal{G}}m}}\bigg)} \leq \exp{\bigg({-\epsilon^2 \frac{T}{4Km}}\bigg)}
\end{align*}
\begin{align*}
    \mathbb{P}\{ \textit{There exists an action $a$ such that } |\widehat{\mu}_a - \mu_a| > \epsilon \mid F = 0 \} &\leq (4N + 2)\exp{\bigg({-\epsilon^2 \frac{T}{4K m}}\bigg)} \\
    &\leq 6N\exp{\bigg({-\epsilon^2 \frac{T}{4K m}}\bigg)}
\end{align*}
Substituting $\epsilon = \sqrt{\frac{16K m}{T}\log\frac{NT}{m}}$, we get,
\begin{align*}
E[r_T \mid F=0] \leq 2 \sqrt{\frac{16K m}{T}\log{\frac{NT}{m}}} + \frac{6}{N^3}\bigg(\frac{ m}{T}\bigg)^4 \leq \sqrt{\frac{144K m}{T}\log{\frac{NT}{m}}}
\end{align*}
Finally, the expected simple regret of Algorithm \ref{SR-algorithm} is as follows:
\begin{align*}
E[r_T] &= E[r_T | F = 0 ]\mathbb{P}(F = 0) + E[r_T | F = 1 ]\mathbb{P}(F = 1) \nonumber \\
&\leq E[r_T | F = 0 ] + \mathbb{P}(F = 1) \nonumber \\
&\leq \sqrt{\frac{144K m}{T}\log{\frac{NT}{m}}} + 4NZe^{-\frac{1}{16}(1-2^{-1/k})^2 q^2 T}
\end{align*}
Since $T \geq \max(L_1, L_2)$ the simple regret is $\mathcal{O}\bigg(\sqrt{\frac{m}{T} \log{\frac{NT}{m}}}\bigg)$.

%\end{proof}