\appendix
\begin{comment}
\section{Reproducibility Checklist}
\begin{enumerate}
\item This paper:
\begin{enumerate}
    \item Includes a conceptual outline and/or pseudocode description of AI methods introduced - \textbf{Yes}
\item Clearly delineates statements that are opinions, hypothesis, and speculation from objective facts and results - \textbf{Yes}
\item Provides well marked pedagogical references for less-familiare readers to gain background necessary to replicate the paper - \textbf{Yes}
\end{enumerate}


\item Does this paper make theoretical contributions? - \textbf{Yes}

\begin{enumerate}
% If yes, please complete the list below.
\item All assumptions and restrictions are stated clearly and formally. - \textbf{Yes}
\item All novel claims are stated formally (e.g., in theorem statements). - \textbf{Yes}
\item Proofs of all novel claims are included. - \textbf{Yes}
\item Proof sketches or intuitions are given for complex and/or novel results. - \textbf{Yes}
\item Appropriate citations to theoretical tools used are given. - \textbf{Yes}
\item All theoretical claims are demonstrated empirically to hold. - \textbf{No}
\item All experimental code used to eliminate or disprove claims is included. - \textbf{NA}
\end{enumerate}

\item Does this paper rely on one or more datasets? - \textbf{No}

% If yes, please complete the list below.

% A motivation is given for why the experiments are conducted on the selected datasets (yes/partial/no/NA)
% All novel datasets introduced in this paper are included in a data appendix. (yes/partial/no/NA)
% All novel datasets introduced in this paper will be made publicly available upon publication of the paper with a license that allows free usage for research purposes. (yes/partial/no/NA)
% All datasets drawn from the existing literature (potentially including authors’ own previously published work) are accompanied by appropriate citations. (yes/no/NA)
% All datasets drawn from the existing literature (potentially including authors’ own previously published work) are publicly available. (yes/partial/no/NA)
% All datasets that are not publicly available are described in detail, with explanation why publicly available alternatives are not scientifically satisficing. (yes/partial/no/NA)

\item Does this paper include computational experiments? - \textbf{No}

% If yes, please complete the list below.

% Any code required for pre-processing data is included in the appendix. (yes/partial/no).
% All source code required for conducting and analyzing the experiments is included in a code appendix. (yes/partial/no)
% All source code required for conducting and analyzing the experiments will be made publicly available upon publication of the paper with a license that allows free usage for research purposes. (yes/partial/no)
% All source code implementing new methods have comments detailing the implementation, with references to the paper where each step comes from (yes/partial/no)
% If an algorithm depends on randomness, then the method used for setting seeds is described in a way sufficient to allow replication of results. (yes/partial/no/NA)
% This paper specifies the computing infrastructure used for running experiments (hardware and software), including GPU/CPU models; amount of memory; operating system; names and versions of relevant software libraries and frameworks. (yes/partial/no)
% This paper formally describes evaluation metrics used and explains the motivation for choosing these metrics. (yes/partial/no)
% This paper states the number of algorithm runs used to compute each reported result. (yes/no)
% Analysis of experiments goes beyond single-dimensional summaries of performance (e.g., average; median) to include measures of variation, confidence, or other distributional information. (yes/no)
% The significance of any improvement or decrease in performance is judged using appropriate statistical tests (e.g., Wilcoxon signed-rank). (yes/partial/no)
% This paper lists all final (hyper-)parameters used for each model/algorithm in the paper’s experiments. (yes/partial/no/NA)
% This paper states the number and range of values tried per (hyper-) parameter during development of the paper, along with the criterion used for selecting the final parameter setting. (yes/partial/no/NA)
\end{enumerate}
\end{comment}

\newpage 
% \color{red} JP: Please remove equations number for those which are not referred \color{black}
\section{Proof of Lemma \ref{lem:1/t+1}}
We first prove the result for one dimension and then extend the result for the $d$ dimension by establishing the independence of the coordinates of the points in each dimension. 

For $d=1$, the points belong to a closed interval on the real line. 
We have a strict inequality since $X_t$ is drawn from a continuous i.i.d. distribution. Hence for any two permutations $X_{j_1}, X_{j_2}, \ldots, X_{j_t}$ and $X_{k_1}, X_{k_2}, \ldots, X_{k_t}$ of the sequence $\mathbf{X}_t$, we have $$\P(X_{j_1} < X_{j_2} < \ldots < X_{j_t}) = \P(X_{k_1} < X_{k_2} < \cdots < X_{k_t}).$$ Since the events $\{X_{j_1} < X_{j_2} < \cdots < X_{j_t}\}$ are mutually exclusive and there are $t!$ possible permutations, we have 
\begin{align}\label{eq1:1/t+1}
&\sum_{\{j_1,j_2,\ldots,j_n\}} \P\left(X_{j_1} < X_{j_2} < \cdots < X_{j_t} \right) = 1 \nonumber \\
\implies &\P\left(X_{j_1} < X_{j_2} < \cdots < X_{j_t} \right) = \frac{1}{t!}.
\end{align}
Given any realization of the sequence $\mathbf{X}_{t-1}$, for some permutation of $j_1,j_2,\ldots j_{t-1}$ we have $X_{j_1} < X_{j_2} < \cdots < X_{j_{t-1}}$. Let expert $i$ be the $k$th interval $(X_{j_{k-1}},X_{j_k})$, then the event $\{X_t \in \text{expert } i\}$ is equivalent to $\{X_t \in (X_{j_{k-1}},X_{j_k})\}$, i.e., $X_t$ is the $k$th highest value in the realization $\{\mathbf{X}_{t-1},X_t\}$. Therefore, we have
\begin{align}
        &\P\left(X_t \in \text{ expert } i \given[\big] X_{j_1} < X_{j_2} < \cdots < X_{j_{t-1}} \right) \nonumber \\
        =& \P\left(X_t \in (X_{j_{k-1}},X_{j_k}) \given[\big] X_{j_1} < X_{j_2} < \cdots < X_{j_{t-1}} \right) \nonumber\\
        =&\frac{\P\left(X_{j_1}< \cdots <X_{j_{k-1}}<X_{t}<X_{j_{k}}<\cdots <X_{j_{t-1}}\right)}{\P\left(X_{j_1} < X_{j_2} < \cdots < X_{j_{t-1}}\right)}\nonumber\\ 
        =&\dfrac{\dfrac{1}{t!}}{\dfrac{1}{(t-1)!}}=\frac{1}{t}.\nonumber
    \end{align}
Note that the conditional probability is independent of $k$ and thus it is true for any expert $i$. %Further, the independence of $k$ also implies that, if we fix a value for $i$, then expert $i$ will be a different interval for different realizations of $\mathbf{X}_{t-1}$ and the conditional probability is still $1/t$.
Finally, using total probability law over the permutations $j_1,j_2,\ldots,j_{t-1}$, we obtain $\P(X_t \in \text{ expert } i) = 1/t$, for all $i$.

For $d > 1$, let $X_t =  (Z^1_{t},\ldots, Z^d_{t})$, where $Z^r_{t}$ is the Euclidean coordinate of point $X_t$ in $r^{\text{th}}$ dimension.

\textbf{Claim:} $Z^r_{t}$ are i.i.d. across $t$ and $r$.

From the above claim and from \eqref{eq1:1/t+1}, for any permutation $k_1,k_2,\ldots,k_{t-1}$ in dimension $r$, we obtain
\begin{align}
    &\P\left(Z^r_{k_1} < Z^r_{k_2} < \cdots < Z^r_{k_t} \right) = \frac{1}{t!} \nonumber \\
    \implies &\P\left(\begin{matrix}\{Z^1_{m_1} < Z^1_{m_2} < \cdots  < Z^1_{m_t}\}, \ldots, & \nonumber \\  \{Z^d_{j_1} < Z^d_{j_2} < \cdots < Z^d_{j_t}\} \end{matrix}\right) = \frac{1}{(t!)^d}. \nonumber
\end{align}
Again, given any realization of $\mathbf{X}_{t-1}$, the event $\{X_t \in \text{ expert } i\}$ is equivalent to $\{Z^r_{t} \in (Z^r_{j_{k-1}},Z^r_{j_k})\}$ for some permutation $j^r_1,j^r_2,\ldots,j^r_{t-1}$ in each dimension $r$.
\begin{align}
        &\P\left(X_t \in \text{ expert } i \given[\big] \begin{matrix}\{Z^1_{m_1} < Z^1_{m_2} < \cdots < Z^1_{m_{t-1}}\},\ldots,& \nonumber \\ \{Z^d_{j_1} < Z^d_{j_2} < \cdots < Z^d_{j_{t-1}}\} \end{matrix} \right)\\    &=\frac{\P\left(\begin{matrix}\{Z^1_{m_1}< \cdots <Z^1_{m_{k-1}}<Z^1_{t}<Z^1_{m_{k}}<\cdots <Z^1_{j_{t-1}}\},\ldots & \\ \{Z^d_{j_1}< \cdots <Z^d_{j_{k-1}}<Z^d_{t}<Z^d_{j_{k}}<\cdots <Z^d_{j_{t-1}}\}\end{matrix}\right)}{\P\left(\begin{matrix}\{Z^1_{m_1} < Z^1_{m_2} < \cdots < Z^1_{m_{t-1}}\},\ldots, & \\\{Z^d_{j_1} < Z^d_{j_2} < \cdots < Z^d_{j_{t-1}}\}\end{matrix}\right)}\nonumber\\ 
        &=\dfrac{\dfrac{1}{(t!)^d}}{\dfrac{1}{[(t-1)!]^d}}=\dfrac{1}{t^d}.\nonumber
    \end{align}

\newpage
\section{Proof of Lemma~\ref{lem:expYt}}
% \color{red} JP: Please make this proof consistent with the rest of the paper.
Let $\phi_i =  \frac{e^{-\eta L_{t-1}(i)}}{ \sum_{j \in \mathcal{B}_{t-1}}e^{-\eta L_{t-1} (j)}}$.
% Note that $e^{L_{t-1}(i)-L_{t-1}(j)} \geq 0$ for all $i$, $j$, and $L_{t-1}(i) \geq L_{t-1}(j)$ implies $e^{L_{t-1}(i)-L_{t-1}(j)} \geq 1$. Therefore,
% \begin{align}
% \phi_i & =\frac{1}{\sum_{j: L_{t-1}(j)>L_{t-1}(i)} e^{L_{t-1}(n_t)-L_{t-1} (j)} + \sum_{j: L_{t-1}(j) \leq L_{t-1}(i)} e^{L_{t-1}(i)-L_{t-1} (j)}} \nonumber \\
% & \leq \frac{1}{\sum_{j \in \mathcal{B}_{t-1}} \mathds{1}_{\{L_{t-1}(j) \leq L_{t-1}(i)\}}} =\delta_i.
% \label{eq:dominance}
% \end{align}
% Note that, for each $i \in \mathcal{B}_{t-1},$ $\delta_i$ takes a unique value from $\{1,\frac{1}{2},\ldots,\frac{1}{t^d}\}$. Specifically, $\delta_i = \frac{1}{k}$, if expert $i$ has the $k$th highest cumulative loss. Thus, $\delta_i$ only depends on the relation between the cumulative losses but not on their values.  In the following, we use $[k]$ to denote the expert with $k$th highest cumulative loss.
% \\\\
%In round $t-1$, let $H_t=Sequence\{X_1<X_2<\ldots X_{t-1}\}$ denote the set comprising the history of the losses of the experts and the sequence of arrivals $\mathbf{X}_{t-1}$. 
%Given a realization of $H_t$, 
\\\\
\jpcol{For any given sequence of arrivals $\mathbf{X}_{t-1}$, $Y_t$ takes $t^d$ possible values}, each corresponding to $X_t$ belonging to one of the $t^d$ partitions. From Lemma \ref{lem:1/t+1}, the latter event has probability $1/t^d$. For $i,j \in \mathcal{B}_{t-1}$, let $c_j(i)$ denote the number of partitions of expert $i$ caused by sampling $X_t$ from expert $j$, and let $C_i = \sum_{j \in \mathcal{B}_{t-1}}c_j(i)$. 
%The conditional expectation of $Y_t$ given $H_t$ is given by 
We compute the expectation of 
\begin{align}
\E[Y_t]=\sum_{(m_1,\ldots,m_t)}\ldots\sum_{(j_1,\ldots,j_t)}\E\left[Y_t \mid \begin{matrix}\{Z^1_{m_1} < Z^1_{m_2} < \cdots  < Z^1_{m_t}\}, \ldots, & \nonumber\\  \{Z^d_{j_1} < Z^d_{j_2} < \cdots < Z^d_{j_t}\} \end{matrix}\right] \P \left(\begin{matrix}\{Z^1_{m_1} < Z^1_{m_2} < \cdots  < Z^1_{m_t}\}, \ldots, & \nonumber\\  \{Z^d_{j_1} < Z^d_{j_2} < \cdots < Z^d_{j_t}\} \end{matrix}\right)
% \E[Y_t]=&\E[Y_t|X_1<X_2\ldots X_{t-1}]\P(X_1<X_2<\ldots X_{t-1})+\ldots \nonumber \\
%     & + \E[Y_t|X_{t-1}<X_{t-2}\ldots X_1]\P(X_{t-1}<X_{t-2}<\ldots X_1)
\end{align}
We have for every ordering (along every $d$ dimension) \\
$$\P \left(\begin{matrix}\{Z^1_{m_1} < Z^1_{m_2} < \cdots  < Z^1_{m_t}\}, \ldots, & \\  \{Z^d_{j_1} < Z^d_{j_2} < \cdots < Z^d_{j_t}\} \end{matrix}\right)=\dfrac{1}{(t!)^d}.$$
\begin{align}
    \text{$\P\left(X_t \in \text{ expert } j \given[\big] \begin{matrix}\{Z^1_{m_1} < Z^1_{m_2} < \cdots < Z^1_{m_{t-1}}\},\ldots,& \\ \{Z^d_{j_1} < Z^d_{j_2} < \cdots < Z^d_{j_{t-1}}\} \end{matrix} \right)=\dfrac{1}{t^d}$ \; $\forall j \in \mathcal{B}_{t-1}$} \label{eq: probsubs}
\end{align}
\\
It follows that
% \color{red}
\begin{align}
    & \E\left[Y_t\given[\big] \begin{matrix}\{Z^1_{m_1} < Z^1_{m_2} < \cdots < Z^1_{m_{t-1}}\},\ldots,& \\ \{Z^d_{j_1} < Z^d_{j_2} < \cdots < Z^d_{j_{t-1}}\} \end{matrix}\right] \nonumber\\ 
    & =\sum_{j \in \mathcal{B}_{t-1}}\P\left(X_t \in \text{ expert } j \given[\big] \begin{matrix}\{Z^1_{m_1} < Z^1_{m_2} < \cdots < Z^1_{m_{t-1}}\},\ldots,& \\ \{Z^d_{j_1} < Z^d_{j_2} < \cdots < Z^d_{j_{t-1}}\} \end{matrix} \right)\sum_{i \in \mathcal{B}_{t-1}}c_j(i)\phi(i)\nonumber \\
    & = \dfrac{1}{t^d}\sum_{j \in \mathcal{B}_{t-1}}\sum_{i \in \mathcal{B}_{t-1}}c_j(i)\phi(i) \label{eq: prob}\\
    & = \dfrac{1}{t^d}\sum_{i \in \mathcal{B}_{t-1}}\phi(i)\sum_{j \in \mathcal{B}_{t-1}}c_j(i) \nonumber\\ 
    & = \dfrac{1}{t^d}\sum_{i \in \mathcal{B}_{t-1}}\phi(i) C_i \leq \dfrac{2^d}{t}\sum_{i \in \mathcal{B}_{t-1}}\phi(i)\leq \dfrac{2^d}{t}.  
    \label{eq:Y_t}
\end{align}
\color{black} We note that \eqref{eq: prob} follows from \eqref{eq: probsubs}.  In the upper bound in \eqref{eq:Y_t}, we have used $C_i = (t+1)^d-t^d$ (derived below) and the following inequality: \color{black}
\begin{align}
(1 + x)^r \leq 1 + (2^r-1)x ; \;x \in [0, 1] \text{ and } r \in \mathbb{R} \setminus (0, 1). \nonumber 
\end{align}
\color{black}

Note that $C_i$ is the total number of partitions of expert $i$ created due to sampling $X_t$ from all $t^d$ experts. We compute $C_i$ using the following counting argument. We say an expert $i$ \textit{shares} $k$ hyperplanes with expert $j$ if, for any point in $j$, exactly $k$ out of the $d$ orthogonal hyperplanes (parallel to the faces of $\mathbb{B}$) \color{black} that pass through that point will partition expert $i$. We compute the number of experts that share exactly $k$ hyperplanes with $i$ as follows. Choose any $k$ dimensions from $d$ in $\binom{d}{k}$ possible ways. Further, choose any orthogonal hyperplane passing through $i$ that is parallel to some dimension from the rest of $d-k$ dimensions. There will be $t-1$ basis hyperplanes, i.e., the hyperplanes that partitioned $\mathbb{B}$ by passing through $t-1$ points drawn by the environment, that are parallel to the chosen hyperplane and do not partition $i$. The $(t-1)^{d-k}$ partitions, which are formed by the intersection of the $t-1$ basis hyperplanes corresponding to each of the $d-k$ dimensions, do not share exactly $d-k$ hyperplanes with $i$, or they share exactly $k$ hyperplanes with $i$. Therefore, the total number of experts that share exactly $k$ hyperplanes with $i$ is $\binom{d}{k}(t-1)^{d-k}$, and each point drawn from those experts will result in $2^k$ partitions of expert $i$. Since index $i$ will be assigned to one of its children (sub-partitions), we have $2^k - 1$ new experts from partitioning $i$. It follows that
\begin{align*}
    C_i& = \sum_{k=1}^d \binom{d}{k}(2^k-1)(t-1)^{d-k} \\
    & =(t-1)^d \sum_{k=1}^{d} \binom{d}{k}\left(\frac{2}{t-1}\right)^k - \sum_{k=1}^d \binom{d}{k}(t-1)^{d-k} \\
    & = (t-1)^d \left(\frac{t+1}{t-1}\right)^d - t^d = (t+1)^d - t^d.
\end{align*}
Indeed $C_i$ is independent of $i$ and is equal to the total number of new experts revealed in slot $t$.
From \eqref{eq:Y_t} it follows that 
\begin{align}
    \sum_{t=1}^T \E[Y_t] \leq \sum_{t=1}^{T}\left(\dfrac{2^d}{t}\right)  \leq 2^{d} \left(\log{T}+1\right). \nonumber  
\end{align}
%and it turns out to be equal to the total number of new experts revealed in slot $t$. This can be attributed to the fact that additional weights in $\hat{W}_t$ is $(t+1)^d - t^d$. 

% \color{red}
% \begin{align}
%    \E[Y_t \mid H_t] =  \left(1+\dfrac{1}{t}\right)^d - 1.
% \end{align}
%\begin{align}
%    C_i = \frac{C}{t^d} = \frac{(t+1)^d - t^d}{t^d} \leq \frac{2^d}{t}
%\end{align}

\color{black}

\section{Proof of Theorem ~\ref{thm:adaHedgeG}}
As above Let $\phi_i =  \frac{e^{-\eta L_{t-1}(i)}}{ \sum_{j \in \mathcal{B}_{t-1}}e^{-\eta L_{t-1} (j)}}$.
Note that $e^{L_{t-1}(i)-L_{t-1}(j)} \geq 0$ for all $i$, $j$, and $L_{t-1}(i) \geq L_{t-1}(j)$ implies $e^{L_{t-1}(i)-L_{t-1}(j)} \geq 1$. Therefore,
\begin{align*}
\phi_i & =\frac{1}{\sum_{j: L_{t-1}(j)>L_{t-1}(i)} e^{L_{t-1}(n_t)-L_{t-1} (j)} + \sum_{j: L_{t-1}(j) \leq L_{t-1}(i)} e^{L_{t-1}(i)-L_{t-1} (j)}} \nonumber \\
& \leq \frac{1}{\sum_{j \in \mathcal{B}_{t-1}} \mathds{1}_{\{L_{t-1}(j) \leq L_{t-1}(i)\}}} =\delta_i.
% \label{eq:dominance}
\end{align*}
Note that, for each $i \in \mathcal{B}_{t-1},$ $\delta_i$ takes a unique value from $\{1,\frac{1}{2},\ldots,\frac{1}{t^d}\}$. Specifically, $\delta_i = \frac{1}{k}$, if expert $i$ has the $k$th highest cumulative loss. Thus, $\delta_i$ only depends on the relation between the cumulative losses but not on their values.  In the following, we use $[k]$ to denote the expert with $k^\text{th}$ highest cumulative loss.
We have
\begin{align*}
 Y_t = \frac{\sum_{i = n_{t-1} + 1}^{n_t}w_t(i)}{W_t} =\frac{\sum_{i = n_{t-1} + 1}^{n_t} e^{-\eta L_{t-1} (i)}}{\sum_{j \in \mathcal{B}_{t-1}} e^{-\eta L_{t-1} (j)}}. 
\end{align*}
Note that $e^{\eta (L_{t-1}(i)-L_{t-1}(j))} \geq 0$ for all $i$, $j$, and $L_{t-1}(i) \geq L_{t-1}(j)$ implies $e^{\eta (L_{t-1}(i)-L_{t-1}(j))} \geq 1$, since $\eta > 0$. Therefore,
\begin{align}
Y_t &= \sum_{i = n_{t-1} + 1}^{n_t}\frac{1}{\begin{matrix}
    \sum\limits_{\substack{j: L_{t-1}(j) > L_{t-1}(i)}} e^{\eta (L_{t-1}(i) - L_{t-1}(j))} 
     + \sum\limits_{\substack{j: L_{t-1}(j) \leq L_{t-1}(i)}} e^{\eta (L_{t-1}(i) - L_{t-1}(j))}
    \end{matrix}} \nonumber \\
&\leq \sum_{i = n_{t-1} + 1}^{n_t}\frac{1}{\sum_{j \in \mathcal{B}_{t-1}} \mathds{1}_{\{L_{t-1}(j) \leq L_{t-1}(i)\}}}.
\label{eq:dominance2}
\end{align}

In round $t$, we define a random variable $Z_t$ such that $Z_t=j^{-1}$, if $X_t$ falls in the $j^{\text{th}}$ best expert, i.e., if $\sum_{j \in \mathcal{B}_{t-1}} \mathds{1}_{\{L_{t-1}(j) \leq L_{t-1}(i)\}} = j$.
From \eqref{eq:dominance2}, we have $Y_t \leq Z_t$, for all $t$. 
{\allowdisplaybreaks
Further, we have
\begin{align*}
     \P\left(\sum_{t=1}^T Y_t - \sum_{t=1}^T \E[Y_t] > \delta\right) & \leq \P\left(\sum_{t=1}^T Z_t - \sum_{t=1}^T \E[Y_t] > \delta\right)  \nonumber \\
    & \leq \P\left(\sum_{t=1}^T Z_t - \sum_{t=1}^T \E[Z_t] > \delta- \sum_{t=1}^T \E[Z_t]+ \sum_{t=1}^T \E\left[Y_t\right]\right). 
    % & \leq \P\sum Z_i - \sum \E[Z_i] > \delta') \text{ where $\delta'=\delta- \sum \E[Z_i]+ \sum \E[Y_i]$}  
\end{align*}
}

% \begin{align}
% \E[Z_t]=&\E[Z_t|X_1<X_2\ldots X_{t-1}]\P(X_1<X_2<\ldots X_{t-1})+\ldots\\
%     & + \E[Z_t|X_{t-1}<X_{t-2}\ldots X_1]\P(X_{t-1}<X_{t-2}<\ldots X_1)
% \end{align}
% We have for every ordering $\P(X_1<X_2<\ldots X_{t-1})=\dfrac{1}{(t-1)!}$ 
% \\
\noindent Using the same argument as the one used for $Y_t$, we have 
\begin{align}
    \E[Z_t] & = \dfrac{1}{t^d}\sum_{j \in \mathcal{B}_{t-1}}\sum_{i \in \mathcal{B}_{t-1}}c_j(i)\delta(i) \nonumber \\
    & = \dfrac{1}{t^d}\sum_{i \in \mathcal{B}_{t-1}}\delta(i)\sum_{j \in \mathcal{B}_{t-1}}c_j(i) \nonumber \\ 
    & = \dfrac{1}{t^d}\sum_{i \in \mathcal{B}_{t-1}}\delta(i) C_i \leq \dfrac{2^d}{t}\sum_{i \in \mathcal{B}_{t-1}}\delta(i)\leq \dfrac{2^d}{t}d\left(1+\log{t} \right).\label{eq:Z_t}
\end{align}
Further using similar manipulations as $Y_t$, we get
\begin{align}
    & \E\bigg[Z_t^2\given[\big] \begin{matrix}\{Z^1_{m_1} < Z^1_{m_2} < \cdots < Z^1_{m_{t-1}}\},\ldots,& \\ \{Z^d_{j_1} < Z^d_{j_2} < \cdots < Z^d_{j_{t-1}}\} \end{matrix}\bigg] \nonumber\\
    & =\sum_{j \in \mathcal{B}_{t-1}}\P\left(X_t \in \text{ expert } j \given[\big] \begin{matrix}\{Z^1_{m_1} < Z^1_{m_2} < \cdots < Z^1_{m_{t-1}}\},\ldots,& \\ \{Z^d_{j_1} < Z^d_{j_2} < \cdots < Z^d_{j_{t-1}}\} \end{matrix} \right)\left(\sum_{i \in \mathcal{B}_{t-1}}c_j(i)\delta(i)\right)^2 \nonumber \\
    & = \dfrac{1}{t^d}\sum_{j \in \mathcal{B}_{t-1}}\left(\sum_{i \in \mathcal{B}_{t-1}}c_j(i)\delta(i)\right)^2 \nonumber\\
    & = \dfrac{1}{t^d}\sum_{j \in \mathcal{B}_{t-1}}\sum_{i \in \mathcal{B}_{t-1}}c_j(i)^2\delta(i)^2 + \dfrac{1}{t^d}\sum_{j \in \mathcal{B}_{t-1}}\sum_{i \in \mathcal{B}_{t-1}}\sum_{k \in \mathcal{B}_{t-1}}2c_j(i)c_j(k)\delta(i)\delta(k) \nonumber\\ 
    & \leq \dfrac{1}{t^d}\sum_{i \in \mathcal{B}_{t-1}}\delta(i)^2\sum_{j \in \mathcal{B}_{t-1}}c_j(i)^2 + \dfrac{1}{t^d}\sum_{j \in \mathcal{B}_{t-1}}\sum_{i \in \mathcal{B}_{t-1}}\sum_{k \in \mathcal{B}_{t-1}}2c_j(i)c_j(k)\delta(i)\delta(k). \label{main}
\end{align}
\color{black}
Using the logic to compute $C_i$ where $2^k-1$ are the new experts being formed, we have the following: 
\begin{align*}
    D_i=\sum_{j \in \mathcal{B}_{t-1}}c_j(i)^2& = \sum_{k=1}^d \binom{d}{k}(2^k-1)^2(t-1)^{d-k} \\
    & = \sum_{k=1}^d \binom{d}{k}(2^{2k}-2\cdot 2^k+1)^2(t-1)^{d-k} \\
    & = \sum_{k=1}^d \binom{d}{k}4^k(t-1)^{d-k}-2\sum_{k=1}^d \binom{d}{k}2^k(t-1)^{d-k} + \sum_{k=1}^d \binom{d}{k}(t-1)^{d-k}\\
    % & \leq (t-1)^d \sum \binom{d}{k}\left(\frac{4}{t-1}\right)^k -2(t-1)^d \sum \binom{d}{k}\left(\frac{2}{t-1}\right)^k + (t-1)^d \sum \binom{d}{k}\left(\frac{1}{t-1}\right)^k\\
    & = (t+3)^d-(t+1)^d -((t+1)^d-t^d)\\
    & = (t+3)^d-(t+1)^d. 
\end{align*}
% \color{red}
Indeed $D_i$ is independent of $i$ as all the experts are split with equal probability.\\
Simplifying \eqref{main} using the fact $\sum_{i=1}^{t^d} \delta(i)^2 = \sum_{i=1}^{t^d}\dfrac{1}{i^2} \leq \dfrac{\pi^2}{6}$, we get 
\begin{align}
    \dfrac{1}{t^d}\sum_{i \in \mathcal{B}_{t-1}}\delta(i)^2 D_i \leq \dfrac{(t+3)^d-t^d}{t^d} \cdot \dfrac{\pi^2}{6} \leq \dfrac{5\cdot 2^d}{t}.
\label{p1}\end{align}
The second term in \eqref{main} can be simplified as follows: 
\begin{align}
   \dfrac{1}{t^d}\sum_{j \in \mathcal{B}_{t-1}}\sum_{i \in \mathcal{B}_{t-1}}\sum_{k \in \mathcal{B}_{t-1}}2c_j(i)c_j(k)\delta(i)\delta(k)& \leq \dfrac{1}{t^d}\sum_{j \in \mathcal{B}_{t-1}}\sum_{i \in \mathcal{B}_{t-1}}\sum_{k \in \mathcal{B}_{t-1}}2^{d+1}c_j(i)\delta(i)\delta(k)  \label{s1}\\
    &\leq \dfrac{2^{d+1}\left(d\log{t}+1\right)}{t^d}\sum_{j \in \mathcal{B}_{t-1}}\sum_{i \in \mathcal{B}_{t-1}}c_j(i)\delta(i) \label{s2} \\
    & \leq \dfrac{2^{2d+1}\left(d\log{t}+1\right)^2}{t},  
\label{p2}\end{align}
where \eqref{s1} follows as $c_j(k)\leq 2^d$ and \eqref{s2} follows using the fact that $\sum_{k \in \mathcal{B}_{t-1}}\delta(k) \leq (\log{t^d}+1)$. Further, \eqref{p2} follows from \eqref{eq:Z_t}.
\color{black}
% \begin{align}
%    \E[Z_t^2]& =3\left(\sum_{i \in \mathcal{B}_{t-1}} \frac{(t+3)^d-t^d}{t^d}\delta(i)^2\right) = 3\cdot \frac{(t+3)^d-t^d}{t^d} \sum_{i \in \mathcal{B}_{t-1}} \delta(i)^2 = 3\left(\left(1+\frac{3}{t}\right)^d-1\right)\sum_{i \in \mathcal{B}_{t-1}} \delta(i)^2 \\
%    & \leq \dfrac{3\cdot 2^d}{t} \dfrac{\pi^2}{6} \leq 5\dfrac{2^d}{t}
% \end{align}
\noindent Substituting \eqref{p1} and \eqref{p2} in \eqref{main}, we get 
\begin{align*}
    \sum_{t=1}^T\E[Z_t^2] & \leq \sum_{t=1}^T \dfrac{5\cdot 2^d}{t}+ \dfrac{2^{2d+1}\left(d\log{t}+1
\right)^2}{t} \\ 
& \leq 5\cdot 2^d \left(1+\log{T}\right)+ d^2\cdot 2^{2d+3}\sum_{t=1}^{T}\dfrac{\log{t}^2}{t} \\
& \leq 5\cdot 2^d \left(1+\log{T}\right) + d^2\cdot 2^{2d+3}\left(1+\log{T}^3\right).
\end{align*}
To get \eqref{eq:bern)}, we use \eqref{eq:expZ}, \eqref{eq:expZ1}, and the fact that $\sum_{t = 1}^T\E\left[Y_t\right] \geq \log T$. Since the $Z_t$s are independent and are upper bounded by one, using Bernstein's inequality, we get
\begin{align}\label{eq:Bernstein}
    \P\left(\sum_{t=1}^T Z_t - \sum_{t=1}^T \E[Z_t] > \delta'\right) \leq e^{-\frac{\delta'^2/2}{V_n+\delta'/3}},
\end{align}
where $\displaystyle V_n=\sum_{t=1}^T \text{Var}(Z_t)$, and $\delta'=\delta - \sum_{t=1}^T \E[Z_t] +  \sum_{t=1}^T \E[Y_t] \geq \delta-c_1\log{T}^2$ for some constant $c_1$ depending on dimension $d$. We also have
$\sum_{t=1}^T \text{Var}(Z_t) \leq  c_2 \log{T}^3$ for some constant $c_2$ depending on dimension $d$.
\\
Choosing $\delta' = c_3 \log{T}^2$ results in $\delta=O(\log{T}^3)$. Substituting $\delta'$ and $\sum_{t=1}^T \text{Var}(Z_t) \leq  \frac{\pi^2}{6}(\log T +1)$
 in \eqref{eq:Bernstein}, 
\begin{align*}
    \P\left(\sum_{i=1}^T Y_i - \sum_{i=1}^T \E[Y_i] > \delta\right) & \leq e^{-c\log{T}}=O\left(\dfrac{1}{T^{c}}\right).
\end{align*}

%\input{Bernstein}

% \section{Randomized arrivals}
% Consider the image classification task where every image is characterized by a softmax $p_{t}$. Consider a bag where we $T$ images and we randomly sample from the bag without replacement. The idea behind doing this is we are trying to avoid the worst-case instance of where every time we split the best expert interval. By doing such sampling we avoid this. For the time being lets assume whenever a $p_{t}$ arrival comes we assign a loss either 0 or 1 to all the experts less than it and loss either 0 or 1 to all experts to right independently. So there are 4 possible combinations $(0,0), (0,1), (1,0), (1,1)$. The adversary can choose the loss for every image beforehand and we only have the power to sample an image randomly from the bag of image.
% \\\\
% Now for the sake of understanding the idea lets fix some loss structure for the adversary for eg whenever a $p_{t}$ arrival comes it gives loss $(1,0)$ to the experts which is gives 1 to all the experts to the left of it and 0 to the right. So if the arrival is $p_{1},p_{2},p_{3},....$ every time the best expert is split and regret becomes linear.

% Remember in the proof of the hedge algorithm :
% \begin{align}
%     \log{\frac{W_{t+1}}{W_{t}}}=\log{\frac{W_{t+1}}{\hat{W_{t}}}}+\log{\frac{\hat{W_{t}}}{W_{t}}}
% \end{align}
% where $\hat{W_{t}}=\sum_{i \in N_{t+1}}e^{-\eta L_{i}(t)}$ 
%  Define $X_{t}=\log{\frac{\hat{W_{t}}}{W_{t}}}$
%  $=\log{1+x} \leq x$
%  $= \frac{e^{-\eta L_{j}(t)}{\sum_{i \in N_{t+1}}e^{-\eta L_{i}(t)}}}$ where $j$ is the new expert at time $t$
%  $\frac{1}{\sum_{i \in N_{t+1}}e^{-\eta(L_{i}(t)-L_{j)(t)}}}$
%  $\frac{1}{k}$ if $j$ is the $k^{th}$ best expert at time t  

%  So there are $T!$ sequences We need to find the $\sum_{t=1}^{T} \E[X_{t}]= \sum_{t=1}^{T}\E[B_{i}]$ where $B_{i}$ is the event $p_{i}$ arrives. Out of $T!$ sequences pick an expert $i$ and calculate the expectation of $X_{t}$ when that $p_{i}$ comes.

%  For the loss structure above for experts born out of $p_{i}$ and lets say out of  $p_{1},p_{2}...p_{i-1}$ exactly $k$ have already arrived before we will always have $X_{t}=\frac{1}{k}$ for all such sequences.

%  Using the inversion logic for expert $i$ we will have $P(B_{i}=1/j)=1/i,\FORALL{j=1,2,...i}$.

%  hence $\E[B_{i}]=\frac{1}{i}\sum_{j=1}^{i} \frac{1}{j} \leq \frac{\log{i}}{i}$
%  $\sum_{t=1}^{T} \E[X_{t}]=\sum_{i=1}^{T}\frac{\log{i}}{i} \leq (\log{T})^{2}$

% \section{Partial information}
% Define potential function $\phi_t=\frac{1}{\eta} \log \sum_{i \in \mathcal{B}_t} \exp \left(-\eta L_i(T)\right)$, hence 
% \begin{align}
% \phi_T-\phi_0&=\sum_{t=1}^T \phi_t-\phi_{t-1} =\sum_{t=1}^T \frac{1}{\eta} \log \left(\frac{\sum_{i \in \mathcal{B}_t}  \exp \left(-\eta L_t(i)\right.}{\sum_{i \in \mathcal{B}_{t-1}} \exp -\eta L_{t-1}(i)}\right) \\
% & =\sum_{t=1}^T \frac{1}{\eta} \log \frac{W_t}{\hat{W_t}}+\frac{1}{\eta} \log \frac{\hat{W_t}}{W_{t-1}}  =\sum_{t=1}^T \frac{1}{\eta} \log \sum_{i \in \mathcal{B}_t}  w_t(i) \exp \left(-\eta l_t(i)\right)+\frac{1}{\eta} Y_t \\
% & \leq \sum_{t=1}^T \frac{1}{\eta} \log \left(\sum_{i \in \mathcal{B}_t} w_t(i)\left[1-\eta l_t(i)+\frac{\eta^2 l_t(i)^2}{2} \right]\right) +\sum_{t=1}^T\frac{1}{\eta} Y_t \quad 
% \end{align}
% Using \eqref{eq2:thm2} and $(e^{-x} \leqslant 1-x+\frac{x^2}{2})$, Further substituting $\phi_0=0$ and $(\log (1+x) \leq x)$ 
% \begin{align}
% \phi_T-\phi_0 & \leq \sum_{t=1}^T\left[-w_t \cdot l_t+\eta \sum_{i \in \mathcal{B}_t}  w_t(i) l_t(i)^2\right] +\frac{1}{\eta} \sum_{t=1}^T Y_t \quad  \\
% \phi_T + &\sum_{t=1}^T w_t \cdot l_t \leq \frac{1}{\eta} \sum_{t=1}^T Y_t+\eta \sum_{t=1}^T \sum_{i \in \mathcal{B}_t} w_t(i) l_t(i)^2 \\
% \sum_{t=1}^T w_t \cdot l_t-& L_T^* (i) \leq \phi_T+\sum_{t=1}^T w_t \cdot l_t \quad (\phi_{T} \geq 0 \geq -L_T (i))
% \end{align}
% Taking expectation on both the sides 
% \begin{align}
% \E \left[\sum_{t=1}^T w_t \hat{l}_t-L_T^*(i)\right] & \leq \frac{\log (T)}{\eta}+\eta \sum_{t=1}^T \sum_{i \in \mathcal{B}_t}\E \left[w_t(i) \cdot \hat{l}_t(i)^2\right] \\
% \E \left[\sum_{t=1}^T w_t \cdot l_t-L_T^*(i)\right] & \leq \frac{\log (T)}{\eta}+\frac{\eta}{\varepsilon} \sum_{t=1}^T \sum_{i \in \mathcal{B}_t} \E \left[w_t(i).\hat{l}_t(i)\right] \\
% \left(1-\frac{\eta T}{\varepsilon}\right)L_{T} & \leq \frac{\log (T)}{\eta}+L_T^* \label{eq1:partialinfo}
% \end{align}
% Substituting paramters $\varepsilon =\frac{m}{T} \quad \eta =\frac{\sqrt{2 m \log (T)}}{T}$ in we get \eqref{eq1:partialinfo}
% \begin{align}
% \left(1-\frac{\sqrt{2 \log (T)}}{\sqrt{m}}\right)L_T & \leq \sqrt{\frac{\log (T)}{2 m}} T+L_T^*(i) \\
% L_T-L_T^* & \leq T \sqrt{\frac{\log (T)}{m}}+\sqrt{\frac{2\log (T)}{m}}  L_T \leq (\sqrt{2}+1)T \sqrt{\frac{\log (T)}{m}} 
% \end{align}