\documentclass[12pt]{colt2024} %% Anonymized submission
      \usepackage{times}
      \def\P{{\mathbb P}}  %%%   appears in many equations  Prob
\def\E{{\mathbb E}} 
      \begin{document}
      \title{Proof Of Lemma3}   

\section{Proof of Lemma~\ref{lem:ex
pYt}}
In round $t-1$, let $H_t$ denote the set comprising the history of the losses of the experts and the sequence of arrivals $\mathbf{X}_{t-1}$. Given a realization of $H_t$, $Y_t$ takes $t^d$ possible values each corresponding to $X_t$ belonging to one of the $t^d$ partitions. From Lemma \ref{lem:1/t+1}, the latter event has probability $1/t^d$. For $i,j \in \mathcal{B}_{t-1}$, let $c_j(i)$ denote the number of partitions of expert $i$ caused by sampling $X_t$ from expert $j$, and let $C_i = \sum_{j \in \mathcal{B}_{t-1}}c_j(i)$. The conditional expectation of $Y_t$ given $H_t$ is given by 

\begin{align}
        \E[Y_t]=&\E[Y_t|X_1<X_2\ldots X_{t-1}]\P(X_1<X_2<\ldots X_{t-1})+\ldots\\
    & + \E[Y_t|X_{t-1}<X_{t-2}\ldots X_1]\P(X_{t-1}<X_{t-2}<\ldots X_1)
\end{align}

We have for every ordering $\P(X_1<X_2<\ldots X_{t-1})=\dfrac{1}{(t-1)!}$ 

Let $\phi(i) =  \frac{e^{-\eta L_{t-1}(i)}}{ \sum_{j \in \mathcal{B}_{t-1}}e^{-\eta L_{t-1} (j)}}$

\begin{align}
    \E[Y_t] & =\sum_{j \in \mathcal{B}_{t-1}}\P(X_t \in j| X_1<X_2\ldots X_{t-1})\sum_{i \in \mathcal{B}_{t-1}}c_j(i)\phi(i)\\
    & = \dfrac{1}{t^d}\sum_{j \in \mathcal{B}_{t-1}}\sum_{i \in \mathcal{B}_{t-1}}c_j(i)\phi(i)\\
    & \text{As $\P(X_t \in j| X_1<X_2\ldots X_{t-1})=\dfrac{1}{t^d}$ $\forall j \in \mathcal{B}_{t-1}$}\\
    & = \dfrac{1}{t^d}\sum_{i \in \mathcal{B}_{t-1}}\phi(i)\sum_{j \in \mathcal{B}_{t-1}}c_j(i)\\ 
    & = \dfrac{1}{t^d}\sum_{i \in \mathcal{B}_{t-1}}\phi(i)C_i\\ 
\end{align}
\color{blue}
Let $\phi_i =  \frac{e^{-\eta L_{t-1}(i)}}{ \sum_{j \in \mathcal{B}_{t-1}}e^{-\eta L_{t-1} (j)}}$.
Note that $e^{L_{t-1}(i)-L_{t-1}(j)} \geq 0$ for all $i$, $j$, and $L_{t-1}(i) \geq L_{t-1}(j)$ implies $e^{L_{t-1}(i)-L_{t-1}(j)} \geq 1$. Therefore,
\begin{align}
\phi_i & =\frac{1}{\sum_{j: L_{t-1}(j)>L_{t-1}(i)} e^{L_{t-1}(n_t)-L_{t-1} (j)} + \sum_{j: L_{t-1}(j) \leq L_{t-1}(i)} e^{L_{t-1}(i)-L_{t-1} (j)}} \nonumber \\
& \leq \frac{1}{\sum_{j \in \mathcal{B}_{t-1}} \mathbbm{1}_{\{L_{t-1}(j) \leq L_{t-1}(i)\}}} =\delta_i.
\label{eq:dominance}
\end{align}

Note that, for each $i \in \mathcal{B}_{t-1},$ $\delta_i$ takes a unique value from $\{1,\frac{1}{2},\ldots,\frac{1}{t^d}\}$. Specifically, $\delta_i = \frac{1}{k}$, if expert $i$ has the $k$th highest cumulative loss. Thus, $\delta_i$ only depends on the relation between the cumulative losses but not on their values.  In the following, we use $[k]$ to denote the expert with $k$th highest cumulative loss.
It takes values from the set $\{1,\frac{1}{2},\ldots,\frac{1}{t^d}\}$.
\color{black}


% \begin{align*}
%     \E[Y_t\mid H_t] 
%     &= \sum_{j \in \mathcal{B}_{t-1}} \P(X_t \in j|\mathbf{X}_{t-1}) \sum_{i \in \mathcal{B}_{t-1}} c_j(i)\phi_i \nonumber \\
%     &\leq \sum_{j \in \mathcal{B}_{t-1}} \P(X_t \in j|\mathbf{X}_{t-1}) \sum_{k \in \mathcal{B}_{t-1}} c_j(i) \delta_i \\
%     &= \sum_{j \in \mathcal{B}_{t-1}} \P(X_t \in j|\mathbf{X}_{t-1}) \sum_{k \in \mathcal{B}_{t-1}} \frac{c_j([k])}{t}
% \end{align*}

% Proof for $d = 1$.
% \begin{align*}
%     \E[Y_t\mid H_t] 
%     &= \sum_{j \in \mathcal{B}_{t-1}} \P(X_t \in j|\mathbf{X}_{t-1}) \phi_j \nonumber \\
%     &\leq \sum_{j \in \mathcal{B}_{t-1}} \P(X_t \in j|\mathbf{X}_{t-1}) \delta_i \\
%     &= \sum_{k \in \mathcal{B}_{t-1}} \P(X_t \in [k]|\mathbf{X}_{t-1}) \frac{1}{k}
% \end{align*}
% Note that the RHS in the above equation is independent of the sequences of losses. Therefore,
% \begin{align*}
%     \E[Y_t] &= \E_{H_t} [\E[Y_t\mid H_t]]\\
%     &\leq \sum_{k \in \mathcal{B}_{t-1}}\E_{\mathbf{X}_{t-1}}[\P(X_t \in [k]|\mathbf{X}_{t-1})] \frac{1}{k}\\
%     &= \sum_{k \in \mathcal{B}_{t-1}} \frac{1}{tk}\\
%     &\leq \frac{\log t + 1}{k}.    
% \end{align*}

% $\sum_{i \in \mathcal{B}_{t-1}}\delta_i \leq \sum_{i=1}^{|\mathcal{B}_{t-1}|}\frac{1}{i}\leq \log{t^d}+1\leq d\log{t}+1\leq d\log{T}+1$
% \begin{align*}
%     \E[Y_t|X_1,X_2,\ldots,X_{t-1}] & \leq \sum_{i \in \mathcal{B}_{t-1}} \P(X_t \in i|X_1,X_2,\ldots,X_{t-1}) C_i \delta_i
%     % \\
%     % & \leq \sum_{i \in \mathcal{B}_{t-1}} C_i \P(X_t \in i) \delta_i  \leq  \sum_{i \in \mathcal{B}_{t-1}} \frac{(t+1)^d-t^d}{t^d}\delta_i \\
%     % & \leq \frac{(t+1)^d-t^d}{t^d} \sum_{i \in \mathcal{B}_{t-1}} \delta_i \leq d(d\log{T}+1)
% \end{align*}

% \color{black}

Note that $C_i$ is the total number of partitions of expert $i$ created due to sampling $X_t$ from all $t^d$ experts. We compute $C_i$ using the following counting argument. We say an expert $i$ \textit{shares} $k$ hyperplanes with expert $j$ if, for any point in $i$, exactly $k$ out of the $d$ orthogonal hyperplanes (parallel to the faces of $\mathbb{B}$) \color{black} that pass through that point will partition expert $i$. We compute the number of experts that share exactly $k$ hyperplanes with $i$ as follows. Choose any $k$ dimensions from $d$ in $\binom{d}{k}$ possible ways. Further, choose any orthogonal hyperplane passing through $i$ that is parallel to some dimension from the rest of $d-k$ dimensions. There will be $t-1$ basis hyperplanes, i.e., the hyperplanes that partitioned $\mathbb{B}$ by passing through $t-1$ points drawn by the environment, that are parallel to the chosen hyperplane and do not partition $i$. The $(t-1)^{d-k}$ partitions, which are formed by the intersection of the $t-1$ basis hyperplanes corresponding to each of the $d-k$ dimensions, do not share exactly $d-k$ hyperplanes with $i$, or they share exactly $k$ hyperplanes with $i$. Therefore, the total number of experts that share exactly $k$ hyperplanes with $i$ is $\binom{d}{k}(t-1)^{d-k}$, and each point drawn from those experts will result in $2^k$ partitions of expert $i$. Since index $i$ will be assigned to one of its children (sub-partitions), we have $2^k - 1$ new experts from partitioning $i$.   

\begin{align*}
    C_i& = \sum_{k=1}^d \binom{d}{k}(2^k-1)(t-1)^{d-k} \\
    & =(t-1)^d \sum \binom{d}{k}\left(\frac{2}{t-1}\right)^k - \sum_{k=1}^d \binom{d}{k}(t-1)^{d-k} \\
    & = (t-1)^d \left(\frac{t+1}{t-1}\right)^d - t^d = (t+1)^d - t^d.
\end{align*}


Indeed $C_i$ is independent of $i$ and is equal to the total number of new experts revealed in slot $t$.
%and it turns out to be equal to the total number of new experts revealed in slot $t$. This can be attributed to the fact that additional weights in $\hat{W}_t$ is $(t+1)^d - t^d$. 
Substituting $C_i = (t+1)^d - t^d$ in \eqref{eq1:lem:expYt}, we obtain


\begin{align*}
   \E[Y_t]=\sum_{i \in \mathcal{B}_{t-1}} \frac{(t+1)^d-t^d}{t^d}\phi(i) = \frac{(t+1)^d-t^d}{t^d} \sum_{i \in \mathcal{B}_{t-1}} \phi(i) = \left(\left(1+\frac{1}{t}\right)^d-1\right)
\end{align*}

\begin{align*}
    \sum_{t=1}^T \left(1+\frac{1}{t}\right)^d-1 & \leq  \sum_{t=1}^T e^\frac{d}{t}-1 \leq \int_0^{T}\left(e^{\frac{d}{x}}-1\right) dx=T(e^\frac{d}{T}-1)-dEi\left(\dfrac{d}{T}\right)\\
    & =\dfrac{1}{2}\left(2d\log{T}-2d\gamma+2d\right)-\dfrac{d^2}{2T}-\dfrac{d^3}{12T^2}-\dfrac{d^4}{72T^3}+O\left(\dfrac{1}{T^4}\right)\\
    & \leq 2d\log{T}
\end{align*}

\begin{align*}
    \sum_{t=1}^T \E[Y_t] \leq \sum_{t=1}^{T}\left(\left(1+\frac{1}{t}\right)^d-1\right)=2d\log{T}
\end{align*}
\end{document}