\section{Regret Analysis} \label{sec:regret_analysis}
In this section we provide proofs for the theorems in Sec.~\ref{ss:analysis}.

Following Eq.~\ref{eq:multiplicative_update}, at every step a probability distribution is induced over the coordinates $\pi_t$:
\begin{align}
    \pi_{t,i} = \frac{w_{t,i}}{W_t} \quad \text{where} \quad W_t = \sum_{i=1}^D w_{t,i}
\end{align}
The probability $\hat{\pi}_{t,\mathcal{I}_t}$ of selecting a certain coordinate block $\mathcal{I}_t\subset \mathcal{I}=\{1,\cdots,D\}$ follows sampling according to $\pi_t$ without replacement, such that,
\begin{align}
    \hat{\pi}_{t,\mathcal{I}_t} 
    &= \sum_{p\in perm(\mathcal{I}_t)} \prod_{k\in p}{\frac{\pi_{t,k}}{1-\sum_{j\in p_{1:k}}\pi_{t,j}}} 
    \\&= \label{eq:common_numerator}
    \left(\prod_{i\in \mathcal{I}_t}\pi_{t,k}\right) \cdot \left(\sum_{p\in perm(\mathcal{I}_t)} \prod_{k\in p}\left(1-\sum_{j\in p_{1:k}}\pi_{t,j}\right)^{-1}\right)
    =
    \mathcal{P}(\mathcal{I}_t) \cdot \mathcal{R}(\mathcal{I}_t)
\end{align}
where $perm(\mathcal{I}_t)$ are all the permutations of the set $\mathcal{I}_t$ and $p_{1:k}$ are the first $k$ coordinates in the permutation $p$. \ref{eq:common_numerator} is due to the common numerator of all permutations where the left term $\mathcal{P}(\mathcal{I}_t)$ corresponds to the probability of sampling a subset of coordinates with replacement, and the right term $\mathcal{R}(\mathcal{I}_t)$ is associated with sampling without replacement.

% Denote the set of all possible coordinate blocks of all sizes $c\in\mathcal{C}$ by $\mathcal{S}$ and its size $|\mathcal{S}|=\prod_{c\in\mathcal C} {D \choose c}$.

% The expected cumulative loss following our update policy is
% $$L_T= \sum_{t=1}^T\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}\cdot\frac{1}{|\mathcal{I}_t|} \sum_{i\in\mathcal{I}_t}\ell_{t,i}$$

% Assume the best coordinate block is $C^*$ and the corresponding cumulative loss:
% $$L_T^*= \sum_{t=1}^T L_{t,\mathcal{I}_t}=\sum_{t=1}^T\frac{1}{|\mathcal{I}_t|} \sum_{i\in\mathcal{I}_t}\ell_{t,i} = \sum_{t=1}^T\bar{\ell}_{t,\mathcal{I}_t}$$

% We hence aim at bounding the regret $Regret_T = L_T-L_T^*$ 
% For this purpose we bound the regret with respect to any arbitrary sequence of selected coordinate blocks.

% \textbf{Lemma 1} For $\eta >0$ and assume all losses $\ell_t,i \geq 0$ for all $t\in\{1,\cdots,T\}$ and $i\in \mathcal{I}$ the update rule in \ref{eq:multiplicative_weight_update} satisfies for any block of coordinates $C^*$:
% \begin{equation}\label{eq:lemma_1}
% % Regret_T=L_T-L^*_T =
% \sum_{t=1}^T\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}\cdot\frac{1}{|\mathcal{I}_t|} \sum_{i\in\mathcal{I}_t}\ell_{t,i} - \sum_{t=1}^T \ell^*_t
% \leq \eta \sum_{t=1}^T\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}\cdot \left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 + \frac{\log(D)}{\eta}
% \end{equation}

In addition, let us first modify the losses in Eq.~\ref{eq:alpha_beta_loss} to be non-negative by adding the same constant $\log(\alpha)$ to all possible values
\begin{align} \label{eq:alpha_beta_loss_shifted}
    \tilde{\ell}_{t,i} =
    \begin{cases}
		0 & \text{if } i \in C_t \text{ and } y_t > M_{t-1} \\
		\log(\tilde{\alpha}\tilde{\beta}) & \text{if } i \in C_t \text{ and } y_t \leq M_{t-1} \\
        \log(\tilde{\alpha}) & \text{if } i \notin C_t
    \end{cases}
\end{align}
This modification does not change the resulted distribution $\pi_t$ induced over the coordinates as it is invariant to shifts of the losses:
\begin{align}
\notag
\pi_{t,i} = \frac{w_{t,i}}{W_t} &= \frac{e^{-\eta\sum_{\tau=1}^t\tilde{\ell}_{\tau,i}}}{\sum_{j=1}^D e^{-\eta\sum_{\tau=1}^t\tilde{\ell}_{\tau,j}}} =
\frac{e^{-\eta\sum_{\tau=1}^t(\ell_{\tau,i}+\log(\tilde{\alpha}))}}{\sum_{j=1}^D e^{-\eta\sum_{\tau=1}^t(\ell_{\tau,j}+\log(\tilde{\alpha}))}}
=
\frac{e^{-\eta t\log(\tilde{\alpha})}e^{-\eta\sum_{\tau=1}^t\ell_{\tau,i}}}{e^{-\eta t\log(\tilde{\alpha})}\sum_{j=1}^D e^{-\eta\sum_{\tau=1}^t\ell_{\tau,j}}}
\\&=
\frac{e^{-\eta\sum_{\tau=1}^t\ell_{\tau,i}}}{\sum_{j=1}^D e^{-\eta\sum_{\tau=1}^t\ell_{\tau,j}}}
\end{align}
Thus for simplicity we refer to $\tilde{\ell}$ as $\ell$ in the followings.

% \textbf{Proof} 
\subsection{Proof of Lemma 1}
Starting with a uniform distribution over the coordinates $w_{0,i}\equiv \frac{1}{D}$ such that $W_0 = 1$ and we have:
\begin{align}
    W_{t+1} &= \sum_{i\in \mathcal{I}} w_{t+1,i} 
    \\ & \leq \label{eq:sum_of_products}
    \sum_{\mathcal{I}_t\in \mathcal{S}}\prod_{i\in\mathcal{I}_t}w_{t+1,i}
    \\&=
    W_t\sum_{\mathcal{I}_t\in \mathcal{S}}W_t^{-1}\prod_{i\in\mathcal{I}_t}w_{t,i}e^{-\eta\ell_{t,i}} 
    \\&\leq \label{eq:sum_leq_1}
    W_t\sum_{\mathcal{I}_t\in \mathcal{S}}W_t^{-|\mathcal{I}_t|}\prod_{i\in\mathcal{I}_t}w_{t,i}e^{-\eta\ell_{t,i}} \cdot|perm(\mathcal{I}_t)|
    \\&=
    W_t\sum_{\mathcal{I}_t\in \mathcal{S}}\prod_{i\in\mathcal{I}_t}\frac{w_{t,i}}{W_t}e^{-\eta\ell_{t,i}}\cdot \sum_{p\in perm(\mathcal{I}_t)}1 
    \\&=
    W_t\sum_{\mathcal{I}_t\in \mathcal{S}}\prod_{i\in\mathcal{I}_t}\pi_{t,i}e^{-\eta\ell_{t,i}}\cdot \sum_{p\in perm(\mathcal{I}_t)}\prod_{k\in p}1 
    \\&\leq 
    W_t\sum_{\mathcal{I}_t\in \mathcal{S}}e^{-\eta\sum_{i\in\mathcal{I}_t}\ell_{t,i}}\prod_{i\in\mathcal{I}_t}\pi_{t,i}\cdot\sum_{p\in perm(\mathcal{I}_t)}\prod_{k\in p}\left(1-\sum_{j\in p_{1:k}}\pi_{t,j}\right)^{-1}
    \\&= \label{eq:back_to_pi_hat}
    W_t\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}e^{-\eta\sum_{i\in\mathcal{I}_t}\ell_{t,i}}
    \\&\leq 
    W_t\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}e^{-\frac{\eta}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}}
    \\&\leq \label{eq:exp_ineq}
    W_t\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}\left(1-\frac{\eta}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}+\eta^2\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2\right)
    \\&\leq \label{eq:pi_hat_sums_to_1}
    W_t\left(|\mathcal{C}|+\sum_{\mathcal{I}_t\in \mathcal{S}}\eta^2\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 -\frac{\eta}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)
    \\&\leq 
    W_t\left(1+ (|\mathcal{C}|-1)+\sum_{\mathcal{I}_t\in \mathcal{S}}\eta^2\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 -\frac{\eta}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)
    \\&\leq \label{eq:exp_ineq_2}
    W_t e^{\sum_{\mathcal{I}_t\in \mathcal{S}} \eta^2|\hat{\pi}_{t,\mathcal{I}_t}\hat{\pi}_{t,\mathcal{I}_t}\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 -\frac{\eta}{|\mathcal{I}_t|}\hat{\pi}_{t,\mathcal{I}_t}\hat{\pi}_{t,\mathcal{I}_t}\sum_{i\in\mathcal{I}_t}\ell_{t,i}+(|\mathcal{C}|-1)} 
\end{align}
Where
\begin{itemize}
    \item (\ref{eq:sum_of_products}) is since $\mathcal{C}$ always contains a block size of $1$, i.e. $1\in\mathcal{C}$ and thus: 
    \begin{align*}
        \sum_{\mathcal{I}_t\in \mathcal{S}}\prod_{i\in\mathcal{I}_t}w_{t+1,i}
        &=
        \sum_{c\in\mathcal{C}}\sum_{\mathcal{I}_t\in\mathcal{S}_c}\prod_{i\in\mathcal{I}_t}w_{t+1,i}
        \\ &=
        \sum_{\mathcal{I}_t\in\mathcal{S}_1}\prod_{i\in\mathcal{I}_t}w_{t+1,i}+\sum_{c\in\mathcal{C}\setminus \{1\}}\sum_{\mathcal{I}_t\in\mathcal{S}_c}\prod_{i\in\mathcal{I}_t}w_{t+1,i}
        \\&=
        \sum_{i\in \mathcal{I}} w_{t+1,i} + \sum_{c\in\mathcal{C}\setminus \{1\}}\sum_{\mathcal{I}_t\in\mathcal{S}_c}\prod_{i\in\mathcal{I}_t}w_{t+1,i} 
        \\ & \geq 
        \sum_{i\in \mathcal{I}} w_{t+1,i}
    \end{align*}
    \item (\ref{eq:sum_leq_1}) holds since $W_0=1$ and $W_t$ is monotonically non-increasing following the update rule (\ref{eq:multiplicative_weight_update}) with non-negative losses, thus $w_t\leq 1$ for all $t$.
    \item (\ref{eq:back_to_pi_hat}) follows from (\ref{eq:common_numerator}).
    \item (\ref{eq:exp_ineq}) holds since $e^x\leq 1-x+x^2$ for all $x\geq 0$.
    \item (\ref{eq:pi_hat_sums_to_1}) holds since $\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}=\sum_{c\in\mathcal{C}}\sum_{\mathcal{I}_t\in\mathcal{S}_c }\hat{\pi}_{t,\mathcal{I}_t}=\sum_{c\in\mathcal{C}}1=|\mathcal{C}|$.
    \item (\ref{eq:exp_ineq_2}) holds since $1+x\leq e^x$.
\end{itemize}

Given that the sum of weights of a certain coordinate block $C^*$ is less than the total sum of weights, together with Eq.~\ref{eq:exp_ineq_2}, $w_{0,i}\equiv\frac{1}{D}$ and $W_0=1$ we have:
\begin{align}
   \frac{1}{D}\sum_{i\in C^*}e^{-\eta\sum_{t=1}^T\ell_{t,i}}
   = \sum_{i\in C^*} w_{t,i} \leq W_T 
   \leq 
    e^{\sum_{t=1}^T\sum_{\mathcal{I}_t\in \mathcal{S}} \eta^2\hat{\pi}_{t,\mathcal{I}_t}\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 -\frac{\eta}{|\mathcal{I}_t|}\hat{\pi}_{t,\mathcal{I}_t}\sum_{i\in\mathcal{I}_t}\ell_{t,i} + (|\mathcal{C}|-1)} 
\end{align}
Taking the $\log$ of both sides, we have:
\begin{align} \label{eq:recursive_ineq}
  \log&\left(\sum_{i\in C^*}e^{-\eta\sum_{t=1}^T\ell_{t,i}}\right) 
  -\log(D) 
  \notag \\ &\leq \sum_{t=1}^T\sum_{\mathcal{I}_t\in \mathcal{S}} \eta^2\hat{\pi}_{t,\mathcal{I}_t}\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 -\frac{\eta}{|\mathcal{I}_t|}\hat{\pi}_{t,\mathcal{I}_t}\sum_{i\in\mathcal{I}_t}\ell_{t,i} 
  + T(|\mathcal{C}|-1)
\end{align}
Following the same certain block, all the participating coordinates suffer the same loss 
 $\ell^*_t$ at every time step as follows from Eq.~\ref{eq:alpha_beta_loss}, hence:
 \begin{align} \notag
     \log\left(\sum_{i\in C^*}e^{-\eta\sum_{t=1}^T\ell_{t,i}}\right)
     &=
     \log\left(\sum_{i\in C^*}e^{-\eta\sum_{t=1}^T\ell^*_t}\right)
     \\ & = \notag
     \log\left(|C^*|e^{-\eta\sum_{t=1}^T\ell^*_t}\right)
     \\&=  \notag
     \log(|C^*|) -\eta\sum_{t=1}^T\ell^*_t
     \\ &\geq \label{eq:common_loss_ineq}
     -\eta\sum_{t=1}^T\ell^*_t
 \end{align}
 Thus Eq.~\ref{eq:recursive_ineq} and \ref{eq:common_loss_ineq} yield:
 \begin{align}
     -\eta|\mathcal{C}|\sum_{t=1}^T\ell^*_t 
     -|\mathcal{C}|\log(D) 
     \leq \sum_{t=1}^T\sum_{\mathcal{I}_t\in \mathcal{S}} \eta^2\hat{\pi}_{t,\mathcal{I}_t}\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 -\frac{\eta}{|\mathcal{I}_t|}\hat{\pi}_{t,\mathcal{I}_t}\sum_{i\in\mathcal{I}_t}\ell_{t,i}
     + T\log(|\mathcal{C}|)
 \end{align}
 And since $\sum_{\mathcal{I}_t\in\mathcal{S}}$ is equivalent to $\sum_{c\in\mathcal{C}}\sum_{\mathcal{I}_t\in\mathcal{S}_c}$, we have:
 \begin{align}
    \sum_{t=1}^T\sum_{c\in\mathcal{C}}&\frac{1}{|\mathcal{C}|}\sum_{\mathcal{I}_t\in\mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t}\cdot\frac{1}{|\mathcal{I}_t|} \sum_{i\in\mathcal{I}_t}\ell_{t,i} - \sum_{t=1}^T \ell^*_t
    \notag \\ & \leq 
    \eta \sum_{t=1}^T\sum_{c\in\mathcal{C}}\frac{1}{|\mathcal{C}|}\sum_{\mathcal{I}_t\in\mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t}\cdot \left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 + \frac{\log(D)+T(1-1/|\mathcal{C}|)}{\eta} 
 \end{align}
 
 
%  \textbf{Theorem 1}  Apply the update rule in \ref{eq:multiplicative_weight_update}, with a modified $\eta=\log(\alpha\beta)^{-1}\sqrt{\frac{\log(D)}{T}}$, then:
%  \begin{equation*}
%      Regret_t = \mathcal{O}(\log(\alpha\beta)\sqrt{T\log(D)})
%  \end{equation*}
 
%  \textbf{Proof} 
\subsection{Proof of Theorem 1}
 Since $\ell_{t,i} \leq \log(\tilde{\alpha}\tilde{\beta})$ then: 
 \begin{align}
     \left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2
     \leq
     \left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\log(\tilde{\alpha}\tilde{\beta})\right)^2
     \leq
     \log(\tilde{\alpha}\tilde{\beta})^2
 \end{align}
 
 Thus since $\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}=|\mathcal{C}|$ we have,
 \begin{align}
     \sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}\cdot \left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2
     \leq
     \sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}\log(\tilde{\alpha}\tilde{\beta})^2
     =
     |\mathcal{C}|\log(\tilde{\alpha}\tilde{\beta})^2
 \end{align}
 And Eq.~\ref{eq:lemma_1} yields,
 \begin{equation}
     Regret_t \leq \eta T \log(\tilde{\alpha}\tilde{\beta})^2 
     + \frac{\log(D)+T(1-1/|\mathcal{C}|)}{\eta} 
    %  = 2 \log(\tilde{\alpha}\tilde{\beta})\sqrt{T\log(D)}
 \end{equation}