\subsection{Regret analysis for sampling coordinates without replacement} \label{sec:regret_analysis_without_replacement}
Denote by $p_c$ the probability of choosing a certain block size $c\in\mathcal{C}$, such that $p_c> 0$ and $\sum_{c\in\mathcal{C}}p_c=1$, e.g., for a uniform sampling of the block size $p_c=1/|\mathcal{C}|$ for all $c\in\mathcal{C}$.

The probability $\hat{\pi}_{t,\mathcal{I}_t}$ of selecting a certain coordinate block $\mathcal{I}_t\subset \mathcal{I}=\{1,\cdots,D\}$ of size $|\mathcal{I}_t|=c\in\mathcal{C}$ follows sampling according to $\pi_t$ (Eq.~(2) without replacement, such that,
\begin{align}
    \hat{\pi}_{t,\mathcal{I}_t} 
    &= \sum_{p\in perm(\mathcal{I}_t)} \prod_{k\in p}{\frac{\pi_{t,k}}{1-\sum_{j\in p_{1:k}}\pi_{t,j}}} \nonumber
    \\&= \label{eq:common_numerator}
    \left(\prod_{i\in \mathcal{I}_t}\pi_{t,i}\right) \cdot \left(\sum_{p\in perm(\mathcal{I}_t)} \prod_{k\in p}\left(1-\sum_{j\in p_{1:k}}\pi_{t,j}\right)^{-1}\right)
    =
    \mathcal{P}(\mathcal{I}_t) \cdot \mathcal{R}(\mathcal{I}_t)
\end{align}
where $perm(\mathcal{I}_t)$ are all the permutations of the set $\mathcal{I}_t$ and $p_{1:k}$ are the first $k$ coordinates in the permutation $p$. Eq.~(\ref{eq:common_numerator}) holds due to the common numerator of all permutations where the left term $\mathcal{P}(\mathcal{I}_t)$ corresponds to the probability of sampling a subset of coordinates with replacement, and the right term $\mathcal{R}(\mathcal{I}_t)$ is associated with sampling without replacement. Of course, summing over all the possible blocks of size $c$ results $\sum_{\mathcal{I}_t\in\mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t} = 1$ for all $c\in\mathcal{C}$.

Thus $\tilde{\pi}_{t,\mathcal{I}_t}=p_c\cdot\hat{\pi}_{t,\mathcal{I}_t}$ and the probability of sampling every block of coordinates of any size sum up to $1$ as well:
\begin{equation}\label{eq:prob_sum_to_1}
    \sum_{c\in\mathcal{C}}\sum_{\mathcal{I}_t\in\mathcal{S}_c}\tilde{\pi}_{t,\mathcal{I}_t} 
    =
    \sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in\mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t} 
    = 
    \sum_{c\in\mathcal{C}}p_c
    = 
    1
\end{equation}

% Denote the set of all possible coordinate blocks of all sizes $c\in\mathcal{C}$ by $\mathcal{S}$ and its size $|\mathcal{S}|=\prod_{c\in\mathcal C} {D \choose c}$.

% The expected cumulative loss following our update policy is
% $$L_T= \sum_{t=1}^T\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}\cdot\frac{1}{|\mathcal{I}_t|} \sum_{i\in\mathcal{I}_t}\ell_{t,i}$$

% Assume the best coordinate block is $\mathcal{I}^*$ and the corresponding cumulative loss:
% $$L_T^*= \sum_{t=1}^T L_{t,\mathcal{I}_t}=\sum_{t=1}^T\frac{1}{|\mathcal{I}_t|} \sum_{i\in\mathcal{I}_t}\ell_{t,i} = \sum_{t=1}^T\bar{\ell}_{t,\mathcal{I}_t}$$

% We hence aim at bounding the regret $Regret_T = L_T-L_T^*$ 
% For this purpose we bound the regret with respect to any arbitrary sequence of selected coordinate blocks.

% \textbf{Lemma 1} For $\eta >0$ and assume all losses $\ell_t,i \geq 0$ for all $t\in\{1,\cdots,T\}$ and $i\in \mathcal{I}$ the update rule in \ref{eq:multiplicative_weight_update} satisfies for any block of coordinates $\mathcal{I}^*$:
% \begin{equation}\label{eq:lemma_1}
% % Regret_T=L_T-L^*_T =
% \sum_{t=1}^T\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}\cdot\frac{1}{|\mathcal{I}_t|} \sum_{i\in\mathcal{I}_t}\ell_{t,i} - \sum_{t=1}^T \ell^*_t
% \leq \eta \sum_{t=1}^T\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}\cdot \left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 + \frac{\log(D)}{\eta}
% \end{equation}

% \textbf{Proof} 
%\subsubsection{Proof of Lemma~\ref{lem:regret_without_replacement}}
\begin{lemma} \label{lem:regret_without_replacement}
 Sample a block size $c\in\mathcal{C}$ with probability $p_c>0$ and $c$ coordinates without replacement according to $\pi_t$. Assume $\mathcal{C}\supset\{1\}$, $\eta >0$ and non-negative losses $\ell_{t,i}\geq 0$. Then the update rule in (3) satisfies for any block of coordinates $\mathcal{I}^*$:
\begin{align}\label{eq:lemma_1}
% Regret_T
% =L_T-L^*_T 
% =
\sum_{t=1}^T\sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t}\cdot&\frac{1}{|\mathcal{I}_t|} \sum_{i\in\mathcal{I}_t}\ell_{t,i} 
 -\sum_{t=1}^T\frac{1}{|\mathcal{I}^*|}\sum_{i\in \mathcal{I}^*}\ell_{t,i}
% \sum_{t=1}^T\sum_{\mathcal{I}_t\in \mathcal{S}}\hat{\pi}_{t,\mathcal{I}_t}\cdot\frac{1}{|\mathcal{I}_t|} \sum_{i\in\mathcal{I}_t}\ell_{t,i} - \sum_{t=1}^T \ell^*_t
%  \sum_{t=1}^T\sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t}\cdot\frac{1}{|\mathcal{I}_t|} \sum_{i\in\mathcal{I}_t}\ell_{t,i} - \sum_{t=1}^T \ell^*_t
\notag \\ & \leq 
\eta\sum_{t=1}^T\sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t}\cdot \left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 + \frac{\log(D)}{\eta} -\frac{T\log(p_1)}{\eta}
\end{align}
\end{lemma}

%\begin{proof}
\noindent \textit{Proof}: 
Starting with a uniform distribution over the coordinates $w_{0,i}\equiv \frac{1}{D}$ such that $W_0 = 1$ and we have:
\begin{align}
    p_1\cdot W_{t+1} &= p_1\cdot\sum_{i\in \mathcal{I}} w_{t+1,i} 
    \nonumber\\ & \leq \label{eq:sum_of_products}
     \sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}\prod_{i\in\mathcal{I}_t}w_{t+1,i}
    \\&=
    W_t\sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}W_t^{-1}\prod_{i\in\mathcal{I}_t}w_{t,i}e^{-\eta\ell_{t,i}} 
    \nonumber \\&\leq \label{eq:sum_leq_1}
    W_t\sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}W_t^{-|\mathcal{I}_t|}\prod_{i\in\mathcal{I}_t}w_{t,i}e^{-\eta\ell_{t,i}} \cdot|perm(\mathcal{I}_t)|
    \\&=
    W_t \sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}\prod_{i\in\mathcal{I}_t}\frac{w_{t,i}}{W_t}e^{-\eta\ell_{t,i}}\cdot \sum_{p\in perm(\mathcal{I}_t)}1 
    \nonumber\\
    &=
    W_t\sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}\prod_{i\in\mathcal{I}_t}\pi_{t,i}e^{-\eta\ell_{t,i}}\cdot \sum_{p\in perm(\mathcal{I}_t)}\prod_{k\in p}1 
    \nonumber\\
    %      \end{align}
    %  \begin{align}
    &\leq 
    W_t\sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}e^{-\eta\sum_{i\in\mathcal{I}_t}\ell_{t,i}}\prod_{i\in\mathcal{I}_t}\pi_{t,i}\cdot\sum_{p\in perm(\mathcal{I}_t)}\prod_{k\in p}\left(1-\sum_{j\in p_{1:k}}\pi_{t,j}\right)^{-1}
    \nonumber
    \\&= \label{eq:back_to_pi_hat}
    W_t \sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t}e^{-\eta\sum_{i\in\mathcal{I}_t}\ell_{t,i}}
    \\&\leq 
    W_t \sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t}e^{-\frac{\eta}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}}
    \nonumber 
    \\&\leq \label{eq:exp_ineq}
    W_t\sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t}\left(1-\frac{\eta}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}+\eta^2\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2\right)
    \\&\leq \label{eq:pi_hat_sums_to_1}
    W_t\left(1+
    \sum_{c\in\mathcal{C}}p_c\cdot\left(\sum_{\mathcal{I}_t\in \mathcal{S}_c} \eta^2\hat{\pi}_{t,\mathcal{I}_t}\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 -\frac{\eta}{|\mathcal{I}_t|}\hat{\pi}_{t,\mathcal{I}_t}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)
    \right)
    \\&\leq \label{eq:exp_ineq_2}
    W_t e^{
    \sum_{c\in\mathcal{C}}p_c\cdot\left(\sum_{\mathcal{I}_t\in \mathcal{S}_c} \eta^2\hat{\pi}_{t,\mathcal{I}_t}\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 -\frac{\eta}{|\mathcal{I}_t|}\hat{\pi}_{t,\mathcal{I}_t}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)
    } 
\end{align}
where
\begin{itemize}
     \item (\ref{eq:sum_of_products}) holds since $\mathcal{C}\supset\{1\}$ always contains a block size of $1$ and thus
    \begin{align*}
        \sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in\mathcal{S}_c}\prod_{i\in\mathcal{I}_t}w_{t+1,i}
        &=
        p_1\sum_{\mathcal{I}_t\in\mathcal{S}_1}\prod_{i\in\mathcal{I}_t}w_{t+1,i}+\sum_{c\in\mathcal{C}\setminus \{1\}}p_c\sum_{\mathcal{I}_t\in\mathcal{S}_c}\prod_{i\in\mathcal{I}_t}w_{t+1,i}
        \\&=
        p_1\sum_{i\in \mathcal{I}} w_{t+1,i} + \sum_{c\in\mathcal{C}\setminus \{1\}}p_c\sum_{\mathcal{I}_t\in\mathcal{S}_c}\prod_{i\in\mathcal{I}_t}w_{t+1,i} 
       % \\ & 
       \geq 
        p_1\sum_{i\in \mathcal{I}} w_{t+1,i}
    \end{align*}
    \item (\ref{eq:sum_leq_1}) holds since $W_0=1$ and $W_t$ is monotonically non-increasing following the update rule (3) with non-negative losses, thus $w_t\leq 1$ for all $t$; (\ref{eq:back_to_pi_hat}) follows from (\ref{eq:common_numerator}); (\ref{eq:exp_ineq}) holds since $e^{-x}\leq 1-x+x^2$ for $x\geq 0$; (\ref{eq:pi_hat_sums_to_1}) holds due to Eq.~\ref{eq:prob_sum_to_1}; (\ref{eq:exp_ineq_2}) holds since $1+x\leq e^x$.
\end{itemize}

Given that the sum of weights of a certain coordinate block $\mathcal{I}^*$ is less than the total sum of weights, together with Eq.~\ref{eq:exp_ineq_2}, $w_{0,i}\equiv\frac{1}{D}$ and $W_0=1$ we have
\begin{align}
   \frac{1}{D}\sum_{i\in \mathcal{I}^*} & e^{-\eta\sum_{t=1}^T\ell_{t,i}}
   = \sum_{i\in \mathcal{I}^*} w_{t,i} \leq W_T 
   \notag \\ &
   \leq 
    p_1^{-T} e^{
  \sum_{t=1}^T\sum_{c\in\mathcal{C}}p_c\cdot\left(\sum_{\mathcal{I}_t\in \mathcal{S}_c} \eta^2\hat{\pi}_{t,\mathcal{I}_t}\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 -\frac{\eta}{|\mathcal{I}_t|}\hat{\pi}_{t,\mathcal{I}_t}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right) 
  },\nonumber
  \end{align}
Taking the $\log$ of both sides, we have
\begin{align} \label{eq:recursive_ineq}
  \log&\left(\sum_{i\in \mathcal{I}^*}e^{-\eta\sum_{t=1}^T\ell_{t,i}}\right) -\log(D) 
  \notag \\ 
  & \leq
  \sum_{t=1}^T\sum_{c\in\mathcal{C}}p_c\cdot\left(\sum_{\mathcal{I}_t\in \mathcal{S}_c} \eta^2\hat{\pi}_{t,\mathcal{I}_t}\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 -\frac{\eta}{|\mathcal{I}_t|}\hat{\pi}_{t,\mathcal{I}_t}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right) - T\log(p_1)
  \end{align}
Following the same certain block, all the participating coordinates suffer the same loss 
 $\ell^*_t$ at every time step as follows from Eq.~(3), %\ref{eq:alpha_beta_loss}, 
 hence
 \begin{align} 
%  \notag
     \log\left(\sum_{i\in \mathcal{I}^*}e^{-\eta\sum_{t=1}^T\ell_{t,i}}\right)
     =
     \log\left(\sum_{i\in \mathcal{I}^*}e^{-\eta\sum_{t=1}^T\ell^*_t}\right)
     &= 
     \notag
     \log\left(|\mathcal{I}^*|e^{-\eta\sum_{t=1}^T\ell^*_t}\right)
     \\&
     =  %\notag
     \log(|\mathcal{I}^*|) -\eta\sum_{t=1}^T\ell^*_t
     \geq %\label{eq:common_loss_ineq}
     -\eta\sum_{t=1}^T\ell^*_t, \nonumber
 \end{align}
 which, together with Eq.~(\ref{eq:recursive_ineq}),  yields
 %and (\ref{eq:common_loss_ineq}) yield
 \begin{align}
     -\eta\sum_{t=1}^T\ell^*_t &-\log(D) 
     \notag \\ & 
     \leq 
     \sum_{t=1}^T\sum_{c\in\mathcal{C}}p_c\cdot\left(\sum_{\mathcal{I}_t\in \mathcal{S}_c} \eta^2\hat{\pi}_{t,\mathcal{I}_t}\left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2 -\frac{\eta}{|\mathcal{I}_t|}\hat{\pi}_{t,\mathcal{I}_t}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right) -T\log(p_1), \nonumber
 \end{align}
which finishes the proof.
 %\end{proof}
 
%  \textbf{Theorem 1}  Apply the update rule in \ref{eq:multiplicative_weight_update}, with a modified $\eta=\log(\alpha\beta)^{-1}\sqrt{\frac{\log(D)}{T}}$, then:
%  \begin{equation*}
%      Regret_t = \mathcal{O}(\log(\alpha\beta)\sqrt{T\log(D)})
%  \end{equation*}
 
%  \textbf{Proof} 
\noindent \textbf{Proof of Theorem~2:}%\ref{theo:regret_without_replacement}}:
%\begin{proof}
Since $\ell_{t,i} \leq \log(\tilde{\alpha}\tilde{\beta})$ then
 \begin{align}
     \left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2
     \leq
     \left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\log(\tilde{\alpha}\tilde{\beta})\right)^2
     \leq
     \log(\tilde{\alpha}\tilde{\beta})^2. \nonumber
 \end{align}
 Thus, due to Eq.~(\ref{eq:prob_sum_to_1}), we have
 \begin{align}
    \sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t}\cdot \left(\frac{1}{|\mathcal{I}_t|}\sum_{i\in\mathcal{I}_t}\ell_{t,i}\right)^2
     \leq
     \sum_{c\in\mathcal{C}}p_c\sum_{\mathcal{I}_t\in \mathcal{S}_c}\hat{\pi}_{t,\mathcal{I}_t}\log(\tilde{\alpha}\tilde{\beta})^2
     =
     \log(\tilde{\alpha}\tilde{\beta})^2. \nonumber
 \end{align}
Eq.~(\ref{eq:lemma_1}) reads
 \begin{equation} \label{eq:final_lemma_1}
     Regret_t \leq \eta T \log(\tilde{\alpha}\tilde{\beta})^2 + \frac{\log(D)}{\eta} - \frac{T\log(p_1)}{\eta}.
    %  = 
    %  2 \log(\tilde{\alpha}\tilde{\beta})\sqrt{T\log(D)}
 \end{equation}
Choosing $\eta\geq 1$, we have
 \begin{equation}
     Regret_t \leq \eta T \log(\tilde{\alpha}\tilde{\beta})^2 + \frac{\log(D)}{\eta} - \eta T\log(p_1) = 
     \eta T (\log(\tilde{\alpha}\tilde{\beta})^2 -\log(p_1)) + \frac{\log(D)}{\eta}. \nonumber
    %  = 
    %  2 \log(\tilde{\alpha}\tilde{\beta})\sqrt{T\log(D)}
 \end{equation}
 Thus setting $\eta=\sqrt{\frac{\log(D)}{T(\log(\tilde{\alpha}\tilde{\beta})^2 -\log(p_1))}}\geq 1$ finally we have
  \begin{equation}
     Regret_t 
     \leq 
     \mathcal{O}\left(\sqrt{
     (\log(\tilde{\alpha}\tilde{\beta})^2 -\log(p_1)) \cdot T\log(D)}
     \right). \nonumber
    %  2 \log(\tilde{\alpha}\tilde{\beta})\sqrt{T\log(D)}
 \end{equation}
 %\end{proof}
 
 \emph{Remark:} Note that the condition $\eta\geq 1$ can be replaced by setting an appropriate $p_1=\sqrt[T]{\epsilon}$ for $0<\epsilon\leq 1$. Thus Eq.~(\ref{eq:final_lemma_1}) reads
 \begin{equation}
     Regret_t 
     \leq 
     \eta T \log(\tilde{\alpha}\tilde{\beta})^2 + \frac{\log(D)-\log(\epsilon)}{\eta}.\nonumber
 \end{equation}
 Thus, setting $\eta=\frac{1}{\log(\tilde{\alpha}\tilde{\beta})}\sqrt{\frac{\log(D)-\log(\epsilon)}{T}}$ yields
     $Regret_t 
     \leq 
     \mathcal{O}\left(\log(\tilde{\alpha}\tilde{\beta})^{-1}\sqrt{
      T(\log(D)-\log(\epsilon))} \right)$.