% \documentclass{uai2025} % Keep the base UAI template
% \documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 

% Additional packages from CVPR template
\usepackage{multirow}
\usepackage{subcaption}
\usepackage{wrapfig}
\usepackage[normalem]{ulem}
\usepackage{bbding}
\usepackage{pifont}
% \usepackage{color}
\usepackage{kotex}
% \usepackage{adjustbox}
\usepackage{multicol}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{xcolor}
\usepackage{booktabs}

\usepackage{amsthm}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}


\usepackage{algorithm}
\usepackage{algpseudocode}

% Keep the rest of your original UAI preamble below
\usepackage[american]{babel}
\usepackage{natbib}
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}

\title{Sparse Structure Exploration and Re-optimization for Vision Transformer}

% Author block using UAI format
\author[1]{\href{mailto:do753951@kookmin.ac.kr}{Sangho An}}
\author[1]{\href{mailto:acasia@kookmin.ac.kr}{Jinwoo Kim}}
\author[2]{\href{mailto:keonho.lee@hyundai.com}{Keonho Lee}}
\author[2]{\href{mailto:jingang4394@hyundai.com}{Jingang Huh}}
\author[2]{\href{mailto:kcw4875@hyundai.com}{Chanwoong Kwak}}
\author[2]{\href{mailto:yjlee28@hyundai.com}{Yujin Lee}}
\author[2]{\href{mailto:jinms@hyundai.com}{Moonsub Jin}}
\author[1]{\href{mailto:jangho.kim@kookmin.ac.kr}{Jangho Kim\thanks{Corresponding Author}}}

\affil[1]{
    % Computer Science Department\\
    Kookmin University\\
    Seoul, Korea
    % \par
    % \texttt{\{do753951, acasia, jangho.kim\}@kookmin.ac.kr}
}
\affil[2]{
    RoboticsLab \\
    Hyundai Motor Company 
    % Uiwang, Korea
    % \par
    % \texttt{\{keonho.lee, jingang4394, kcw4875, yjlee28, jinms\}@hyundai.com}
}


\begin{document}
\maketitle

\begin{abstract}
Vision Transformers (ViTs) achieve outstanding performance by effectively capturing long-range dependencies between image patches (tokens). However, the high computational cost and memory requirements of ViTs present challenges for model compression and deployment on edge devices. In this study, we introduce a new framework, Sparse Structure Exploration and Re-optimization (SERo), specifically designed to maximize pruning efficiency in ViTs. Our approach focuses on (1) hardware-friendly pruning that fully compresses pruned parameters instead of zeroing them out, (2) separating the exploration and re-optimization phases in order to find the optimal structure among various possible sparse structures, and (3) using a simple gradient magnitude-based criterion for pruning a pre-trained model. SERo iteratively refines pruning masks to identify optimal sparse structures and then re-optimizes the pruned structure, reducing computational costs while maintaining model performance. Experimental results indicate that SERo surpasses existing pruning methods across various ViT models in both performance and computational efficiency. For example, SERo achieves a 69\% reduction in computational cost and a 2.4x increase in processing speed for DeiT-Base model, with only a 1.55\% drop in accuracy. Implementation code: \url{https://github.com/Ahnho/SERo}.
\end{abstract}


\section{Introduction}
\label{sec:intro}


Vision Transformers (ViTs), in particular, have shown success in numerous computer vision tasks by tokenizing image patches and utilizing them in various innovative ways ~\citep{liu2021swintransformerhierarchicalvision,touvron2021trainingdataefficientimagetransformers}. Despite their successes, however, the excessive parameter count and computational cost of transformers have become significant barriers to the efficiency of ViTs. Consequently, extensive research has focused on pruning parameters to enhance the computational efficiency of ViTs.


\begin{figure}[t]
  \centering
  \begin{subfigure}{0.4\linewidth}
    \centering
    \includegraphics[width=\linewidth]{Images/value_ratio_accuracy.png}
    \caption{Value Ratio and Accuracy}
    \label{fig:valueratio_and_acc}
  \end{subfigure}
  \begin{subfigure}{0.55\linewidth}
    \centering
    \includegraphics[width=\linewidth]{Images/weight_grad.png}
    \caption{Weight and Gradient distributions}
    \label{fig:weight_and_grad_distributions}
  \end{subfigure}
  \caption{Figure~\ref{fig:valueratio_and_acc} shows the accuracy and remaining value ratio after weight and gradient magnitude pruning according to sparsity ratio on CIFAR-100 dataset of DeiT-Tiny model, and Figure~\ref{fig:weight_and_grad_distributions} illustrates the weight and gradient distributions of Q, K, V, and FFN layers in the pre-trained model with 128 batch size.}
 \label{Vit_fig1}
\end{figure}


According to~\citep{paul2022unmasking}, well explored sparse structure of networks often exhibit a flatter error landscape, which provides a favorable structure for training and thus offers optimization advantages.  Notably, dynamic pruning methods~\citep{lin2020dynamic,kim2023finding} emphasize exploring various sparse structures during training can lead to finding better sparse structures. Due to these benefits, the importance of exploration in sparse structures becomes more pronounced.~\citep{zimmer2021learned} highlights that re-optimizing models obtained through one-shot pruning can lead to better-performing sparse models. Considering previous research, finding an optimal sparse structure is essential. A well-explored structure enables re-optimization to begin from a more advantageous landscape, and instead of merely fine-tuning the sparse model, re-optimizing it is crucial for recovering any lost performance.
 
\begin{figure*}[t]
  \centering
   \includegraphics[width=1\linewidth]{Images/vit_p.png}  
   \caption{The overall process of our proposed SERo method consists of three main steps. \textbf{First}, we explore the sparse structure by dynamically adjusting masks while considering the dimensions of multi-head self-attention and the loss of the value matrix. \textbf{Second}, we compress the discovered structure to achieve actual computational speed improvements. \textbf{Finally}, we reset the learning rate and re-optimize with the identified structure with few epochs.}
   \label{fig:main}
\end{figure*}

However, In the case of ViT pruning, there are many considerations due to the need to fine-tune a pre-trained model with an attention mechanism for downstream tasks. Although existing methods provide effective solutions, they still have limitations. For example, if pruning is performed without considering dimensionality and simply zeros out inactive channels, it may fall short in achieving practical gains in the computational cost and the latency~\citep{yu2022unified,yu2022topology}. When selecting channels (neurons) to prune, criteria with high computational costs, such as importance or Hessian, are often used, and the model is fixed in the early epochs rather than focusing on exploring various sparse structures~\citep{shim2024snpstructuredneuronlevelpruning,chen2021chasing}. Additionally, the identified sparse structure is learned through fine-tuning rather than re-optimization~\citep{yu2022width,yu2023x,yu2022combinatorial,he2017channel,he2019filter}.

As shown in Figure~\ref{Vit_fig1}, we observe that the weights in the Q and K matrices of the ViT's pre-trained model generally have higher magnitudes compared to those in the V matrix. Consequently, when applying a typical weight magnitude-based pruning approach~\citep{han2015deep,yu2022unified} to derive sparse structures, the value matrix tends to be pruned more extensively, leading to a proportional decline in performance. In the attention mechanism, even if Q and K matrices are heavily pruned, information is still preserved across tokens. However, significant pruning of the value matrix results in considerable information loss. This effect becomes more pronounced as the pruning ratio increases, demonstrating that even a simple criterion based on gradient magnitude can effectively guide channel selection without requiring complex criteria. 

Based on our observation, to address these challenges while leveraging the strengths of structured exploration and re-optimization suited for ViT, this work introduces a novel ViT pruning methodology from three perspectives:

\textbf{(1)} Real compression for computational costs and latency efficiency: We fully compress the pruned parameters instead of merely zeroing them out, leading to tangible improvements in both computational costs and the inference speed.
\textbf{(2)} Separation of exploration and re-optimization: Unlike previous methods, we separate the exploration phase to find an optimal sparse model from the re-optimization phase. After identifying an appropriate sparse network structure, we compress and perform re-optimization by resetting learning rate instead of fine-tuning.
\textbf{(3)} Simple criterion considering transfer learning: By analyzing the characteristics of pre-trained models, we propose a simple criterion for effectively exploring sparse models in the context of transfer learning.

We propose a gradient-based sparse structure exploration and re-optimization (SERo) that incorporates three perspectives. Figure~\ref{fig:main} shows the overall process of SERo. To select an optimal sparse model based on a pre-trained model, we employ a simple gradient magnitude criterion. For broader exploration, we use gradual pruning with dynamic pruning that periodically updates the pruning mask. Since weight magnitude is known to significantly impact model performance in most pruning methods, we design our approach to prioritize updates to parameters with higher weight magnitudes among those identified as important by the gradient criterion. This enables effective exploration and discovery of high-quality sparse structures. Once the target sparsity is reached, exploration is halted, and compression is applied to establish the sparse structure. Unlike typical fine-tuning, we separate exploration from re-optimization, allowing us to re-optimize from a well-structured sparse foundation and thereby maximize the performance of the sparse model. Our contributions are as follows:
\begin{itemize}
    \item We introduce a novel sparse structure exploration and re-optimization method (SERo) based on three perspectives, incorporating both gradient and weight magnitude in the exploration phase and applying re-optimization on the compressed sparse structure.
    \item We provide a theoretical analysis of the convergence properties of our exploration update method and investigate the advantages of re-optimizing the explored sparse structure, analyzing differences from other exploration techniques.
    \item We demonstrate the effectiveness across various benchmarks and validate the effectiveness of  our approach in the latency and accuracy through extensive experiments. In particular, DeiT-Base with SERo improve 2.4x faster than the original model with minimal performance drop.
\end{itemize}

The proposed method maintains ViT performance while achieving practical computational costs and speed optimizations suitable for deployment. This new approach to ViT pruning demonstrates the potential to maximize model efficiency and transfer learning performance, making it highly applicable to real-world scenarios.


\section{Related work}
\label{sec:relatedwork}


%-------------------------------------------------------------------------
\subsection{Pruning}


{Unstructured pruning~\citep{lin2020dynamic,frankle2018lottery,liu2019rethinkingvaluenetworkpruning} removes less important weights at the individual level. While this approach can achieve higher compression rates, it is difficult to accelerate with hardware.}
% However, some unstructured pruning methods, such as N:M~\citep{zhou2021learning} pruning, are specifically designed to fit hardware structures.
{On the other hand, structured pruning~\citep{luo2017thinet,he2018soft,lin2020channel,lin2020hrank} removes larger structural units, such as channels, filters, or layers, effectively altering the network's architecture. As a result, it can directly improve hardware acceleration. Our proposed SERo adopts this structured pruning approach to effectively achieve performance improvements on actual hardware.}


%--------------------------------------------------------------------------------
\subsection{Vision transformer model pruning}


Vision Transformers (ViTs)\citep{dosovitskiy2021imageworth16x16words, liu2021swintransformerhierarchicalvision,touvron2021trainingdataefficientimagetransformers} are models that convert image patches into tokens, embed them, and utilize only the encoder structure of transformers\citep{vaswani2023attentionneed}. While ViTs demonstrate exceptional performance in image processing tasks, they face significant limitations due to their high computational complexity and large memory requirements. Various research efforts are underway to address these challenges by reducing computational costs.
One approach involves proposing lightweight ViT architectures through Knowledge Distillation techniques, as demonstrated in DeiT~\citep{touvron2021trainingdataefficientimagetransformers}. For model pruning approaches, SSViTE~\citep{chen2021chasing} proposes a method to optimize model parameters and explore connectivity during training, dynamically extracting and learning sparse sub-networks of ViT. WDPruning~\citep{yu2022width} introduces compression by simultaneously reducing both the width and depth dimensions using trainable parameters and shallow classifiers. X-pruner~\citep{yu2023x} proposes an end-to-end explainability-aware mask to measure each prunable unit's contribution. UVC~\citep{yu2022unified} presents an integrated framework combining pruning, layer skipping, and knowledge distillation. More recently, SNP~\citep{shim2025snp} introduces structured neuron-level pruning, which prunes the queries and keys with the least information while maintaining the overall attention scores. All these approaches share the common goal of improving ViT efficiency while maintaining or enhancing their performance.


\section{Proposed method}

In this section, we describe the basic structure of the ViT and explain the pruning granularity, criterion, and the exploration of the granular sparse structure used in the proposed method and the Re-optimization.


\subsection{Preliminary}
\label{Preliminary}
A ViT block consists of a self-attention layer, projection layer and the feed-forward network in the form of a multi-layer perceptron. The self-attention layer transforms the input tokens \( X \in \mathbb{R}^{N \times E} \) into query, key, and value representations using weight matrices \( W^q, W^k, W^v \in \mathbb{R}^{E \times E} \), resulting in \( Q, K, V \in \mathbb{R}^{N \times E} \). To capture a diverse set of attentions, multi-head self-attention (MSA) is applied by splitting \( Q, K, V \) into \( H \) heads. Each head performs a self-attention operation defined as \( \text{Attn}^l(Q_l, K_l, V_l) = \text{softmax}\left(\frac{Q_l K_l^T}{\sqrt{E/H}}\right) V_l \), where \( l \) is the index of the head, and there are \( H \) heads in total. Here, \( Q_l, K_l, V_l \in \mathbb{R}^{N \times E/H} \). The resulting attention outputs \( \text{Attn}^l \) from each head are concatenated and restored to the original dimension \( \mathbb{R}^{N \times E} \) before being passed through the feed-forward network. The MSA operation is expressed as follows:
\[
\text{MSA}(X) = \text{concat}(\text{Attn}^1, \text{Attn}^2, \dots, \text{Attn}^H).
\]

The values obtained through the diverse attentions of MSA are concatenated, passed through a projection layer $W^{P}$, followed by layer normalization, and then processed through Feed Forward Network (FFN) with weights \( W^{f1} \) and \( W^{f2} \). 


\subsection{Pruning unit, granularity and criterion}
\label{criterion}
To perform structured pruning in a hardware-friendly manner, we divide the pruning units into three groups based on their functional components (Q\&K, V\&proj, FFN). Therefore, we share the pruning mask \( M \) at the level where actual computations occur. \( W^q \) and \( W^k \) are pruned using \( M^{q,k} \),\( W^v \) and \( W^p \) are pruned using \( M^{v} \) , and finally, \( W^{f1} \) and \( W^{f2} \) are pruned using \( M^{f} \) .


To prune \( W^q \) and \( W^k \) by columns based on the average \( L_1 \) norm of their gradients, we define a column-wise pruning mask \( M \in \mathbb{R}^{E} \), applied uniformly across all elements in each column of \( W^q \) and \( W^k \). This results in:

\[
\overline{W}^q = M^{q,k} \odot W^q, \quad \overline{W}^k = M^{q,k} \odot W^k
\]
where $M^{q,k} = \{ m^{q,k}{i,j}  \mid m^{q,k}{i,j} \in \{0,1\} ,  i = 1, \dots, E,  j = 1, \dots, E \}$, each entry \( M^{q,k}_j \) in the mask vector \( M^{q,k} \) is determined by the average \( L_1 \) norm of the gradients for the \( j \)-th column of \( W^q \) and \( W^k \). Specifically, the gradient of the loss \( \mathcal{L} \) is computed with respect to the full weight matrices \( W^q \) and \( W^k \), and we then select the \( j \)-th column of the resulting gradient matrices:
\begin{equation}
M^{q,k}_j = \begin{cases} 
1 & \text{if } \frac{1}{2} \left( \left\| \nabla_{W^q} \mathcal{L}[:,j] \right\|_1 + \left\| \nabla_{W^k} \mathcal{L}[:,j] \right\|_1 \right) \geq \tau \\
0 & \text{otherwise}
\end{cases}
\label{eq:qk_pruning}
\end{equation}

where  \( \nabla_{W^q} \mathcal{L}[:,j] \) and \( \nabla_{W^k} \mathcal{L}[:,j] \) denote the gradients of the loss \( \mathcal{L} \) with respect to the full weight matrices \( W^q \) and \( W^k \), indexed by the \( j \)-th column. The threshold \( \tau \) is used to decide which columns are retained or pruned, with columns below this threshold pruned by setting \( M^{q,k}_j = 0 \). Each entry \( M^{q,k}_j \) applies uniformly across all rows in the \( j \)-th column of \( W^q \) and \( W^k \), such that the entire column is either pruned or retained.

To prune \( W^v \), we use the \( L_1 \) norm of the gradient of \( W^v \), and we prune by removing columns, reducing its dimension. To account for the deleted dimensions in the next layer, we also apply pruning to the weights in \( W^p \) that correspond to the pruned input dimensions. This results in:
\[
\overline{W}^v = M^{v} \odot W^v, \quad \overline{W}^p = (M^{v})^T \odot W^p
\]
where \( (M^{v})^T \) is the transpose operation. The pruning mask \( M^{v}_j \) is determined as follows:

\begin{equation}
M^{v}_j = \begin{cases} 
1 & \text{if } \left\| \nabla_{W^v} \mathcal{L}[:,j] \right\|_1 \geq \tau \\
0 & \text{otherwise}
\end{cases}
\label{eq:v_pruning}
\end{equation}

Similarly to \( W^v \), we perform pruning in the feed-forward network based on \( W^{f1} \), and the corresponding dimensions are removed in \( W^{f2} \). This results in:
\[
\overline{W}^{f1} = M^{f} \odot W^{f1}, \quad \overline{W}^{f2} = (M^{f})^T \odot W^{f2}
\]
where the pruning mask \( M^{f}_j \) is determined as follows:

\begin{equation}
M^{f}_j = \begin{cases} 
1 & \text{if } \left\| \nabla_{W^{f1}} \mathcal{L}[:,j] \right\|_1 \geq \tau \\
0 & \text{otherwise}
\end{cases}
\label{eq:ffn_pruning}
\end{equation}


\subsection{Structure exploration and re-optimization}

The dynamic sparse structure update rule with $t$-th iteration as follows: 
\begin{equation}
W_{t+1} = W_t - \eta \cdot \nabla_{\overline{W}_t} \mathcal{L}
\label{eq:f weight_update}
\end{equation}

has the advantage of allowing dynamic exploration of sparse structures through the Straight Through Estimator (STE)~\citep{lin2020dynamic,guo2016dynamic}. However, STE introduces instability because it estimates the non-differentiable part \( \nabla_{W_t} \overline{W}_t \) as 1 during the update process~\citep{kim2023finding,zhou2021learning}. To ensure more stable sparse structure exploration, we use a soft pruning mask such that \( \nabla_{W_t} \overline{W_t} = M_t \), leading to the update rule:

\begin{equation}
W_{t+1} = W_t - \eta \cdot \nabla_{\overline{W_t}} \mathcal{L} \cdot M_t \cdot W^{|\cdot|}
\label{eq:prun_weight_update}
\end{equation}

We denote $|\cdot|$ as a element-wise absolute function. Note that \( W \) consists of \( W^q, W^k, W^v, W^p, W^{f1}, W^{f2} \), which are the weight matrices included in the ViT block as described in Section~\ref{Preliminary}, and \( M \) corresponds to the masks applied to each weight matrix as explained in Section~\ref{criterion}. In Figure~\ref{Vit_fig1}, we addressed the issue of imbalance in \( W \) of the pre-trained model. To resolve this issue, we conduct a gradient-based pruning. Since the magnitude of \( W \), which reflects the contribution to the computation, was not considered during the gradient-based pruning, we apply the non-uniform scaling of \( W^{|\cdot|} \) to the gradient. This approach reflects the intention of updating more heavily for weights with larger magnitudes, as they are more important. Gradual pruning is employed to progressively increase sparsity, influencing the selection of filters to be removed in future iterations of sparse structure discovery~\citep{lin2020dynamic}.

\noindent \textbf{Convergence analysis :}
We provide a convergence analysis for the proposed dynamic pruning method with weight magnitude alignment in convex functions. The update rule for the weights is defined as:

\[
W_{t+1} = W_t - \eta \cdot \nabla_{\overline{W_t}} \mathcal{L} \cdot M_t \cdot W^{|\cdot|}
\]

where \( M \) is the pruning mask, \( W^{|\cdot|} \) represents the element-wise magnitude of the weights, and \( \nabla_{\overline{W_t}} \mathcal{L} \) is the gradient with respect to the pruned weights.

\begin{theorem}
Let \( W_* \) be the optimal solution for a convex loss function \( \mathcal{L}(W) \), and assume the gradient of the pruned model is bounded by \( G^2 \) for every pruned model, such that:
$\mathbb{E} \left[ \| \nabla_{\overline{W_t}} \mathcal{L} \|^2 \mid W_t \right] \leq G^2$ and the loss difference between $\mathcal{L}(\overline{W_t})- \mathcal{L}(W_t) \leq \epsilon$ at every $t$-th iteration. 
Then, the convergence of the pruned model with weight magnitude alignment is upper-bounded as follows:
\begin{equation}
\resizebox{\columnwidth}{!}{
\begin{math}
\begin{aligned}
\mathbb{E}\left[\frac{1}{T} \sum_{t=1}^{T} \mathcal{L}(\overline{W}_t)\right] - \mathcal{L}(W_*) \leq \frac{\| W_1 - W_* \|^2}{2 \eta T} + 2\eta T \epsilon  + \frac{\eta G^2}{2} \mathbb{E}\left[\| M_t \cdot W^{|\cdot|}_t \|^2 \right]
\end{aligned}
\end{math}
}
\end{equation}
\label{th1}
\end{theorem}
\begin{proof}
The update rule for the pruned weight is:
\[
W_{t+1} = W_t - \eta \cdot \nabla_{\overline{W_t}} \mathcal{L} \cdot M \cdot W^{|\cdot|}
\]
Let \( W_t \) be fixed, and take the expectation over \( W_t \). Using the convexity of \( \mathcal{L} \), we can express the difference between the current weights and the optimal solution as:
\begin{equation}
\resizebox{\columnwidth}{!}{
    \begin{math}
    \begin{aligned}
\mathbb{E} \left[ \| W_{t+1} - W_* \|^2 \mid W_t \right] = \| W_t - W_* \|^2 + \eta^2 \mathbb{E}\left[\| \nabla_{\overline{W_t}} \mathcal{L} \cdot M_t \cdot W^{|\cdot|}_t \|^2 \right] \\ - 2 \eta \mathbb{E}\left[\langle 
\nabla_{\overline{W_t}} \mathcal{L} \cdot M_t \cdot W^{|\cdot|}_t, W_t - W_* \rangle \mid W_t \right].
    \end{aligned}
    \end{math}
}
\end{equation}
By bounding the gradient term and using the assumption that \( \mathbb{E} \left[ \| \nabla_{\overline{W_t}} \mathcal{L} \|^2 \mid W_t \right] \leq G^2 \), we obtain:
\begin{equation}
\resizebox{\columnwidth}{!}{
    \begin{math}
    \begin{aligned}
&\mathbb{E} \left[ \| W_{t+1} - W_* \|^2 \mid W_t \right] \leq \| W_t - W_* \|^2 + \eta^2 G^2 \mathbb{E}\left[\| M_t \cdot W^{|\cdot|}_t \|^2 \right] \\ &- 2 \eta (\mathcal{L}(\overline{W_t}) - \mathcal{L}(W_*)-\epsilon).
    \end{aligned}
    \end{math}
}
\end{equation}
Summing over \( t = 1 \) to \( T \), we arrive at the final result:

\begin{equation}
\resizebox{\columnwidth}{!}{
\begin{math}
\begin{aligned}
\mathbb{E}\left[\frac{1}{T} \sum_{t=1}^{T} \mathcal{L}(\overline{W}_t)\right] - \mathcal{L}(W_*) \leq \frac{\| W_1 - W_* \|^2}{2 \eta T} + 2\eta T \epsilon + \frac{\eta G^2}{2} \mathbb{E}\left[\| M_t \cdot  W^{|\cdot|}_t \|^2 \right].
    \end{aligned}
    \end{math}
}
\end{equation}
Details are in Appendix.
\end{proof}


Once the exploration of the sparse structure is complete, we modify the structure of \( W \) using the mask \( M \). Since pruning was performed at the column level, making it hardware-friendly, we can remove the parts corresponding to \( M_j \) from the matrix through structured pruning. We define the newly generated lightweight network, after the exploration is complete, as \( \tilde{W} \).
Then, we optimize $\tilde{W}$ with the conventional gradient descent as depicted in Figure~\ref{fig:main}.
\begin{equation}
\tilde{W}_{t+1} = \tilde{W}_t - \eta \cdot \nabla_{\tilde{W}_t} \mathcal{L} 
\label{optimization}
\end{equation}

By initializing \( \tilde{W}_{1} \) with the parameters found in the exploration phase ($\tilde{W}_{1}\coloneqq\overline{W}_{T}$, also represented as $W_1^{\overline{W}} $), the convergence bound is tighter compared to starting with randomly initialized parameters of the same structure.

\begin{equation}
\Delta = \frac{\|W_1^{\text{random}} - W^*\|^2 - \|W_1^{\overline{W}} - W^*\|^2}{2 \eta T}
\label{initialization}
\end{equation}


\begin{table*}[t]
\centering
\caption{Evaluation of pruning methods for DeiT models on ImageNet-1K dataset: accuracy (\%), computational cost (GFLOPs), and number of parameters (M).}
\resizebox{0.8\textwidth}{!}{
\renewcommand{\arraystretch}{0.9}
\begin{tabular}{c|c|cc|c|c}
\hline
& Method & Top-1 (\%) & Top-5 (\%) & GFLOPs & Params (M) \\
\hline
\multirow{8}{*}{\scalebox{0.9}{\rotatebox{0}{DeiT-Tiny}}} 
& Original~\citep{touvron2021trainingdataefficientimagetransformers} & 72.20 & 91.10 & 1.3 & 5.7 \\
& SSViTE~\citep{chen2021chasing} & 70.12 & - & 0.9 & 4.2 \\
& WDPruning~\citep{yu2022width} & 70.34 & 89.82 & 0.7 & 3.5 \\
& X-Pruner~\citep{yu2023x} & 71.10 & 90.11 & 0.6 & - \\
& UVC~\citep{yu2022unified} & 70.60 & - & 0.5 & - \\
& SNP~\citep{shim2025snp} & 70.29 & 90.01 & 0.6 & 3.0 \\
& \textbf{SERo (ours)} & \textbf{72.30} & \textbf{91.0} & \textbf{0.8} & \textbf{3.4} \\
\hline
\multirow{9}{*}{\scalebox{0.9}{\rotatebox{0}{DeiT-Small}}}
& Original~\citep{touvron2021trainingdataefficientimagetransformers} & 79.85 & 95.00 & 4.6 & 22.1 \\
& SSViTE~\citep{chen2021chasing} & 79.22 & - & 3.1 & 14.6 \\
& WDPruning~\citep{yu2022width} & 78.38 & 94.05 & 2.6 & 13.3 \\
& X-Pruner~\citep{yu2023x} & 78.93 & 94.24 & 2.4 & - \\
& UVC~\citep{yu2022unified} & 78.82 & - & 2.3 & - \\
& SNP~\citep{shim2025snp} & 78.52 & 94.37 & 2.0 & 10.0 \\
& SNP~\citep{shim2025snp} & 73.32 & 91.66 & 1.3 & 6.4 \\
& \textbf{SERo (ours)} & \textbf{79.30} & \textbf{94.60} & \textbf{2.8} & \textbf{13.5} \\
& \textbf{SERo (ours)} & \textbf{74.84} & \textbf{92.42} & \textbf{1.5} & \textbf{7.2} \\
\hline
\multirow{8}{*}{\scalebox{0.9}{\rotatebox{0}{DeiT-Base}}}
& Original~\citep{touvron2021trainingdataefficientimagetransformers} & 81.80 & 95.59 & 17.6 & 86.6 \\
& SSViTE~\citep{chen2021chasing} & 82.22 & - & 11.8 & 56.8 \\
& WDPruning~\citep{yu2022width} & 80.76 & 95.36 & 9.9 & 55.3 \\
& X-Pruner~\citep{yu2023x} & 81.02 & 95.38 & 8.5 & - \\
& UVC~\citep{yu2022unified} & 80.57 & - & 8.0 & - \\
& SNP~\citep{shim2025snp} & 79.63 & 94.37 & 6.4 & 31.6 \\
& \textbf{SERo (ours)} & \textbf{80.25} & \textbf{94.98} & \textbf{5.4} & \textbf{27.0} \\
\hline
\end{tabular}
}
\label{tab:comparison}
\end{table*}


\begin{algorithm}[t]
\caption{Sparse Structure Exploration and Re-optimization (SERo)}
\textbf{Require:} Total epochs $E=E_e + E_o$, Pruning frequency $F$, Total iterations per epoch $T$, Binary mask ${M}\in\{0, 1\}^N$, Dense network ${W} \in \mathbb{R}^N$, Pruned network ${\overline{{W}}={M} \odot W}$\\
\hrulefill
\textbf{Initialize:} Model weights $W$

\textbf{[ Sparse Structure Exploration phase ]}
\begin{algorithmic}[1]
\State  Warming-up for 1 epoch
\For{epoch = 2, ..., $E_e$}
    \State calculate gradual sparsity $p$ 
    \For{Iter = 1, ..., $T$}
        \State \textbf{if} Iter $\%$ $F$ == 0 \textbf{then}
            \State \hspace{5mm} calculate pruning mask $M$ with Eq.~\ref{eq:qk_pruning}, ~\ref{eq:v_pruning}, ~\ref{eq:ffn_pruning}
            \State \hspace{5mm} update sparse weights $\overline{W} = W \odot M$
        \State update weights $W$ with gradient descent Eq.~\ref{eq:prun_weight_update}
        % \State $t$ = $t$ + 1
    \EndFor
\EndFor \\
\textbf{[Model compression by deleting zero weights]} \\
\textbf{[ Re-optimization Phase ]}
\For{epoch = 1, ..., $E_o$}
    \State Pruning Network training
    \State update weights $\overline{W}$  with gradient descent Eq.~\ref{optimization}
    
    % \State Update sparse weights $W_t$ with fixed mask $M$
%     \State Apply gradient descent on non-zero weights only
%     \State Keep pruned weights at zero: $W_t = W_t \odot M$
\EndFor
% \State \Return Final pruned model with weights $W$ and mask $M$
\end{algorithmic}
\label{alg:overlif}
\end{algorithm}

\section{Experiment}

We applied our proposed method to perform pruning and compression on DeiT~\citep{touvron2021trainingdataefficientimagetransformers} models (Tiny, Small and Base), which are pretrained models trained on the ImageNet-1K~\citep{5206848} and CIFAR~\citep{krizhevsky2009learning} dataset. We followed the same setting of SNP~\cite{shim2025snp}. We did not add additional epochs for the re-optimization phase; instead, we split the total epochs into the exploration and re-optimization phases, ensuring the same training cost as SNP. Implementation details are provided in Appendix and the implemented code. 

\subsection{Main result}

Table~\ref{tab:comparison} demonstrates the superior performance of our proposed SERo method on the ImageNet-1K dataset. While existing pruning methods struggled to maintain performance with higher pruning ratios, our Sparse Structure Exploration and Re-optimization (SERo) approach shows remarkable efficiency.
For DeiT-Base model, SERo achieves approximately 69\% reduction in computational cost (GFLOPs) with only 1.55\% accuracy drop compared to the dense model. Notably, when compared to SNP~\citep{shim2025snp}, which achieved the highest pruning ratio of about 63\% among existing methods, SERo achieves 6\% more pruning while maintaining 0.62\% higher accuracy.

For the smaller DeiT-Small model, SERo with 39\% pruning achieving higher accuracy while pruning 5\% more compared to SSVITE~\citep{chen2021chasing}, which previously showed the best performance. SERo shows only 0.55\% accuracy drop compared to the dense model. Furthermore, when compared to SNP, which had the highest compression ratio, SERo achieves similar pruning levels while showing 1.52\% higher accuracy.

For DeiT-Tiny, SERo's effective structure exploration capability has also been proven. Despite applying about 39\% pruning, it achieved a 0.08\% performance improvement compared to the unpruned original model. This demonstrates that SERo can improve performance through exploring and re-optimizing more efficient network structures beyond simple pruning. These results validate SERo's effective structure exploration and optimization capabilities across different model scales. Additional experimental results and detailed analysis can be found in Appendix.

\begin{figure}[t]
\centering
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=1\linewidth]{Images/accuracy_latency.png}
    \end{subfigure}
\caption{Relationship between model accuracy and inference latency across different sparsity levels.}
\label{acc_latency}
\end{figure}




\begin{table}[t]
\caption{Performance comparison of various compression methods on DeiT-base model using ImageNet dataset, averaged over 1,000 iterations with batch size 64 on NVIDIA A6000 GPU (excluding the first 200 iterations as warm-up epochs). Here, dense represents the original model, Zeroing indicates the model with pruned weights replaced by zeros, while FFN Compression, Attention Compression, and All Compression refer to compression applied to Feed-Forward Network blocks only, Self-Attention blocks only, and the entire network architecture, respectively.}
\label{tab:performance_comparison}
\begin{center}
\begin{adjustbox}{width=1\linewidth}
\begin{tabular}{cccccc}
\hline
\toprule
\multicolumn{6}{c}{\textit{Performance Metrics}} \\
\textbf{Metric} & \textbf{Dense} & \textbf{Zeroing} & \textbf{FFN} & \textbf{Attention} & \textbf{All} \\
& & & \textbf{Compression} & \textbf{Compression} & \textbf{Compression} \\
\midrule
Throughput (fps) & 401.4 & 410.8 (1.0×) & 652.6 (1.6×) & 514.3 (1.3×) & \textbf{960.3} (2.4×) \\
Latency (ms) & 159.5 & 155.8 (1.0×) & 98.1 (1.6×) & 124.5 (1.3×) & \textbf{66.6} (2.4×) \\
GFLOPs & 17.6 & 17.6 (+0.0\%) & 9.3 (-47.2\%) & 13.7 (-22.2\%) & \textbf{5.4} (-69.3\%) \\
Parameters (M) & 86.6 & 86.6 (+0.0\%) & 44.5 (-48.6\%) & 69.1 (-20.2\%) & \textbf{27.0} (-68.8\%) \\
\bottomrule
\hline
\end{tabular}
\end{adjustbox}
\end{center}
\end{table}



\begin{figure}[t]
\centering
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=1\linewidth]{Images/pruning_accuracy.png}
    \end{subfigure}
\caption{Accuracy comparison between the dense model and selective attention component pruning (Value \& Proj vs. Query \& Key) at varying sparsity levels.}
\label{val_qk}
\end{figure}



\subsection{Computational efficiency}
Table~\ref{tab:performance_comparison} compares the performance of various compression approaches applied to DeiT-base model trained with SERo. The experimental results show that the Zeroing model maintains the same number of parameters and GFLOPs as the dense model, with similar processing speed. FFN Compression achieved a 48.6\% reduction in parameters while increasing processing speed by 1.6 times, while Attention Compression resulted in a 20.2\% parameter reduction with a 1.3 times speed improvement. Finally, the fully compressed model (All Compression) demonstrated the most significant improvements, reducing parameters by 68.8\% while achieving a 2.4 times speed increase.



To further analyze the computational efficiency, we evaluated the relationship between accuracy and latency across different sparsity levels using DeiT-Tiny model trained on CIFAR-100 dataset, as shown in Figure~\ref{acc_latency}. The results reveal that while latency decreases linearly with increased sparsity, model accuracy remains stable up to 0.3 sparsity before showing significant degradation beyond 0.4.






\begin{figure}[t]
  \centering
  \includegraphics[width=1.0\linewidth]{Images/tsb.png}  
  \caption{Unit-wise analysis per block of active connection ratios (\%) (1 - pruning rate (\%)) in DeiT models using SERo pruning: Comparison of Query \& Key, Value \& Proj, and FFN unit across DeiT-Tiny, Small, and Base variants at 0.4 sparsity level.}
  \label{Vit_fig2}
\end{figure}


\subsection{Unit-wise analysis}
% In this section, we conduct a detailed analysis of layer-wise pruning effects in the DeiT model.
In this section, we analyze in detail how each pruning unit (Q\&K, V\&proj, FFN) as stated in Section~\ref{criterion} affects the performance of DeiT model pruned with SERo.
\subsubsection{Comparative analysis of unit-wise pruning}

Figure~\ref{val_qk} compares the performance between the dense model and two pruning cases: pruning only the Value \& Proj unit and pruning only the Query \& Key unit. The experimental results show that pruning only the Value \& Proj unit leads to performance degradation when sparsity exceeds 0.3, while pruning only the Query \& Key unit maintains stable performance even at relatively high sparsity levels. This suggests that the Value \& Proj unit plays a more crucial role in maintaining model performance.

\subsubsection{Sparse structural analysis of active connections} 

To analyze the structural cause of this phenomenon, Figure~\ref{Vit_fig2} shows the active connection ratios (1 - pruning rate (\%)) for each block at 0.4 sparsity level of DeiT model. Comparing the active connection patterns across Query \& Key, Value \& Proj, and FFN units, we observe that the Value \& Proj unit consistently maintains a high ratio of active connections. Notably, in the middle blocks (4-8), the Value \& Proj unit maintains over 90\% of active connections.

Table~\ref{exp: layerwise} shows sparse structures and accuracy at various sparsity levels (0.3, 0.5, 0.7). A notable observation is the difference in the Value \& Proj unit. SERo achieves higher accuracy while preserving more weights in the Value \& Proj unit across all sparsity levels (12.54\% at sparsity 0.5). Additionally, we conducted mask similarity analysis by training both gradient-based methods and SERo under identical initialization conditions and environments. Despite showing high similarity (over 90\%), there are significant performance differences between them, suggesting that SERo finds more optimal network structures compared to conventional gradient magnitude methods.




\begin{table}[t]
\caption{Comparison of sparse structures, mask similarities, and accuracies among different pruning methods (weight magnitude, gradient magnitude, and SERo) on DeiT-Tiny model with CIFAR-100 dataset.}
\label{exp: layerwise}
\begin{center}
\begin{adjustbox}{width=1.0\linewidth}
\begin{tabular}{c c c c c}
\noalign{\smallskip}\noalign{\smallskip}\hline 
\toprule
\multicolumn{5}{c}{\colorbox{gray!20!white}{\textit{Pruning ratio (\%) \& Accuracy (\%)}}} \\
& \textbf{Unit Type} & \textbf{Weight mag} & \textbf{Gradient mag} & \textbf{SERo (Ours)} \\
\midrule
\multirow{3}{*}{\textbf{Sparsity=0.3}} & Query \& Key & 20.31 & 34.51 & 31.9 \\
& Value \& Proj & 11.81 & 8.20 & \textbf{4.82} \\
& FFN & 36.97 & 34.32 & 35.82 \\
\cmidrule{2-5}
& Accuracy & 79.97 & 81.38 & \textbf{82.09}\\
\midrule
\multirow{3}{*}{\textbf{Sparsity=0.5}} & Query \& Key & 28.04 & 70.06 & 64.63 \\
& Value \& Proj & 28.86 & 17.06 & \textbf{12.54} \\
& FFN & 60.77 & 53.07 & 55.71 \\
\cmidrule{2-5}
& Accuracy & 75.35 & 78.99 & \textbf{79.58} \\
\midrule
\multirow{3}{*}{\textbf{Sparsity=0.7}} & Query \& Key & 45.27 & 91.84 & 89.24 \\
& Value \& Proj & 57.64 & 35.76 & \textbf{25.30} \\
& FFN & 79.28 & 73.10 & 76.37\\
\cmidrule{2-5}
& Accuracy & 65.29 & 73.49 & \textbf{74.25} \\
\midrule
\multicolumn{5}{c}{\colorbox{gray!20!white}{\textit{Mask Similarity (\%)}}} \\
\textbf{Sparsity} & \textbf{Sparsity=0.3} & \textbf{Sparsity=0.5} & \textbf{Sparsity=0.7} \\
Gradient vs SERo & 0.94 & 0.91 &  0.92 \\
% Weight vs SERo & 0.xx & 0.xx & 0.xx \\
\bottomrule
\end{tabular}
\end{adjustbox}
\end{center}
\end{table}



\subsubsection{Model-specific connection pattern analysis}
For more detailed analysis, Figure~\ref{small_mask_dist} shows the block-wise active connection ratios in DeiT-Small model trained on the ImageNet dataset. While the FFN unit maintains relatively low active connection ratios in both early and later blocks, the Value \& Proj unit maintains high active connection ratios across all blocks. This suggests that the Value \& Proj unit plays a crucial role in maintaining model performance. These analysis results experimentally demonstrate the importance of the Value \& Proj unit in transformer architecture and indicate that unit-wise characteristics should be considered when establishing effective pruning strategies.


\begin{figure}[t]
\centering
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=1\linewidth]{Images/mask_distribution.png}
    \end{subfigure}
\caption{Active connection ratios per block of DeiT-Small (40\% Sparsity) trained on ImageNet dataset.}
\label{small_mask_dist}
\end{figure}


\begin{figure}[t]
\centering
    \begin{subfigure}{\linewidth}
        \centering
        \includegraphics[width=0.7\linewidth]{Images/loss_surface_comparison (13).png}
    \end{subfigure}
\caption{Loss landscapes by PyHessian~\citep{yao2020pyhessian} between gradient Mgnitude pruning (Orange) and SERo (Blue, Ours).}
\label{fig:loss_surface}
\end{figure}

\subsection{Method validation}

\subsubsection{Loss landscape analysis}
The correlation between loss landscape flatness and model generalization performance has been demonstrated in several studies~\citep{10.1162/neco.1997.9.1.1, keskar2016large}. In this section, we conducted loss landscape analysis to compare the performance of our proposed method with conventional gradient magnitude pruning. Specifically, after re-optimizing the structures found by each method, we visualized the loss landscapes using PyHessian~\citep{yao2020pyhessian}. The experiments were conducted on the CIFAR-100 dataset using DeiT-Tiny model. As shown in Figure~\ref{fig:loss_surface}, the model obtained using our proposed method exhibits a flatter loss landscape, suggesting better generalization performance.

\subsubsection{Weight initialization impact and Re-optimization}
To evaluate the effect of re-optimization on the weights corresponding to the structure found in Theorem~\ref{th1}, we compared various initialization methods. Table~\ref{tab:initialization_comparison} analyzes the impact of different weight initialization methods after structure exploration and compression. As demonstrated in Eq.~\ref{initialization},  The experimental results show that the weight inheritance (No Init; $W_1^{\overline{W}}$) achieves the highest accuracy at 79.58\%, significantly outperforming both Gaussian~\citep{he2015delvingdeeprectifierssurpassing} and Xavier~\citep{pmlr-v9-glorot10a} initialization methods. This suggests the importance of maintaining weights during the re-optimization process. Table~\ref{tab:imagenet_phases} presents the results for No Init (Exploration), where re-optimization was not performed after exploration. It demonstrates that after finding an optimal architecture, optimizing the model after exploration in the optimized architecture (exploration + Re-optimization) is more efficient.




\begin{table}[t]
\caption{Accuracy comparison of different weight initialization methods (DeiT-Tiny model on CIFAR-100 with 50\% Sparsity). No Init refers to using compressed parameters directly without re-initialization, preserving the original weight distribution.}
\label{tab:initialization_comparison}
\begin{center}
\begin{adjustbox}{width=0.90\linewidth}
\begin{tabular}{c c c c }
\noalign{\smallskip}\noalign{\smallskip}\hline
\toprule
\multicolumn{4 }{c}{\colorbox{gray!20!white}{\textit{Initialization Methods}}} \\
\textbf{Metric} & \textbf{No Init} & \textbf{Gaussian} & \textbf{Xavier}  \\
\toprule
Top-1 Accuracy (\%) & \textbf{79.58} & 48.34 & 45.55 \\
\bottomrule
\end{tabular}
\end{adjustbox}
\end{center}
\end{table}

\begin{table}[t]
\caption{Comparison of model accuracies between exploration and exploration + re-optimization phases of SERo on ImageNet dataset across different ViT architectures.}
\label{tab:imagenet_phases}
\begin{center}
\begin{adjustbox}{width=1.0\linewidth}
\begin{tabular}{c c c c c}
\noalign{\smallskip}\noalign{\smallskip}\hline
\toprule
\multicolumn{5}{c}{\colorbox{gray!20!white}{\textit{Accuracy (\%) on ImageNet}}} \\
& \textbf{Model} & \textbf{Sparsity} & \textbf{Exploration} & \textbf{Exploration + Re-optimization} \\
\midrule
\multirow{3}{*}{\textbf{SERo}} 
& DeiT-Tiny & 40\% & 68.14 & \textbf{72.30} \\
& DeiT-Small & 70\% & 69.18 & \textbf{74.84} \\
& DeiT-Base & 70\% & 77.33 & \textbf{80.25} \\
\bottomrule
\end{tabular}
\end{adjustbox}
\end{center}
\end{table}

 
\section{Conclusion}

In this paper, we propose a Sparse Structure Exploration and Optimization (SERo), a novel pruning framework for vision transformers. Our approach presents a simple yet effective method that systematically analyzes the properties of pre-trained models to explore efficient sparse structures. In particular, instead of the conventional pruning approach of setting parameters to zero, we demonstrate significant improvements in computational costs and the inference speed by completely removing and compressing unnecessary parameters. Our experimental results show superior performance on various vision transformer models, suggesting that the proposed framework could be extended beyond vision tasks to other domains, including NLP tasks with transformer-based models.


\acknowledgements{
This work was supported by Hyundai Motor Company and Kia, and partially supported by the Institute of Information \& Communications Technology Planning \& Evaluation(IITP) grant funded by the Korea government(MSIT) (o.RS-2025-02219317, AI Star Fellowship(Kookmin University)).
}


% References
\bibliography{uai2025-template}

\newpage

\onecolumn

\title{Sparse Structure Exploration and Re-optimization for Vision Transformer\\(Supplementary Material)}
% \maketitle

\appendix
\section{Convergence analysis}
The update rule of SERo in the exploration phase is as follows:
\[
W_{t+1} = W_t - \eta \cdot \nabla_{\overline{W_t}} \mathcal{L} \cdot M_t \cdot W^{|\cdot|}
\]
Consider the squared distance between \( W_{t+1} \) and \( W_* \):
\[
\|W_{t+1} - W_*\|^2 = \|W_t - \eta \cdot \nabla_{\overline{W_t}} \mathcal{L} \cdot M_t \cdot W_t^{|\cdot|} - W_*\|^2.
\]
Taking the expectation conditioned on \( W_t \):
\[
\mathbb{E}[\|W_{t+1} - W_*\|^2 \mid W_t] = \|W_t - W_*\|^2 - 2\eta \mathbb{E}[\langle \nabla_{\overline{W_t}} \mathcal{L} \cdot M_t \cdot W_t^{|\cdot|}, W_t - W_* \rangle \mid W_t] + \eta^2 \mathbb{E}[\|\nabla_{\overline{W_t}} \mathcal{L} \cdot M_t \cdot W_t^{|\cdot|}\|^2 \mid W_t].
\]

Using the assumption that \( \mathbb{E}[\|\nabla_{\overline{W_t}} \mathcal{L}\|^2 \mid W_t] \leq G^2 \) and noting that \( \|a \cdot b\|^2 = \|a\|^2 \|b\|^2 \) for element-wise multiplication, we have:
\[
\mathbb{E}[\|\nabla_{\overline{W_t}} \mathcal{L} \cdot M_t \cdot W_t^{|\cdot|}\|^2 \mid W_t] \leq G^2 \|M_t \cdot W_t^{|\cdot|}\|^2.
\]

by convexity and the loss difference condition,
\[
\langle \nabla_{\overline{W_t}} \mathcal{L} \cdot M_t \cdot W_t^{|\cdot|}, W_t - W_* \rangle \geq \mathcal{L}(\overline{W_t}) - \mathcal{L}(W_*) - \epsilon.
\]
Plugging these bounds back:
\[
\mathbb{E}[\|W_{t+1} - W_*\|^2 \mid W_t] \leq \|W_t - W_*\|^2 - 2\eta (\mathcal{L}(\overline{W_t}) - \mathcal{L}(W_*) - \epsilon) + \eta^2 G^2 \|M_t \cdot W_t^{|\cdot|}\|^2.
\]

Summing over \( t = 1 \) to \( T \), the left-hand side telescopes:
\[
\|W_{T+1} - W_*\|^2 \leq \|W_1 - W_*\|^2 - 2\eta \sum_{t=1}^T (\mathcal{L}(\overline{W_t}) - \mathcal{L}(W_*) - \epsilon) + \eta^2 G^2 \sum_{t=1}^T \|M_t \cdot W_t^{|\cdot|}\|^2.
\]

Rearranging:
\[
\frac{1}{T} \sum_{t=1}^T \mathcal{L}(\overline{W_t}) - \mathcal{L}(W_*) \leq \frac{\|W_1 - W_*\|^2}{2 \eta T} + 2\eta T \epsilon + \frac{\eta G^2}{2T} \sum_{t=1}^T \|M_t \cdot W_t^{|\cdot|}\|^2.
\]

Taking the expectation over all randomness:
\[
\mathbb{E}\left[\frac{1}{T} \sum_{t=1}^T \mathcal{L}(\overline{W_t})\right] - \mathcal{L}(W_*) \leq \frac{\|W_1 - W_*\|^2}{2 \eta T} + 2\eta T \epsilon + \frac{\eta G^2}{2} \mathbb{E}[\|M_t \cdot W_t^{|\cdot|}\|^2].
\]

\subsection{Convergence bound of Re-optimization phase}

With a fixed explored sparse structure, the convergence bound for standard gradient descent over \( T \) iterations is given by:
\[
\mathbb{E}\left[\frac{1}{T} \sum_{t=1}^{T} \mathcal{L}(W_t)\right] - \mathcal{L}(W_*) \leq \frac{\| W_1 - W_* \|^2}{2\eta T} + \frac{\eta G^2}{2}.
\]

When initializing \( \tilde{W}_1 \) with the parameters found in the exploration phase (\( W_1^{\overline{W}} \)), the distance to the optimal solution is smaller compared to random initialization (\( W_1^{\text{random}} \)). Specifically, since:
\[
\|W_1^{\text{random}} - W^*\|^2 > \|W_1^{\overline{W}} - W^*\|^2,
\]
the improvement in the convergence bound is given by:
\[
\Delta = \frac{\|W_1^{\text{random}} - W^*\|^2 - \|W_1^{\overline{W}} - W^*\|^2}{2 \eta T}.
\]
Thus, initializing from \( W_1^{\overline{W}} \) leads to faster convergence compared to random initialization.


\section{Comparative analysis of gradient update methods: standard vs weight-scaled approaches}

Figure~\ref{mean_compa} shows the average weights per block and accuracy after applying different gradient update methods in re-optimization phase. The blue line represents our standard gradient update method, while the orange line shows the $gradient \times |W|$ approach used in exploration phase. The results indicate that multiplying by  $ |W| $ leads to overall smaller weight values and lower accuracy. This occurs because $|W|$ is less than 1, effectively reducing the update magnitude. Such reduced updates essentially mimic a lower learning rate, potentially slowing down optimization or increasing the risk of getting trapped in local minima.
 
\begin{figure}[]
\centering
    \begin{subfigure}{\linewidth}
        \centering 
        \includegraphics[width=\linewidth]{Images/weight_mean_comparison.png}
    \end{subfigure}
\caption{Mean absolute weights per block with different update methods in re-optimization phase using DeiT-Small model with 40\% sparsity on ImageNet dataset}
\label{mean_compa}
\end{figure}

\section{Hardware Friendly pruning}

When masks are not shared across query/key, value/proj, and FFN layers, dimension mismatch problems similar to Zeroing Pruning may arise. The left side of Figure~\ref{hard} illustrates that removing the third column from the input and the second row from the weight matrix results in unnecessary computations such as 1a + 2d + 3g in the final output. Upon actual compression of such a model, as shown in the bottom left, the computational results deviate from pre-compression values. This presents a significant obstacle to effective hardware optimization. Therefore, as demonstrated in the right image, we need a hardware-friendly pruning approach that maintains consistent input/output dimensions.


\begin{figure*}[]
\centering
    \begin{subfigure}{\linewidth}
        \centering 
        \includegraphics[width=\linewidth]{Images/hardware.png}
    \end{subfigure}
\caption{Illustration of dimension mismatch caused by non-shared weight masking (left) versus Hardware Friendly Pruning with consistent dimensions through shared masking (right).}
\label{hard}
\end{figure*}


\section{Analysis of weight distributions}

\begin{figure}[t]
\centering
% Top row with two plots side by side
\subfloat[Block (1~4) distribution]{
  \includegraphics[width=0.48\textwidth]{Images/weight_distributions_part1.png}
}
\hfill
\subfloat[Block (5~8) distribution]{
  \includegraphics[width=0.48\textwidth]{Images/weight_distributions_part2.png}
}
% Bottom single plot centered
\begin{center}
\subfloat[Block (9~12) distribution]{
  \includegraphics[width=0.48\textwidth]{Images/weight_distributions_part3.png}
}
\end{center}
\caption{Weight distribution comparison of DeiT-Tiny model trained on CIFAR-100: block-wise Query/Key/Value/FFN1 distributions of SERo (blue) and Gradient magnitude (red). yellow regions indicate overlapping distributions.}
\label{fig:distributions}
\end{figure}

%%%%

% \usepackage{multicol}

\begin{multicols}{2}
\begin{figure}[H]
\centering
    \begin{subfigure}{\linewidth}
        \includegraphics[width=1.0\linewidth]{Images/weight_block_distributions.png}
    \end{subfigure}
\caption{Weight distribution comparison of DeiT-Tiny model trained on CIFAR-100: Weight distributions per block of SERo (blue) and Gradient magnitude (red). yellow regions indicate overlapping distributions.}
\label{weight_block_dist}
\end{figure}

\begin{figure}[H]
\centering
    \begin{subfigure}{\linewidth}
        \centering  % subfigure 내부에 \centering 추가
        \includegraphics[width=0.83\linewidth]{Images/mean_weights.png}
    \end{subfigure}
\caption{Weight average comparison of Query/Key/Value/FFN1 per block in DeiT-Tiny model trained on CIFAR-100: SERo (orange) and Gradient magnitude (blue)

}
\label{mean_weight}
\end{figure}
\end{multicols}

%%%%%


As shown in Figure~(\ref{fig:distributions},~\ref{weight_block_dist}), while SERo (blue) and Gradient magnitude (red) exhibit similar pruning patterns (yellow overlapping regions), SERo's superior performance can be attributed to subtle differences in non-overlapping regions and particularly effective weight preservation in the Value layer.

Figure~\ref{mean_weight} further supports this analysis, showing that Query and Key layers display very similar patterns with peaks at block 5 followed by gradual decrease, while the Value layer maintains consistently high mean weights after block 1, with a notable difference at block 12. SERo (orange) maintains higher mean absolute weight values throughout the network, indicating better preservation of important weights. Notably, the Value layer plays a particularly crucial role in this preservation, demonstrating SERo's effectiveness in maintaining essential weight information for model performance.

\begin{figure}[]
\centering
    \begin{subfigure}{\linewidth}
        \centering 
        \includegraphics[width=\linewidth]{Images/gradcam_comparison.png}
    \end{subfigure}
\caption{Comparison of combined and individual block GradCAM between original(left) and 70\% pruned(right) DeiT-Base model}
\label{gradcam}
\end{figure}



\section{Implementation details}
We conduct experiments using the officially released DeiT models, maintaining consistent hyperparameters for both sparse structure exploration and re-optimization phases. The sparse structure exploration runs for 50 epochs and re-optimization for 150 epochs, with both phases using a learning rate of 5e-5 and a weight decay of 0.05. The AdamW optimizer with a cosine learning rate scheduler is applied in all experiments. The batch size is set to 256 or 512 for the ImageNet-1K dataset and 128 for CIFAR datasets. All ImageNet-1K experiments are conducted on NVIDIA A6000 GPU, while CIFAR experiments are run on NVIDIA RTX 3090 GPU. For more implementation details, please refer to our released code.


\section{Model and Task Generalization}

To verify whether SERO's applicability is limited to classification tasks on DeiT models, we conducted experiments as shown in Tables~\ref{tab:swin_comparison},\ref{tab:damo_comparison}, and\ref{tab:faceptor_performance_results}.

First, in Table~\ref{tab:swin_comparison}, we performed classification task experiments on the Swin Transformer~\cite{liu2021swintransformerhierarchicalvision} model rather than the DeiT model. As specified in the training details of the paper, we applied the hyperparameters used for DeiT training to perform 50\% pruning on this model and measured the computational cost (FLOPs) and accuracy. The experimental results showed that despite significantly improving efficiency by reducing FLOPs by approximately 50\%, the accuracy drop was only about 1.9\%.

In Table~\ref{tab:damo_comparison}, to evaluate the effectiveness of SERO in non-classification tasks such as object detection, we applied SERO to the DAMO~\cite{xu2023damoyoloreportrealtime} model architecture, which incorporates attention mechanisms, and conducted evaluation on the COCO 2017~\cite{lin2015microsoftcococommonobjects} dataset. When SERO was applied to the DAMO-NL variant achieving approximately 48\% parameter reduction, FLOPs decreased by 48.4\% while mAP dropped by only 2.07 points (from 40.5 to 38.43). Similarly, other DAMO model variants (NS, NM) also showed only limited mAP performance degradation despite significant reductions in FLOPs and parameters.

Additionally, Table~\ref{tab:faceptor_performance_results} shows the experimental results on the Faceptor~\cite{qin2024faceptorgeneralistmodelface} model, a more complex transformer model that combines encoder-decoder architecture, using six datasets~\cite{7477558},~\cite{zhang2017ageprogressionregressionconditionaladversarial},~\cite{liu2015deeplearningfaceattributes},~\cite{lee2020maskgandiverseinteractivefacial},~\cite{Liu_Shi_Shen_Si_Wang_Mei_2020},~\cite{6755925}. SERO maintained strong performance across various face-related tasks even at high compression rates (achieving 62.59M parameters with approximately 40\% compression), with minimal performance degradation and even performance improvements in some tasks such as Face ID (CFP\_FP) and Face Alignment (300W, LaPa).

\begin{table}[!t]
    \centering
    \caption{Comparison of Swin Transformer Models with SERo}
    \label{tab:swin_comparison}
    \begin{tabular}{lcccc}
        \toprule
        \textbf{Model} & \textbf{Parameters} & \textbf{Pruning Rate (\%)} & \textbf{FLOPs} & \textbf{Accuracy} \\
        \midrule
        Swin-T (Original) & 29M & - & 4.5G & 81.3\% \\
        Swin-S (Original) & 50M & - & 8.7G & 83.0\% \\
        Swin-S + SERo & 25M & 50\% & 4.3G & 81.1\% \\
        \bottomrule
    \end{tabular}
\end{table}


\begin{table*}[!t]
    \centering
    \caption{Performance Comparison of DAMO Models with SERo}
    \label{tab:damo_comparison}
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{llccccccc}
        \toprule
        \textbf{Model} & \textbf{Pruning Method} & \textbf{Compression} & \textbf{FLOPs} & \textbf{Params} & \textbf{mAP @} & \textbf{Latency} & \textbf{Latency} \\
        & & \textbf{Ratio} & \textbf{(G)} & \textbf{(M)} & \textbf{0.5:0.95} & \textbf{Orin batch 1} & \textbf{Nuc batch 1} \\
        \midrule
        DAMO NS & Original & - & 1.62 & 1.41 & 32.3 & 3.06 & 110.78 \\
        DAMO NS & SERo (ours) & 33\% & 1.09 & 0.93 & 30.94 & 2.29 & 66.67 \\
        \midrule
        DAMO NM & Original & - & 3.8 & 2.71 & 38.2 & 2.56 & 72.72 \\
        DAMO NM & SERo (ours) & 49\% & 1.99 & 1.37 & 35.19 & 2.19 & 49.59 \\
        \midrule
        DAMO NL & Original & - & 6.16 & 5.69 & 40.5 & 1.90 & 38.05 \\
        DAMO NL & SERo (ours) & 48\% & 3.18 & 2.87 & 38.43 & 1.61 & 28.9 \\
        \bottomrule
    \end{tabular}%
    }
\end{table*}

\begin{table*}[!t]
    \centering
    \caption{Performance Comparison of Faceptor Models with SERo}
    \label{tab:faceptor_performance_results}
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{llcccccccc}
        \toprule
        \textbf{Model} & \textbf{Parameters} & \textbf{FLOPs} & \textbf{CFP\_FP} & \textbf{UTK\_FACE} & \textbf{CelebA} & \textbf{CelebAMask-HQ} & \textbf{LaPa} & \textbf{LaPa} & \textbf{300W} \\
        & \textbf{(M)} & \textbf{(G)} & \textbf{(Val Acc $\uparrow$)} & \textbf{(MAE $\downarrow$)} & \textbf{(mACC)} & \textbf{(F1-mean $\uparrow$)} & \textbf{(F1-mean $\uparrow$)} & \textbf{(Inter-Ocular $\downarrow$)} & \textbf{(Inter-Pupil $\downarrow$)} \\
        \midrule
        Faceptor & 105.2 & 108.46 & 96.14 & 4.21 & 23.12 & 88.22 & 91.94 & 4.63 & 6.67 \\
        \midrule
        SERo & 62.59 & 67.55 & 96.46 & 4.32 & 23.13 & 88.00 & 91.69 & 4.52 & 6.51 \\
        \midrule
        SERo & 42.57 & 46.97 & 92.69 & 4.41 & 23.13 & 86.74 & 90.95 & 4.68 & 6.74 \\
        \bottomrule
    \end{tabular}%
    }
\end{table*}


\section{Hyperparameter Sensitivity Analysis}

In this section, we conduct an analysis of hyperparameter sensitivity. These studies utilized the DeiT-small model pruned to 70\% sparsity, with performance evaluated based on CIFAR-100 accuracy.

Table~\ref{tab:learning_rate} examines the effect of learning rate in the re-optimization phase. When set higher than the exploration phase learning rate (5e-4 vs. 5e-5), convergence problems occurred, resulting in a low accuracy of 78.8\%. In contrast, using the same learning rate (5e-5) for both exploration and re-optimization achieved the best performance of 83.1\% accuracy. This is because it can effectively maintain the sparse structure discovered during training. Using a lower learning rate (5e-6) than the exploration phase resulted in insufficient optimization, achieving only 80.69\% performance. Therefore, we confirmed that using the same learning rate for both phases is the most effective approach.

Table~\ref{tab:pruning_frequency} examines the impact of pruning mask update frequency on model performance. The frequency of 8 (mask update every 8 epochs) used in the main experiments achieved the best performance of 83.1\%. When updating the mask every epoch (frequency=1), performance decreased to 82.24\% due to structural instability. In contrast, lower update frequencies (frequency 16, 32, 64) maintained stable performance in the range of 82.7~82.9\%. This demonstrates that excessively frequent mask updates negatively affect performance, while stability is secured above a certain threshold.


Table~\ref{tab:epoch_allocation} analyzes the impact of epoch allocation between the exploration and re-optimization phases on performance. When configured with 50 epochs for exploration and 150 epochs for re-optimization, the optimal balance was achieved, recording the highest accuracy of 83.1\%. This demonstrates the importance of maintaining an appropriate balance between exploration and re-optimization. When exploration was insufficient (2 epochs), performance was limited to 82.4\%, while conversely, excessive exploration with minimal re-optimization (199 exploration epochs, 1 re-optimization epoch) significantly degraded performance to 78.8\%.

\begin{multicols}{2}
\begin{table}[H]
\caption{Effect of learning rate in the re-optimization phase on model performance. DeiT-small model pruned to 70\% sparsity evaluated on CIFAR-100.}
\label{tab:learning_rate}
\centering
\begin{tabular}{cccc}
\toprule
& \textbf{Step} & \textbf{LR} & \textbf{Acc (\%)} \\
\midrule
\multirow{4}{*}{\textbf{SERo}} & Exploration & 5e-5 & 79.67 \\
& Exploration + Re-optimization & 5e-4 & 78.8 \\
& Exploration + Re-optimization & 5e-5 & \textbf{83.1} \\
& Exploration + Re-optimization & 5e-6 & 80.69 \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[H]
\caption{Impact of pruning mask update frequency on model performance. DeiT-small model pruned to 70\% sparsity evaluated on CIFAR-100.}
\label{tab:pruning_frequency}
\centering
\begin{tabular}{cc}
\toprule
\multirow{5}{*}{\textbf{SERo}}\textbf{Frequency} & \textbf{Acc (\%)} \\
\midrule
1 & 82.24 \\
8 & \textbf{83.1} \\
16 & 82.75 \\
32 & 82.71 \\
64 & 82.9 \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[H]
\caption{Impact of epoch allocation between exploration and re-optimization phases on model performance. DeiT-small model pruned to 70\% sparsity evaluated on CIFAR-100.}
\label{tab:epoch_allocation}
\centering
\begin{tabular}{cccc}
\toprule
& \textbf{Exploration} & \textbf{Re-optimization} & \textbf{Accuracy (\%)} \\
\midrule
\multirow{7}{*}{\textbf{SERo}} & 2 & 198 & 82.4 \\
& 25 & 175 & 82.9 \\
& 50 & 150 & \textbf{83.1} \\
& 100 & 100 & 82.5 \\
& 150 & 50 & 82.39 \\
& 175 & 25 & 81.85 \\
& 199 & 1 & 78.8 \\
\bottomrule
\end{tabular}
\end{table}
\end{multicols}


As shown in Table~\ref{tab:gflops_comparison}, SERo significantly reduces the computational operations in the re-optimization phase compared to the fine-tuning phase. Notably, for the DeiT-Base model with 70\% sparsity, the computational operations (GFLOPs) decrease from 17.6 to 5.4, showing approximately 70\% reduction, which demonstrates that our proposed re-optimization strategy can effectively alleviate the computational burden of the model.
Furthermore, Table~\ref{tab:cifar_strategies} demonstrates that global pruning consistently outperforms layer-wise pruning across all model variants on CIFAR-100. The performance gap is particularly notable for DeiT-Small, where global pruning achieves 83.34\% accuracy compared to 78.28\% with layer-wise pruning, suggesting that maintaining the flexibility to redistribute sparsity across layers is beneficial for model performance.

\begin{multicols}{2}
\begin{table}[H]
\caption{Comparison of GFLOPs between fine-tuning and re-optimization phases of SERo across different DeiT architectures.}
\label{tab:gflops_comparison}
\begin{center}
\begin{adjustbox}{width=1.0\linewidth}
\begin{tabular}{c c c c c}
\noalign{\smallskip}\noalign{\smallskip}\hline
\toprule
\multicolumn{5}{c}{\colorbox{gray!20!white}{\textit{GFLOPs}}} \\
& \textbf{Model} & \textbf{Sparsity} & \textbf{Fine-tuning} & \textbf{Re-optimization} \\
\midrule
\multirow{3}{*}{\textbf{SERo}}
& DeiT-Tiny & 40\% & 1.3 & \textbf{0.8} \\
& DeiT-Small & 70\% & 4.6 & \textbf{1.5} \\
& DeiT-Base & 70\% & 17.6 & \textbf{5.4} \\
\bottomrule
\end{tabular}
\end{adjustbox}
\end{center}
\end{table}

\begin{table}[H]
\caption{Comparison of model accuracies between layer-wise and global pruning strategies of SERo on CIFAR-100 dataset (Sparsity = 50\%).}
\label{tab:cifar_strategies}
\begin{center}
\begin{adjustbox}{width=1.0\linewidth}
\begin{tabular}{c c c c c}
\noalign{\smallskip}\noalign{\smallskip}\hline
\toprule
\multicolumn{5}{c}{\colorbox{gray!20!white}{\textit{Accuracy (\%) on CIFAR-100}}} \\
& \textbf{Model} & \textbf{Sparsity} & \textbf{Layer-wise} & \textbf{Global} \\
\midrule
\multirow{3}{*}{\textbf{SERo}} 
& DeiT-Tiny & 50\% & 76.95 & \textbf{79.58} \\
& DeiT-Small & 50\% & 78.28 & \textbf{83.34} \\
& DeiT-Base & 50\% & 83.02 & \textbf{85.76} \\
\bottomrule
\end{tabular}
\end{adjustbox}
\end{center}
\end{table}
\end{multicols}


\section{Attention pattern analysis before and after pruning}

\begin{figure}[]
\centering
    \begin{subfigure}{\linewidth}
        \centering  
        \includegraphics[width=\linewidth]{Images/attention_comparison4.png}
    \end{subfigure}
\caption{Comparison of combined and individual head attention maps between original(left) and 70\% pruned(right) DeiT-Base model at block 4}
\label{attention_comparison4}
\end{figure}\


\begin{figure}[]
\centering
    \begin{subfigure}{\linewidth}
        \centering 
        \includegraphics[width=\linewidth]{Images/attention_comparison10.png}
    \end{subfigure}
\caption{Comparison of combined and individual head attention maps between original(left) and 70\% pruned(right) DeiT-Base model at block 10}
\label{attention_comparison10}
\end{figure}




Figure~\ref{gradcam} shows the per-block GradCAM visualization comparing the original and pruned models. The pruned model demonstrates reduced overall noise and shows improved focus on key features such as the chimpanzee's face and form. This suggests that the pruning process effectively preserved the model's essential feature extraction capabilities while eliminating unnecessary activations.

The attention map analysis at blocks 4 and 10 Figure~(\ref{attention_comparison4},~\ref{attention_comparison10}) reveals interesting pattern differences. In Figure~\ref{attention_comparison4}, the original model shows distributed attention patterns across wider regions, while the pruned model exhibits more focused attention on key features of the peacock (head, tail). In Figure~\ref{attention_comparison10}, both models demonstrate sparse attention patterns, reflecting the tendency to focus on specific feature points in higher blocks. These findings demonstrate that model compression can achieve efficient processing while maintaining key feature recognition capabilities.




\end{document}