\documentclass{uai2025} % for initial submission
%\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amssymb, amsthm}
\usepackage{subcaption}
\usepackage{relsize}
\usepackage{multirow}
% \usepackage[ruled,vlined,linesnumbered]{algorithm2e}
\usepackage{algorithm}
\usepackage{algorithmic}
% \newcommand{\theHalgorithm}{\arabic{algorithm}}
\usepackage[justification=Justified,singlelinecheck=off]{caption}
\usepackage[justification=centering,singlelinecheck=off]{subcaption}
\usepackage{multirow}
\usepackage{makecell}
\usepackage{wrapfig}
\usepackage{bbm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newcommand{\lnorm}[1]{\left\lVert#1\right\rVert_1}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert_2}
\newcommand{\fnorm}[1]{\left\lVert#1\right\rVert_F}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\newcommand{\mask}{\mathcal{M}^i_j}
\usepackage{float}
\DeclareMathOperator*{\argmin}{arg\,min}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{SPvR: Structured Pruning via Ranking}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2025 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
  \begin{document}
\maketitle

\begin{abstract}
Deep neural networks have achieved state-of-the-art performance in multiple domains but are increasingly resource-intensive, limiting their deployment on constrained devices.
We introduce Structured Pruning via Ranking (SPvR), a novel structured pruning approach to address this challenge for classification tasks. 
SPvR prunes pre-trained networks in terms of function composition and network width while adhering to a user-specified parameter budget. 
Our method leverages local grouping and global ranking modules to generate smaller yet effective networks tailored to a given dataset and model. 
Finally, we train the pruned networks from scratch, instead of fine-tuning.
Our evaluations demonstrate that SPvR significantly surpasses existing state-of-the-art pruning methods on benchmark datasets,
% namely, CIFAR10, Tiny ImageNet, ImageNet1K and IMDB 50K, 
using standard architectures. 
Even with a $90\%$ reduction in size, SPvR's sub-networks experience a minimal drop in test accuracy $(<1\%)$ while on ImageNet1K, we outperform all baselines by achieving $<1\%$ Top-5 accuracy drop when pruning $70\%$ of ResNet50 parameters. 
Additionally, when compared to MobileNetV3, an SPvR pruned network improves the Top-1 accuracy by $3.3\%$ with $20\%$ less parameters.
Furthermore, we empirically show that SPvR achieves reduced inference latency, underscoring its practical benefits for deploying neural networks on resource-constrained devices.
\end{abstract}

\section{Introduction}\label{sec:intro}
\label{introduction}
Highly over-parameterized, deep neural networks have shown remarkable proficiency in learning effective representations in diverse domains such as computer vision \citep{wang2023yolov7,yuan2021florence,he2016deep}, natural language processing \citep{wu2023brief,radford2019language,devlin2018bert} and speech \citep{DBLP:conf/icml/RadfordKXBMS23,DBLP:conf/nips/BaevskiZMA20}. 
However, their deployment on commercial, especially low-end, hardware is impeded by their substantial size, leading to large memory requirements and extended inference times. 
Consequently, recent deep-learning research has pivoted toward methods for reducing model size. These include network pruning \citep{fang2023depgraph,blalock2020state,li2016pruning,molchanov2016pruning,lecun1989optimal}, low-rank weight approximation \citep{li2023losparse,swaminathan2020sparse,denton2014exploiting}, weight quantization \citep{li2023model,gong2020vecq,courbariaux2016binarized}, and knowledge distillation \citep{liang2023less,pan2020meta,hinton2015distilling}, with pruning receiving notable attention for its effective balance between size reduction and performance.

Network pruning is generally classified into two approaches: unstructured and structured. 
The former involves masking individual weights, leading to sparse models \citep{frankle2018lottery}, while the latter prunes entire neurons or channels, resulting in dense sub-networks \citep{li2016pruning}. 
Sparse models often necessitate specialized hardware for efficiency \citep{han2016eie}, whereas dense sub-networks can reduce both inference time and storage requirements on conventional hardware. 
However, most structured pruning techniques are model-specific and tailored to particular network architectures such as Convolutional Neural Networks \citep{li2022revisiting,sui2021chip,luo2020neural} or language models \citep{ma2023llm,NEURIPS2020_6f5216f8,mccarley2019structured}. 
Many of these methods either overlook the dataset's role in pruning \citep{He_2019_CVPR,li2016pruning} or require training the original large model to identify the optimal sub-network \citep{xia2022structured,NEURIPS2020_6f5216f8,fan2019reducing}. 
Furthermore, pruning algorithms need to generate optimal sub-networks tailored to user-defined parameter budgets, considering the varying sizes of the target deployment devices \citep{tiwari2021chipnet,dupont2021weight}.

In response to these challenges, we introduce SPvR (Structured Pruning via Ranking), a novel, structured pruning approach tailored for classification tasks. 
SPvR is applicable to any pre-trained network, without the need to train the original extensive model. 
It employs a local grouping module that partitions similar neurons layer by layer for efficient pruning alongside a global ranking module that assesses the overall importance of these groups. 
Based on a predefined parameter budget, the least significant groups are eliminated, resulting in a dense, smaller sub-network with reduced depth and parameter count. 
This layer reduction which typically occurs at high pruning rates is vital for lowering inference latency, as structured pruning often leads to irregular layer widths, which are not optimal for GPU utilization. 
After pruning, the resulting dense sub-network is re-initialized and trained from scratch.
We choose to retrain instead of fine-tuning as the observations made in the seminal work by \cite{liu2018rethinking} demonstrate that for structured pruning, the resultant architecture is more important than the retained weights. 
% We further empirically validate this hypothesis through the use of Geometric Complexity (GC) \citep{dherin2022neural}, a recently proposed. 

Extensive evaluations on benchmark datasets, including CIFAR10 \citep{krizhevsky2009learning}, Tiny ImageNet \citep{le2015tiny}, ImageNet \cite{deng2009imagenet}, and CityScapes \citep{cordts2016cityscapes}, using distinct architectures, namely, VGG16 \citep{simonyan2014very}, ResNet34 \citep{he2016deep}, ResNet50 \citep{he2016deep}, and SegNet \citep{badrinarayanan2017segnet}, demonstrate that SPvR's recommended shallower sub-networks significantly outperform other methods across all datasets and pruning rates, while also achieving notably lower inference latency due to their reduced depth. 
Furthermore, we exhibit the applicability of our method to resource-constrained devices by demonstrating that an SPvR pruned network with $20\%$ fewer parameters significantly outperforms MobileNetV3 \cite{howard2019searching}, a model specifically designed for mobile phone CPUs.

\subsection{Our Contributions:}
\begin{itemize}
    \item \textbf{Development of an Efficient Pruning Algorithm:} Our primary contribution is the development of Structured Pruning via Ranking (SPvR), a novel, structured pruning approach. 
    SPvR is capable of effectively generating shallow and dense sub-networks tailored to specific datasets, pre-trained models, and user-defined parameter budgets. 
    Unlike some state-of-the-art methods such as OTOv2\cite{chen2023otov2}, SPvR only necessitates backpropagation on the pruned model (re-training) rather than the original, massive network.

    % \item \textbf{Empirical Justification for Re-training Strategy:} We provide robust empirical evidence supporting the efficacy of re-initialization and training from scratch for structurally pruned networks. 
    % This is substantiated by leveraging Geometric Complexity, a recently proposed model complexity measure, in a novel setting. 
    % Our findings indicate that this approach is a more effective strategy than traditional fine-tuning methods, especially in the context of structurally pruned networks.

    \item \textbf{Comprehensive Evaluation Demonstrating Enhanced Performance and Efficiency:} Through extensive experimental evaluations, we demonstrate that sub-networks generated by SPvR, when trained from scratch, consistently outperform existing pruning methods as well as hand-crafted architectures. 
    This superior performance is observed across a range of benchmark datasets. 
    Furthermore, a significant contribution of our work is the achievement of reduced model inference latency. 
    This aspect is crucial for deploying neural networks in resource-constrained environments, aligning with the growing need for efficient and fast computational models.
\end{itemize}

\section{Related Work}
\label{sec:related}
Structured pruning strategies for convolutional neural networks have seen a variety of approaches \citep{he2023structured}. 
Weight-dependent methods, including norm-based filter pruning ($\ell_1$ and $\ell_2$ norms), focus on filter removal based on their norms \cite{li2016pruning}, while Filter Pruning via Geometric Median (FPGM) targets filters near the geometric median of a layer \cite{He_2019_CVPR}. 
However, these methods overlook the data's influence on the final architecture. 
Activation-based pruning, such as HRank \cite{lin2020hrank} and CHIP \cite{sui2021chip}, remove filters by analyzing activation ranks or cross-channel correlations. Techniques like ThiNet \cite{luo2017thinet} and NISP \cite{yu2018nisp} determine filter importance through reconstruction error or feature ranking, respectively. CURL \cite{luo2020neural} employs KL-divergence for channel masking and global filter removal. These methods, however, suffer from being time-consuming (e.g., HRank), neglecting output layer changes (CHIP), or using suboptimal ranking (ThiNet, CURL).
Regularization approaches like Network Slimming \cite{liu2017learning} target filters with minimal scaling factors in batch normalization layers, whereas optimization-based methods, such as those using Taylor Expansion \cite{molchanov2019importance}, rank filters by weight and gradient impacts. Random Channel Pruning (RCP) integrates the Lottery Ticket Hypothesis, selecting pruned models for further training \citep{li2022revisiting, frankle2018lottery}. Both regularization and optimization require initial full model training.
OTOv2 (Only Train Once) \cite{chen2023otov2} stands out by pruning both vision and language models.
However, these techniques necessitate training the original, large network from scratch to identify pruned sub-networks.
Unlike most structured pruning methods our proposed technique focuses on depth reduction to lower inference latency, offering a significant advantage in resource efficiency by retraining only the pruned networks.

\section{SPvR: An Efficient Model Pruning Algorithm}
Structured Pruning via Ranking (SPvR) is a novel, network pruning algorithm that assesses the global importance of neurons/filters through forward passes on the original network, targeting the least important ones for pruning. 
This approach reduces parameter count and the network's depth, leading to faster sub-networks during training and inference compared to those from other pruning methods. 
SPvR comprises two key components: a local grouping module that clusters similar neurons/filters within layers and a global ranking module that prioritizes these groups for efficient pruning.


\subsection{Ranking Module}
\label{sec:ranking}
Let $f_\theta$ be an $L$ layer neural network parameterized by $\theta$ where $\theta=\{\theta^1, \theta^2, \cdots, \theta^L\}$. 
Here, $\theta^i$ represents the parameters of layer $i$ while  $\theta^i_j$ denotes the $j$-th neuron/filter at layer $i$. 
Given a dataset $\mathcal{D}=\{(x_0,y_0), \cdots, (x_n,y_n)\}$ composed of input and output pairs $x_k$ and $y_k$, respectively, the task of training $f_\theta$ is solving the following minimization problem:
\begin{equation}
    \min\limits_\theta \frac{1}{n}\sum_{k=1}^nE(y_k,f_\theta(x_k))
\end{equation}
where $E$ is the error function, $f_\theta(x_k) \in \mathbb{R}^c$ is the softmax final output of $f_\theta$ for a given input $x_k$ and $c$ is the number of classes. 
% A neuron is important if its removal changes the class labels of the input samples compared to the original net. 
A neuron is important if its removal significantly changes the output of $f_\theta$. 
Let $f_{\mask(\theta)}$ denote the network after masking $j$-th neuron/filter in the $i$-th layer from the original network. More precisely, $\mask$ is defined as follows:

\[
\mask(\theta) =\begin{cases}
     

            \theta^{m}_r & \text{ if } m \neq i \vee r \neq j\\
            0 & \text{ otherwise}
        
\end{cases}    
\]

Masking $\theta^i_j$ may lead to one of the following cases:

% \noindent\begin{minipage}{.5\linewidth}
% \begin{equation}
%     \begin{split}
%         &\lVert f_\theta(x_k) - f_{\mask(\theta)}(x_k)\rVert_2 < \epsilon \\
%         &\underset{p}{\mathrm{argmax}}~f_\theta(x_k)_p = \underset{p}{\mathrm{argmax}}~f_{\mask(\theta)}(x_k)_p
%     \end{split}
%     \nonumber
% \end{equation}
% \end{minipage}
% \noindent\begin{minipage}{.5\linewidth}
% \begin{equation}
%     \begin{split}
%         &\lVert f_\theta(x_k) - f_{\mask(\theta)}(x_k)\rVert_2 < \epsilon \\
%         &\underset{p}{\mathrm{argmax}}~f_\theta(x_k)_p \neq \underset{p}{\mathrm{argmax}}~f_{\mask(\theta)}(x_k)_p
%     \end{split}
%     \nonumber
% \end{equation}
% \end{minipage}
    \begin{enumerate}[label*=\arabic*)]
    \item $\begin{aligned}[t]
        &\lVert f_\theta(x_k) - f_{\mask(\theta)}(x_k)\rVert_2 < \epsilon \\
    &\underset{p}{\mathrm{argmax}}~f_\theta(x_k)_p = \underset{p}{\mathrm{argmax}}~f_{\mask(\theta)}(x_k)_p
    \end{aligned}$

    \item  $\begin{aligned}[t]
        &\lVert f_\theta(x_k) - f_{\mask(\theta)}(x_k)\rVert_2 \geq \epsilon  \\
    &\underset{p}{\mathrm{argmax}}~f_\theta(x_k)_p = \underset{p}{\mathrm{argmax}}~f_{\mask(\theta)}(x_k)_p
    \end{aligned}$
    % \nolinenumbers
    \item $\begin{aligned}[t]
        &\lVert f_\theta(x_k) - f_{\mask(\theta)}(x_k)\rVert_2 < \epsilon \\
    &\underset{p}{\mathrm{argmax}}~f_\theta(x_k)_p \neq \underset{p}{\mathrm{argmax}}~f_{\mask(\theta)}(x_k)_p
    \end{aligned}$

    \item $\begin{aligned}[t]
        &\lVert f_\theta(x_k) - f_{\mask(\theta)}(x_k)\rVert_2 \geq \epsilon \\
    &\underset{p}{\mathrm{argmax}}~f_\theta(x_k)_p \neq \underset{p}{\mathrm{argmax}}~f_{\mask(\theta)}(x_k)_p
    \end{aligned}$
\end{enumerate}

Here $\epsilon\to 0$.
If removing $\theta_i^j$ results in case $1$, it is considered the least important neuron/filter. On the other hand, if the removal of $\theta^i_j$ leads to case $4$, it is considered the most important neuron/filter. 
Following the above cases, under the i.i.d. assumption \citep{molchanov2019importance}, the importance of the $j$-th neuron in the $i$-th layer is determined by:
{\small
\begin{gather}
        \mathcal{I}^i_j = \sum_{k=1}^n L\left(f_\theta(x_k), f_{\mask(\theta)}(x_k)\right)\label{eqn:scoring_function}\\
        \begin{aligned}
        \text{where, }&L = I + \left\lvert f_\theta(x_k)_q - f_{\mask(\theta)}(x_k)_q\right\rvert\nonumber\\
        &I = \begin{cases}
            1 & \text{ if } ~\underset{p}{\mathrm{argmax}}~f_\theta(x_k)_p\neq \underset{p}{\mathrm{argmax}}~f_{\mask(\theta)}(x_k)_p\\
            0 & \text{ otherwise}
        \end{cases}
        \end{aligned}
\end{gather}
}

Here, $q=\underset{p}{\mathrm{argmax}}~f_\theta(x_k)_p$ indicates the class predicted by $f_{\theta}(x_k)$. 
The inclusion of $I$ in the scoring function $L$ is motivated by the idea that if masking a neuron/filter leads to a misclassification, it should be considered crucial for the task and assigned a higher importance value. 
Given that $\left\lvert f_\theta(x_k)_q - f_{\mask(\theta)}(x_k)_q\right\rvert \leq 1$, assigning a value less than $1$ diminishes the significance of misclassification, while any value above $1$ has the same effect on the final ranking (the scores may vary, but the rank remains consistent). 
Therefore, in our experiments, we assign $1$ for misclassifications. 
If two different neurons/filters result in the same number of misclassifications including no misclassifications, as indicated by $I$, the tie is resolved by $\left|f_\theta(x_k)_q - f_{\mask(\theta)}(x_k)_q\right|$, which assesses the deviation in the predicted class's probability before and after masking. 





\subsubsection{SPvR Ranking Function vs KL Divergence} 
Sub-optimal ranking criteria have previously been employed to gauge the impact of masking a neuron/filter on a network's final output, thereby estimating its importance. 
CURL \citep{luo2020neural}, a state-of-the-art method, uses a criterion based on KL-divergence. 
We demonstrate the advantage of our approach over KL-divergence with an example, further supported by a detailed empirical evaluation in Appendix~\ref{sec:appendix-kl-vs-us}.

Consider a three class classification task where a pre-trained network's final layer has three neurons representing classes $0$, $1$ and $2$, respectively, with a softmax function applied to the output. 
    For a sample input $x$ yielding $f_\theta(x)=[0.1, 0.3, 0.6]^T$ from the model, the sample is classified as belonging to class $2$. 
    Now, for three neurons indexed by $j=1$, $j=2$ and $j=3$ in the $i$-th layer, let $f_{\mathcal{M}^{i}_{1}(\theta)}(x)=[0.1, 0.6, 0.3]^T$, $f_{\mathcal{M}^{i}_{2}(\theta)}(x)=[0.01, 0.1, 0.89]^T$ and $f_{\mathcal{M}^{i}_{3}(\theta)}(x)=[0.1, 0.8, 0.1]^T$. 
    The neuron at $j=3$ is the most crucial since its removal leads to misclassification along with a large change in output followed by $j=1$ (misclassification) and $j=2$ (no effect on the network's classification accuracy). 
    Let $\theta' = \mathcal{M}^{i}_{1}(\theta)$, $\theta'' = \mathcal{M}^{i}_{2}(\theta)$ and $\theta''' = \mathcal{M}^{i}_{3}(\theta)$. 
    We calculate the importance of neurons at $j=1$, $j=2$ and $j=3$ using both KL divergence and our proposed method.

\begin{center}
    \noindent\begin{minipage}{.5\linewidth}
    \begin{equation}
        \begin{split}
             &\text{KL}\left(f_\theta(x)\parallel f_{\theta'}(x)\right)=0.20\nonumber\\
             &\text{KL}\left(f_\theta(x)\parallel f_{\theta''}(x)\right)=0.32\nonumber\\
             &\text{KL}\left(f_\theta(x)\parallel f_{\theta'''}(x)\right)=0.78\nonumber\\
        \end{split}
    \end{equation}
\end{minipage}%
\begin{minipage}{.5\linewidth}
    \begin{equation}
      \begin{split}
         &L\left(f_\theta(x),f_{\theta'}(x)\right)=1.3\nonumber\\
         &L\left(f_\theta(x),f_{\theta''}(x)\right)=0.29\nonumber\\
         &L\left(f_\theta(x),f_{\theta'''}(x)\right)=1.5\nonumber\\
        \end{split}
  \end{equation}
\end{minipage}
\end{center}


In this scenario, the KL divergence criterion incorrectly assigns greater importance to the neuron at $j=2$ in comparison to $j=1$, whereas our proposed ranking function accurately identifies the correct ranking.

\subsection{Grouping Module}
Determining $\mathcal{I}^i_j$ for individual neurons/filters in a wide and deep network is computationally expensive. 
In order to make SPvR more compute efficient, we group layerwise similar neurons/filters so that $\mathcal{I}^i_j$ estimates the importance of the $j$-th group in the $i$-th layer where the size of the group is a hyperparameter denoted by $d$. 
Let $S\in \mathbb{R}^{n\times m}$ be a network's intermediate layer output. 
The similarity between two neurons/filters is measured by the correlation between their output activations/channels given by the following correlation matrix, $C\in\mathbb{R}^{m\times m}$:
\begin{equation}
    C = \hat{S}^T\hat{S}\\
\label{eqn:correlation}
\end{equation}
\begin{equation}
\begin{split}
    \text{ where, }~&\hat{S} = \Bar{S}\frac{1}{\sqrt{n}}\mathrm{Diag}\left(\Bar{S}^T\Bar{S}\right)^{-1/2} \\
    ~\text{ and, }~ &\Bar{S} = S-\frac{1}{n}\mathbbm{1}_n\mathbbm{1}_n^TS\\
\end{split}
    \nonumber
\end{equation}

\begin{algorithm}[ht]
   \caption{Grouping Module}
   \label{alg:grouping}
\begin{algorithmic}
   \STATE {\bfseries Input:} Correlation Matrix $C$, group-size $d$
   \STATE Let $\mathcal{G}=\{\}$ (set of sets) $, \mathcal{Q} = \{1,\cdots,m\}$
   % \REPEAT
   \FOR{$j \in \mathcal{Q}$}
   \STATE $\mathcal{T}=$ Indices corresponding to top $d$ values in $C_{(j,:)}$
   \STATE $\mathcal{G}.\mathrm{append}(\mathcal{T})$
   \STATE $C_{(:,\mathcal{T})}=-\infty$
   \STATE $\mathcal{Q} = \mathcal{Q}\setminus \mathcal{T}$
   % \STATE $C = C[p,q] ~\forall p=(1\text{ to }m),\forall q=(1\text{ to }m)$ such that $(j\notin g) \wedge (k\notin g)$
   % \STATE $C = C[p,q] ~\forall p,q \in \mathcal{Q}$
   
   % \ENDIF
   \ENDFOR\\
   {\bfseries Output:} $\mathcal{G}$
   % \UNTIL{$noChange$ is $true$}
\end{algorithmic}
\end{algorithm}

Here, Diag$\left(\Bar{S}^T\Bar{S}\right)$ is a diagonal matrix where the diagonal entries are equal to the diagonal entries of $\Bar{S}^T\Bar{S}$. 
Using $C$, the neurons are partitioned into mutually exclusive groups using Algorithm \ref{alg:grouping} where, $C_{(j,:)}$ denotes the $j$-th row and $C_{(:,\mathcal{T})}$ denotes all columns indexed in $\mathcal{T}$. 
In the case of CNNs, the output of the $i$-th layer is denoted by a tensor $S^i\in\mathbb{R}^{n\times m\times w\times h}$ where $n$ is the number of samples, $m$ is the number of output channels and $w$, and $h$ are the width and height of each output channel, respectively. 
In such a scenario, the tensor is first reduced to a matrix, $S_{pq}^i\in\mathbb{R}^{n\times m}$ as follows:

\begin{equation}
    \begin{gathered}
    S^i_{pq} = \sum_{r=1}^w\sum_{t=1}^{h}\lvert S^i_{pqrt}\rvert\\
    \forall p\in\{1,2,\cdots,n\} \text{ and } \forall q\in\{1,2,\cdots,m\}
    \label{eqn:reduce}
\end{gathered}
\end{equation}
Applying Eqn. \ref{eqn:correlation} and Algorithm \ref{alg:grouping} on $S^i_{pq}$ yields the groups for the current layer. 
Although Eqn. \ref{eqn:correlation} describes a linear correlation, we find that it works well in practice while being lightweight to compute.

Once the neurons/filters are grouped, the importance of each group across all layers is assessed using the ranking module. These groups are globally sorted throughout the model, and the least important ones are pruned away until the desired parameter count is reached. 
At high pruning rates, this process may result in the pruning of entire layers, as we do not impose a minimum threshold for layer pruning. 
Such a scenario, known as layer collapse, leads to entire layers being removed, rendering a network untrainable \citep{NEURIPS2020_46a4378f}. 
This poses a challenge for methods that rely on preserved weights for subsequent fine-tuning. 
In contrast, our approach views the pruned sub-network as a new model and initializes (re-initialization) its parameters through standard initialization techniques. 
After re-initialization, the pruned networks undergo training from scratch, a strategy recommended by \cite{liu2018rethinking}. 
Consequently, layer collapse is a beneficial feature of our method contributing significantly to reducing inference latency in the pruned models at high pruning rates. 
Ultimately, SPvR autonomously identifies the optimal number of layers and the precise layerwise width of the final pruned architecture for the specific dataset, model, and parameter budget.

\textbf{SPvR Pruning Time Complexity:} 
The grouping module requires only a single forward pass through the original large model to compute layerwise groups across the entire network while the ranking module requires $\sum_{i=1}^{L}\left\lceil \frac{m_i}{d}\right\rceil$ number of forward passes where $m_i$ is the number of neurons/filters at the $i$-th layer of an $L$ layer neural network and $d$ is the group size, a hyper-parameter. Hence, the time complexity of our pruning algorithm, similar to most other pruning algorithms is $O(n)$ where $n$ is the number of samples.


\begin{table*}[ht]
\small
\caption{Comparison of the \textbf{Top-1} and \textbf{Top-5} accuracy scores on the CIFAR10, Tiny ImageNet and ImageNet1K datasets, for multiple pruning methods at different levels of pruning for the VGG16, ResNet34 and ResNet50 networks, respectively. 
Higher values are better. 
Bold values indicate the best score. 
The \textit{Param} column indicates the percentage of parameters removed from the original model.
}
\label{tab:accuracy}
\begin{center}
% \setlength\tabcolsep{4.8pt}
\begin{tabular}{cccccccccccccc}
\toprule
 &  &  & \multicolumn{11}{c}{Methods}\\
 \cmidrule{4-14}
Dataset & Model & Param & Base & \multicolumn{1}{c}{$\ell_1$} & \multicolumn{1}{c}{$\ell_2$} & \multicolumn{1}{c}{Taylor} & \multicolumn{1}{c}{FPGM} & \multicolumn{1}{c}{RCP} & \multicolumn{1}{c}{HRank} & \multicolumn{1}{c}{CURL} & \multicolumn{1}{c}{NISP} & \multicolumn{1}{c}{OTOv2} & \multicolumn{1}{c}{SPvR} \\
\toprule
\multirow{6}{*}{\makecell{CIFAR\\ 10}} & \multirow{6}{*}{\makecell{VGG16 \\ (Top-1)}} & 00\% & 94.25 & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} \\
 &  & 70\% & - & 93.53 & 93.45 & 93.23 & 92.72 & 86.68 & 92.84 & 94.18 & 92.51 & 93.20 & \textbf{94.21} \\
 &  & 80\% & - & 92.60 & 92.99 & 92.87 & 91.63 & 85.90 & 93.09 & 93.89 & 91.48 & 92.70 & \textbf{94.33} \\
 &  & 90\% & - & 92.05 & 91.64 & 91.55 & 90.85 & 84.00 & 92.36 & 93.49 & 90.28 & 91.07 & \textbf{94.43} \\
 &  & 95\% & - & 90.20 & 90.36 & 90.37 & 88.41 & 84.09 & 91.44 & 92.14 & 88.77 & 91.03 & \textbf{93.64} \\
 &  & 98\% & - & 87.32 & 87.64 & 87.22 & 87.09 & 83.78 & 91.10 & 91.66 & 87.00 & 87.88 & \textbf{92.60}\\
\midrule
\multirow{6}{*}{\makecell{Tiny\\ Image\\ Net}} & \multirow{6}{*}{\makecell{Res \\ Net34 \\ (Top-1)}} & 00\% & 63.02 & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} \\
 &  & 70\% & - & 60.95 & 60.50 & 60.65 & 60.73 & 58.18 & 57.90 & 58.38 & 57.06 & 58.72 & \textbf{62.89} \\
 &  & 80\% & - & 58.73 & 58.51 & 59.40 & 59.14 & 56.52 & 55.30 & 56.85 & 54.76 & 59.05 & \textbf{62.55} \\
 &  & 90\% & - & 56.42 & 56.78 & 57.06 & 56.22 & 54.21 & 52.85 & 55.13 & 52.62 & 55.94 & \textbf{60.36} \\
 &  & 95\% & - & 55.07 & 54.96 & 55.16 & 52.88 & 50.65 & 50.55 & 48.33 & 50.82 & 52.10 & \textbf{58.95} \\
 &  & 98\% & - & 52.15 & 51.18 & 51.59 & 50.42 & 45.46 & 46.83 & 39.49 & 45.10 & 47.73 & \textbf{55.84}\\
\midrule
\multirow{6}{*}{\makecell{Tiny\\ Image\\ Net}} & \multirow{6}{*}{\makecell{Res \\ Net34 \\ (Top-5)}} & 00\% & 83.22 & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} \\
 &  & 70\% & - & 81.59 & 81.28 & 81.68 & 81.68 & 80.51 & 80.27 & 80.68 & 79.39 & 82.04 & \textbf{83.08} \\
 &  & 80\% & - & 80.65 & 80.58 & 80.61 & 80.39 & 79.26 & 78.85 & 80.22 & 77.56 & 82.18 & \textbf{82.73} \\
 &  & 90\% & - & 79.30 & 79.40 & 79.79 & 78.70 & 78.08 & 77.02 & 79.81 & 76.78 & 81.00 & \textbf{82.34} \\
 &  & 95\% & - & 79.16 & 78.32 & 78.86 & 76.46 & 75.57 & 75.68 & 74.80 & 75.35 & 78.66 & \textbf{81.59} \\
 &  & 98\% & - & 76.68 & 77.42 & 77.42 & 76.13 & 72.19 & 73.26 & 67.46 & 71.89 & 74.66 & \textbf{80.72}\\
\midrule
\multirow{4}{*}{\makecell{Image\\ Net1K}} & \multirow{4}{*}{\makecell{Res\\ Net50 \\ (Top-1)}} & 00\% & 76.32 & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} \\
 &  & 40\% & - & 73.82 & 73.89 & 71.69 & 74.83 & 75.13 & 74.98 & - & 75.43 & - & \textbf{75.58} \\
 &  & 70\% & - & 70.07 & 70.91 & - & - & - & 69.10 & 73.39 & - & 72.20 & \textbf{73.70} \\
 &  & 80\% & - & 68.11 & 69.00 & - & - & - & - & - & - & 70.10 & \textbf{72.18} \\
 \midrule
 \multirow{4}{*}{\makecell{Image\\ Net1K}} & \multirow{4}{*}{\makecell{Res\\ Net50 \\ (Top-5)}} & 00\% & 92.89 & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} \\
 &  & 40\% & - & 91.76 & 91.82 & 91.01 & 92.32 & 92.52 & 92.33 & - & 92.45 & - & \textbf{92.69} \\
 &  & 70\% & - & 89.23 & 89.57 & - & - & - & 89.58 & 91.46 & - & 90.70 & \textbf{91.91} \\
 &  & 80\% & - & 88.06 & 88.43 & - & - & - & - & - & - & 89.30 & \textbf{90.58} \\
 \bottomrule
\end{tabular}
\end{center}
\end{table*}

\section{Experimental Setup}
\label{sec:experiments}
% \subsection{Setup}
% \label{sec:exp_setup}
\subsection{Datasets and Models} 
We evaluate our method on four datasets ranging from small to large scale: CIFAR10 \citep{krizhevsky2009learning} ($50K$ training samples, $10K$ test samples and $10$ classes), Tiny ImageNet ($100K$ training samples, $10K$ test samples and $200$ classes) \citep{le2015tiny} and ImageNet1K \cite{deng2009imagenet} ($1200K$ training samples, $50K$ validation samples and $1000$ classes). 
For CIFAR10, Tiny ImageNet and ImageNet1K, we utilize VGG16 with batch normalization. \citep{simonyan2014very}, ResNet34 and ResNet50 \citep{he2016deep} models, respectively.
The selection of datasets, models, and their specific combinations is based on the structured pruning literature \citep{hoang2023revisiting,pmlr-v119-goyal20a,krishnan2019structural}. 

\subsection{Baselines} 
For CIFAR10 and Tiny ImageNet, we compare against $\ell_1$ norm, $\ell_2$ norm, Taylor expansion, HRank, FPGM, CURL, Random Channel Pruning (RCP), NISP, and OTOv2. 
For ImageNet1K, we compare against the same methods but report the results as detailed in their respective publications.


\begin{figure*}[t]
\centering
\begin{subfigure}{0.33\textwidth}
  \centering
  \includegraphics[width=1.0\linewidth]{uai2025-template/images/inf_time_vgg16.png}
  \caption{}
  \label{fig:inf_time_vgg16}
\end{subfigure}%
\begin{subfigure}{0.33\textwidth}
  \centering
  \includegraphics[width=1.0\linewidth]{uai2025-template/images/inf_time_resnet34.png}
  \caption{}
  \label{fig:inf_time_resnet34}
\end{subfigure}
\begin{subfigure}{0.33\textwidth}
  \centering
  \includegraphics[width=1.0\linewidth]{uai2025-template/images/inf_time_resnet50.png}
  \caption{}
  \label{fig:inf_time_resnet50}
\end{subfigure}
\caption{Inference latency for (a) VGG16, (b) ResNet34 and (c) ResNet50. 
The results are reported in terms of milliseconds averaged over $1500$ runs with a standard deviation of $0.01-0.001$. 
Smaller values are better. }
\label{fig:inf_time}
\end{figure*}


\subsection{Implementation Details} 
Our experiments were conducted using PyTorch v$2.3.1$ \citep{paszke2019pytorch} on an NVIDIA A100 GPU. 
Since no pre-trained models are available for CIFAR10 and TinyImageNet, we generate our own "pre-trained" versions of VGG16 and ResNet34 by training them from scratch to achieve the maximum reported accuracy on their respective datasets. 
On the other hand, we use an ImageNet1K pre-trained ResNet50 model for pruning.
For training VGG16, ResNet34 and ResNet50, we employed SGD with a momentum of $0.9$ as the optimizer along with the cosine annealing scheduler \citep{loshchilov2016sgdr}. 
The learning rate for each experiment was determined through a grid search within the range $[0.0001,1.0]$. 
Training durations were set at $200$ epochs for VGG16 with a batch size of $128$, $100$ epochs for ResNet34 with a batch size of $512$ and $100$ epochs for ResNet50 with a batch size of $256$. 
The group size, $d$, was set to $2$, $4$ and $8$ for CIFAR10, Tiny ImageNet1K and ImageNet experiments, respectively. 
The pruning library by \cite{fang2023depgraph} was utilized to implement the baseline methods, except for OTOv2. 
On the ImageNet1K dataset, we compare SPvR's performance against the results reported by the baseline methods. 
Further implementation information is provided in Section \ref{sec:appendix_impl} in the Appendix. 
Details on the architectures recommended by SPvR and their corresponding size on disk at various pruning percentages are available in Section \ref{sec:pruned_archs} in the Appendix while architecture-specific pruning implementations are provided in Section \ref{sec:pruning_strategies} in the Appendix.


\section{Results}
\label{section:results}
\subsection{Performance against Baselines}
Table \ref{tab:accuracy} showcases the top-1 accuracy for CIFAR10 and both top-1 and top-5 accuracies for Tiny ImageNet and ImageNet1K. 
The results clearly indicate that SPvR surpasses all other pruning methods at every pruning stage across all three models by a significant margin. 
Notably, our method enhances the performance of the VGG16 model when $90\%$ of its parameters are pruned while maintaining a minimal accuracy drop of less than $1\%$ even after a $95\%$ reduction in parameters. 
% For the CIFAR10 dataset, the $\ell_1$ and $\ell_2$ norm pruning methods emerge as strong contenders, while HRank and CURL present significant competition to PvR. 
On the Tiny ImageNet dataset, our pruned ResNet34 networks exhibit less than a $1\%$ drop in top-5 accuracy, even with $90\%$ of the model pruned, far surpassing every other pruning baseline. 
Similarly, our $70\%$ pruned ResNet50 achieves less than $1\%$ drop in Top-5 accuracy on the ImageNet1K dataset while also displaying the least drop in Top-1 accuracy with increasing pruning rate.
% At the $98\%$ pruning level, $\ell_1$ norm pruning closely matches our top-1 accuracy, whereas $\ell_2$ norm and Taylor expansion approaches are competitive in top-5 accuracy. 
% On the IMDB 50K movie reviews dataset, SPvR's sub-network at the $90\%$ pruning level outshines TinyBert, a comparably sized handcrafted model. 
% Remarkably, the Bert-base-uncased sub-networks generated through our approach exceed the performance of the original model at various pruning levels, particularly notable at the $98\%$ pruning threshold. 
% Surprisingly, $\ell_1$ and $\ell_2$ norms perform comparably against more complex pruning methods
% which is in line with some earlier reports \cite{li2022revisiting,lin2020hrank,liu2018rethinking,wang2022trainability}. 
As observed by \cite{li2022revisiting}, $\ell_1$ and $\ell_2$ pruning methods are incredibly tough-to-beat baselines when trained using the correct set of hyper-parameters.

\subsection{Retraining vs Fine-tuning} 
\label{sec:appendix_retrain_vs_finetune}
% To further drive home the point that training from scratch for structure pruned networks is a superior option to fine-tuning, 
Is retraining critical for structured pruning? 
Although \cite{liu2018rethinking} answer in the affirmative, we further investigate this phenomenon by comparing the performance of training from scratch against fine-tuning on all chosen datasets and models. 
For fair evaluation, we choose the highest pruning rate where depth reduction has not occurred since layer removal, in the case of fine-tuning, requires learning new connections from scratch.
Table \ref{tab:fine_tune_vs_tfs} demonstrates that fine-tuning performs worse than training from scratch for all networks when trained for the same number of epochs with the best learning rates selected using grid-search. 
The gap is more pronounced for higher pruning rates which is in line with the observations of \cite{liu2018rethinking}. 
% Although fine-tuning requires fewer epochs to achieve the results reported in Table \ref{tab:fine_tune_vs_tfs}, the performance stagnates and does not improve upon further training. 
% In contrast, training from scratch yields better performance with increased training. 
% Since pruning and re-training is an offline, one-time job, it does not hurt to train a model longer to achieve better results which is the underlying objective of pruning.
% Thus, training from scratch is a better scheme given that we want to maximize performance through offline training.

\begin{table}[H]
\small
\caption{A comparison between Fine-Tuning (FT) a pruned model versus Training From Scratch (TFS). 
% Here, FT denotes Fine-Tuning while TFS denotes Training From Scratch. 
All accuracy scores are reported in $\%$.}
\label{tab:fine_tune_vs_tfs}
\begin{center}
\setlength\tabcolsep{5pt}
\begin{tabular}{llccc}
    \toprule
        Dataset & Model & Params & FT & TFS\\
    \toprule
        CIFAR10 & VGG16 & 60\% & 93.40 & \textbf{94.25}\\
        TinyImageNet (Top-1) & ResNet34 & 60\% & 61.60 & \textbf{62.91} \\
        TinyImageNet (Top-5) & ResNet34 & 60\% & 82.70 & \textbf{83.11}\\
        ImageNet1K (Top-1) & ResNet50 & 90\% & 67.77 & \textbf{69.37} \\
        ImageNet1K (Top-5) & ResNet50 & 90\% & 87.91 & \textbf{89.08}\\
    \bottomrule
\end{tabular}
\end{center}
\end{table}

\begin{table*}[t]
\small
    \caption{Comparison of the \textbf{Top-1} and \textbf{Top-5} accuracy scores on the CIFAR10, Tiny ImageNet and ImageNet1K datasets, for multiple pruning methods at a single level of pruning for the VGG16, ResNet34 and ResNet50 networks, respectively. 
    Each pruned model is trained from scratch.
    Higher values are better. 
    Bold values indicate the best score. 
    The \textit{Param} column indicates the percentage of parameters removed from the original model.
    }
    \label{tab:secret_ingredient}
    \begin{center}
    \setlength\tabcolsep{6pt}
    \begin{tabular}{cccccccccccc}
    \toprule
     &  &  & \multicolumn{9}{c}{Methods}\\
     \cmidrule{4-12}
    Dataset & Model & Param & \multicolumn{1}{c}{$\ell_1$} & \multicolumn{1}{c}{$\ell_2$} & \multicolumn{1}{c}{Taylor} & \multicolumn{1}{c}{FPGM} & \multicolumn{1}{c}{RCP} & \multicolumn{1}{c}{HRank} & \multicolumn{1}{c}{CURL} & \multicolumn{1}{c}{NISP} & \multicolumn{1}{c}{SPvR} \\
    \toprule
    \makecell{CIFAR10} & \makecell{VGG16 \\ (Top-1)} & 90\% & 92.70 & 92.12 & 91.65 & 91.00 & 88.07 & 93.10 & 93.69 & 90.01 & \textbf{94.43} \\
    \midrule
    \multirow{2}{*}{\makecell{\\TinyImageNet}} & \makecell{ResNet34 \\ (Top-1)} &  90\% & 56.20 & 56.88 & 57.16 & 56.23 & 54.25 & 52.35 & 55.43 & 52.93 & \textbf{60.36} \\
    \cmidrule{2-12}
     & \makecell{ResNet34 \\ (Top-5)} & 90\% & 79.10 & 79.30 & 79.89 & 78.60 & 78.28 & 77.34 & 80.00 & 76.97 & \textbf{82.34} \\
    \midrule
    \multirow{2}{*}{\makecell{\\ImageNet1K}} & \makecell{ResNet50 \\ (Top-1)} & 80\% & 71.50 & 71.45 & - & - & - & - & - & - & \textbf{72.18} \\
     \cmidrule{2-12}
     & \makecell{ResNet50 \\ (Top-5)} & 80\% & 90.30 & 90.40 & - & - & - & - & - & - & \textbf{90.58} \\
     \bottomrule
    \end{tabular}
    \end{center}
    % \vspace{-20pt}
\end{table*}

\subsection{Inference Latency}
We also examine model inference latency, defined as the time it takes for a model to make a prediction for a single sample. The inference latency is significantly affected by the network's layerwise width, where non-standard layer structures (not a power of two) minimally impact the prediction time for a single sample. 
We chose not to focus on FLOPs count since models of similar sizes can have identical FLOPs but vastly different inference latencies \citep{liu2021latency}. 
The inference latency, measured in milliseconds and averaged over 1500 runs, is displayed in Fig. \ref{fig:inf_time}. 
Techniques that globally rank and remove neurons or filters tend to create more irregular layer widths compared to layerwise pruning methods, leading to slower inference times such as in the CURL-based sub-networks compared to the original, unpruned networks. 
Conversely, SPvR-generated sub-networks exhibit significantly reduced inference latency, benefiting from decreased depth even with irregular layer widths.
Specifically, at the $98\%$ pruning level for VGG16, SPvR-generated sub-networks demonstrate remarkably low latency. 
For ResNet34 and ResNet50, our recommended sub-networks consistently show lower inference latency than those generated by other methods across all pruning stages. 
% Bert-base-uncased network experiments confirm that SPvR achieves the most favourable balance between accuracy and inference latency. 


\begin{table}[ht]
\small
\caption{A $94\%$ pruned ResNet50 against MobileNetV3 on ImageNet1K. 
Param - model parameters in Million.
}
\label{tab:mobilenet}
\begin{center}
\setlength\tabcolsep{10pt}
\begin{tabular}{lcc}
    \toprule
        Model & Param (M) & Top-1 Acc.\\
    \toprule
        MobileNetV3-small & 2.4 & 65.40\\
        MobileNetV3-minimal & 2.0 & 61.90\\
        ResNet50-pruned & 1.6 & 65.20\\
    \bottomrule
\end{tabular}
\end{center}
\vspace{-10pt}
\end{table}

\subsection{Application to mobile phones} 
The MobileNet \citep{howard2017mobilenets} family of networks are highly efficient models specifically designed to run on mobile phone CPUs with MobileNetV3 \citep{howard2019searching} being the latest model. 
These networks have been developed through hours of careful research combined with automated architecture search methods such as Neural Architecture Search. 
Instead, we advocate that SPvR can be used to quickly find an efficient architecture for a given dataset and parameter budget. 
To demonstrate this, we prune a ResNet50 model down from $25.6$M parameters to $1.6$M, train it on the ImageNet dataset and compare the results against the MobileNetV3 networks. 
Table \ref{tab:mobilenet} shows that our pruned network performs at par with MobileNetV3-small while having $33\%$ fewer parameters and achieves $3.3\%$ better Top-1 accuracy than MobileNetV3-minimal while having $20\%$ fewer parameters.
% It should be noted that our training time was considerably less than what was used for MobileNetV3 networks meaning that the result shown in Table \ref{tab:mobilenet} can be further improved.

\subsection{Application to Image Segmentation}
We trained a SegNet[1] model having a VGG16 encoder and decoder backbone on the CityScapes dataset [2]. 
The CityScapes dataset contains $5000$ annotated images with $20$ labels. 
We consider $20$ samples per class and a group size, $d=4$ to generate the groupings and rankings. 
Since a semantic segmentation task is essentially a classification task over each pixel, we sum the $L$ term in Eqn. 2 over all pixels, i.e.,
\begin{equation}
    L = \sum_{u=1}^{w\times h} I + \left\lvert f_\theta(x_k)_q - f_{\mask(\theta)}(x_k)_q\right\rvert\nonumber
\end{equation}
where $w$ and $h$ are the width and height of the image.
The original SegNet model achieves $51.2\%$ IoU whereas its $80\%$ pruned version using SPvR achieves $46.5\%$ IoU. 

\begin{figure*}[t]
\centering
\begin{subfigure}{0.33\textwidth}
  \centering
  \includegraphics[width=1.0\linewidth]{uai2025-template/images/ablation_study_vgg.png}
  \caption{}
  \label{fig:ablation_study_vgg16}
\end{subfigure}%
\begin{subfigure}{0.33\textwidth}
  \centering
  \includegraphics[width=1.0\linewidth]{uai2025-template/images/ablation_study_resnet.png}
  \caption{}
  \label{fig:ablation_study_resnet34}
\end{subfigure}
\begin{subfigure}{0.33\textwidth}
  \centering
  \includegraphics[width=1.0\linewidth]{uai2025-template/images/ablation_study_resnet50.png}
  \caption{}
  \label{fig:ablation_study_resnet50}
\end{subfigure}
\caption{(a) Change in Top-1 accuracy and time required to prune $98\%$ of the VGG16 network with increasing group size, (b) Change in Top-1, Top-5 accuracy and time required to prune $98\%$ of the ResNet34 network with increasing group size (c)  Change in Top-1, Top-5 accuracy and time required to prune $98\%$ of the ResNet50 network with increasing group size}
\label{fig:ablation_study}
\end{figure*}

\section{Ablation Study}\label{sec:impact_of_group_size}
\subsection{Retraining vs Architecture}
Does SPvR's success stem from the architecture of the pruned sub-networks or retraining them from scratch?
To answer this question and disentangle the performance benefits of training from scratch from our generated sub-networks, we re-run all experiments for each baseline, training them from scratch except for OTOv2 which already re-trains the original pre-trained model. 
We produce the results on a single but high pruning rate for all datasets as it should be enough to demonstrate the efficacy of our sub-networks.

As per Table \ref{tab:secret_ingredient}, the sub-networks generated by each method show a slight improvement in accuracy when trained from scratch except for RCP on the CIFAR10 dataset where its score increases by $4\%$. 
On the ImageNet1K dataset, we report results for all baselines as made available by the corresponding authors. 
For this particular experiment, we only train the models generated by $\ell_1$ and $\ell_2$ norm methods as they form one of the strongest baselines. 
We find that both sub-networks benefit from retraining by up to $3\%$ but are yet unable to outmatch the performance of SPvR.
In general, none of the methods are able to achieve comparable performance to SPvR indicating the importance of the sub-networks generated by our method.

\subsection{Impact of Group Size on Performance and Pruning Time} 
To understand the impact of group size $d$, we observe the change in the accuracy of pruned networks and the time required for pruning under varying values of $d$. 
We perform this ablation study for only the maximum level of pruning as it is the worst-case scenario regarding both accuracy and time. 
Hence, we ablate the VGG16, ResNet34 and ResNet50 networks at a pruning rate of $98\%$ for $d=\{2,4,8,16,32\}, d=\{4,8,16,32\}$ and $d={8, 16, 32}$, respectively. 
It is expected that values of $d$ closer to $1$ produce fine-grained pruning results but at the cost of slower rankings. 
According to Figs. \ref{fig:ablation_study_vgg16}, \ref{fig:ablation_study_resnet34} and \ref{fig:ablation_study_resnet50}, our hypothesis is indeed validated with SPvR generally being robust to group size as the top-1 accuracy drop for $d=(2-32)$ for VGG16 is about $1\%$, the top-5 accuracy drop for $d=(4-32)$ for ResNet34 is about $2\%$ and the top-5 accuracy drop from $d=(8-32)$ for ResNet50 is about $1\%$. 
At the same time, for $d=32$, SPvR can prune VGG16, ResNet34 and ResNet50 under $\mathbf{6}$ \textbf{seconds}, $\mathbf{8}$ \textbf{minutes} and $\mathbf{30}$ \textbf{minutes}, respectively.


\section{Conclusion}
We introduced Structure Pruning via Ranking (SPvR), an efficient model pruning algorithm that efficiently prunes vision and language models without requiring backpropagation on the original, pre-trained models. 
Our approach leverages a novel combination of local layerwise grouping and global ranking to prune less significant neuron or filter groups guided by user-defined parameter budgets. This process results in the generation of compact sub-networks with reduced depth and parameter counts.
Key highlights of our contribution include the empirical validation of the re-initialization strategy through the lens of Geometric Complexity, demonstrating its effectiveness for structurally pruned networks. 
Furthermore, our comprehensive evaluation across various benchmark datasets and models confirms SPvR's superior performance. 
The algorithm outpaces both existing pruning methods as well as hand-crafted architectures in terms of accuracy and achieves significant reductions in inference latency. 
Our findings underscore the potential of SPvR in addressing the deployment challenges of large neural networks on resource-constrained devices.

% References
\bibliography{uai2025-template}

\newpage

\onecolumn
\title{SPvR: Structured Pruning via Ranking\\(Supplementary Material)}
\maketitle



% This Supplementary Material should be submitted together with the main paper.

\appendix
\section{SPvR Ranking Function vs KL Divergence: Empirical Evaluation}
\label{sec:appendix-kl-vs-us}
In order to show the superiority of our ranking function in a more practical setting, we train a single hidden layer feed-forward neural network on a binary classification synthetic dataset and compare the neuron rankings produced by the KL divergence criterion and our method against the ground truth. 
Specifically, the data is generated as per the algorithm in \citep{Guyon2003DesignOE} using the scikit-learn library \citep{scikit-learn}. 
The number of features and samples are set to $100$ and $1000$, respectively. 
To introduce noise into the dataset, the binary labels corresponding to each sample are flipped to either $1$ or $0$ with a probability of $0.02$ which also introduces imbalance into the dataset. 
A single hidden layer feed-forward neural network with $64$ neurons in the hidden layer and ReLU activation along with $2$ neurons in the output layer and softmax activation is trained on the synthetic dataset using the Adam optimizer \citep{kingma2014adam} with a learning rate of $0.001$. 
Once the network is trained, the ground truth ranked list is generated by measuring the number of misclassifications that occur when masking individual neurons with higher misclassifications being attributed to more important neurons. 
We choose this as the ground truth since the main philosophy of pruning is to remove parameters that do not hurt the final accuracy of a model.
Two more ranked lists are generated using our ranking function and the KL divergence criterion, respectively. 
The Kendall Tau rank correlation metric \citep{kendall1938new}, a non-parametric rank similarity measure, is used to evaluate the rank performance of both methods in comparison to the ground truth where a value of $1$ indicates exact rank association while a value of $0$ indicates no association. 
Table \ref{tab:rank} demonstrates that our ranking criterion is much better suited for the task of computing neuron importance for pruning in comparison to the KL divergence criterion.

\begin{table}[h]
    \centering
    \caption{Evaluation of the neuron ranks produced by our proposed ranking function and the KL divergence criterion against the generated ground truth ranked list in terms of the Kendall Tau rank correlation metric. 
    Bold values indicate the best performance with $1$ being the highest achievable score.}
    % \vspace{10pt}
    \begin{tabular}{lc}
        \toprule
        Ranking Methods & Kendall Tau Rank Correlation\\
        \midrule
        SPvR Ranking criterion & \textbf{0.861} \\
        KL Divergence criterion & 0.586\\
        \bottomrule
    \end{tabular}
    \label{tab:rank}
\end{table}

\begin{remark}
    Measuring only the number of misclassifications does not provide the complete picture of a neuron's importance as the change induced in the final output layer needs to be taken into account in order to correctly rank neurons with the same number of misclassifications.
\end{remark}

\section{Pruning Strategies}
\label{sec:pruning_strategies}
\textbf{Pruning VGG16:} 
Pruning networks without skip connections, such as VGG16, is relatively straightforward. 
The ranking module provides a sorted list of groups that must be pruned. One can iterate over the list and discard the least important groups until the user-supplied target parameter budget is reached. 
The remaining groups form the smaller sub-network.

\textbf{Pruning ResNets:} The ResNet type architectures have two sets of skip connections, known as identity and projection shortcuts \citep{he2016deep}. Layers with the same number of filters share the identity shortcut, while layers with different numbers of filters require a projection. When iterating over the sorted list provided by the ranking module, if a group from a particular layer is discarded then the least significant group from each subsequent layer with an identity shortcut is discarded until a group from a layer after a projection shortcut is encountered.

\section{Pruned Architectures}\label{sec:pruned_archs}
\subsection{VGG16}
    \begin{table}[!h]
    \small
        \caption{The number of channels per layer for each pruning percentage. Here, \textit{"M"} denotes the position of the max-pooling layer.
        The \textit{Param} column indicates the percentage of parameters removed from the original model.
        The \textit{Size} column denotes the actual size of the model on disk in megabytes.}
        \label{tab:vgg16_pruned_arch}
        \centering
        \setlength\tabcolsep{4.3pt}
        \begin{tabular}{ccl}
            \toprule
            Param & \makecell{Model \\ Size} & \multicolumn{1}{c}{Architecture} \\
            \midrule
            00\% & 112 & 64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512, "M", 512, 512, 512, "M"\\
            70\% & 33.7 & 58, 64, "M", 126, 128, "M", 238, 224, "M", 192, 192, 94, "M", 56, 482, 512, "M"\\
            80\% & 22.5 & 58, 64, "M", 126, 128, "M", 238, 224, "M", 192, 192, 94, "M", 56, 196, 512, "M"\\
            90\% & 11.2 & 54, 64, "M", 124, 128, "M", 224, 220, "M", 174, 110, "M"\\
            95\% & 5.56 & 42, 64, "M", 110, 126, "M", 170, 138, 92, "M"\\
            98\% & 2.25 & 36, 62, "M", 78, 112, "M", 92, 44, 46, "M"\\
            \bottomrule
        \end{tabular}
    \end{table}
% \newpage
\subsection{ResNet34}
    \begin{table}[!h]
        \caption{The number of channels per layer per block with the number of blocks being denoted by \textit{$\times$} and each block being denoted by \textit{[.]}.
        The \textit{Param} column indicates the percentage of parameters removed from the original model.
        The \textit{Size} column denotes the actual size of the model on disk in megabytes.}
        \label{tab:resnet34_pruned_arch}
        \centering
        \setlength\tabcolsep{22pt}
        \begin{tabular}{ccl}
            \toprule
            Param & Size & \multicolumn{1}{c}{Architecture} \\
            \midrule
            00\% & 163 & [64, 64]$\times$3, [128, 128]$\times$4, [256, 256]$\times$6, [512, 512]$\times$3\\
            70\% & 46.8 & [60, 60]$\times$3, [112, 112]$\times$4, [328, 328]$\times$3\\
            80\% & 30.6 & [60, 60]$\times$3, [104, 104]$\times$4, [256, 256]$\times$3\\
            90\% & 15.2 & [60, 60]$\times$3, [100, 100]$\times$4, [152, 152]$\times$3\\
            95\% & 8.03 & [60, 60]$\times$3, [76, 76]$\times$4, [88, 88]$\times$3\\
            98\% & 2.72 & [60, 60]$\times$3, [36, 36]$\times$4, [44, 44]$\times$3\\
            \bottomrule
        \end{tabular}
    \end{table}
    \newpage
\subsection{ResNet50}
    \begin{table}[!h]
        \caption{The number of channels per layer per block with the number of blocks being denoted by \textit{$\times$} and each block being denoted by \textit{[.]}.
        The \textit{Param} column indicates the percentage of parameters removed from the original model.
        The \textit{Size} column denotes the actual size of the model on disk in megabytes.}
        \label{tab:resnet50_pruned_arch}
        \centering
        \setlength\tabcolsep{9pt}
        \begin{tabular}{ccl}
            \toprule
            Param & Size & \multicolumn{1}{c}{Architecture} \\
            \midrule
            0\% & 195 & [64, 64, 64]$\times$3, [128, 128, 128]$\times$4, [256, 256, 256]$\times$6, [512, 512, 512]$\times$3\\
            40\% & 118 & [64, 64, 64]$\times$3, [120, 120, 120]$\times$4, [240, 240, 240]$\times$6, [328, 328, 328]$\times$3\\
            70\% & 60.3 & [64, 64, 64]$\times$3, [112, 112, 112]$\times$4, [200, 200, 200]$\times$6, [152, 152, 152]$\times$6\\
            80\% & 41.5 & [56, 56, 56]$\times$3, [112, 112, 112]$\times$4, [168, 168, 168]$\times$6, [96, 96, 96]$\times$3\\
            95\% & 12 & [56, 56, 56]$\times$3, [88, 88, 88]$\times$4, [64, 64, 64]$\times$6\\
            \bottomrule
        \end{tabular}
    \end{table}
% \subsubsection{SegNet}
%     \begin{table}[!h]
%     \small
%         \caption{The first value in the \textit{architecture} column indicates the hidden size, the second value (or list of values) indicates the intermediate size (or the blockwise intermediate sizes), the third value is the number of attention heads and the final value is the number of hidden layers. 
%         These values are provided in the form of the HuggingFace config.json file.
%         The \textit{Param} column indicates the percentage of parameters removed from the original model.
%         The \textit{Size} column denotes the actual size of the model on disk in megabytes.}
%         \label{tab:bert_pruned_arch}
%         \centering
%         % \setlength\tabcolsep{1pt}
%         \begin{tabular}{ccl}
%             \toprule
%             Param & Size & \multicolumn{1}{c}{Architecture} \\
%             \midrule
%             0\% & 1220 & 768, 3072, 12, 12\\
%             70\% & 339 & 256, [2944, 3008, 3008, 2944, 3072, 2944, 3008, 2944, 3072, 3008, 3008, 2944], 4, 12\\
%             80\% & 246 & 192, [2944, 3008, 3008, 2944, 3072, 2944, 3008, 2944, 3072, 2880, 3008, 2944], 3, 12\\
%             90\% & 130 & 128, [2944, 3008, 2944, 3008, 2944, 3008, 2560, 2944, 2944], 2, 9\\
%             95\% & 73.4 & 128, [2880, 2880, 2880], 2, 3\\
%             98\% & 36.1 & 64, [2880, 2880, 2880], 2, 3\\
%             \bottomrule
%         \end{tabular}
%     \end{table}

\section{Implementation Details}
\label{sec:appendix_impl}
Hyper-parameters for each method were adopted from their respective publications, except RCP, where we sampled $20$ sub-architectures instead of $100$ due to resource constraints. Consistency was maintained across all methods, including SPvR, in terms of optimizer, scheduler, batch size, and training duration. 
% Each experiment's learning rate was independently selected through a grid search in the range $[0.0001,1.0]$. 
In the ranking phase of SPvR, similar to CURL, only a subset of the dataset was used. 
Specifically, $50$ training samples per class were randomly selected for both CIFAR10 and Tiny ImageNet and $20$ samples per class for the ImageNet datasets. 

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%     F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.


\end{document}
