%\documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Kodern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\newcommand{\sm}{\textsc{Appendix}}
\newcommand\kl[1]{\textcolor{purple}{#1}}
\newcommand\cmt[1]{\textcolor{red}{#1}}
\newcommand\bmt[1]{\textcolor{blue}{#1}}
\newcommand\sy[1]{\textcolor{purple}{#1}}
\newcommand\op[1]{\operatorname{#1}}
\newcommand{\eg}{\textit{e.g., }}
\newcommand{\ie}{\textit{i.e., }}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
\input{math_commands.tex}
\usepackage{amsthm}
\usepackage{amsmath}

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newtheorem{theorem}{Theorem}[section]
\newtheorem{conjecture}{Conjecture}
%\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
%\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}{Assumption}
%\theoremstyle{remark}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}
\newtheorem{question}{Question}


\usepackage{hyperref}
\definecolor{mydarkblue}{rgb}{0,0.08,0.45}
\hypersetup{ %
pdftitle={},
pdfkeywords={},
pdfborder=0 0 0,
pdfpagemode=UseNone,
colorlinks=true,
linkcolor=mydarkblue,
citecolor=mydarkblue,
filecolor=mydarkblue,
urlcolor=mydarkblue,
}


\title{Memorization Capacity for Additive Fine-Tuning with Small ReLUs}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
%\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Jy-yong Sohn\thanks{Equal Contribution}}
\newcommand\CoAuthorMark{\footnotemark[\arabic{footnote}]}
\author[2,3]{Dohyun Kwon\protect\CoAuthorMark}
\author[1]{Seoyeon An}
\author[4]{Kangwook Lee}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics and Data Science\\
    Yonsei University\\
    Republic of Korea
}
\affil[2]{%
    Department of Mathematics\\
    University of Seoul\\
    Republic of Korea
}
\affil[3]{%
    Center for AI and Natural Sciences\\
    Korea Institute for Advanced Study\\
    Republic of Korea
}
\affil[4]{%
    Department of Electrical and Computer Engineering\\
    University of Wisconsin-Madison\\
    WI, USA
  }

  
  \begin{document}
\maketitle

\begin{abstract}
  Fine-tuning large pre-trained models is a common practice in machine learning applications, yet its mathematical analysis remains largely unexplored.
In this paper, we study fine-tuning through the lens of memorization capacity. Our new measure, the Fine-Tuning Capacity (FTC), is defined as the maximum number of samples a neural network can fine-tune, or equivalently, as the minimum number of neurons ($m$) needed to arbitrarily change $N$ labels among $K$ samples considered in the fine-tuning process. 
In essence, FTC extends the memorization capacity concept to the fine-tuning scenario. We analyze FTC for the \textit{additive} fine-tuning scenario where the fine-tuned network is defined as the summation of the frozen pre-trained network $f$ and a neural network $g$ (with $m$ neurons) designed for fine-tuning. 
When $g$ is a ReLU network with either 2 or 3 layers, we obtain tight upper and lower bounds on FTC; 
we show that $N$ samples can be fine-tuned with $m=\Theta(N)$ neurons for 2-layer networks, and with $m=\Theta(\sqrt{N})$ neurons for 3-layer networks, no matter how large $K$ is.  
Our results recover the known memorization capacity results when $N = K$ as a special case. 
\end{abstract}

\vspace{-3mm}
\section{Introduction}
\vspace{-3mm}
As a branch of machine learning theory, the expressive power of neural networks is investigated for several decades. 
By using the concept of universal approximation, it is shown that neural networks can approximate a large classes of functions, either in the depth-bounded scenarios~\citep{cybenko1989approximation,funahashi1989approximate,hornik1989multilayer,barron1993universal} or width-bounded scenarios~\citep{lu2017expressive,hanin2017approximating,kidger2020universal,park2020minimum}. 
Another line of research focused on the memorization capacity of neural networks~\citep{baum1988capabilities,huang1998upper,huang2003learning,yun2019small,vershynin2020memory,rajput2021exponential,vardi2021optimal}, exploring the capability of neural networks for memorizing finite samples. 

Meanwhile, with the advent of large language models~\citep{brown2020language,openai2023gpt,ouyang2022training,chowdhery2022palm,zhang2022opt,touvron2023llama} and foundation models~\citep{bommasani2021opportunities,radford2021learning,ramesh2022hierarchical}, the paradigm of pre-training followed by fine-tuning is dominating the machine learning communities. Various empirical results show that a gigantic model pre-trained on large amount of data can be easily fine-tuned to perform well on downstream tasks, given only a small amount of additional data for the target task. 
Compared with the extensive empirical results, mathematical analysis on fine-tuning large pre-trained models remains largely unexplored.

In this paper, we take the first step in understanding the fine-tunability of pre-trained networks through the lens of memorization capacity. 
We focus on the scenario where we fine-tune a pre-trained neural network $f$ on dataset $D = \{(\vx_i, y_i)\}_{i=1}^K$ with $K$ samples; here, $\vx_i \in \sR^{d}$ and $y_i \in \sR$ for all $i \in [K]$ where $[K] =\{1,2,\cdots, K\}$, and we assume $\vx_i \ne \vx_j$ for all $i \ne j$.
Let 
%\begin{align*}
    $T := \{ i \in [K]: f(\vx_i) \ne y_i \}$
%\end{align*}
be the set of indices of samples that the pre-trained network $f$ does not fit. The cardinality of this set is denoted by $N:= |T| \leq K$.
%In other words, the pre-trained network already fits (\ie $f(\vx_i) = y_i$) for $K-N$ samples, while it does not necessarily fit for $N$ samples. 
In other words, %we have
\begin{align}
%f(\vx_i) &\ne y_i \quad \forall i \in T, \label{eqn:new_dataset_label1}\\
f(\vx_i) &= y_i %\quad \forall i \in [K] \setminus T. 
\label{eqn:new_dataset_label2}
\end{align}
holds for all $i \in [K] \setminus T$, while not guaranteed for $i \in T$.
%we change the label during fine-tuning.
%\cmt{TODO: remove the comments on  general operation... (moved to the discussion section)}
Our aim is to add a neural network $g_\theta$ (parameterized by $\theta$) to the pre-trained network $f$ in a way that the fine-tuned network $f + g_{\theta}$ satisfies
\begin{align}
\label{eq:fine}
    (f + g_{\theta}) (\vx_i) = y_i, \quad \forall i \in [K].
\end{align}
See Fig.~\ref{fig:prob-formulation} for the visualization of the \textit{additive} fine-tuning scenario we focus on.
%where $\oplus$ is an operation, such as addition and function composition. 
%\sy{
This scenario is motivated by recently proposed additive fine-tuning methods ~\citep{zhang2020side, fu2021learn, cao2022attention}, and especially, the side-tuning ~\citep{zhang2020side} where a side network $g_{\theta}$ is added to the pre-trained network $f$. Since our model does not cover other popular fine-tuning methods including LoRA ~\citep{hu2021lora}, extending our theoretical results to such popular methods is remained as a future work.
%}
% This scenario is motivated by a recently proposed fine-tuning method called side-tuning~\citep{zhang2020side}, where a \emph{side} network $g_{\theta}$ is added to the pre-trained network $f$. Moreover, our fine-tuning model reflects the spirit of recently proposed \emph{modular} fine-tuning methods (e.g., LoRA~\citep{hu2021lora}) where the main pre-trained part is fixed, and a small module is attached for adaptation.
%Although our \textit{initial} work matches with the side-tuning scenario, we are planning to extend our work to general modular fine-tuning scenarios in the near future.
%various parameter-efficient fine-tuning methods, e.g., BitFit~\citep{zaken2021bitfit}, LoRA~\citep{hu2021lora}, Adaptor~\citep{houlsby2019parameter}
%We focus on the case where we \emph{add} $f$ and $g_{\theta}$, but one can consider a more general setting discussed in Sec.~\ref{sec:disc}.
Under such setting, we define the fine-tuning capacity (FTC) of a neural network $g_{\theta}$ as below. 

\begin{definition}[FTC]
\label{def:ftc}
    The fine-tuning capacity of a given neural network $g_{\theta}$ is the maximum number $N$ satisfying the following property: for all $\vx_i \in \sR^d$, $y_i \in \sR$, for all $T \subseteq [K]$ satisfying $\lvert T \rvert = N$, and for any choices of function $f$ satisfying $f(\vx_i) = y_i$ for all $i \in [K]\setminus T$, 
    there exists parameter $\theta$ such that $(f + g_{\theta})(\vx_i) = y_i$ all $i \in [K]$.
\end{definition}


\begin{figure}[t!]
    \centering
    \includegraphics[width=7cm]{fig/side_tuning_concept_v3.pdf}
    \caption{
    Additive fine-tuning scenario where the pre-trained network $f$ is fine-tuned to $f+g_{\theta}$,
    in order to fit the dataset $D=\{(\vx_i, y_i)\}_{i=1}^K$. Here, 
    the pre-trained network already fits $N$ samples $\{(\vx_i, y_i) \}_{i \in [K]\setminus T}$, where $T \subseteq [K]$ is the set of indices where $y_i \ne f(\vx_i)$.
    We use $g_{\theta}$ to fill the gap between $f(\vx_i)$ and $y_i$, for $i \in T$.
    }
    \label{fig:prob-formulation}
\end{figure}







Under such a setting, we establish the upper/lower bounds on FTC, when $g$ is 2-layer ReLU network or 3-layer ReLU network. %The detailed statements can be found in Theorem~\ref{thm:ftc_add_bound}, Corollary~\ref{coro:ftc}, Theorem~\ref{thm:ftc_add_bound_3layer}, and Corollary~\ref{coro:ftc3NN}.
% \textcolor{blue}{Trivial bound: the memorization capacity $\leq$  the fine-tuning capacity}
% \cmt{I added this in Remark~\ref{remark:ftc_vs_mc}. Please check.}
%\cmt{Do we want to add the trivial bound here? or at the end of Sec.3? Note that memo cap is formally defined in Sec.3}
% \begin{theorem}(Informal)\label{thm:informal} 
% \cmt{This will be done after finishing up Thm 5.2. Probably use the notation $N^{\star}$... follow the guideline in our email...}
% For $K \gg N$ and given $L = 2$ or $L=3$, there exists an $L$-layer neural network $g_\theta$ with $m$ neurons, such that Eq.~\ref{eq:fine} holds and $m = m(N)$ does not depend on $K$. In particular, if $K \gg m$, the fine-tuning capacity $N$ of $g_\theta$ is given by $$N \approx (\hbox{the memorization capacity of } g_\theta)/3.$$
% For example, when $L=3$, the memorization capacity is $m^2$, while the fine-tuning capacity is $m^2/3$.
% %= f(m)$ where $f(m)$ is the memorization capacity of $g_\theta$.
% %For a given $m$-neuron neural network $g_\theta$ parameterized by $\theta$, let $f(m)$ be the memorization capacity of $g_\theta$. Then, 
% \end{theorem}
Our main contributions are summarized below:
\begin{itemize}
    \item We define a new metric called Fine-Tuning Capacity (FTC), which measures the maximum number of samples $N^{\star}$ a neural network with $m$ neurons can fine-tune. Equivalently, we define the minimum number of neurons $m^{\star}$ needed to arbitrarily change $N$ labels among $K$. FTC can be considered as an extension of memorization capacity, tailored for the fine-tuning scenario. 
    \item 
    %For 2-layer network, we characterized $m$ in terms of 
    For 2-layer ReLU networks, we establish tight upper and lower bounds on $m^{\star}$, in Theorem~\ref{thm:ftc_add_bound}. 
    The upper bound is obtained by a novel neural network construction for fine-tuning.
    Our construction requires less number of neurons than conventional constructions developed in the memorization capacity literature when $K \ge 3N+2$. 
    By using our bounds on $m^{\star}$, we also provide an equivalent statement in Corollary~\ref{coro:ftc}, showing the tight bounds on the fine-tuning capacity $N^{\star}$ .
    \item For 3-layer ReLU networks, we obtain tight upper and lower bounds on $m^{\star}$ in Theorem~\ref{thm:ftc_add_bound_3layer}. Our results imply that 
    $N$ samples can be fine-tuned with $m = \Theta(\sqrt{N})$ neurons without any dependence on $K$.%which does not depend on the number of samples. %, $K$. 
    %no matter how large $K$ is. 
    We also provide an equivalent statement in Corollary~\ref{coro:ftc3NN}, showing the tight upper and lower bounds on $N^{\star}$ .
\end{itemize}

% \begin{remark}
%     Note that our problem formulation assumes the pre-trained network $f$ fully fits the data $D=\{(\vx_i, y_i)\}_{i=1}^K$, thus having $y_i = f(\vx_i)$. Actually, this setting can be fit into more broader scenarios. For instance, consider the $K$ original labels as merely initial prediction values of the pre-trained network, without necessarily viewing them as perfectly fitting a pre-training dataset. We then posit that fine-tuning necessitates the adjustment of $N$ labels, either to address task differences or to correct the pre-trained network's errors.
% \end{remark}
% \cmt{somewhere mention that our main contribution is new problem formulation, not the new proof technique? (but we also have some new proof technique as well...!)}

% \cmt{Can we consider an arbitrary neural network with given memorization capacity?}


% \begin{remark}\label{rmk:fully-fit-assumption}
% \cmt{tbe..}
%     The above setting can be fit into various scenarios. For instance, consider the $K$ original labels as merely initial prediction values of the pre-trained network, without necessarily viewing them as perfectly fitting a pre-training dataset. We then posit that fine-tuning necessitates the adjustment of $N$ labels, either to address task differences or to correct the pre-trained network's errors. %Under this interpretation, provided the pre-trained network has a small number of label errors, we can assume small $N$.
% \end{remark}



\section{Related Works}
\vspace{-3mm}

\paragraph{Fine-Tuning}

Various methods for efficient fine-tuning are introduced in recent years~\citep{houlsby2019parameter,zhang2020side,zaken2021bitfit,he2021towards}, which fine-tune only a small part of pre-trained models to adapt it for target tasks. 
There are some mathematical analysis on fine-tuning~\citep{wu2022power,zeng2023expressive,giannou2023expressive,englert2022adversarial,oymak2023role,du2020few,malladi2023kernel} or more broadly on transfer learning~\citep{tripuraneni2020theory,maurer2016benefit}, but none of them analyzed the fine-tunability of large pre-trained models using the lens of memorization capacity.
%\cmt{cite~\citep{zhang2020side}}


\paragraph{Memorization}

One concept relevant to FTC is \emph{memorization capacity} which measures the ability of memorizing given feature-label pairs $\{(\vx_i, y_i)\}_{i=1}^K$. 
Finding the bounds on the memorization capacity is considered in recent works on various networks~\citep{zhang2016understanding,yun2019small,vershynin2020memory,rajput2021exponential,nguyen2018optimization,hardt2016identity,kim2023provable}. 
%improve the dependency on minimum distance between data points than Vershynin~\citep{rajput2021exponential},
%~\citep{bubeck}
%~\citep{bartlett2017nearlytight}
%\kl{where is Yun et al?}
%\kl{Always "T"ransformers}
%Other than memorization capacity, 
The effect of memorization in large language models is explored in recent works, both for pre-training~\citep{carlini2022quantifying,ippolito2022preventing} and for fine-tuning~\citep{zeng2023exploring}.

%\paragraph{Theoretical Analysis on Fine-Tuning}



\section{Fine-Tuning Capacity}
\vspace{-3mm}
% \cmt{two types of equivalent definitions:}


%\dk{Definition of memorization, compare}

% In this work, we focus on the case where the operation is addition.

% \begin{definition}[Additive FTC]
%     We say that $N$ is the additive fine-tuning capacity if $N$ is the fine-tuning capacity given in Definition~\ref{def:ftc}, where the operation  $\oplus$ is replaced with the addition operation.
%     %For a given neural network $g$ parameterized by $\theta$, \emph{the fine-tuning capacity of $g$} is the maximum number $N$ satisfying the following property: for any choices of function $f$, for all $\vx_i \in \bR^d$ $y_i \in \mathbb{R}, \tilde{y}_i \in \mathbb{R}$ %\cmt{Do we need [-1, +1] instead of $\mathbb{R}$ for the range of $y_i$? Why does Prof.Yun's paper have range [-1,+1]?} 
%     %and for all $T \subseteq [K]$ satisfying $\lvert T \rvert = N$, there exists $\theta$ such that $(f + g_\theta)(\vx_i) = \tilde{y}_i$ all $i \in [K]$.
% \end{definition}

Note that a notion of additive FTC given in Definition~\ref{def:ftc} contains the pre-trained network $f$, but one can confirm that FTC does not depend on $f$ since $f(\vx_i)  = y_i$ holds for all $i \in [K]\setminus T$, for every pre-trained network $f$ we are considering. Below we provide an equivalent simpler definition. 

% \begin{definition}[FTC, equivalent form]
% \label{def:ftc2}
%     For a given neural network $g$, \emph{the fine-tuning capacity of $g$} is the maximum number $N$ satisfying the following property: for all $T \subseteq [K]$ with $\lvert T \rvert = N$ and for all $y_i, \tilde{y}_i \in \mathbb{R}$, %satisfying $y_i, \tilde{y}_i \in [-1, +1]$, 
%     there exists $\theta$ such that 
%     \begin{align}
%     \label{eqn:finetune_fit_well}
%         g_\theta(\tilde{\vx}_i) = \tilde{y}_i - y_i
%     \end{align}
%     for all $i \in [K]$.
% \end{definition}

%\dk{
\begin{definition}[FTC, equivalent form]
    For a given positive integer $K$, the fine-tuning capacity (FTC) of a given neural network $g_\theta$ is 
    \begin{align}
        N_{\op{FTC}}^{\star}(g,K) &:= \max_{N \in \{0, 1,\cdots, K\}} N \emph{ such that } \nonumber\\
        &\forall T \subseteq [K] \emph{ with } \lvert T \rvert = N, \forall \vx_i \in \mathbb{R}^d, \forall z_i \in \mathbb{R}, \nonumber\\
         \exists \theta &\emph{ satisfying }  \label{eqn:finetune_fit_well}  %\end{align*}
    %the maximum number $N$ satisfying the following property: for all $T \subseteq [K]$ with $\lvert T \rvert = N$ and for all $\vx_i \in \mathbb{R}^d$, $z_i \in \mathbb{R}$,
    %there exists $\theta$ such that
    %\begin{align}
        \begin{cases}
            g_\theta(\vx_i) = z_i & \forall  i \in T,\\
            g_\theta(\vx_i) = 0 &\forall i \in [K]\setminus T.  
    \end{cases}    
    \end{align}
\end{definition}
This definition is a generalization of conventional memorization capacity shown below, when the condition is relaxed to a special case, $T = [N]$. 

\begin{definition}[Memorization Capacity~
\citep{yun2019small}]
    The memorization capacity of a neural network $g_\theta$ is 
    \begin{align}
        N_{\op{MC}}^{\star}(g) &:= \max_{N \ge 0} N 
        \emph{ such that } \nonumber \\
        \forall \vx_i \in \mathbb{R}^d, &\forall z_i \in \mathbb{R}, 
         \exists \theta \emph{ with }
         g_\theta(\vx_i) = z_i \quad \forall i \in [N]
    \end{align}
    % the maximum number $K$ satisfying the following property: for all $\vx_i \in \mathbb{R}^d$, $z_i \in \mathbb{R}$, 
    % there exists $\theta$ such that
    %  $       g_\theta(\vx_i) = z_i$ for all $i \in [K]$.
\end{definition}

\begin{remark}\label{remark:ftc_vs_mc}
    The memorization capacity and the fine-tuning capacity has a trivial bound: for any neural network $g$ and for arbitrary $K > 0$, 
    \begin{align}
        N_{\op{FTC}}^{\star}(g,K) \le N_{\op{MC}}^{\star}(g).
    \end{align}
    % Compared with this trivial result, our statement in Theorem~\ref{thm:informal} 
    % shows a non-trivial relationship between two quantities: $N_{\op{FTC}}^{\star}(g,K) \approx N_{\op{MC}}^{\star}(g)/3$ holds if $K \gg m$, where $m$ is the number of neurons in the network $g$. 
\end{remark}

Note that FTC is defined as the maximum number of samples $N$ we can fine-tune using a given network $g$. One can also consider an equivalent definition: the minimum number of neurons $m$ contained in $g$ to successfully fine-tune $N$ samples, which is formally stated below. 

\begin{definition}[FTC, equivalent form, in terms of \# neuron]
    The minimum number of neurons required for fine-tuning arbitrary $N$ out of $K$ samples, is defined as  
    \begin{align*}
        m_{\op{FTC}}^{\star}(N,K) &:= \min_{m \ge 0} m 
     \emph{ such that } \\
     &\forall T \subseteq [K] \emph{ with } \lvert T \rvert = N, \forall \vx_i \in \mathbb{R}^d, \forall z_i \in \mathbb{R}, \\
        & \exists \emph{ neural network } g_{\theta} \emph{ with } m \emph{ neurons satisfying }   \\
        &\begin{cases}
            g_\theta(\vx_i) = z_i &\hbox{ for all }  i \in T,\\
            g_\theta(\vx_i) = 0 &\hbox{ for all } i \in [K]\setminus T.  
    \end{cases}   
    \end{align*}
\end{definition}

Throughout the paper, we use $N^{\star}$ as a short-hand notation for $N^{\star}_{\op{FTC}}$, and use $m^{\star}$ as a short-hand notation for $m^{\star}_{\op{FTC}}$. Our theoretical results provide bounds on $m^{\star}$ and $N^{\star}$, \eg Theorem~\ref{thm:ftc_add_bound} is bounding $m^{\star}$, while an equivalent result in Corollary~\ref{coro:ftc} is bounding $N^{\star}$.




%}

%\dk{\begin{remark}
%    The above definition can be generalized to any ``group'' that has two operations, addition and scalar multiplication.
%\end{remark}
%}


\begin{figure}[t!]
    \centering
    \includegraphics[width=6cm]{fig/fig_3N.pdf}
    \caption{
    Proving Theorem~\ref{thm:ftc_add_bound} for $K=14, N=4$. }
    \label{fig:proof_num_pieces}
\end{figure}


\begin{figure}[t!]
\vspace{-3mm}
    \centering
    \includegraphics[width=6cm]{fig/fig_3N_M9.pdf}
    \caption{
    Proving Theorem~\ref{thm:ftc_add_bound} for $K=9, N=4$. }
    \label{fig:proof_num_pieces_K9}
\vspace{-3mm}
\end{figure}
\section{FTC of 2-layer FC ReLU Networks}\label{sec:two_layer}
\vspace{-3mm}
A 2-layer fully-connected neural network $g_{\theta}:\mathbb{R}^d \rightarrow \mathbb{R}$ with ReLU activation can be represented as
\begin{align}
\label{eq:2l}
    g_{\theta}(x) = \mW_2\sigma(\mW_1 \vx + \vb_1) + \vb_2,   
\end{align}
which is parameterized by $\theta = [\mW_1, \mW_2, \vb_1, \vb_2]$ where $\mW_1 \in \sR^{m \times d}$, $\mW_2 \in \sR^{1 \times m}$, $\vb_1 \in \sR^{m}$, and $\vb_2 \in \sR$. Here, $m$ is the number of hidden neurons, and $\sigma$ is the ReLU activation. 
% In such setting, we aim at finding the \emph{fine-tuning capacity},  defined as the maximum $N$  such that for all $\vx_i, \tilde{\vx}_i \in \bR^d$, for all $y_i, \tilde{y}_i \in [-1, +1]$ and for all $T \subseteq [K]$ satisfying $\lvert T \rvert = N$, there exists a parameter $\theta$ for the fine-tuned network such that $\tilde{f}_{\theta}(\tilde{\vx}_i) = \tilde{y}_i$ all $i \in [K]$. 
% \dk{The above definition is not too different from the original memorization capacity. We need to define the fine-tuned network, as well.}
% \begin{itemize}
%     %\item 
%     %Let $N_{k}$ be a set of neural networks with at most $k$ neurons.
%     %\item 
%     %For a given neural network $f$, we say that $g$ is \emph{the $k$-neuron fine-tuned network of $f$} if there exists a subnetwork of $g$ such that the subnetwork is equivalent to $f$ and the number of neurons of $g \setminus f$ is at most $k$.
%     %the difference between $f$ and $g$ is at most $k$ \dk{in which sense?}. 
%     %We denote a set of $k$-neuron fine-tuned networks of $f$ by $F_k(f)$.
%     %\item 
%     %For given $k\in \mathrm{N}$ and a neural network $f$, we say that $N$ is \emph{the $k$-neuron fine-tuning capacity of $f$} if $N$ is the maximum number satisfying the following property: for any choices of $\vx_i, \tilde{\vx}_i \in \bR^d$, for all $y_i, \tilde{y}_i \in [-1, +1]$ and for all $T \subseteq [K]$ satisfying $\lvert T \rvert = N$, there exists $g \in F_k(f)$ such that $g(\tilde{\vx}_i) = \tilde{y}_i$ all $i \in [K]$.
%     \item 
%     For a given neural network $g$ parameterized by $\theta$, we say that $N$ is \emph{the fine-tuning capacity of $g$} if $N$ is the maximum number satisfying the following property: for any choices of $f$, $\vx_i, \tilde{\vx}_i \in \bR^d$, for all $y_i, \tilde{y}_i \in [-1, +1]$ and for all $T \subseteq [K]$ satisfying $\lvert T \rvert = N$, there exists $\theta$ such that $(f+g_\theta)(\tilde{\vx}_i) = \tilde{y}_i$ all $i \in [K]$.
%     \item 
%     For a given neural network $g$, we say that $N$ is \emph{the fine-tuning capacity of $g$} if $N$ is the maximum number satisfying the following property: for all $T \subseteq [K]$ satisfying $\lvert T \rvert = N$ and for all $\{y_i, \tilde{y}_i\}_{i \in [K]}$, %satisfying $y_i, \tilde{y}_i \in [-1, +1]$, 
%     there exists $\theta$ such that $g_\theta(\tilde{\vx}_i) = \tilde{y}_i - y_i$ all $i \in [K]$.
%     \end{itemize}
%Sections~\ref{sec:two_layer_lower} and~\ref{sec:two_layer_upper} provide the lower/upper bounds on the fine-tuning capacity $N$, 
The below result states the bounds on $m$ for 2-layer FC ReLU networks.
%the results of which are summarized as an informal theorem as below:
\begin{theorem}
%[informal]
\label{thm:ftc_add_bound}
%\cmt{Suppose $K > 2N$.}
%\cmt{we need to discuss how to phrase it better}
Let $K \ge 3$. %and $g$ be a 2-layer fully-connected ReLU network with $m$ neurons. %as in \eqref{eq:2l}.
\begin{enumerate}
    \item 
    For all $T \subseteq [K]$, $|T|=N$, $\vx_i \in \mathbb{R}^d$, $z_i \in \mathbb{R}$, $i \in [K]$, there exists a 2-layer fully-connected ReLU network $g$ with $m$ neurons satisfying \eqref{eqn:finetune_fit_well} 
    and 
    $$ m \leq \min \{3N+1, K-1\}.$$
    \item 
    For given $T \subseteq [K]$, $|T|=N$,  $\vx_i \in \mathbb{R}^d$, $z_i \in \mathbb{R}$, $i \in [K]$, suppose that \eqref{eqn:finetune_fit_well} holds for some 2-layer fully-connected ReLU network $g$ with $m$ neurons. Then, $$\min \{3N, K-2\}  \leq m.$$
\end{enumerate}
%Let $N$ be the fine-tuning capacity of 
 %Then, 
%$m$ is bounded as 
   % \begin{align}
   %     \min \{3N, K-2\}  \leq m \leq \min \{3N+1, K-1\}.
    %\end{align} 
    Thus, 
    \begin{align}
        \min \{3N, K-2\} \le m^{\star} \le \min\{3N+1, K-1\}.
    \end{align}
\end{theorem}
The proof of this theorem is given in Sec.~\ref{sec:two_layer_lower} and Sec.~\ref{sec:two_layer_upper}.


\begin{remark}%\dk{Need to polish}
    If $N=K$ (i.e., fine-tuning changes all labels), then the result of Theorem~\ref{thm:ftc_add_bound} reduces to 
    \begin{align*}
        m^{\star} + 1 \leq N = K \leq m^{\star}+2.
    \end{align*}
    The upper bound of $N$ coincides with the upper bound of memorization capacity studied in the result of~\citep{yun2019small}. On the other hand, the lower bound is consistent with the one given in \citep{zhang2016understanding}. %The gap between the lower and the upper bound 
    %\kl{$m$ vs $m$? why is it slightly better?}
    %\dk{Indeed, it is $m+1$ in our construction while the one in $\citep{zhang2016understanding}$ is $m$}
\end{remark}

    %the neural network was constructed the lower bound $m$ 

    %\begin{remark}%\dk{Need to polish}
    %    Interestingly, when $K \ge 3N+2$ (i.e., changing the labels for a sufficiently small number of samples), we have 
    %\begin{align}
    %    \frac{m-1}{3} \leq N \leq \frac{m}{3}.
    %\end{align}
    %\dk{Do we allow non-integer $N$? Otherwise, a floor function may be necessary here?}
    %In other words, for sufficiently large $K$, the fine-tuning capacity $N$ does not depend on the size $K$ of the underlying dataset $D$ but the number of neurons $m$ in $g_\theta$. 
    %\end{remark}

    %As a consequence of Theorem~\ref{thm:ftc_add_bound}, we compare memorization  and fine-tuning  described in Figure~\ref{fig:memocap_vs_ftc}. 
    Below we state the  bounds for the fine-tuning capacity of a 2-layer fully-connected ReLU, directly obtained from the above theorem.

\begin{corollary} [FTC of 2-layer FC ReLU]\label{coro:ftc}
    Suppose $K \ge 3$. For given $m \in \mathbb{N}$, let $N^{\star}$ be the fine-tuning capacity of a 2-layer fully-connected ReLU network $g$ given in \eqref{eq:2l} with $m$ neurons. 
    \begin{enumerate}
        \item 
        If $K \geq m+2$, then
        \begin{align}
        \label{eq:1ftc}
        \left\lfloor \frac{m-1}{3} \right\rfloor \leq  N^{\star} \leq \frac{m}{3}.
    \end{align}
        \item 
        If $K \leq m+1$, then
        $N^{\star} = K$.
    \end{enumerate}
    %Then, 
%$m$ is bounded as 
   % \begin{align}
  %      \left\lfloor \frac{m-1}{3} \right\rfloor \leq \min \left\{N, \frac{K-2}{3}\right\}   \leq \frac{m}{3}.
   % \end{align}
   % In particular, if $K \geq 3N+2$, then 
\end{corollary}
\begin{proof}
    Suppose that $K \geq m+2$. By Theorem~\ref{thm:ftc_add_bound}.1, for $|T|= \left\lfloor \frac{m-1}{3} \right\rfloor$, there exists a 2-layer fully-connected ReLU network $g$ where the number of neurons of $g$ is less than or equal to
    \begin{align*} 
     \min \left\{3\left\lfloor \frac{m-1}{3} \right\rfloor+1, K-1\right\}= 3\left\lfloor \frac{m-1}{3} \right\rfloor+1 \leq m.
    \end{align*}
    Here, $K \geq m+2$ yields the first equality while the second inequality follows from the definition of the floor function. This yields the lower bound of \eqref{eq:1ftc}. On the other hand, due to Theorem~\ref{thm:ftc_add_bound}.2 and $K \geq m+2$, we conclude the upper bound of \eqref{eq:1ftc}.

    Suppose that $K \leq m+1$. Similarly, by Theorem~\ref{thm:ftc_add_bound}.1 for $|T| = K$, there exists a 2-layer fully-connected ReLU network $g$ where (the number of neurons of $g$) $\leq K-1 \leq m$. This yields $N \geq K$. As $N \leq K$ always holds due to our construction, we conclude $N=K$.
\end{proof}

In other words, for sufficiently large $K$, the fine-tuning capacity $N$ does not depend on the size $K$ of the underlying dataset $D$ but the number of neurons $m$. % in $g_\theta$.
%Note that in the lower bound, we need the floor function. For instance, if $N = \left\lfloor \frac{m-1}{3} \right\rfloor + 1$, then $3N+1 > m$ and thus Theorem~\ref{thm:ftc_add_bound} cannot be applied.
%from Theorem~\ref{thm:ftc_add_bound} may not be smaller than or equal to $m$.



%\dk{Draw some figures: comparison between memorization capacity and fine-tuning capacity}
% \begin{figure}[t!]
%     \centering
%     \vspace{-2mm}
%     \includegraphics[width=4cm]{fig/Memocap_vs_FTC_v4.pdf}
%     \vspace{-2mm}
%     \caption{Comparison between memorization capacity and fine-tuning capacity (FTC), in terms of the required number of neurons $m$ for memorization/fine-tuning.
%     As the number of pre-trained samples $K$ increases, $m$ scales linearly for \emph{memorizing} $K$ samples. On the other hand, $m$ saturates in $K \ge 3N+2$ regime for \emph{fine-tuning} $N$ out of $K$ samples.
%     }
%     \vspace{-2mm}
%     \label{fig:memocap_vs_ftc}
% \end{figure}
%\dk{Add remarks: no dependency of $N$ on $K$.}




%\dk{Explain the gap between the upper bound and the lower bound.}

%Note that the obtained upper bound $m_U$ and lower bound $m_L$ are illustrated in Fig.~\ref{fig:bounds_m}, for varying $K$.

% \subsection{Lower Bound on $m$}\label{sec:two_layer_lower}


% %We use the technique used for proving Theorem 3.3 of~\citep{yun2019small}. 
% We first prove when $K \ge 2N+1$. Define $T = \{2, 4, 6, ..., 2N\}$, and define $\vx_i = i\vu$ for all $i \in [N]$ for arbitrary vector $\vu \in \sR^d$. Let $z_i = 2$ if $i = 4k$ for some $k \in \sN$ and $z_i = -1$ if $i = 4k-2$ for some $k \in \sN$. See Fig.~\ref{fig:proof_num_pieces} when $K=14$ and $N=4$.  
% Then, $\bar{g}_{\theta}(t) := g_{\theta}(t\vu)$ is a piecewise affine function with at least $2N+1$ pieces. Recall that using Theorem 3.3 of~\citep{yun2019small} for 2-layer ReLU network, $\bar{g}_{\theta}(t)$ is having $m+1$ pieces, since ReLU activation is a piecewise linear function with two pieces. Thus, $m \ge 2N$ holds. 

% Note that given $K$ (the total number of samples used during pre-training) and $N$ (the number of samples we want to change the label), the above proof scheme specifies $T, \{\vx_i, z_i\}_{i=1}^K$ and counts the number of pieces for the piecewise-linear function $\bar{g}_{\theta}(t) = g_{\theta}(t\vu)$, under the setting of  $\vx_i = i\vu$. 

% We use similar technique for proving the lower bound on $m$ when $N \le K < 3N+2$. Consider revising $(T, x_i, z_i)$ triplet (defined for $K \ge 3N+2$ case),
% %by removing some indices $i \in T$ in Fig.~\ref{fig:proof_num_pieces} 
% so that $K < 3N+2$ condition is satisfied. For example, when $K = 2N+1$, the triplet is revised as $T = \{2, 4, \cdots, 2N\}$, $z_i  = 2$ if $i=4k$ for some $k \in \mathbb{N}$ and $z_i  = -1$ if $i=4k-2$ for some $k \in \mathbb{N}$, as illustrated in Fig.~\ref{fig:proof_num_pieces_K9}, which has $K-1=8$ pieces.
% For general $K$ and $N$ satisfying $K < 3N+2$, one can choose $(T, x_i, z_i)$ triplet such that the corresponding 
% $\bar{g}_{\theta}(t) = g_{\theta}(t\vu)$ has $K-1$ pieces. 
% % When $K < 3N+2$, we can follow similar proof technique for specifying $T$ and $\{\vx_i, z_i\}$ to get a tight bound on the number of required pieces of $\bar{g}_{\theta} (t)$. 
% Thus, the number of pieces $p(K)$ of $\bar{g}_{\theta}(t)$ we constructed can be represented as
% \begin{align}
%     p(K) = \begin{cases}
%     3N+1, & \text{ if } K \ge 3N+2 \\
%     K-1, & \text{ if } N \le K < 3N+2     
%     \end{cases}
% \end{align}
% This completes the proof. 


% \
\vspace{-3mm}
\subsection{Proof of Lower Bound on $m^{\star}$}\label{sec:two_layer_lower}
\vspace{-3mm}
%We use the technique used for proving Theorem 3.3 of~\citep{yun2019small}. 
We first prove the lower bound in Theorem~\ref{thm:ftc_add_bound} when $K \ge 3N+2$. 
% \kl{This is misleading -- it reads like you're proving under what conditions $K \ge 3N+2$ holds true.
% What you're proving is the inequality in Theorem 4.1 when $K \ge 3N+2$}
Define $T = \{3, 6, 9, ..., 3N\}$, and define $\vx_i = i\vu$ for all $i \in [N]$ for arbitrary vector $\vu \in \sR^d$. Let $z_i = 2$ if $i = 6k$ for some $k \in \sN$ and $z_i = -1$ if $i = 6k-3$ for some $k \in \sN$. See Fig.~\ref{fig:proof_num_pieces} when $K=14$ and $N=4$.  
Then, $\bar{g}_{\theta}(t) := g_{\theta}(t\vu)$ is a piecewise affine function with at least $3N+1$ pieces. Recall that using Theorem 3.3 of~\citep{yun2019small} for 2-layer ReLU network, $\bar{g}_{\theta}(t)$ is having $m+1$ pieces, since ReLU activation is a piecewise linear function with two pieces. Thus, $m \ge 3N$ holds. 
% \kl{We need to rewrite Theorem 3.3 in our paper for completeness. At least in the appendix. Using our own language.}
% \kl{For this case, do we need to choose '2' or '1' also works? For the second case, yes, I see why '2' is needed.}
% \kl{}
Note that given $K$ 
%(the total number of samples used during pre-training) 
and $N$,
%(the number of samples we want to change the label), 
the above proof scheme specifies $T, \{\vx_i, z_i\}_{i=1}^K$ and counts the number of pieces for the piecewise-linear function $\bar{g}_{\theta}(t) = g_{\theta}(t\vu)$, under the setting of  $\vx_i = i\vu$. 

We use a similar technique for proving the lower bound on $m$ when $N \le K < 3N+2$. Consider revising $(T, x_i, z_i)$ triplet (defined for $K \ge 3N+2$ case),
%by removing some indices $i \in T$ in Fig.~\ref{fig:proof_num_pieces} 
so that $K < 3N+2$ condition is satisfied. For example, when $K = 2N+1$, the triplet is revised as $T = \{2, 4, \cdots, 2N\}$, $z_i  = 2$ if $i=4k$ for some $k \in \mathbb{N}$ and $z_i  = -1$ if $i=4k-2$ for some $k \in \mathbb{N}$, as illustrated in Fig.~\ref{fig:proof_num_pieces_K9}, which has $K-1=8$ pieces.
For general $K$ and $N$ satisfying $K < 3N+2$, one can choose $(T, x_i, z_i)$ triplet such that the corresponding 
$\bar{g}_{\theta}(t) = g_{\theta}(t\vu)$ has $K-1$ pieces. 
% When $K < 3N+2$, we can follow similar proof technique for specifying $T$ and $\{\vx_i, z_i\}$ to get a tight bound on the number of required pieces of $\bar{g}_{\theta} (t)$. 
Thus, the number of pieces $p(K)$ of $\bar{g}_{\theta}(t)$ we constructed can be represented as
\begin{align}\label{eqn:p_K}
    p(K) = \begin{cases}
    3N+1, & \text{ if } K \ge 3N+2 \\
    K-1, & \text{ if } N \le K < 3N+2     
    \end{cases}
\end{align}
This completes the proof. 


% \begin{figure}[t!]
%     \centering
%     \includegraphics[width=7cm]{fig/mU_mL.pdf}
%     \caption{
%     Upper/lower bounds on $m$ for different $K$. }
%     \label{fig:bounds_m}
% \end{figure}


% Note that \eqref{eqn:finetune_fit_well} is satisfied if and only if $\Delta f = \tilde{f} - f$ satisfies 
% \begin{align}%\label{eqn:new_dataset}
% \Delta f(\vx_i) & = \tilde{y}_i - y_i \quad \forall i \in T \\
% \Delta f(\vx_i)  &= 0 \quad \forall i \in [K] \setminus T
% \end{align}
% We prove $N \le  m$ by specifying the original dataset $\{(\vx_i, y_i) \}_{i=1}^K$, the new dataset $\{(\tilde{\vx}_i, \tilde{y}_i) \}_{i=1}^K$, and the fine-tuning index set $T$ with $\lvert T \rvert = \Delta m + 1$ such that there does not exist $\theta$ such that $\tilde{f}_{\theta}(\vx_i) = \tilde{y}_i$ holds for all $i \in [K]$. 


\vspace{-3mm}
\subsection{Proof of Upper Bound on $m^{\star}$}\label{sec:two_layer_upper}
\vspace{-3mm}
%\cmt{probably we can remove the below theorem? (since it is already merged in Thm.4.1)}

%\begin{theorem}
%\label{thm:ftc_add_upp_bound}
%    Under the same setting as in Theorem~\ref{thm:ftc_add_bound}, we have
%    \begin{align}
        %m \leq N + 2\min\{K-N, N\}.
%        m \leq \min \{3N, \max\{K-2, 1\}\}
%    \end{align}
%\end{theorem}



%\dk{Add high-level idea}

%\dk{Draw figures about removing points}

We here establish the upper bound in Theorem~\ref{thm:ftc_add_bound}. To be specific,  in Theorem~\ref{lem:ftc_upp}, we establish the upper bound on $m^{\star}$ in terms of the partition of $[K]\setminus T$. The key idea is to remove all points of  $[K]\setminus T$ except for the endpoints of each block as in Figure~\ref{fig:proof_remove}. More discussion will be provided below.

%See Figure~\ref{}

%The key idea is to remove all  

Let us introduce some terminology. For a given set $I$, a partition $\mathcal{P}$ of $I$ is a set of nonempty subsets $P$ of $I$ such that every element in $I$ is in exactly one of these subsets. We denote $P \in \mathcal{P}$ by the \textit{block} of $\mathcal{P}$.



%the number of unchanged labels in each cluster. 


\begin{definition}
    %Let $W$ be a set of $K$ real numbers and 
    For $I \subset [K]$, we say that $\mathcal{P}(I)$ is the consecutive partition of $I$ if all consecutive integers in $I$ are included in the same block, i.e., for $i,j \in I$, $i$ and $j$ are in the same block of $\mathcal{P}(I)$ if and only if $|i-j|=1$.
    %$I$ is divided into the blocks of consecutive integers. 
    %In other words, all consecutive integers in $I$ are included in the same subset.
\end{definition}


\begin{example}
\label{ex:l2}
    If $I = \{1, 2, 3, 5, 6, 8, 10, 11, 12, 13\},$ then the \textit{consecutive partition} is defined as
\begin{align}\label{eqn:consecutive-partition}
    \mathcal{P}(I) = \{ \{1, 2, 3\}, \{5, 6\}, \{8\}, \{10, 11, 12, 13\}\},
\end{align}
and the blocks of $\mathcal{P}(I)$ are $\{1, 2, 3\}, \{5, 6\}$, $\{8\}$, and $\{10, 11, 12, 13\}$.
\end{example}



The following theorem shows that the upper bound of $m$ is given in terms of the size of the blocks in $\mathcal{P}([K]\setminus T)$. For proving Theorem~\ref{thm:ftc_add_bound}, we find the uniform bound for general datasets using this bound and Lemma~\ref{lem:num2}.



%starting from $w_1$, we group 




%depends on the number of transitions between the unchanged labels and the changed ones.

\begin{figure}[t!]
    \centering
    \vspace{-3mm}
    \includegraphics[width=6cm]{fig/fig_3N_remove.pdf}
    \vspace{-2mm}
    \caption{
    Illustration of the neural network constructed in Theorem~\ref{lem:ftc_upp} with $K=14$ and $T = \{4, 7, 9, 14\}$. $\mathcal{P}([K]\setminus T)$ is the partition $\mathcal{P}(I)$ given in Example~\ref{ex:l2}. The gray points are the removed ones. %A new set $J = \{1,3,4,5,6,7,8,9,10,13,14\}$. 
    }
    %Proving Theorem~\ref{thm:ftc_add_bound} for $K=14, N=4$. }
    \label{fig:proof_remove}
\end{figure}

\begin{theorem}
\label{lem:ftc_upp}
%\dk{In complete}
Consider the same setting as in Theorem~\ref{thm:ftc_add_bound}, and suppose that 
\begin{align}
\label{eq:up0}
    \va^T\vx_1  < \va^T\vx_2 < \cdots < \va^T\vx_K    
\end{align}
holds for some $\va \in \sR^d$. Then, there exists an $m$-neuron network 
     $g_{\theta}(x) = \mW_2\sigma(\mathbf{1}_{m} \va^T \vx + \vb_1) + \vb_2,$
    such that \eqref{eqn:finetune_fit_well} holds, and 
    \begin{align}
    \label{eq:up1}
        m^{\star} \leq K-1 - \sum_{P \in \mathcal{P}([K]\setminus T)} \max\{ |P|-2,0 \}
    \end{align}
    Here, $\mW_2 \in \sR^{1 \times m}$, $\vb_1 \in \sR^{m}$, and $\vb_2 \in \sR$.
    %$\vv, \vb \in \sR^m$, and 
    %$W = \{\vx_i^T \va : i \in [K] \}$.
    %For given $\va \in \sR^d$, suppose that $y_1 < y_2 < y_3 < \cdots < y_K$ where $y_i:= \vx_i^T \va$ for $i \in [K]$. Let $l$ be the number of clusters of $[K]\setminus T$, \dk{Need to define this appropriately} and $d_i$ be the size of $i$th cluster for $i \in [l]$. Then,
    %it holds that
     
    %Let $l$ 
    %be the number of $i$'s in $[K]$ such that
    %\begin{align}
    %    (i, i+1) \in T \times ([K]\setminus T) \hbox{ or } (i, i+1) \in ([K]\setminus T) \times T.
    %\end{align}
%    Then, $m \leq N+ l$.
\end{theorem}



%$g_{\theta}:\mathbb{R}^d \rightarrow \mathbb{R}$ with ReLU activation. Note that this 2-layer network $$g_{\theta}(x) = \mW_2\sigma(\mW_1 \vx + \vb_1) + \vb_2$$ is parameterized by $\theta = [\mW_1, \mW_2, \vb_1, \vb_2]$ where $\mW_1 \in \sR^{m \times d}$, $\mW_2 \in \sR^{1 \times m}$, $\vb_1 \in \sR^{m}$, and $\vb_2 \in \sR$.

\begin{remark}
\label{rem1}
    To minimize the width $m$ of the network, a smaller number of blocks of bigger sizes would be ideal. If we only have one block in the partition $\mathcal{P}([K]\setminus T)$, then Eq.~\ref{eq:up1} is given as
    \begin{align*}
        m^{\star} \leq K-1 - | [K]\setminus T | +2 = N + 1. 
    \end{align*}
    This happens when $\va^T \vx_i$s for $i \in [K]\setminus T$ are segregated for some $\va$, \eg when $[K]\setminus T = \{i, i+1, \cdots, j\}$ for some $i < j$ in $[K]$.
    %$[K]\setminus T = \{i, i+1, \cdots, j\}$
    %for some $i,j$.
    %$\va^T \vx_i$'s are perfectly separated:
    %\begin{align}
    %    \va^T \vx_i < \va^T \vx_j 
    %\end{align}
    %for any $i \in [K]\setminus T,    j \in T$, and for some $\va \in \mathbb{R}^d$. 
    Thus, some appropriate projection yields a smaller number of neurons required.
    %\kl{I don't understand. Can't we have $I = \{3,4,5,..,N+2\}$? In this case, there is one block, but the above ordering doesn't hold.}
    %a good projection can give 
    %In other words, if ch
    %an appropriate projection of $\vx_i$ helps us to obtain the sharper bound.
    %so that two sets $\{\va^T \vx_i : i \in [K]\setminus T\}$ and $\{\va^T \vx_i : i \in T\}$ are clustered, then we can get the sharper bound.
    %\begin{align}
    %\label{eq:up2}
    %    m &\leq K-2 - \sum_{P \in \mathcal{P}([K]\setminus T)} |P|-2,\\
    %    &
    %\end{align}
    %The inequality \eqref{eq:up1} shows that  
    %\dk{Add some comments about $|P|$ dependence}
\end{remark}

%\begin{remark}
    %In general, the best scenario in Remark~\ref{rem1} may not happen. 
    
    %The general upper bound in Theorem~\ref{thm:ftc_add_bound}, we find the upper bound
%\end{remark}

%\begin{remark}
%    It is worth pointing out that the idea described in Remark~\ref{rem1} can be applied in 3-layer networks. Adding one more layer, we may obtain the desired cluster and obtain the sharper bound. See Section~\ref{sec:three_layer_upper} for more details.
%    \cmt{Need to be revised?}
    %we describe how  can help to minimize the width by ing two sets as in .
%\end{remark}


\begin{remark}
    The worst scenario is when every block of $\mathcal{P}([K]\setminus T)$ has $2$ or less elements, as in Figures~\ref{fig:proof_num_pieces} and \ref{fig:proof_num_pieces_K9}. Note that if $K - N$ is much larger than $N$, then this scenario cannot occur. This is because the number of blocks cannot be larger than $\min\{K-N, N\}+1$ from Lemma~\ref{lem:num}.
    %this can only happens 
    %then the inequality \eqref{eq:up1} is given as
\end{remark}

\begin{proof}[Proof of Theorem~\ref{lem:ftc_upp}]
    First, we consider the case when %\sy{sy: why $W$ is in $\mathcal{P}$?} \dk{Thanks, updated.}
    \begin{align}
    \label{eq:ftc_upp1}
        |P| \leq 2 \hbox{ for all } P \in \mathcal{P}( [K]\setminus T).    
    \end{align}
     In this case, it suffices to prove $m \leq K.$
    As $K$ neurons can represent $K$ data points, the inequality directly follows from the standard argument as in \cite{zhang2016understanding} and also shown in Lemma~\ref{lem:linear}.
    %For later purpose, 

    Let us consider general cases. The main strategy is to remove data points so that \eqref{eq:ftc_upp1} holds. Specifically, except for two endpoints of each block $P \in \mathcal{P}([K]\setminus T)$, we remove $i \in [K]\setminus T$ from $[K].$ Let us denote this new subset of $[K]$ by 
    \begin{align}\label{eqn:J-index-set}
     J = \{j_1 < j_2 < \cdots < j_{|J|}\}   
    \end{align}
    For example, when $K=14$ and $T = \{4,7,9,14\}$ as in Fig.~\ref{fig:proof_remove}, the consecutive partition $\mathcal{P}(I)$ is given in Eq.~\ref{eqn:consecutive-partition}, and thus we remove $\{2,11,12\}$ from $[K] = \{1,2, \cdots, 14\}$, which gives us $J = \{1,3,4,5,6,7,8,9,10,13,14\}$.  
    Note that the number of data points in $J$ is 
    \begin{align}
    \label{eq:j}
      |J|:= K - \sum_{P \in \mathcal{P}( [K]\setminus T)} \max\{ |P|-2,0 \}.  
    \end{align}
    Applying Lemma~\ref{lem:linear} with $A:= \{(\va^T\vx_j, \vz_j)\}_{j \in J}$ and $m = |J|-1$, there exist $\mW_2 \in \sR^{1 \times m}$, $\vb_1 \in \sR^{m}$, and $\vb_2 \in \sR$ such that 
    \begin{align*}
        h_\theta(\va^T\vx_j) = \mW_2\sigma(\mathbf{1}_{m} \va^T \vx_j + \vb_1) + \vb_2 = z_j 
    \end{align*}
    for all $j \in J$. Thanks to $h_\theta(\va^T\vx) = g_\theta(\vx)$, we conclude $g_\theta(\vx_j) = z_j$ for all $j\in J$.
    
    Lastly, for $i \in [K]\setminus J$, there exist two endpoints $j_{i} < j_{i+1}$ such that $[j_{i}, j_{i+1}] \cap ([K]\setminus T)$ in $\mathcal{P}([K]\setminus T)$, and $j_i < i < j_{i+1}$. Note that $h_\theta$ constructed in Lemma~\ref{lem:linear} is linear in $[\va^T \vx_{j_{i}}, \va^T \vx_{j_{i+1}}]$. Since $g_\theta(\vx_{j_{i}}) = g_\theta(\vx_{j_{i+1}}) = 0$, we conclude that $g_\theta(\vx_i) = 0$ as desired. %for $x \in [\vx_{i_{j}}, \vx_{i_{j+1}}]$.
    %of  such that $j_i, i, j_{i+1} in [K]\setminus T$.
    %with $|J|$ neurons by using .
    %Notice that 
    %\dk{The last step will be included.} 
    %The proof 
    %We first assume that $1, K \in [K]\setminus T$. Then, we have $l/2$ connected components of the unchanged labels. Denote the number of index in each connected components by $c_1, c_2, \cdots, c_{l/2}$.
\end{proof}



%To complete the last step of 



%In \cite{zhang2016understanding}, it has been shown that $m$


\begin{lemma} 
\label{lem:linear}
    For $m\geq1$ and $A = \{(w_i, z_i)\}_{i=1}^{m+1}$  where $w_1 < w_2 < \cdots < w_{m+1}$, $w_i \in \sR$ and $y_i \in \sR$. There exist $\mW_2 \in \sR^{1 \times m}$, $\vb_1 \in \sR^{m}$, and $\vb_2 \in \sR$ such that 
     %\sy{sy: $z_i$ means $y_i$?}
    \begin{align}
    \label{eq:linear}
        h_\theta(x) = \frac{z_{i}-z_{i+1}}{w_i-w_{i+1}}(x - w_i) + z_i
    \end{align}
    for $x \in [w_i, w_{i+1}]$, $i=1,2,\cdots, m-1$
    and \eqref{eq:linear} with $i=m$ holds in $[w_{m}, \infty)$
    where $h_\theta(x) = \mW_2\sigma(\mathbf{1}_{m} x + \vb_1) + \vb_2.$
\end{lemma}

\begin{proof} %\dk{Incomplete}
    We prove this by induction. For $m=1$, choose $\vb_1 = -w_1$ and $b_1  = z_1$, then $h_\theta(w_1) = z_1$. Take $W_2 = \frac{z_{1}-z_{2}}{w_1-w_{2}}$ and we get \eqref{eq:linear} with $i=1$ in $[w_1, \infty)$.

    Suppose that the above result holds for $m = k$. Then, there exist $\mW_2 \in \sR^{1 \times k}$, $\vb_1 \in \sR^{k}$, and $\vb_2 \in \sR$ satisfying \eqref{eq:linear} in $[w_i, w_{i+1}]$ for $i = 1,2, \cdots k-1$ and $[w_k, \infty)$. Using this, we choose $\widetilde{\mW_2} = (\mW_2, \frac{z_{k+1}-z_{k+2}}{w_{k+1}-w_{k+2}})$ and $\widetilde{b_1} = (b_1, -w_{k+1})$. Then, $\widetilde{h_\theta}(x) = \mW_2\sigma(\mathbf{1}_{m} x + \vb_1) + \vb_2$ satisfies  \eqref{eq:linear} with $i=k+1$ in $[w_{k+1}, \infty)$.
    %$\vb_1= (-w_1, -w_2)^T$ and $b_1  = z_1$, then $h_\theta(w_1) = z_1$. On the other hand, let $W_2 = (\frac{z_{i}-z_{i+1}}{w_i-w_{i+1}}$
    %we get  such that $h_\theta(\vx) = \vv \sigma(\vx + \vb_1) + b_2$ satisfies the above condition.
\end{proof}


\begin{lemma}
\label{lem:num}
    For $I \subset [K]$ and the consecutive partition, $\mathcal{P}(I)$, it holds that
    \begin{align*}
        |\mathcal{P}(I)| \leq \min \{ |I|, |[K]\setminus I| + 1\}.
    \end{align*}
\end{lemma}
\begin{proof}
    First, $|\mathcal{P}(I)| \leq |I|$ directly follows from the definition of the partition. 
    
    On the other hand, let $a_i$ and $b_i$ be the end point of each block in $\mathcal{P}([K]\setminus I)$:
    \begin{align*}
        \mathcal{P}([K]\setminus I) &= \{[a_i, b_i] \cap [K]: 1\leq i \leq |[K]\setminus I|\}.
    \end{align*}
    Then, 
    \begin{align*}
        \mathcal{P}(I) &= \{[b_i+1, a_{i+1}-1] \cap [K] :1\leq i \leq |[K]\setminus I|-1\}\\& \cup \{ [1, b_i-1] \cap [K], [a_{[K]\setminus I}+1, K] \cap [K] \}
    \end{align*}
    and thus we conclude that
    $|\mathcal{P}(I)| \leq |[K]\setminus I| + 1.$
\end{proof}

\begin{lemma}
\label{lem:num2}
    Under the same setting as in Theorem~\ref{lem:ftc_upp}, we have
    \begin{align*}
        \sum_{P \in \mathcal{P}([K]\setminus T)} \max\{ |P|-2,0 \} \geq \max\{K-3N-2, 0 \}.
    \end{align*}
    In particular, $J$ given in \eqref{eq:j} satisfies
    \begin{align*}
        |J| \leq \min\{3N+2, K \}.
    \end{align*}
\end{lemma}

\begin{proof}
    Recall that $\sum_{P \in \mathcal{P}([K]\setminus T)} |P| = |[K]\setminus T| =  K-N.$
    Using this, we have
    \begin{align*}
        \sum_{P \in \mathcal{P}([K]\setminus T)} \max\{ |P|-2,0 \} &\geq \sum_{P \in \mathcal{P}([K]\setminus T)} (|P| - 2 )\\
        &\geq K-N - 2 |\mathcal{P}( [K]\setminus T)|.
    \end{align*}
    By Lemma~\ref{lem:num}, the number of blocks in the partition cannot be larger than $\min{\{K-N, N+1\}}$.%, we conclude.
    %\sy{sy: $\min{\{K-N, |T|+1\}}=N+1$ always?}
\end{proof}

%\begin{lemma}
%    Suppose that 
    
%    For given 
    
%    There exists a network 
%\end{lemma}

%\dk{I think we may assume $\vx_i \ne \vx_j$ for all $i \ne j$ at the beginning.}
%\js{yeah, sounds good. Added it in Sec.3.1 and removed in Lemma 1 statement}


%\begin{lemma}
%    For $x \in \mathbb{R}^d$ and $y \in \mathbb{R}$, there exists $\mW \in \sR^{3 \times d}$, $\vv \in \sR^{3}$ and $\vb \in \sR^{3}$ such that
    
%\end{lemma}

%We construct a 





%\begin{proof}[Proof of Theorem~\ref{thm:ftc_add_upp_bound}]
%\cmt{Do we need to put it here?}

\textbf{Proof of upper bound in Theorem~\ref{thm:ftc_add_bound}:}
    Since $\vx_i \ne \vx_j$ for all $i \ne j$, there exists $\va \in \sR^d$ satisfying $\vx_i^T \va \ne \vx_j^T \va$ all $i \ne j$. Without the loss of generality, we assume that \eqref{eq:up0} holds. Then, using Theorem~\ref{lem:ftc_upp} and choosing $\mW_1 = \mathbf{1}_{m} \va^T$, there exists
    \begin{align*}
        g_{\theta}(x) = \mW_2\sigma(\mW_1 \vx + \vb_1) + \vb_2, 
    \end{align*}
    satisfying \eqref{eqn:finetune_fit_well} and 
    \begin{align*}
         m \leq K - 1 - \sum_{P \in \mathcal{P}([K]\setminus T)} \max\{ |P|-2,0 \}.
    \end{align*}
    By Lemma~\ref{lem:num2}, we conclude that
    %\begin{align*}
        $m \leq K-1 - \max\{K-3N-2, 0 \} = \min \{3N+1, K-1\}.$ 
    %\end{align*}
    %\sy{sy: why not $\min{\{3N+2, K\}}$?}
    %\hfill$\Box$
    %We claim that 
    %\begin{align}
    %    l \leq 2 \min\{K-N, N\}
    %\end{align}
    %where $l$ is the number of transitions given in Proposition~\ref{prop:ftc_upp}.
    %Then, we conclude by Proposition~\ref{prop:ftc_upp}.
%\end{proof}

% \kl{}
% \kl{Use `qedhere' to move the square box}


% \dk{
% \begin{remark}
%     The above bound may be improved. %For given $a \in \mathbb{R}^d$, let 
%     \begin{align}
%         m \leq N + 2\min\{K-N, N\}.
%     \end{align}
%     %Indeed, let $l$ be the number of boundaries between  and 
%     Let $l$ be the number of transitions between the unchanged labels and the changed ones. Then, $m \leq N+ l$.
% \end{remark}
% }


% \dk{Let us add some motivation to the above proof}


\vspace{-3mm}
\section{FTC of 3-layer ReLU Network}
\label{sec:ftc3}
\vspace{-3mm}

%As extension of section ~\ref{sec:two_layer}, 
Now we analyze FTC of 3-layer fully-connected neural network $g_{\theta}:\mathbb{R}^d \rightarrow \mathbb{R}$ with ReLU activation. Note that 3-layer network can be represented as 
\begin{align}
\label{eq:3l}
    g_{\theta}(x) = \mW_3 \sigma(\mW_{2} \sigma(\mW_{1} \vx + \vb_{1}) + \vb_{2}) + b_3.
\end{align} 
%\sy{(sy: I edited here to refer this equation in corollary 5.2)} 
As before, $\sigma$ is the ReLU activation and the network is parameterized by $\theta = [ \mW_{1}, \mW_{2}, \mW_3, \vb_{1}, \vb_{2}, b_3]$ where $\vv \in \sR^{d_{2}}$, $\mW_{1} \in \sR^{d_{1} \times d}$, $\mW_{2} \in \sR^{d_{2} \times d_{1}}$, $\mW_{3} \in \sR^{1\times d_{2}}$, $\vb_{1} \in \sR^{d_{1}}$, $\vb_{2} \in \sR^{d_{2}}$ and $b_3 \in \sR$. 
Following the setting considered in the memorization capacity of 3-layer neural network~\citep{yun2019small}, we consider the scenario when $z_i \in [-1, +1]$ for all $i \in [K]$.
For notational simplicity, we denote $z_i = 0$ for $i \in [K]\setminus T$.
Below theorem summarizes the upper/lower bound on FTC of 3-layer network. 
%\sy{sy: 1) $\sigma$ notation is not needed to be explained?} \sy{sy: 2) instead of $i \in [K] \setminus T$, using $i \in [K]\setminus T$ is better for consistency}

%\cmt{Need additional assumption: $z \in [-1, +1]$. Normalized..?}
%Sections 3.2.1 and 3.2.2 provide the lower/upper bounds on the fine-tuning capacity N, and the summary of these results can be written as the theorem below:

\begin{theorem}
[FTC of 3-layer FC ReLU]\label{thm:ftc_add_bound_3layer}
Let $K \ge 3$, $T \subseteq [K]$, $|T| = N$, and $g$ be a 3-layer FC ReLU network with $m$ neurons.
\begin{enumerate}
    \item 
    For all $\vx_i \in \mathbb{R}^d$, $z_i \in \mathbb{R}$, $i \in [K]$, there exists $g$ with $m$ neurons satisfying \eqref{eqn:finetune_fit_well} 
    where $$ m \leq \min\{ 2\sqrt{K} + \min\{ 2\sqrt{K}, 3N \}, 6 \sqrt{3N+2} \}.$$
    \item 
    For given $\vx_i \in \mathbb{R}^d$, $z_i \in \mathbb{R}$, $i \in [K]$, suppose that \eqref{eqn:finetune_fit_well} holds for some $g$ with $m$ neurons. Then, $$\sqrt{2\min \{3N, K-2\}+\frac{1}{4}} - \frac{1}{2}  \leq m.$$
\end{enumerate}
%. The minimum number of neurons $m$ required in 3-layer FC ReLU network for fine-tuning $N$ samples is bounded as
Thus, the minimum number of neurons $m^{\star}$ is bounded as
%The fine-tuning capacity $N$ of a 3-layer FC ReLU network $g_{\theta}$ with $d_{1}$ neurons in layer 1, $d_{2}$ neurons in layer 2 is bounded as
\begin{align*}
    &\sqrt{2\min \{3N, K-2\}+\frac{1}{4}} - \frac{1}{2} \\
    & \quad \quad \le m^{\star} \le \min\{ 2\sqrt{K} + \min\{ 2\sqrt{K}, 3N \}, 6 \sqrt{3N+2} \}
    % \min \{3N, \max(K-2, 0)\} \le 2d_{1}d_{2}+d_{2} %\le ???.
\end{align*}
\end{theorem}

\begin{remark}
%\sy{
The upper bound in Theorem 5.1 directly shows that $m^{\star} \le \Theta(\sqrt{N})$.
    The lower bound in Theorem 5.1 indicates two facts: 
    \begin{itemize}
        \item If $3N \le K-2$, then $m^{\star} \ge \Theta(\sqrt{N})$,
        \item If $3N > K-2$, then $m^{\star} \ge \Theta(\sqrt{K})$. Combining this with $K \ge N$, we have $m^{\star} \ge \Theta(\sqrt{N})$.
    \end{itemize} 
    All in all, $m^{\star} = \Theta(\sqrt{N})$.
%}
% The upper/lower bounds in Theorem~\ref{thm:ftc_add_bound_3layer} have the following orders, depending on the relationship between $N$ (the number of samples we fine-tune) and $K$ (the number of samples used for pre-training):
% \begin{align*}
%     \begin{cases}
%         \text{ If } N = o(\sqrt{K}), & \text{ then } m = \Theta(\sqrt{N}) \\
%         \text{ If } N = \Theta(\sqrt{K}), &  \text{ then } \Theta( \sqrt[4]{K} ) \le m \le \Theta(\sqrt{K}) \\
%         \text{ If } N = \Theta(K), &  \text{ then } m = \Theta(\sqrt{K}) 
%     \end{cases}
% \end{align*}


% \cmt{Add Corollary 5.2 (representing Thm 5.1 in terms of $N^{\star}$ somewhere here... Check Corollary 4.2 as a reference...}

\begin{corollary} [FTC of 3-layer FC ReLU]\label{coro:ftc3NN}
    For given $m \in \mathbb{N}$, let $N^{\star}$ be the fine-tuning capacity of a 3-layer fully-connected ReLU network $g$ given in \eqref{eq:3l} with $m$ neurons.
    Then, $N^{\star}$ is bounded as below, for different range of $K$:
    \begin{enumerate}
        \item 
            If $K \le \left\lfloor \frac{m^2}{16} \right\rfloor$, then
            \begin{align}
            \label{eq:ftc3l1}
            N^{\star} = K.
            \end{align}
        \item 
            If $\left\lfloor \frac{m^2}{16} \right\rfloor + 1 \le K < \frac{m^2+m+4}{2}$, then
            \begin{align}
            \label{eq:ftc3l2}
            \left\lfloor \frac{m^2}{108} - \frac{2}{3} \right\rfloor \le N^{\star} \le K.
            \end{align}
        \item 
            If $K \ge \frac{m^2+m+4}{2}$, then
            \begin{align}
            \label{eq:ftc3l2}
            \left\lfloor \frac{m^2}{108} - \frac{2}{3} \right\rfloor \le N^{\star} \le \frac{m^2+m}{6}.
            \end{align}
    \end{enumerate}
\end{corollary}
\begin{proof}
    % From Theorem ~\ref{thm:ftc_add_bound_3layer}.1, we know the existence of a neural network $g$ with $m$ neurons, satisfying
    % \begin{align}
    %     m &\le \min\{ 4\sqrt{K} , 2\sqrt{K}+3N, 6 \sqrt{3N+2} \} \nonumber\\
    %     &\le \min\{ 4\sqrt{K} , 6 \sqrt{3N+2} \}.
    % \end{align} 
    %\cmt{asdf}
    % First of all, we need to divide the interval of K based on Theorem ~\ref{thm:ftc_add_bound_3layer}.1. In other words, we have to control the size of $K$ to ensure that the desired outcome is obtained from $\min\{ 4\sqrt{K} , 2\sqrt{K}+3N, 6 \sqrt{3N+2} \}$. But for simplicity of our statement and proof, we will simplify our approach by focusing solely on $ \min\{ 4\sqrt{K}, 6 \sqrt{3N+2} \}$ during our analysis. This approach is still valid because we are only giving up a small amount of precision, specifically for cases where $N \le \frac{2}{3}\sqrt{K}$. In these cases, the loss in tightness is just $2\sqrt{K}-3N$, which is not critically large.
    Suppose $K \le \left\lfloor \frac{m^2}{16} \right\rfloor$. By Theorem \ref{thm:ftc_add_bound_3layer}.1, for $|T|= K$, there exists a 3-layer fully-connected ReLU network $g$ where the number of neurons of $g$ is less than or equal to
    \begin{align*}
        \min\{ 4\sqrt{K}, 2\sqrt{K}+3K, 6\sqrt{3K+2} \} = 4\sqrt{K} \le m.
    \end{align*} This implies that $N \ge K$, and since $N\le K$ always holds, we can conclude $N^{\star} = K$.

    Suppose $K \ge \left\lfloor \frac{m^2}{16} \right\rfloor + 1$. 
    %\sy{sy: I think the interval of $K$ don't have to be expressed with floor function, so it might be okay to replace with $K > \frac{m^2}{16}$} 
    By Theorem \ref{thm:ftc_add_bound_3layer}.1, for $|T|=\left\lfloor 
    \frac{m^2}{108}-\frac{2}{3} \right\rfloor$, there exists a 3-layer fully-connected ReLU network $g$ where the number of neurons of $g$ is less than or equal to
    \begin{align}%\label{eqn:}
        &\min\{ 4\sqrt{K}, 6\sqrt{3\left\lfloor 
        \frac{m^2}{108}-\frac{2}{3} \right\rfloor+2} \} \\
        &= 6\sqrt{3\left\lfloor 
        \frac{m^2}{108}-\frac{2}{3} \right\rfloor+2} \le m,
    \end{align}
    where the inequality is derived from the fact that $4\sqrt{K} \ge m$ for 
    $K \ge \left\lfloor \frac{m^2}{16} \right\rfloor + 1$, and $6\sqrt{3\left\lfloor 
    \frac{m^2}{108}-\frac{2}{3} \right\rfloor+2} \le 6\sqrt{3 \left( 
    \frac{m^2}{108}-\frac{2}{3} \right)+2} = m$ .
    Now we can conclude that $N^{\star} \ge \left\lfloor 
    \frac{m^2}{108}-\frac{2}{3} \right\rfloor$.

    Now we derive the upper bound on $N^{\star}$. %, when $K \ge \left\lfloor \frac{m^2}{16} \right\rfloor + 1$. 
    % Recall that from Theorem \ref{thm:ftc_add_bound_3layer}.2,
    % \begin{align*}
    %     m \ge \sqrt{2\min \{3N, K-2\}+\frac{1}{4}} - \frac{1}{2}.
    % \end{align*}
    If $K \ge \frac{m^2+m+4}{2}$, then
    \begin{align*}
        m &\ge \sqrt{2\min \{3N, K-2\}+\frac{1}{4}} - \frac{1}{2} \\ &\ge \sqrt{\min{\{ 6N, m^2+m \}} + \frac{1}{4}} - \frac{1}{2}
        \\ &= \min{\left\{ \sqrt{6N+\frac{1}{4}}, \, m+ \frac{1}{2}  \right\}} - \frac{1}{2} 
        \\ &= \min{\left\{ \sqrt{6N+\frac{1}{4}} - \frac{1}{2}, \, m \right\}},
    \end{align*}
    where the first inequality is from Theorem \ref{thm:ftc_add_bound_3layer}.2, and the second inequality is from $K \ge \frac{m^2+m+4}{2}$.
    We can simplify the above inequalities as $m \ge \sqrt{6N+\frac{1}{4}} - \frac{1}{2}$. Since this holds for any number of fine-tunable samples $N$, we have $N^{\star} \le \frac{m^2+m}{6}$.

    If $\left\lfloor \frac{m^2}{16} \right\rfloor + 1 \le K < \frac{m^2+m+4}{2}$, we use a trivial upper bound 
    % Suppose $K < \frac{m^2+m+4}{2}$. Then, we cannot determine a specific upper bound for $N$ because we do not have enough information to ascertain which value will come out from $\min{\{3N, K-2\}}$ given only the conditions provided. Hence we can only get the trivial upper bound 
    $N^{\star} \le K$, which completes our proof. 
\end{proof}


 \begin{figure}[t!]
    \vspace{-5mm}
    \centering
    \includegraphics[width=8cm]{fig/FTC_3layer_corollary5_2.pdf}
    \vspace{-2mm}
    \caption{
    Visualization of FTC for 3-layer network in corollary ~\ref{coro:ftc3NN}. In this figure, $A = \left\lfloor\frac{m^2}{108}-\frac{2}{3}\right\rfloor$ and $B = \frac{m^2+m}{6}$.
    }
    \vspace{-2mm}
    \label{fig:bound}
    \end{figure}

% \begin{align}
%     \bullet \ N &= o(\sqrt{K})    &\rightarrow \quad \   m &= \Theta(\sqrt{N})  \\
%     \bullet \ N &= \Theta(\sqrt{K}) &\rightarrow \ \ \Theta(K^{1/4}) \le & \ m \le \Theta(K^{1/2}) \\
%     \bullet \  N &= \Theta(K) &\rightarrow \ m = \Theta(K^{1/2}) 
% \end{align}
\end{remark}
%\kl{align equations better?}
% But, to clarify the relationship between $m$ and $N$, we can express the above theorem as follows:

% \begin{theorem}
% [Additive FTC bound of 3-layer, equivalent form]\label{thm:ftc_add_bound_3layer_equivalent}
% The fine-tuning capacity $N$ of a 3-layer FC ReLU network $g_{\theta}$ with $m$ neurons, where $d_{1}=d_{2}=\frac{m}{2}$ neurons in each layer, is bounded as
% \begin{align}
%     \min \{3N, \max(K-2, 0)\} \le \frac{m^2}{2} + \frac{m}{2} %\le ???.
% \end{align}
% \end{theorem}

% \begin{remark}
% With Simple modification, we can also reformulate Theorem ~\ref{thm:ftc_add_bound_3layer_equivalent} as below:
% \begin{align}
%     \sqrt{2\min \{3N, \max(K-2, 0)\}+\frac{1}{4}} - \frac{1}{2} \le m %\le ???.
% \end{align}
% \end{remark}

% \dk{1. I thought we didn't decide whether to interpret this thm from the perspective of $m$ or from the perspective of $N$, so I just write both versions with remark.}

% \dk{2. Do we need to explain why we set $d_{1}=d_{2}=m/2$?}
Fig.~\ref{fig:bound} illustrates the results in Corollary~\ref{coro:ftc3NN}. For different ranges of $K$, we have either a constant $N^{\star} = K$, or upper/lower bounds on $N^{\star}$. 

\subsection{Proof of Lower Bound on $m$}\label{sec:three_layer_lower}
\vspace{-3mm}

We follow the proof technique used in section ~\ref{sec:two_layer_lower}. Since the number of minimum pieces $p(K)$ provided in Eq.~\ref{eqn:p_K} is still valid for 3-layer neural network, we just need to check how many pieces $\bar{g}_{\theta}(t)$ has. As stated in the proof of Theorem 3.3 of ~\citep{yun2019small}, 
%for 3-layer ReLU network
%, we can verify that 
$\bar{g}_{\theta}(t)$ has $2d_1d_2+d_2+1$ pieces, where $d_1$ and $d_2$ are the number of neurons in layer 1 and 2, respectively. %The only difference from the 2-layer ReLU network is the representation of pieces of $\bar{g}_{\theta}(t)$, thus we can complete this proof with similar idea. 
Thus, we have %$p(K) \le 2d_1d_2+d_2+1$, 
\begin{align*}
    \min \{3N, K-2\} \le 2d_1d_2+d_2 \le \frac{m^2}{2} + \frac{m}{2},
    % \min \{3N, \max(K-2, 0)\} \le 2d_1d_2+d_2+1 \le \frac{m^2}{2} + \frac{m}{2},
\end{align*} 
%\sy{sy: since $\min{\{3N+1, K-1\}} \le 2d_1 d_2 + d_2 + 1, \quad \min{\{3N, K-2\}} \le 2d_1 d_2 + d_2$ is correct}
where the last inequality is from the fact that $2d_1d_2+d_2$ is having its maximum value when $d_1 = d_2 = m/2$. 
Reformulating the above inequality with respect to $m$ completes the proof.


% \cmt{asdf}

% Finally we get the similar inequality with Theorem ~\ref{thm:ftc_add_bound} for upper bound on $N$, and note that m of Theorem ~\ref{thm:ftc_add_bound} was substituted with expressions for $d_1$ and $d_2$.

\subsection{Proof of Upper Bound on $m$}\label{sec:three_layer_upper}

We can prove $m \le U$ for an upper bound $U$ by constructing a 3-layer neural network $g_{\theta}$ having $U$ neurons, which satisfies Eq.~\ref{eqn:finetune_fit_well} for given $N$ and $K$. Below we construct different types of neural networks satisfying the condition, where each construction gives different upper bounds $U_1 = 4\sqrt{K}, U_2 = 2\sqrt{K} + 3N$ and $U_3  = 6\sqrt{3N+2}$. This completes the proof of $m \le \min\{U_1, U_2, U_3\}$. 
%Due to the space limitation, we defer the proof for $m \le U_1$ in Appendix.
%, specified in each step 1,2 and 3, respectively.

% \begin{theorem}
% [Upper bound on $m$ for 3-layer, ]\label{thm:ftc_add_upper_bound_3layer}
% The fine-tuning capacity $N$ of a 3-layer FC ReLU network $g_{\theta}$ with $m$ neurons, where $d_{1}=d_{2}=\frac{m}{2}$ neurons in each layer, is bounded as
% \begin{align}
% m \le 2\sqrt{K} + \min\{ 2\sqrt{K}, 4N \}
% \end{align}
% \end{theorem}
% \begin{proof}
    % \cmt{plz put the proof here}    



 
    \begin{figure}[t!]
    \vspace{-5mm}
    \centering
    \includegraphics[width=8cm]{fig/3layer_U1.pdf}
    \vspace{-2mm}
    \caption{
    Construction of 3-layer network that achieves upper bound $U_1 = 4\sqrt{K}$ in Sec.~\ref{sec:three_layer_upper}. This construction is directly from~\citep{yun2019small}, and each neuron uses hard-tanh activation in Eq.~\ref{eqn:hard-tanh}.
    This illustration gives an example when $T= \{7\}$, \ie we change the label for $\vx_7$, while maintaining the label for other samples.
    }
    \vspace{-2mm}
    \label{fig:3layer2sqrtK}
    \end{figure}
       
    \paragraph{Proof of $m \le  4\sqrt{K}$:}

    Recall the neural network  $g_{\theta}$ constructed in the proof of Theorem 3.1 of~\citep{yun2019small}, containing $\sqrt{K}$ neurons in both 1st layer and 2nd layer. See Fig.~\ref{fig:3layer2sqrtK} for the illustration of such network, when $K=16$ and $N=1$, where the index of fine-tuning sample is $T = \{7\}$. In such case $z_7 \ne 0$ from the definition of $T$.
    Each box (node) in the figure is a neuron, where the curve inside the box represents the activation function of the neuron. In this figure, each neuron uses hard-tanh activation defined as 
    %$\sigma_H (x) = -1 \cdot \mathbbm{1}_{t \le -1} + t \cdot \mathbbm{1}_{-1 < t \le 1} + 1 \cdot \mathbbm{1}_{t > 1}$.
    \begin{align}\label{eqn:hard-tanh}
        \sigma_H(x) = \begin{cases}
            -1, & \quad t \le -1 \\
            t, & \quad -1 < t < +1 \\
            +1, & \quad t \ge +1,
        \end{cases}
    \end{align}
    where $[-1, 1]$ is the \textit{non-clipping} region of $\sigma_H$, and $[-1, 1]^c$ is the \textit{clipping} region of $\sigma_H$.
    %, where 
    %$\mathbbm{1}_{A}$ is the indicator defined as $\mathbbm{1}_{A} = 1$ if condition $A$ holds and $\mathbbm{1}_{A} = 0$ otherwise.
    Note that the digit $i$ in the box represents the location where the feature $\vx_i$ for the $i$-th sample is mapped to. 
    For example, for the first neuron of layer 1, we have $\alpha_1^1 (\vx_i)  \in [-1,1]$ for $i \in \{1,2,3,4\}$ and $\alpha_1^1 (\vx_i) >  1$ for $i \in \{5, 6, \cdots, 16\}$, where %\sy{sy: not critical, but I think non-clipping region should be unification, [-1, 1] or [-1, 1]}
    \begin{align}\label{eqn:alpha}
        \alpha_j^l (\vx) = \mW_{l,j} \vx + b_{l,j}
    \end{align} 
    is the input value of node $j$ in layer $l$, when the input for the network $g_{\theta}$ is given as $\vx$. Here, the weight matrix and the bias for layer 1 are denoted by $\mW_1 = [\mW_{1,1}^T; \cdots; \mW_{1,\sqrt{K}}^T]$ and $\vb_1 = [b_{1,1}, \cdots, b_{1,\sqrt{K}}]$, respectively. 
  %  \sy{sy: dimension? transpose, and why $N$? } \cmt{it should be $\sqrt{K}$}
    Given target $\{z_i\}_{i=1}^K$, the proof of Theorem 3.1 of~\citep{yun2019small} specified parameters $\theta = [ \mW_{1}, \mW_{2}, \mW_3, \vb_{1}, \vb_{2}, b_3]$ satisfying 
    %\begin{align*}
        $g_{\theta}(\vx_i) = z_i$ for all $i \in [K].$
    %\end{align*}
    Assigning $z_i = 0$ for $i \in [K] \setminus T$ 
    %\sy{sy: also should be $i\in[K]\setminus T$ }
    and reusing these parameters is enough to satisfy the desired condition for fine-tuning in Eq.~\ref{eqn:finetune_fit_well}. Note that this network uses $2\sqrt{K}$ neurons with hard-tanh activations, which can be converted to a ReLU neural network with $4\sqrt{K}$ neurons, using the fact that one hard-tanh neuron can be expressed with two ReLU neurons. This directly proves that $4\sqrt{K}$ ReLU neurons are sufficient for changing the labels of $N \le K$ samples. 


    
    \begin{figure}[t!]
    \vspace{-5mm}
    \centering
    \includegraphics[width=8cm]{fig/3layer_U2.pdf}
    \vspace{-2mm}
    \caption{
    Construction of 3-layer network that achieves upper bound $U_2 = 2\sqrt{K} + 3N$ in Sec.~\ref{sec:three_layer_upper}. 
    This illustration gives an example when $T= \{7\}$.
    % \\
    % \sy{i) $z_2, z_7, z_{10}, z_{15}$ do not have order.. i don't think it is critical problem(because we can set relu with very small gap) but I have some confusion about how to visualization(if I draw like current figure, then I'm worried that it might be confusing that $z_2 < z_7 ...$}\\
    % \sy{ii) change the digit in the 2nd layer}
    }
    \vspace{-3mm}
    \label{fig:3layer3N}
    \end{figure}



\paragraph{Proof of $m \le  2\sqrt{K} + 3N$:}
    We construct a 3-layer neural network with $U_2 = 2\sqrt{K} + 3N$ neurons which successfully fine-tunes $N$ samples. 
    Fig.~\ref{fig:3layer3N} illustrates the example of such construction when $K=16$ and $N=1$. Here, we assumed $T = \{7\}$, \ie the label for 7-th sample is fine-tuned, but similar proof can be applied to arbitrary $T$ with $|T| = N =1$. 
    Here, our goal is to construct $g_{\theta}$ satisfying   $g_{\theta} (\vx_7) = z_7 \ne 0$ and  $g_{\theta} (\vx_i) = 0$ for all $i \ne 7$.
    Our basic idea is, to follow the construction in Fig.~\ref{fig:3layer2sqrtK},
    except the activation used in layer 2. The activation in layer 2 is defined as
    \begin{align*}
        \sigma_B (t) = \begin{cases}
            0, & t \notin [z_7 - \delta, z_7 + \delta] \\
            \frac{z_7}{\delta} (t - z_7 + \delta),  & t \in [z_7 - \delta, z_7) \\
            - \frac{z_7}{\delta} (t - z_7 - \delta),  & t \in [z_7, z_7 + \delta) \\
        \end{cases}
    \end{align*} 
    for arbitrary $\delta < \min_{i \ne j} \frac{\lvert z_i - z_j  \rvert}{2}$, which is having a small bump near $z_7$ as in Fig.~\ref{fig:3layer3N}. %\sy{sy: how about saying ``hard-tanh neuron'', ``ReLU neuron''}
    Among $\sqrt{K}$ hard-tanh neurons in layer 2 in Fig.~\ref{fig:3layer2sqrtK}, we only choose the neuron containing $z_7$ (the non-zero target label) in the non-clipping region of the activation, \eg the second neuron of layer 2. 
    Given that $\mW_2$ is the weight matrix for 2nd layer in the construction of Fig.~\ref{fig:3layer2sqrtK}, the weight matrix for 2nd layer in the construction of Fig.~\ref{fig:3layer3N} is defined as $\widetilde{\mW_2}  := \mW_{2,2}$, the 2nd row of $\mW_2$. 
    Then, the overall network looks like $g_{\theta}(\vx) = \sigma_B ( \mW_{2,2} \sigma_H (\mW_1 \vx + \vb_1) + \vb_2) $, which satisfies $g_{\theta}(\vx_7) = z_7$ and $g_{\theta}(\vx_i)=0$ for all $i \ne 7$.
    % \begin{align*}
    %     g_{\theta}(\vx_7) &= z_7, \\
    %     g_{\theta}(\vx_i) &= 0 \quad \forall i \ne 7
    % \end{align*}
    Note that the activation $\sigma_{B}$ with 4 pieces can be constructed by 3 ReLU neurons. Consider constructing $g_{\theta}$ by adding new neurons (with $\sigma_{B}$ activation) in layer 2 for each sample we want to fine-tune, which allocates total $3N$ ReLU neurons in layer 2. Since layer 1 (as in Fig.~\ref{fig:3layer2sqrtK}) contains $2\sqrt{K}$ ReLU neurons, one can confirm that our construction contains $2\sqrt{K} + 3N$ ReLU neurons.
    % Assume that $d_1$ is even, and \textcolor{green}{$d_1d_2=K$}.
    % Since we assumed $\vx_i \ne \vx_j$ for all $i \ne j$, we can find $\vu$ such that $\vu\vx_i \ne \vu\vx_j$ for all $i \ne j$ and without loss of generality, we can set samples in ascending order $\vu\vx_1<\vu\vx_2<\dotsi<\vu\vx_K$. 
    % By using techniques of Theorem 3.1 of ~\citep{yun2019small}, we can divide K samples into $d_1$ groups with $d_2$ in layer 1. In layer 2, we can define a set for $k$-th neuron: $\mathcal{I}_k:=\{k, 2d_2+1-k, 2d_2+k, 4d_2+1-k, \dotsi, d_1d_2+1-k\}$ and samples having such indices are placed on $k$-th neuron. 
    % In our FTC case, we can set some labels to 0 and it might allow us to get tighter bound.  If we think of case like $K=N$ (i.e., changing all labels), we cannot get tighter bound than Theorem 3.1 of ~\citep{yun2019small}. But we can get tighter bound if there exist neurons in layer 2 such that all samples in that neurons are in $[K]\setminus T$. 

    % \textcolor{green}{put figure here}

    % We define above activation function by using 3 ReLU. In ~\citep{yun2019small}, they(??) used 2 ReLU for each neuron. We can define the number of neurons which having at least one sample needed to be fine-tuned as q, then if $2m > 4q$, $q < \frac{m}{2}$ we can get tighter bound. Note that this is upper bound on $m$, so we need to consider worst case. Our worst case is $N$ samples in $T$(need to re-fitting) are spread on as many as possible neurons in layer 2(think of case that we need to change only 4 samples and they are in 4 neurons, all samples are in different neurons). Now we find that we can consider $q = N$ and this completes the proof. 
%\end{proof}
% \subsection{Remarks on the dependency on $T$}
% Note that the upper bound on $m$ we obtained above is considering the worst-case $T$ that maximizes the required number of neurons to fine-tune the model. Here we provide some comments on how the required number of neurons can vary depending on $T$.
% As a toy example, consider when $K=16$ and $N=4$. If $T = \{1,8,9,16\}$, we require only $11$ ReLU activations for fine-tuning, as proved below.
% \cmt{add what happens}
% \subsection{Updated Upper Bound on $m$}
% \begin{theorem}
% [Upper bound on $m$ for 3-layer]\label{thm:ftc_add_upper_bound_3layer_updated}
% The fine-tuning capacity $N$ of a 3-layer FC ReLU network $g_{\theta}$ with $m$ neurons is bounded as
% \begin{align}
%     m \le 4N + 4
% \end{align}
% \end{theorem}
% \begin{proof}

%     \cmt{merge with above}


\paragraph{Proof of $m \le 6 \sqrt{3N+2}$:}



    \begin{figure}[t!]
    \centering
    \includegraphics[width=8cm]{fig/3layer_sqrtN.pdf}
    \caption{
    The construction of 3-layer network achieving the upper bound $U_3 = 6 
    \sqrt{3N+2}$ in Sec.~\ref{sec:three_layer_upper}. Here, we consider the case when $K=16$, $N=3$ and $T = \{ 2,4,9 \}$.
    }
    \label{fig:3layersqrt6N}
    \end{figure}
It is worth nothing that some techniques developed for showing $m \le 4N+4$ (shown in Appendix) are used to prove $m \le 6\sqrt{3N+2}$. Thus, we refer to some equations in Appendix during the proof.

We construct a neural network $g_{\theta}$ with $6\sqrt{3N+2}$ ReLU neurons, which fine-tunes $N$ samples. For simplicity, here we provide an example construction when $K = 16$, $N = 3$, and $T = \{2, 4, 9\}$. By using the definition of $J$ in Eq.~\ref{eqn:J-index-set}, we have $J = \{ 1, 2, 3, 4, 5, 8, 9, 10, 16 \}$. 
We will now construct a 3-layer neural network $g_{\theta}$ satisfying
%Let $g_{\theta}$ be the neural network fitting the labels $z_i$ for $i \in J$, using the technique used in~\citep{yun2019small} with $\sqrt{J}=3$ neurons in both layer 1 and layer 2, as illustrated in Fig.~\ref{fig:3layersqrt6N}. This network is guaranteed to have 
\begin{align}
g_{\theta}(\vx_i) = z_i, \quad i \in J, \label{eqn:3-layer-goal-1}\\
g_\theta(\vx_i) = 0, \quad i \notin J, \label{eqn:3-layer-goal-2}
\end{align}
%for $i \in J$, as well as $g_\theta(\vx_i) = 0$ for $i \notin J$.
which is illustrated in Fig.~\ref{fig:3layersqrt6N}. 
Note that Eq.~\ref{eqn:3-layer-goal-1} can be easily proved by fitting the network using the samples $\vx_i$ with $i \in J$. In the rest of the proof, we will show that for such $g_{\theta}$ satisfying Eq.~\ref{eqn:3-layer-goal-1}, we can prove Eq.~\ref{eqn:3-layer-goal-2}. As an example, we will only show $g_\theta(\vx_6) = 0$, but a similar proof technique can be applied to arbitrary $\vx_i$ with $i \notin J$.


%For simplicity, we focus on $\vx_6$
%while we do not know whether this holds for $i \notin J$. Here, as an example, we show that $g_\theta(x_6) = 0$ holds; with similar technique, one can show $g_\theta(x_i) = 0$ for $i \notin J$.

% In this proof, we will briefly check whether $g_\theta(x_6) = 0$ in our network. \cmt{This proof can be easily generalized to any $x_i$ for $i \notin J$}
 
We construct the first layer by only using samples $\vx_i$ with $i \in J$. We partition 
the $|J|$ samples  into $\sqrt{|J|}$ groups, following the trick used  in \citep{yun2019small} for 3-layer network. 
Let us denote the first index of $j$-th group as $s_j^{min}$ and the last index of $j$-th group as $s_j^{max}$. In other words, $J$ is decomposed into $\sqrt{|J|}$ groups as $J = \{s_1^{min}, \cdots, s_1^{max},  \} \cup \cdots \cup \{ s_{\sqrt{|J|}}^{min}, \cdots, s_{\sqrt{|J|}}^{max} \} $. 


Without loss of generality, we can assume that samples are ordered as $\vv^T\vx_1<\vv^T\vx_2<\dotsi<\vv^T\vx_K$ for some $\vv$. Let $c_i := \vv^T\vx_i$ and define $c_0 = c_1-\epsilon$, $c_{K+1} = c_K+\epsilon$ for arbitrary $\epsilon > 0$. Then, we choose $\mW_1, b_1$ as in Eq.~\ref{eqn:firstlayerweights}, using $\vv$ and $\epsilon$ defined above.

Here, we focus on the relationship between the outputs of layer 1, when the neural network inputs are $\vx_5, \vx_6, \vx_8$, respectively. 
From the definition of $c_i$, we have $c_6 \in (c_5, c_8)$. Using Eq.~\ref{eqn:alpha} and Eq.~\ref{eqn:firstlayerweights}, we have 
$\alpha_j^1 (\vx_6) \in (\alpha_j^1 (\vx_5), \alpha_j^1 (\vx_8))$, meaning that the input $\alpha_j^1$ of layer 1 (for $\vx_6$) is bounded by $\alpha_j^1$ for $\vx_5$ and $\alpha_j^1$ for $\vx_8$. After passing it through the ReLU activation, we also have 
\begin{align}\label{eqn:beta1_sandwich}
 \beta_j^1 (\vx_6) \in (\beta_j^1 (\vx_5), \beta_j^1 (\vx_8)),
\end{align}
for all $j \in \{1, \cdots, \sqrt{J}\}$
using the monotonicity of ReLU. Kore precisely, 
we have 
$\beta_1^1(x_6)=+1$, $-1\le\beta_2^1(x_6)\le+1$, and $\beta_3^1(x_6)=-1$. This is illustrated in the first layer in   Fig.~\ref{fig:3layersqrt6N}.
% Recall that these parameters ($\mW_1, \vb_1$) came from the idea about lining up the samples by their order and clipping out(to <-1 or >+1) the samples outside the group by using the above $\mW_1, b_1$. Then we can say that the samples not in $J$ can also be placed in its order. For example, for the first neuron of the first layer, $\alpha_1^1(x_5)\le\alpha_1^1(x_6)\le\alpha_1^1(x_8)$ is trivial \cmt{from Eq.~\ref{eqn:alpha}}. As the same logic, $\alpha_2^1(x_5)\ge\alpha_2^1(x_6)\ge\alpha_2^1(x_8)$, and $\alpha_3^1(x_5)\le\alpha_3^1(x_6)\le\alpha_3^1(x_8)$. After passing activation function, we have 
Now we move to the construction of the second layer. Recall that the main idea of constructing $\mW_2, b_2$ is using the linear system in Eq.~\ref{eqn:linear_system}. 
Using Eq.\ref{eqn:alpha_1^2} and the fact that we can set all elements of $\mW_2$ as positive (as shown in~\citep{yun2019small}), Eq.~\ref{eqn:beta1_sandwich} implies %\begin{align*}
 $   \alpha_j^2 (\vx_6) \in (\alpha_j^2 (\vx_5), \alpha_j^2 (\vx_8))$
%\end{align*}
for all $j \in \{1, \cdots, \sqrt{J}\}$. Finally, we set the activation function of layer 2 as
\begin{align*}
    \sigma_L(t) = \begin{cases}
        t, & \text{ if } |t| \le 1, \\
        0, & \text{ if } |t| \ge 1
    \end{cases}
\end{align*}
and arbitrarily increase $\lambda$ such that $\beta_j^2(\vx_6) = 0$. 
Then, the output of the network 
$g_{\theta}(\vx) = \sigma_L(\mW_2(\sigma_H(\mW_1 \vx + \vb_1)) + \vb_2)$
satisfies $g_{\theta}(\vx_6) = 0$.
% Note that we can only compare the order within first layer group, so we always have to compare only with $z_5, z_8$. By controlling $\lambda$, we can always clipping out $z_6$ to <-1, >+1 region. then by our activation, $\beta_j^2(x_6)=0$.

Now the question is, what is the upper bound on $|J|$? Recall that Lemma~\ref{lem:num2} guarantees that $|J|\le3N+2$. %\sy{need to check 2-layer proof again. 3N+1? 3N+2?}. 
Since $\sigma_H$ and $\sigma_L$ can be represented by 2 ReLUs and 4 ReLUs, respectively, $2\sqrt{3N+2}+4\sqrt{3N+2}=6\sqrt{3N+2}$ ReLU neurons are sufficient for our 3-layer network construction.
    
%\cmt{TBE...}

    
%\end{proof}



% \paragraph{Toy example}

% Here we provide some toy examples for $N=16$, where naively using conventional wisdom~\citep{yun2019small} will require $m = 16$ ReLU neurons, while our new approach requires less than that. \cmt{We need to explain how this is related with the upper bound on $m$}

% \dk{sy: From here, I wrote quite roughly, sorry for bad sentences and grammatical errors :(}

% 1) Using 3 ReLU at first neuron of 2nd layer (thus, total used 2x4 + 3 = 11 ReLU)

% Toy examples: 

% Let $T = \{1, 8, 9, 16\}$

% We can set $\delta$ as following:
% \begin{align}
%     \delta < \frac{\min_{i, j \in [n]} | \vx_i^T \va - \vx_j^T \va | }{2}
% \end{align} 
% \dk{Kore precisely, for odd $j$, $| \vx_{T_j}^T \va - \vx_{T_{j+1}}^T \va |$, for even $j$, $| \vx_{T_j}^T \va - \vx_{T_{j-1}}^T \va |$. But nothing changed, more complicated..}

% We will fix the settings same as \citep{yun2019small}, except for activation function of layer 2.

% Instead of using hard-tanh, we can use 3 neurons with ReLU activation.

% $1$st neuron: $ReLU(y_i + 1)$\\
% $2$nd neuron: $ReLU(y_i - 1)$\\
% $3$rd neuron: $ReLU(y_i - (1+\delta))$

% $v_1 = 1, v_2 = -\frac{1}{\delta}, v_3 = \frac{1}{\delta} - 1 ,b_3 = -1$



% 2) Using 4 ReLU(Total used 12 ReLU)

% Toy examples: 

% Let $T = \{1, 8, 9, 16\}$

% 3) Using modified equation(Total used 10 ReLU)



% Toy examples: 

% Let $T = \{1, 8, 9, 16\}$
% We will fix the settings same as \citep{yun2019small}, B.2 input to layer 1.  

% We need to get $y_i$ with label-changed samples and $0$ with not changed samples. So if we change some settings of B.3(layer 1 to layer 2), we can fit toy examples with 2 ReLU.

% \citep{yun2019small} used following linear equation to define parameters such that match the input values to the exact label values.
% \begin{align}
% \mK_k
% \begin{bmatrix}
% (\mW_{k,:}^2)^T\\
% \vb_k^2    
% \end{bmatrix} 
% =
% \begin{bmatrix}
% y_{i_{k,1}}\\
% \vdots\\
% y_{i_{k,p}}\\
% \end{bmatrix} 
% \end{align}

% For our toy example case, we just need to fit samples in $k=1$ neuron to their labels, and make other samples are just $0$. 

% Instead of above linear equation, we will use following modified linear equation:
% \begin{align}
% \mK_1
% \begin{bmatrix}
% (\mW_{1,:}^2)^T\\
% \vb_1^2    
% \end{bmatrix} 
% =
% \begin{bmatrix}
% y_{i_{1,1}}+1\\
% \vdots\\
% y_{i_{1,4}}+1\\
% \end{bmatrix} 
% \end{align}
% \dk{We need to check Lemma B.1 in Yun can be applied in this case(cross check needed)}

% Then by Lemma B.1. of \citep{yun2019small}, there are infinitely many solutions for above equation and the form of solutions is $\mu + \alpha\nu$. We can change the value of $\alpha$ to satisfy our purpose.

% Define a index set $\mathcal{I}_1 := \{ 1, 8, 9, 16 \}$. For odd $j$, samples in $j$-th group are sorted in ascending order. Kore explicitly, indices of samples in $1$st group: $\{1, 2, 3, 4\}$ and their output values of $1$st node in layer $1$ have ascending order: $a^1_1(x_1)<a^1_1(x_2)<a^1_1(x_3)<a^1_1(x_4)$, and output values of $l$-th($l \neq 1$) node in layer $1$ have same values: $a^1_l(x_1)=a^1_l(x_2)=a^1_l(x_3)=a^1_l(x_4)$. These results are same for $3$rd group.

% Likewise, for even $j$, samples in $j$-th group are sorted in descending order, thus they have opposite outcomes: for $2$nd group, output values of $2$nd node in layer $1$: $a^1_2(x_{5})>a^1_2(x_{6})>a^1_2(x_{7})>a^1_2(x_{8})$ and output values of $l$-th($l \neq 2$) node in layer $1$ have same values: $a^1_2(x_{5})=a^1_l(x_{6})=a^1_l(x_{7})=a^1_l(x_{8})$ and same for $4$th group.

% By choosing sufficiently large $\alpha$, we can make $z^2_1(x_i) > +2$ for all $i \notin \mathcal{I}_1$, and it makes outcomes of layer $2$ satisfy $a^2_1(x_i) = +1$ for all $i \notin \mathcal{I}_1$. 
% \dk{We can take 2 ReLU such that $ReLU(x+2)=1$ and $-ReLU(x)$, or re-setting bias $\tilde{b^2_1} = b^2_1 - 1$ and apply hard-tanh}

% Finally, we can summarize outcomes of layer $2$ as following:
% \begin{align}
% a^2_1(x_i) = y_{i}+1, \forall i \in \mathcal{I}_1\\
% a^2_1(x_i) = +1, \forall i \notin \mathcal{I}_1
% \end{align}

% Set $\mW^3 = 1$ and $\vb^3 = -1$, then this completes the proof.

% To generalize this results, we can think such case: two neurons in layer $2$. Then results will become slightly different from toy example case:

% \begin{align}
% a^2_1(x_i) = y_{i}+1, \forall i \in T \\
% a^2_1(x_i) = 0, \forall i \in [K] \setminus T
% \end{align}

% So this case can be resolved by using another modified linear equation:
% \begin{align}
% \mK_k
% \begin{bmatrix}
% (\mW_{k,:}^2)^T\\
% \vb_k^2    
% \end{bmatrix} 
% =
% \begin{bmatrix}
% y_{i_{k,1}}-1\\
% \vdots\\
% y_{i_{k,4}}-1\\
% \end{bmatrix} 
% \end{align}
% and setting $\mW^3 = 1$ and $\vb^3 = 0$.

% Note that toy example case can be generalized to odd number of neurons in layer $2$ cases(this implies the number of label-changed samples is between $p*(q-1)$(odd $q$) and $p*q$(odd $q$)).

% Likewise, second example (two neurons in layer $2$) case can be generalized to even number of neurons in layer $2$ cases(this implies the number of label-changed samples is between $p*(q-1)$(even $q$) and $p*q$(even $q$)).

% \dk{we might be
% needed to rewrite by using $z_i$ instead of $y_i$ for matching notations with Def 3.3. And other notations like $\mW^3$ are also needed to be changed to match with our notations}


% \

% \newpage

% 
% \section{Other Extensions}

% In the above, we considered a specific model for fine-tuning, which is changing the label of some samples.  

% \begin{question}
% Given this pre-trained network $f$, we add $m$ new data points $\{\vx_i, y_i\}_{i=n+1}^{n+m}$ and set $\tilde{D} = \{\vx_i, y_i\}_{i=1}^{n+m}$. 
% %satisfying
% %\begin{align}\label{eqn:new_dataset}
% %\tilde{\vx}_i &= \vx_i \quad \forall i \in [n], \\
% %\tilde{y}_i &= y_i \quad \forall i \in [n].
% %\tilde{y}_i &= y_i \quad \forall i \in [K] \setminus T
% %\end{align}
% Suppose that $\vx_1 < \vx_2 < \vx_3 < \cdots < \vx_n < \vx_{n+1} < \vx_{n+2}< \cdots \vx_{n+m}$. Then, $N_{\op{ft}} \leq d+2m$? %\cmt{why?}
% \end{question}

% \begin{question}
%     If this is true, then the next case would be $\vx_1 < \vx_{n+1} < \vx_{n+2}< \cdots \vx_{n+m}< \vx_2 < \vx_3 < \cdots < \vx_n $.
%     After that, one may generalize this by using the number of new data points between $(x_i, x_{i+1})$ for $i = 1, \cdots, n-1$.
% \end{question}

% \begin{question}
% The neural network constructed in Lemma~\ref{lemma:N_ft_upper} has $\op{rank}(\Delta \mW) = 1$. Does using $\Delta \mW$ with low-rank + sparse matrix give us better upper bound on $N_{\op{ft}}(T)$? What happens when we use sparse matrix $\Delta \mW$?
% \end{question}

% % \section{Ours: when we do NOT allow adding more neurons (2-layer FC)}

% % \subsection{Problem setup for binary classification, fine-tuning single label $y_n$}\label{sec:2_layer_cls}

% % Consider a two-layer neural network $f(\vx) = \op{sign}(\vv^T \sigma(\mW \vx - \vb))$ used for binary classification where $\sigma$ is the ReLU activation. 
% % This network is parameterized by $\mW \in \sR^{n \times d}$, $\vv \in \sR^{n}$ and $\vb \in \sR^{n}$, where $d$ is the dimension of input feature and $n$ is the number if hidden neuron. 
% % Let $D = \{\vx_i, y_i\}_{i=1}^n$ be the given dataset with $n$ samples where $\vx_i \in \sR^{d}$ and $y_i \in \{+1, -1\}$. 

% % Suppose we pre-train the network with dataset $D$, \ie find a network $f$ that perfectly fits $D$:
% % \begin{equation}
% % f(\vx_i) = \op{sign}(\vv^T \sigma(\mW \vx_i - \vb)) =  y_i \quad \forall i \in [n].
% % \end{equation}
% % Given this pre-trained network $f$, we fine-tune it with a new dataset $\tilde{D} = \{\tilde{\vx}_i, \tilde{y}_i\}_{i=1}^n$ satisfying
% % \begin{align}
% % \tilde{\vx}_i &= \vx_i \quad \forall i \in [n] \\
% % \tilde{y}_i &= y_i \quad \forall i \in [n]\setminus \{n\} \\
% % \tilde{y}_n &= 1 - y_n,
% % \end{align}
% % and let $\tilde{f}$ be the fine-tuned network (having weights $\tilde{\mW}$, $\tilde{\vv}$ and $\tilde{\vb}$) that perfectly fits the new dataset, \ie
% % \begin{align}
% % \tilde{f}(\tilde{\vx}_i) = \op{sign}(\tilde{\vv}^T \sigma(\tilde{\mW} \tilde{\vx}_i - \tilde{\vb})) = \tilde{y}_i \quad \forall i \in [n].
% % \end{align}
% % %In other words, only the label of the first sample is flipped. 
% % We define $\Delta \mW = \tilde{\mW} - \mW$, $\Delta \vv = \tilde{\vv} - \vv$ and $\Delta \vb = \tilde{\vb} - \vb$. 

% % \subsection{Problem setup for regression, fine-tuning single label $y_n$}\label{sec:2_layer_reg}

% % We use the notation defined above for classification; the only difference is that the network is given as
% % \begin{align}
% % f(\vx) &= \vv^T \sigma(\mW \vx - \vb),
% % \end{align}
% % the label is real number, \ie $y_i \in \sR$,
% % and the target dataset for fine-tuning is 
% % $\tilde{D} = \{\tilde{\vx}_i, \tilde{y}_i\}_{i=1}^n$ where
% % \begin{align}%\label{eqn:reg_single_label}
% % \tilde{\vx}_i &= \vx_i \quad \forall i \in [n] \\
% % \tilde{y}_i &= y_i \quad \forall i \in [n]\setminus \{n\} \\
% % \tilde{y}_n &\in \sR \setminus \{y_n\}
% % \end{align}


% % \subsection{Key Question}

% % Under this setting, we have the following questions.
% % \begin{enumerate}
% %     \item Suppose we allow sparse update, \ie minimize $s := \lVert \Delta \mW \rVert_{0} + \lVert \Delta \vv \rVert_{0} + \lVert \Delta \vb \rVert_{0}$, the number of nonzero elements in the update. What is the minimum $s$ that allows $\tilde{f}$ to perfectly fit the new data? Can we find lower/upper bounds on this quantity?    
% %     \item Suppose we allow low-rank update, \ie $\Delta \mW = \mL \mR$ where $\mL \in \sR^{n \times r}$ and $\mR \in \sR^{r \times d}$. What is the minimum $r$ that allows $\tilde{f}$ to perfectly fit the new data? Can we find lower/upper bounds on this quantity?
% % \end{enumerate}





% % \subsection{Kathematical Results for Regression}

% % %\paragraph{Basic idea} 
% % % The basic idea is as below. If we follow the construction of neural networks (used for finding the upper bound on the memorization capacity), we can easily find $\tilde{\mW}$, $\tilde{\vv}$ and $\tilde{\vb}$ given that the number of neurons is $n$, from~\citep{zhang2016understanding}. Our goal is, to find upper/lower bound on the minimum $r$ or $s$ defined in the key question.
% % %minimize the amount of fine-tuned parts, $\Delta \mW$ and $\Delta \vv$, in terms of the rank $r$ or the sparsity $s$.  

% % \subsubsection{When the pre-trained network is constructed by the technique in~\citep{zhang2016understanding}}

% % Here we assume $\vx_i \ne \vx_j$ for all $i \ne j$. 
% % According to~\citep{zhang2016understanding}, the ReLU network $f(\vx) = \vv^T \sigma(\mW \vx - \vb)$ that memorizes $n$ data samples in $D = \{(\vx_i, y_i)\}_{i=1}^n$ can be constructed by setting $\mW = \mathbf{1}_n \va^T$ and choosing $\va, \vb, \vv$ as below:
% % \begin{itemize}
% %     \item Since $\vx_i \ne \vx_j$ for all $i \ne j$, we can find $\va$ satisfying $\va^T (\vx_i - \vx_j) \ne 0$ for all $i \ne j$. Thus, we have $z_i \ne z_j$ for all $i \ne j$, where $z_i = \va^T \vx_i$.
% %     \item Choose $\vb = [b_1, \cdots, b_n]$ such that $b_1 < z_1 < b_2 < z_2 < \cdots < b_n < z_n$.
% %     \item Define lower-triangle matrix $\mA \in \sR^{n \times n}$ as $A_{ij} := \sigma(z_i - b_j)$, and choose $\vv = \mA^{-1} \vy$ where $\vy = [y_1, \cdots, y_n]$. 
% % \end{itemize}
% % \dk{The essential part here is the invertibility of $A$.}

% % Now, how can we tune the parameters 
% % %(\ie add $\Delta \mW, \Delta \vb$ and $\Delta \vv$) 
% % such that $\tilde{\vv}^T \sigma (\tilde{\mW} \tilde{\vx}_i - \tilde{\vb}) = \tilde{y}_i$ for all $i \in [n]$ where the new dataset $\tilde{D} = \{(\tilde{\vx}_i, \tilde{y}_i)\}_{i=1}^n$ is defined in~\eqref{eqn:reg_single_label}? Below proposition provides the answer for this question.

% % \begin{proposition}
% % \label{prop:fine}
% % Consider a two-layer neural network having $n$ neurons, pre-trained with $n$ samples $D = \{(\vx_i, y_i)\}_{i=1}^n$ by the construction rule given in~\citep{zhang2016understanding}. Then, fine-tuning only one weight element is sufficient to fit the model to the new dataset $\tilde{D}$ in \eqref{eqn:reg_single_label}.
% % %which changes one label from $D$. 
% % \end{proposition}
% % \begin{proof}
% % Suppose we fine-tune the model using the following rule: 
% % \begin{align}
% % \Delta \mW &= \bold{0}_{n \times d}, \\
% % \Delta \vb &= \bold{0}_{n}, \\
% % \Delta v_i &= 
% % \begin{cases}
% % 0, & i \ne n \\
% % \frac{1}{z_n - b_n} (\tilde{y}_n - y_n), & i=n
% % \end{cases}
% % \end{align}
% % Then, the fine-tuned network is represented as 
% % \begin{align}
% % \tilde{f}(\vx) = (\vv + \Delta \vv)^T \sigma (\mW \vx - \vb) = f(\vx) + \frac{1}{z_n - b_n} (\tilde{y}_n - y_n) \sigma( \va^T \vx - b_n),
% % \end{align}
% % which satisfies
% % \begin{align}
% % \tilde{f}(\vx_i) = 
% % \begin{cases}
% % f(\vx_i) = y_i = \tilde{y}_i, & \quad i \ne n \\
% % f(\vx_i) + \frac{1}{z_n - b_n} (\tilde{y}_n - y_n) (z_n - b_n) = y_n + (\tilde{y}_n - y_n) = \tilde{y}_n, & \quad i = n
% % \end{cases}
% % \end{align}
% % This completes the proof.
% % \end{proof}

% % \begin{remark}
% % Here we showed for changing the label of $n$-th sample, \ie the sample having the largest $z = \va^T \vx$. What if we change the label for arbitrary $i$-th sample? For $i=1$, we can easily change the label (from $y_1$ to $\tilde{y}_1$) by changing one element of $\vv$:
% % \begin{align}
% % \Delta v_i = 
% % \begin{cases}
% % 0, & i \ne 1 \\
% % [\mA_{\op{shuff}}^{-1}]_{nn} (\tilde{y}_1 - y_1), & i = 1
% % \end{cases}
% % \end{align}
% % where $\mA_{\op{shuff}} = [\mA[:,n], \cdots, \mA[:,1]]$ is an upper triangle matrix with nonzero diagonal elements, thus its inverse exists and is also an upper triangle matrix.
% % For $i \notin \{1, n\}$, it is not trivial whether modifiying a single weight element is enough for changing $\tilde{y}_i$ from $y_i$. \cmt{check whether we can also solve for $i \notin \{1,n\}$} %\dk{Not sure about a single weight, but one simple way is to use $\vv = \mA^{-1} \vy$.}
% % \end{remark}

% % \begin{remark}[Extend to changing all $n$ labels]
% % What if we want to change the label of all $n$ samples, from $\vy$ to $\vy + \Delta \vy$? It is sufficient to change $n$ elements in $\vv$ using $\Delta \vv = \mA^{-1} \Delta \vy$. Note that $\mA$ is lower triangle matrix with non-zero diagonal elements, thus invertible and the inverse is also lower triangle matrix.
% % \end{remark}

% % \begin{question}
% %     For $k \in \{1,2, \cdots, n\}$, consider the new data set $\tilde{D}_k = \{\tilde{\vx}_i, \tilde{y}_i\}_{i=1}^n$ where
% % \begin{align}\label{eqn:reg_single_label}
% % \tilde{\vx}_i &= \vx_i \quad \forall i \in [n] \\
% % \tilde{y}_i &= y_i \quad \forall i \in [n]\setminus \{k\} \\
% % \tilde{y}_n &\in \sR \setminus \{y_n\}
% % \end{align}
% %     Under the same assumptions as in Proposition~\ref{prop:fine} and $k \notin \{1,n\}$, can we show that fine-tuning only one weight element is not sufficient to fit the model to the new dataset $\tilde{D}_k$? If then, how many weights do we need to change? 
% % \end{question}

% % \begin{question}
% %     What happens if the network is over-parametrized? For instance, if the row or column dimension of $A$ is bigger than $n$, then is fine-tuning only one weight element sufficient to fit the model to the new dataset $\tilde{D}_k$?
% % \end{question}



% \section{Noisy data}


% \begin{question}
%     Apply the analysis in LASSO to our model.
% \end{question}

% Input: $x_1, \cdots, x_{20}$

% Output: $y_1, \cdots, y_{20}$

% Find $f$ such that $f(x_i) = y_i$.

% \begin{question}
%     \begin{enumerate} $\,$
%         \item 
%         For $x_{i,j} = x_i + w_j$, find $f$ such that $f(x_{i,j}) = y_i$ where $f(\vx) = \vv^T \sigma(\mW \vx + \vb)$.
%         \item
%         For $x_{i,j} = x_i + w_j$, find $f$ such that $\|f(x_{i,j}) - y_i\|$ is small where $f(\vx) = \vv^T \sigma((\mW + S) \vx + \vb)$.
%     \end{enumerate}  
% \end{question}



% % Recall that the ReLU network $f(\vx) = \vv^T \sigma(\mW \vx - \vb)$ fitting  is provided in~\citep{zhang2016understanding} which is specified as below. 




% \subsubsection{When the pre-trained network is arbitrarily constructed}

% % \cmt{TBE...}


% \begin{question}
%     Can we define the equivalence class of the weights, which gives the same output? Do they have some geometric structure (metric)? How can we say that finetuning is better than solving a given problem from scratch?
% \end{question}


% % \subsection{Kathematical Results for Classification}

% % \cmt{check the problem setup in Sec.~\ref{sec:2_layer_cls} and add results}


% \section{Discussions}\label{sec:disc}
% \vspace{-3mm}
% Here we add several discussion topics regarding the fine-tuning capacity we defined in our work.
% %\cmt{Other things to add here?}
% \vspace{-1mm}
% \paragraph{Extensions}
% This paper focuses on the case when the fine-tuned network is a summation of pre-trained network $f$ and another network $g_{\theta}$. One can consider a more general setting where the fine-tuned network is obtained by $f \oplus g_{\theta}$ for an arbitrary operation $\oplus$. Kathematically modeling the operation $\oplus$ for various parameter-efficient fine-tuning methods, e.g., BitFit~\citep{zaken2021bitfit}, LoRA~\citep{hu2021lora}, Adaptor~\citep{houlsby2019parameter}, and finding the fine-tuning capacity for such operations is an interesting further research topic.

% \vspace{-1mm}
% \paragraph{Practical Implications}

% According to the theoretical results summarized in Theorems~\ref{thm:ftc_add_bound} and~\ref{thm:ftc_add_bound_3layer}, we can fine-tune $N$ samples by using ReLU network $g_{\theta}$ with $\Theta(N)$ neurons, irrespective of (1) the size of pre-trained network $f$ and (2) the number of samples $K$ used for pre-training. This implies that the difficulty of fine-tuning solely depends on the data we want to fine-tune.
% %, in the case when the fine-tuned network $f+g_{\theta}$ is addition of pre-trained network and 
% We expect our theoretical results open up discussions on the efficiency of existing fine-tuning methods.
% %which gives some insight on practical 

% \vspace{-1mm}
% \paragraph{Tighter Bounds for 3-layer Networks}

% Our results for 3-layer ReLU networks showed $\Theta(\sqrt{N}) \le m \le \Theta(N)$ when $N=o(\sqrt{K})$.
% %, the practical scenarios when the number of samples to fine-tune is much smaller than the total number of samples. 
% It is expected that the upper bound on FTC for 3-layer ReLU networks can be tightened by using similar techniques for 2-layer ReLU networks. We leave this as a future work. \sy{can be erased}
% %Developing tighter bounds on $m$ remains as
% %so that $N = \theta(m^2)$.
% %for such cases, by constructing network $g_{\theta}$ with $\sqrt{N}$ neurons \cmt{how?}.
% %\dk{Write}

% \
\section{Extension to other neural networks}

For deeper neural networks, a lower bound on $N$ can be obtained in terms of the maximum width $d$, the number of layers $L$, and the number of pre-trained samples $K$. 

\begin{proposition}
\label{prop:ext}
For $L \geq 4$, $K \ge 3$, $T \subseteq [K]$, $|T|=N$, there exists an $L$-layer ReLU network with maximum width $d$ satisfying \eqref{eqn:finetune_fit_well} and  $$ d \leq   4\min\left\{\sqrt{\frac{3N}{\sqrt{\left\lfloor\frac{L-1}{2}\right\rfloor}}+5}, \sqrt{K}\right\} + 2.$$
%Let $L \geq 4$, $K \ge 3$, $T \subseteq [K]$, $|T|=N$, and $g$ be a $L$-layer fully-connected ReLU network with $m$ neurons. %as in \eqref{eq:2l}.
%Let $M$ be the number of pre-training samples, and $N$ be the fine-tuning capacity. 
%\begin{enumerate}
%    \item 
    %
%    For all $\vx_i \in \mathbb{R}^d$, $z_i \in \mathbb{R}$, $i \in [K]$, if $$ m \leq   6\sqrt{\left\lfloor\frac{L-1}{2}\right\rfloor}\min\{\sqrt{3N+2}, \sqrt{K}\}  + 2L,$$
%    then there exists $g$ with $m$ neurons satisfying \eqref{eqn:finetune_fit_well}. 
    %where 
    %\item 
    %For given $\vx_i \in \mathbb{R}^d$, $z_i \in \mathbb{R}$, $i \in [K]$, suppose that \eqref{eqn:finetune_fit_well} holds for some $g$ with $m$ neurons. Then, $$\min \{3N, K-2\}  \leq m.$$
%\end{enumerate}
%    Thus, 
%    \begin{align}
%        \min \{3N, K-2\} \le m^{\star} \le \min\{3N+1, K-1\}.
%    \end{align}
\end{proposition}

%Note that without the restriction on the maximum width, the bound is indeed similar to the ones we have in Theorem 5.1, for 3-layer networks. 


The main challenge for proving Proposition~\ref{prop:ext} lies in constructing a suitable neural network, which can be addressed by utilizing our 3-layer network from Section~\ref{sec:ftc3} and the construction idea provided in Figure 2 of \citep{yun2019small}. See Appendix~\ref{sec:proof-ext} for the proof of Proposition~\ref{prop:ext}.

\begin{figure}[t!]
    \vspace{-5mm}
    \centering
    \includegraphics[width=8cm]{fig/FTC_experiments_.pdf}
    \vspace{-2mm}
    \caption{
    Fine-tuning loss $\ell_{\text{FT}}$ computed for a network trained on a synthetic dataset, for various $N$ and $m$. Here, the number of data points used for pre-training is set to $K=1000$.
    As shown in the orange line, the number of neurons $m$ required in the fine-tuning network to achieve a small loss $\ell_{\text{FT}}$ grows in the order of $\Theta(\sqrt{N})$, the square root of the number of modified samples. This result coincides with our theoretical result in Theorem \ref{thm:ftc_add_bound_3layer}.
    }
    \vspace{-2mm}
    \label{fig:experiments}
\end{figure}

\section{Experiments}

% \cmt{Add the plot we made during the rebuttal, explain the experimental setup, state our claim that experimental results coincide with the theoretical results, etc.... Please check our author response in the openreview...}
%To support our theoretical findings, 
In this section, we provide experimental results on a synthetic dataset, which supports our theoretical results.
%First, we construct a synthetic dataset as follows. 
The experimental setup is as follows. 
We first randomly generate $K$ samples $D=\{(\vx_i, y_i)\}_{i=1}^K$ where the feature and the label of $i$-th sample have the following distributions: $\vx_i \sim N(0, I_d)$ and $y_i \sim \text{Unif}[-1, 1]$, where the feature dimension is $d=10$. Then, we train a network ReLU network $f$ that fits the dataset $D$, \ie~\eqref{eqn:new_dataset_label2} holds, thus having zero mean-squared-error (MSE) loss $\ell = \frac{1}{K} \sum_{i=1}^K (f(\vx_i)-y_i)^2$. 
Considering the fine-tuning scenario, we construct another dataset $D'=\{(\vx_i, y_i' )\}_{i=1}^K$ as follows. We first initialize $D' = D$. Then, we randomly choose $N$ out of $K$ samples in $D'$, and re-define the label of the $N$ samples as $y_i' \sim \text{Unif}[-1, 1]$.
Finally, we implement the fine-tuning process. Following our additive fine-tuning scenario, we freeze $f$ and train a 3-layer ReLU network $g_{\theta}$ with $m$ neurons, in a way that $f+g_{\theta}$ fits the new dataset $D'$. We define the fine-tuning loss as $\ell_{FT} = \frac{1}{K}\sum_{i=1}^K (f(\vx_i) + g_{\theta}(\vx_i)-y_i')^2$.

Figure~\ref{fig:experiments} shows the fine-tuning loss $\ell_{FT}$ for different $m$ and $N$. As expected, for a given $N$, the fine-tuning loss decreases as $m$ increases. For each $N$, the yellow line in the figure shows the minimum $m$ satisfying $\ell_{FT}(m,N) \le 0.04$. This yellow line indicates that the required number of neurons to achieve small fine-tuning loss follow the tendency of $\Theta(\sqrt{N})$ shown in the red line in the figure, which coincides with our theoretical result in Theorem~\ref{thm:ftc_add_bound_3layer}.

\section{Conclusion}
\vspace{-3mm}
We introduced Fine-Tuning Capacity (FTC), a generalization of memorization capacity concept for fine-tuning applications. 
This concept is defined to provide theoretical view on current paradigm of fine-tuning large pre-trained models. 
As an initial step towards analyzing FTC, we focused on the additive fine-tuning scenario where a side network is added to the frozen pre-trained network. 
We obtained upper/lower bounds on FTC for shallow ReLU networks.
For 2-layer network, we showed that fine-tuning $N$ samples is possible by using ReLU networks with $m = \Theta (N)$ neurons, irrespective of the size of the pre-trained network and the number of total samples $K$ used during pre-training.
For 3-layer network, the required amount of neurons reduces to $m = \Theta (\sqrt{N})$, for practical scenarios where the number of samples $N$ we want to change labels is far less than the number of total samples $K$ used for pre-training.

% \section*{Acknowledgement}
% JS acknowledges the support of the National Research Foundation of Korea (NRF) grant funded by the Korea government (MSIT) No. RS-2024-00345351.





%\newpage

\appendix

% \section{dummy}

% \begin{lemma}\label{lemma:N_ft_upper} %Suppose $\vx_i \ne \vx_j$ for all $i \ne j$. Then,
% $m \le 3N $.\end{lemma}



% \begin{proof}
% Choose $T \subseteq [K]$ such that $|T| = \lfloor \frac{m}{3} \rfloor$. %Let $m = 3k + r$ for $k \in \mathbb{N}$, and $r \in \{0,1,2\}$. 
% For simplicity, let $T = \{1, 2, \cdots, |T|\}$

% %The other cases can be handled similarly. \dk{To be discussed}

% We prove this by specifying $\mW,  \vv$, and $ \vb$ such that \eqref{eqn:finetune_fit_well} holds, which concludes our lower bound.

% %and $\lVert \Delta \mW \rVert_{0} + \lVert \Delta \vv \rVert_{0} + \lVert \Delta \vb \rVert_{0} \le 2d + 8 \lvert  T \rvert $.

% We choose $\delta>0$ an arbitrary positive number satisfying 
% \begin{align}
% \label{eq:N_ft_upper11}
%     \delta < \frac{\min_{i, j \in [n]} | \vx_i^T \va - \vx_j^T \va | }{2}.
% \end{align} 
% %We set the number of additional neurons as $\Delta m = 4 \lvert T \rvert$, and 

% Choose $\mW \in \sR^{m \times d}$, $\vv \in \sR^{m}$ and $\vb \in \sR^{m}$ as follows:
% \begin{align}\label{eqn:finetune_param}
% \mW &= \mathbf{1}_{m} \va^T, \\
% v_{3j-2} &= z_{j}/\delta, \\
% v_{3j-1} &= - 2 z_{j}/\delta, \\
% v_{3j} &= z_{j}/\delta, \\
% b_{3j-2} &= - \vx_{j}^T \va - \delta \\
% b_{3j-1} &= - \vx_{j}^T \va \\
% b_{3j} &= - \vx_{j}^T \va + \delta
% \end{align}
% for $j \in \{1, 2, \cdots, |T|\}$. Note that we use $3 |T| = 3 \lfloor \frac{m}{3} \rfloor \leq m$ here. All other $v_i$'s and $b_i$'s are set to be zero.

% Now we claim that  \eqref{eqn:finetune_fit_well} holds.
% %\begin{itemize}
% %    \item   
% %    \item $\lVert \mW \rVert_{0} + \lVert \vv \rVert_{0} + \lVert \vb \rVert_{0} \le 2d + 8 |T|$
% %\end{itemize}
% %when $\mW, \vv, \vb$ are as specified in~\eqref{eqn:finetune_param}.
% %We begin by showing that \eqref{eqn:finetune_fit_well} holds. 
% %From~\eqref{eqn:new_dataset} and~\eqref{eqn:finetuned_function}, 
% %From Definition~\ref{def:ftc2},
% %all we need to prove is $g_\theta(\vx) := \vv^T \sigma(\mW \vx + \vb) $ satisfies 
% %\begin{align}
% %g_\theta(\vx_{i}) &= \tilde{y}_{i} - y_{i}, \quad \text{ for } i\in T, \\
% %g_\theta(\vx_{i}) &= 0, \quad \text{ for } i\in [K] \setminus T.
% %\end{align}
% Note that 
% \begin{align}
% g_\theta(\vx) &= \sum_{p=1}^{ m}  v_p \sigma (\va^T \vx +  b_p) \\
% &= \sum_{j=1}^{|T|} \sum_{l=0}^2  v_{3j-2+l} \sigma (\va^T \vx +  b_{3j-2+l}).
% \end{align}
% Here, the inner summation can be represented as
% \begin{align}
% &\sum_{l=0}^2  v_{3j-2+l} \sigma (\va^T \vx +  b_{3j-2+l})\\ &= \frac{z_{j}}{\delta} \{ \sigma(\va^T \vx - \va^T \vx_{j} - \delta) - 2\sigma(\va^T \vx - \va^T \vx_{j}) \\
% &\hspace{20mm} + \sigma(\va^T \vx - \va^T \vx_{j} + \delta) \} \\
% &= 
% \begin{cases}
% z_{j}, & \quad \text{ for } \vx = \vx_{j}, \\
% 0, & \quad \text{ for } \vx = \vx_{k} \text{ with } k\ne j
% \end{cases}
% \end{align}
% where the last equality holds since $|\va^T \vx_{i} - \va^T \vx_j | > 2 \delta$ for all $i \ne j$, by the definition of $\delta$.
% Thus, we conclude. 
% %\begin{align}
% %\Delta f(\vx) &= 
% %\begin{cases}
% %\tilde{y}_{i_j} - y_{i_j}, & \quad \text{ for } \vx = \vx_{i_j}, \forall j \in \{1, 2, \cdots, |T|\} %\\
% %0, & \quad \text{ for } \vx = \vx_{i} \text{ with } i \in [K] \setminus T.
% %\end{cases}
% %\end{align}
% %This combined with~\eqref{eqn:finetuned_function} shows that~\eqref{eqn:finetune_fit_well} holds.
% %Now we show $\lVert \Delta \mW \rVert_{0} + \lVert \Delta \vv \rVert_{0} + \lVert \Delta \vb \rVert_{0} \le 2d + 8 |T|$. Since $\Delta \mW$ is rank-1 matrix, $\lVert \Delta \mW \rVert_{0} = 2d$. One can easily confirm that $\lVert \Delta \vv \rVert_{0}  \le \Delta m $ and $\lVert \Delta \vb \rVert_{0} \le \Delta m$ hold. This completes the proof.
% %\cmt{TBD..} The proof technique is making a bump at target point using a sum of 4 ReLU functions.
% \end{proof}







% 

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    %Briefly acknowledge people and organizations here.

    %\emph{All} acknowledgements go in this section.
    JS acknowledges the support of the National Research Foundation of Korea (NRF) grant funded by the Korea government (MSIT) No. RS-2024-00345351. DK was partially supported by the National Research Foundation of Korea (NRF) grant funded by the Korea government (MSIT) (No. RS-2023-00252516) and the POSCO Science Fellowship of POSCO TJ Park Foundation.
    KL was supported by the NSF Award CCF-2339978 and a grant from FuriosaAI.

    
    %\cmt{TODO: Add acknowledgements for other authors}
\end{acknowledgements}

% References
\bibliography{__ref}
%\bibliography{___uai2024-template}

\newpage

\onecolumn

\title{Memorization Capacity for Additive Fine-Tuning\\(Supplementary Material)}
 \maketitle



% This Supplementary Katerial should be submitted together with the main paper.

 \appendix
\section{Proof of $m \le 4 N+4$ (deferred from Sec.~\ref{sec:three_layer_upper}) }

%     \paragraph{Proof of $m \le 4N + 4$:}

    We construct a 3-layer neural network $g_{\theta}$ with $4N+4$ ReLU neurons which fine-tune $N$ samples. 
    %Following settings are same as ~\citep{yun2019small}: 
    %\cmt{GD: start from here..}
    Note that we can find $\vv$ such that $\vv^T\vx_i \ne \vv^T\vx_j$ for all $i \ne j$, since we assume $\vx_i \ne \vx_j$ for all $i \ne j$. 
    %\sy{sy: it might be confusing we used same $\vu$ notation with previous $x_i=i\vu$} \cmt{Use $\vv$}
    Without loss of generality, we order samples such that $\vv^T\vx_1<\vv^T\vx_2<\dotsi<\vv^T\vx_K$. Let $c_i := \vv^T\vx_i$ and define $c_0 = c_1-\epsilon$, $c_{K+1} = c_K+\epsilon$ for arbitrary $\epsilon > 0$. 
    %\sy{sy: $\delta$ means difference of two data points as previous, so we can change to $\epsilon$ if needed}
    %Note that we need to assume that $N$ is even. (if $N$ is odd, we can move one data point which does not need to be fitted to $T$)
    Recall that the indices for the samples we want to fine-tune is denoted by $T = \{T_1, \cdots, T_{N} \}$. We define dummy indices $T_0 = 1$ and $T_{N+1} = K$. Consider $2N+1$ groups of disjoint indices,% specified as
        \begin{align*}
        s_1 = \{T_0, \cdots, T_1 - 1\}, \quad 
        s_2 &= \{T_1 \}, \\
        s_3 = \{T_1 + 1, \cdots, T_2 - 1\}, \quad 
        s_4 &= \{T_2 \}, \\
        \cdots, \quad
        s_{2N} &= \{T_N\}, \\
        s_{2N+1} = \{T_N + 1, \cdots, T_{N+1} \},
    \end{align*}
    % \begin{align*}
    %     s_1 &= \{T_0, \cdots, T_1 - 1\} \\
    %     s_2 &= \{T_1 \} \\
    %     s_3 &= \{T_1 + 1, \cdots, T_2 - 1\} \\
    %     s_4 &= \{T_2 \} \\
    %     \vdots \\
    %     s_{2N} &= \{T_N\} \\
    %     s_{2N+1} &= \{T_N + 1, \cdots, T_{N+1} \},
    % \end{align*}
    where group $s_1$ is empty when $T_1 = 1$, and group $s_{2N+1}$ is empty when $T_N = K$.  
    We denote the maximum/minimum element of set $s_j$ as $s_j^{\op{max}}$ and $s_j^{\op{min}}$, respectively, \ie $s_j^{\op{max}} = \max s_j$ and $s_j^{\op{min}} = \min s_j$.
    %Define the weight/bias of the first layer as below:
    %To begin with, we define an index set $\mathcal{I}_T = \{i : \vx_i \in T\}$, and we also define the modified index set $\mathcal{I}_{T'} = \bigcup_{j=1}^{N}(\mathcal{I}_{T_j}, \mathcal{I}_{T_j}+1)$, and define $\mathcal{I}_{T_{K}}+1 = \mathcal{I}_{T_K}$.  
    %\sy{notation $\mW_{1_j}$ need to be defined}
    We place $2N+1$ neurons on layer 1, and define parameters for layer 1 as 
    \begin{align}\label{eqn:firstlayerweights}
        \mW_{1,j} &= (-1)^{j-1}\frac{4}{c_{s_j^{\op{max}}} +  c_{s_{j+1}^{\op{min}}} - c_{s_{j-1}^{\op{max}}} - c_{s_j^{\op{min}}}} \vv^T \\
        b_{1,j} &= (-1)^j\frac{
        c_{s_j^{\op{max}}} +  c_{s_{j+1}^{\op{min}}} +
        c_{s_{j-1}^{\op{max}}} + c_{s_j^{\op{min}}}}{c_{s_j^{\op{max}}} +  c_{s_{j+1}^{\op{min}}} - c_{s_{j-1}^{\op{max}}} - c_{s_j^{\op{min}}}}
    \end{align}
    for all $j =1, \cdots, 2N+1$. 
    %where the weight matrix and the bias for layer 1 are denoted by $\mW_1 = [\mW_{1,1}, \cdots, \mW_{1,N}]$ and $\vb_1 = [b_{1,1}, \cdots, b_{1,N}]$, respectively. 
    Under such setting, it can be easily checked that
    \begin{align*}
        \alpha_j^1 (\vx_i) \in [-1, 1] \quad \text{ for } i \in s_j \\
        \alpha_j^1 (\vx_i) \notin [-1, 1] \quad \text{ for } i \notin s_j
    \end{align*}
    for all $j \in [2N+1]$, where $\alpha_j^l (\vx)$ defined in Eq.~\ref{eqn:alpha} is the input value of node $j$ in layer $l$, when the input for the network is $\vx$. 
    %Note that this can be easily proved by the technique used for constructing the layer 1, for the proof of Theorem 3.3 of~\citep{yun2019small}. 
    Fig.~\ref{fig:3layer4N+3} shows the example of such construction, when $K=16$, $N=2$ and $T = \{4,7\}$. In such case, we have $2N+1 = 5$ disjoint groups:
        \begin{align*}
        s_1 = \{1, 2, 3\}, \quad
        s_2 &= \{4\}, \quad
        s_3 = \{5,6\}, \\
        s_4 &= \{7\}, \quad
        s_5 = \{8, 9, \cdots, 16\}. 
    \end{align*}
    % \begin{align*}
    %     s_1 &= \{1, 2, 3\} \\
    %     s_2 &= \{4\} \\
    %     s_3 &= \{5,6\} \\
    %     s_4 &= \{7\} \\
    %     s_5 &= \{8, 9, \cdots, 16\}. 
    % \end{align*}
    As in Fig.~\ref{fig:3layer4N+3}, the input of $j$-th neuron lie in the non-clipping region $\alpha_1^{j}(\vx_i) \in [-1, 1]$ when $i \in s_j$. For example, the first neuron ($j=1$) in the first layer has $\alpha_1^1 (\vx_i) \in [-1, 1]$ for $i \in s_1 = \{1,2,3\}$.


    \begin{figure}[t!]
    \vspace{-5mm}
    \centering
    \includegraphics[width=8cm]{fig/3layer_U3.pdf}
    \vspace{-2mm}
    \caption{
    The construction of 3-layer network achieving upper bound $U_3 = 4N + 4$ in Sec.~\ref{sec:three_layer_upper}. Here, we consider the case when $K=16$, $N=2$ and $T = \{ 4,7 \}$.
    }
    \label{fig:3layer4N+3}
    \end{figure}
    
    
    Now we construct layer 2 as below. Let $S = \cup_{j = 1}^{2N+1} \{ s_j^{\op{min}} \}$ be the index set containing the minimum index of each group $s_j$ for $j \in [2N+1]$, which is $S = \{1,4,5,7,8\}$ in the above example. 
    %\sy{sy: $I$ notation was used for defining consecutive set} \cmt{Use $S$} 
    Our goal is to construct $\mW_2 \in \mathbb{R}^{2N+1}$ and $b_2 \in \mathbb{R}$ such that a single node in layer 2 locates (1) $\{\vx_i\}_{i \in S}$ to the desired target $z_i$, and (2) $\{\vx_i\}_{i \notin S}$ to the clipping region $[-1,1]^c$. In other words, our desired conditions are
    \begin{align}\label{eqn:conditions}
        \alpha_1^2(\vx_i) &= z_i, \quad \forall i \in S, \\
        \alpha_1^2(\vx_i) &\in [-1, 1]^c, \quad \forall i \notin S.
    \end{align}
    Note that the input of the first node in the second layer is represented as
    \begin{align}\label{eqn:alpha_1^2}
        \alpha_1^2(\vx_i) = \sum_{j=1}^{2N+1} W_{2,j} \beta_j^1 (\vx_i) + b_2
    \end{align} 
    where $\mW_2 = [W_{2,1}; \cdots; W_{2,2N+1}]$ and
    \begin{align*}
        \beta_j^1 (\vx_i) = \sigma_H(\alpha_j^1 (\vx_i))
    \end{align*} 
    is the output of node $j$ in layer 1, when the input to the network is $\vx_i$.
    Thus, the first condition in Eq.~\ref{eqn:conditions}
    can be represented as a linear system 
    %\sy{sy: $W_2$ doesn't need transpose?}
    \begin{align}\label{eqn:linear_system}
        \mK \begin{bmatrix}
            \mW_2 \\
            b_2
        \end{bmatrix} 
        = \begin{bmatrix}
            z_{i_1} \\
            \vdots \\
            z_{i_{2N+1}}
        \end{bmatrix}
    \end{align}
    where $i_k$ for $k \in [2N+1]$ is defined the as the elements of $S = \{i_1, \cdots, i_{2N+1}\}$ and
    \begin{align}
        \mK = \begin{bmatrix}
           \beta_1^1 (\vx_{i_1}) & ... & \beta_{2N+1}^1 (\vx_{i_1}) & 1 \\
           \vdots & \ddots & \vdots & \vdots \\
           \beta_1^1 (\vx_{i_{2N+1}}) & ... & \beta_{2N+1}^1 (\vx_{i_{2N+1}}) & 1
        \end{bmatrix}.
    \end{align}
    In our above example, we have %\sy{sy: also should be $\beta$}
    \begin{align*}
        \mK = \begin{bmatrix}
           \beta_1^1 (\vx_1) &  +1 & -1 & +1 & -1 & 1 \\
           +1 &  \beta_2^1 (\vx_4) & -1 & +1 & -1 & 1 \\
           +1 &  -1 & \beta_3^1 (\vx_5) & +1 & -1 & 1 \\
           +1 &  -1 & +1 & \beta_4^1 (\vx_7)  & -1 & 1 \\
           +1 &  -1 & +1 & -1  & \beta_5^1 (\vx_8) & 1
        \end{bmatrix}
    \end{align*}
    Using a similar technique used in~\citep{yun2019small}, this matrix $\mK \in \mathbb{R}^{(2N+1) \times (2N+2)}$ satisfies two conditions:
    \begin{align*}
        & (1) \ \op{rank}(\mK) = 2N+1,  \\
        & (2) \ \exists \bold{\nu} = [\nu_1, \cdots, \nu_{2N+2}] \in \op{null}(\mK) \\
        &  \quad \quad  \text{ such that } \nu_i > 0 \quad \forall i \in [2N+1].
    \end{align*}
    Thus, the linear system in Eq.~\ref{eqn:linear_system} has infinitely many solution in the form of 
    \begin{align}\label{linear_equation}
        \begin{bmatrix}
        \mW_2 \\ b_2
        \end{bmatrix} = \mu + \lambda \nu
    \end{align}
    for any scalar $\lambda$ and a particular solution $\mu$. With the logic used in the proof of Lemma B.1 in~\citep{yun2019small}, we can scale $\lambda$ sufficiently such that the second condition in Eq.~\ref{eqn:conditions} holds. 
    Thus, by using such weight $\mW_2$ and bias $b_2$, the input of layer 2 looks like in Fig.~\ref{fig:3layer4N+3}. Let the neuron in layer 2 has activation function 
    \begin{align*}
        \sigma_T(t) =
        \begin{cases}
            t, & \text{ if } t < 1 \\
            -\frac{1}{\delta} (t-1-\delta), & \text{ if } 1 \le t < 1+\delta \\
            0, & \text{ if } t \ge 1+ \delta
        \end{cases}
    \end{align*}
    Then, the output of the network 
    \begin{align*}
    g_{\theta}(\vx) = \sigma_T(\mW_2 (\sigma_H( \mW_1 \vx + \vb_1 ) ) + b_2  ) = \sigma_T (\alpha_1^2 (\vx) )
    \end{align*}
    satisfies 
    \begin{align*}
        g_{\theta}(\vx_i) = 
        \begin{cases}
            z_i, & \quad i \in T \\
            0, & \quad i \in [K] \setminus T
        \end{cases}
    \end{align*}
    by using the definition of $\sigma_T$ and Eq.~\ref{eqn:conditions}. Note that the first layer of this construction uses $2N+1$ neurons with $\sigma_H$ activation, and the second layer uses 1 neuron with $\sigma_T$ activation. Since $\sigma_H$ and $\sigma_T$ can be converted into 2 ReLU neurons, our construction use total $4N+4$ neurons, which completes the proof.%\\

\section{Proof of Proposition~\ref{prop:ext}}\label{sec:proof-ext}


Recall the 3-layer network illustrated in Figure~\ref{fig:3layersqrt6N}. For given $T \subset [K]$, let us denote this 3-layer network satisfying \eqref{eqn:finetune_fit_well} by $g_{\theta, T}$ which has maximum width $4 \min\{\sqrt{3|T|+2}, \sqrt{K} \}$. In what follows, we construct an $L$-layer network based on $g_{\theta, T}$.


We partition $T$ into $\left\lfloor\frac{L-1}{2}\right\rfloor$ subsets: $T_1, T_2, \cdots, T_{\left\lfloor\frac{L-1}{2}\right\rfloor}$ satisfying $|T_i| \leq |T|/ \left\lfloor\frac{L-1}{2}\right\rfloor + 1$ for all $i$. It can be easily seen that $g_{\theta, T_1}(x) + g_{\theta, T_2}(x) + \cdots + g_{\theta, T_{\left\lfloor\frac{L-1}{2}\right\rfloor}}(x)$ satisfies \eqref{eqn:finetune_fit_well}. Using the construction idea provided in Figure 2 of \citep{yun2019small}, the above function can be represented as an $L$-layer network, which has a maximum width less than or equal to
\begin{align}
    \max_{i} \{4 \min\{\sqrt{3|T_i|+2}, \sqrt{K} \} + 2\},
\end{align}
and we conclude.





\end{document}
