\subsection{Abbreviation and Notation}
\begin{table}[htbp]
\caption{
  Table of abbreviation.
  }
\centering
\begin{tabular}{lc}
    \toprule Abbreviation & Definition\\ \midrule
    % PEFT & parameter-efficient fine-tuning \\
    % LLM & large language model \\
    % NLP & natural language processing \\
    FT & fine-tuning\\
    LP & linear probing\\
    LP-FT & linear probing then fine-tuning\\
    NTK & neural tangent kernel \\
    LoRA&low rank adaptation~\citep{hu2022lora}\\
    ECE & expected calibration error (\citep{naeini2015obtaining})\\
    MCE & maximum calibration error (\citet{naeini2015obtaining})\\
    % NLL & negative loss lilelihood \\
    ID~/~OOD & in-distribution~/~out-of-distribution\\
    FDR & Fisher's discriminant ratio~\citep{fisher1936use}\\
    % MHA & multi-head attention (\citet{vaswani2017attention})\\
    % FFN & feed-forward network \\
    % LogitNorm & Logit Normalization (\citet{wei2022mitigating})\\
    % SWA & Stochastic Weight Averaging (\citet{izmailov2018averaging})\\
    % SGD & Stochastic Gradient Descent\\
    \bottomrule
\end{tabular}
  \label{table:abbreviation}
\end{table}

\begin{table}[htbp] %場所はどこ？
\caption{
  Table of notations.
  }
\centering
\begin{tabular}{lc}
    \toprule Variable & Definition\\ \midrule
    $C~/~N$ & number of classes~/~training samples \\
    $d~/~h~/~r$ & input dimension~/~hidden dimension~/~rank of LoRA\\
    $\mathcal{X}~/~\mathcal{Y}$&trainig samples~/~labels\\
    $\bm{x}~/~y~$&sample~/~label \\
    $[\bm{a}]_{k}$ $k$-th element of vector $\bm{a}$ \\
    $\|\cdot \|~/~\|\cdot \|_F~/\langle \cdot, \cdot \rangle$ & Euclidean norm~/~Frobenius norm~/~inner product\\
    $\bm{e}_{y} \in \{0,1\}^C$ & one-hot encoding of label $y$ \\
    $\bm{I}_{C}$ & $C \times C$ identity matrix \\
    $\ell(\bm{f}(\bm{x}), y)$ & loss function \\
    $L$ & empirical risk \\
    $\bm{\sigma}_{\text{SM}}$ & softmax function\\
    $\bm{f}(\bm{x})$ & model output \\
    $\bm{\phi}(\bm{x})$ & feature extractor \\
    $\bm{V}~/~\bm{b}$ & classifier weight~/~bias \\
    $~\bm{V}_0~/~\bm{\phi}_0$ & classifier weight~/~feature extractor at the start of training \\
    $\bm{B}$ & feature extractor weight matrix in two-layer linear model \\
    $\bm{A}^{\text{LoRA}}~/~\bm{B}^{\text{LoRA}}$ & low-rank weight matrices in LoRA \\
    $\theta^{\bm{g}}~/~\theta^{\bm{A}}~/~\theta^{\bm{a}}$ & parameter of function $\bm{g}$~/~matrix $\bm{A}$~/~vector $\bm{a}$\\
    $\Theta^{\bm{f}}~/~\Theta^{\bm{\phi}}$ & NTK matrix of model~/~feature extractor \\
    $\bm{P}(\bm{x}, \bm{x}_i)~/~\bm{F}(\bm{x}, \bm{x}_i)$ & pre-train-effective~/~FT-effective component of NTK matrix \\
    $\bm{\delta}_i$ & difference between one-hot label and predicted probability \\
    $\eta$ & learning rate \\
    $\otimes$ & kronecker product of two matrices \\
     \bottomrule
\end{tabular}
  \label{table:notations}
\end{table}
% ---------------------------------------------
%-------------------------------------------------------------------------------------------------------------------
\subsection{Proof of theoretical results}
\label{subsec:proof}
\paragraph{Additional notation}
  The parameters for a function $\bm{g}$, a weight matrix $\bm{A}$, and a vector $\bm{a}$ is denoted as $\theta^{\bm{g}}, \theta^{\bm{A}}$, and $\theta^{\bm{a}}$. Given a function $\bm{g}(\cdot ;\theta^{\bm{g}}): \mathbb{R}^d \rightarrow \mathbb{R}^s$ trained on $N$ training samples $\mathcal{X} = \{\bm{x}_1, \bm{x}_2, \ldots, \bm{x}_N\} \subseteq \mathbb{R}^d$, we denote the NTK matrix of $\bm{g}$ at time $t$ as $\Theta^{\bm{g}}_t$, which is defined as $\Theta^{\bm{g}}_t := \frac{\partial \bm{g}_t(\mathcal{X})}{\partial \theta^{\bm{g}}} \left( \frac{\partial \bm{g}_t(\mathcal{X})}{\partial \theta^{\bm{g}}} \right)^\top \in \mathbb{R}^{Ns \times Ns}$, where $\bm{g}_t(\mathcal{X}) := \operatorname{vec}\left(\bm{g}_t(\bm{x_i})\right)_{\bm{x_i \in \mathcal{X}}}$, and $\frac{\partial \bm{g}_t(\mathcal{X})}{\partial \theta^{\bm{g}}} \in \mathbb{R}^{Ns \times p}$ with $p$ parameters. The sub-matrix $\Theta^{\bm{g}}_t(\bm{x}_i, \bm{x}_j)$ is defined as $\Theta^{\bm{g}}_t(\bm{x}_i, \bm{x}_j) := \frac{\partial \bm{g}_t(\bm{x_i})}{\partial \theta^{\bm{g}}} \left( \frac{\partial \bm{g}_t(\bm{x}_j)}{\partial \theta^{\bm{g}}} \right)^\top \in \mathbb{R}^{s\times s}$, describing the relationship between training samples $\bm{x}_i$ and $\bm{x}_j$ in $\mathcal{X}$. In the infinite width limit with NTK parameterization and general assumptions, the NTK matrix converges to $\Theta^{\bm{g}} := \frac{\partial \bm{g}_0(\mathcal{X})}{\partial \theta^{\bm{g}}} \left( \frac{\partial \bm{g}_0(\mathcal{X})}{\partial \theta^{\bm{g}}} \right)^\top$ as shown by \citep{jacot2018neural}. Subscripts represent iteration or epoch, so $\bm{g}_t(\cdot)$ denotes the model $\bm{g}$ at time $t$. $\otimes$ denotes the kronecker product of two matrices defined as
  \begin{align}
    \bm{A} \otimes \bm{B} := \begin{bmatrix}
      a_{11}\bm{B} & a_{12}\bm{B} & \cdots & a_{1n}\bm{B} \\
      a_{21}\bm{B} & a_{22}\bm{B} & \cdots & a_{2n}\bm{B} \\
      \vdots       & \vdots       & \ddots & \vdots       \\
      a_{m1}\bm{B} & a_{m2}\bm{B} & \cdots & a_{mn}\bm{B} \\
  \end{bmatrix},
  \end{align}
  where $ \bm{A} = [a_{ij}] $ is an $ m \times n $ matrix and $ \bm{B} $ is any matrix.
%------------------------------------------------------------------------
\subsubsection{Proof of Proposition \ref{prop:ntk}}
\label{subsec:ntk}
\newtheorem*{RestateProposition}{\rm\bf \Cref{prop:ntk}}
\begin{RestateProposition}
  The NTK matrix of a model $\bm{f}(\bm{x})=\bm{V}\bm{\phi}(\bm{x})+\bm{b}$, denoted by $\Theta^{\bm{f}}$, can be decomposed as:
  \begin{align}
    \Theta^{\bm{f}}(\bm{x}, \bm{x}_i) = \bm{P}(\bm{x}, \bm{x}_i) + \bm{F}(\bm{x}, \bm{x}_i),
  \end{align}
  where the pre-train-effective component $\bm{P}(\bm{x}, \bm{x}_i)$ and the FT-effective component $\bm{F}(\bm{x}, \bm{x}_i)$ are defined using the classifier weight matrix $\bm{V}_0$ and the feature extractor $\bm{\phi}_0$ at starting point of training as:
  \begin{align}
    \bm{P}(\bm{x}, \bm{x}_i) &:= (\langle \bm{\phi}_0(\bm{x}), \bm{\phi}_0(\bm{x}_i)\rangle + 1) \bm{I}_{C}, \\
    \bm{F}(\bm{x}, \bm{x}_i) &:=  \bm{V}_0 \frac{\partial \bm{\phi}_0(\bm{x})}{\partial \theta^{\bm{\phi}}} \frac{\partial \bm{\phi}_0(\bm{x}_i)}{\partial \theta^{\bm{\phi}}}^\top \bm{V}_0^\top.
  \end{align}
  Consequently, assuming that one-epoch training within the NTK regime approximates FT, the logits and feature vectors for a sample $\bm{x}$ after FT, denoted as $\bm{f}^{\text{FT}}(\bm{x})$ and $\bm{\phi}^{\text{FT}}(\bm{x})$, to the starting point of training, $\bm{f}_0(\bm{x})$ and $\bm{\phi}_0(\bm{x})$, can be expressed as:
  \begin{align}
    \bm{f}^{\text{FT}}(\bm{x}) - \bm{f}_0(\bm{x})
    &= \eta \sum_{i=1}^N \left(\bm{P}(\bm{x}, \bm{x}_i) +  \bm{F}(\bm{x}, \bm{x}_i)\right) \bm{\delta}_i,\\
    \bm{\phi}^{\text{FT}}(\bm{x}) - \bm{\phi}_0(\bm{x})
    &= \eta \sum_{i=1}^N \Theta^{\bm{\phi}}(\bm{x}, \bm{x}_i) \bm{V}_0^\top \bm{\delta}_i,
  \end{align}
  where $\bm{\delta}_i := \bm{e}_{y_i} - \bm{\sigma}_{\text{SM}}(\bm{f}_0(\bm{x}_i))$ represents the difference between the one-hot label and the predicted probability, and $\eta$ is the learning rate.
\end{RestateProposition}
%--------------------------------------------------------------------------------------------------------------------
\paragraph{Proof of \Cref{prop:ntk}}
\begin{proof}
  The parameters of $\bm{f}$, denoted as $\bm{\theta}^{\bm{f}}$, consist of $\bm{\theta}^V$, $\bm{\theta}^b$, and $\bm{\theta}^{\bm{\phi}}$. The derivative of the model $\bm{f}$ with respect to each parameter is given by:
  \begin{align}
    \frac{\partial \bm{f}(\bm{x})}{\partial \bm{\theta}^V} &= \bm{\phi}(\bm{x})^\top \otimes \bm{I}_{C}, \label{eq:derivativeV}\\
    \frac{\partial \bm{f}(\bm{x})}{\partial \bm{\theta}^b} &= \bm{I}_{C},\label{eq:derivativeb}\\
    \frac{\partial \bm{f}(\bm{x})}{\partial \bm{\theta}^{\bm{\phi}}} &= \bm{V} \frac{\partial \bm{\phi}(\bm{x})}{\partial \bm{\theta}^{\bm{\phi}}}.\label{eq:derivativePhi}\\
  \end{align}

  Therefore, the NTK matrix of $\bm{f}$, defined as $\Theta^{\bm{f}}(\bm{x}, \bm{x}_i) := \frac{\partial \bm{f}_0(\bm{x})}{\partial \bm{\theta}^{\bm{f}}} \left(\frac{\partial \bm{f}_0(\bm{x}_i)}{\partial \bm{\theta}^{\bm{f}}}\right)^\top$, can be expressed as:
  \begin{align}
    \Theta^{\bm{f}}(\bm{x}, \bm{x}_i) &= \frac{\partial \bm{f}_0(\bm{x})}{\partial \bm{\theta}^{\bm{f}}} \left(\frac{\partial \bm{f}_0(\bm{x}_i)}{\partial \bm{\theta}^{\bm{f}}}\right)^\top \\
    &= \frac{\partial \bm{f}_0(\bm{x})}{\partial \bm{\theta}^V} \frac{\partial \bm{f}_0(\bm{x}_i)}{\partial \bm{\theta}^V}^\top + \frac{\partial \bm{f}_0(\bm{x})}{\partial \bm{\theta}^b} \frac{\partial \bm{f}_0(\bm{x}_i)}{\partial \bm{\theta}^b}^\top + \frac{\partial \bm{f}_0(\bm{x})}{\partial \bm{\theta}^{\bm{\phi}}} \frac{\partial \bm{f}_0(\bm{x}_i)}{\partial \bm{\theta}^{\bm{\phi}}}^\top \\
    &= \left(\bm{\phi}_{0}(\bm{x})^\top \otimes \bm{I}_{C}\right) \left(\bm{\phi}_{0}(\bm{x}_i)^\top \otimes \bm{I}_{C}\right)^\top + \bm{I}_{C} + \bm{V} \frac{\partial \bm{\phi}_{0}(\bm{x})}{\partial \bm{\theta}^{\bm{\phi}}} \left(\bm{V} \frac{\partial \bm{\phi}_{0}(\bm{x}_i)}{\partial \bm{\theta}^{\bm{\phi}}}\right)^\top \quad (\because \text{Eqs.}~\eqref{eq:derivativeV}, \eqref{eq:derivativeb}, \eqref{eq:derivativePhi}) \\
    &= \langle \bm{\phi}_0(\bm{x}), \bm{\phi}_0(\bm{x}_i)\rangle \bm{I}_{C} + \bm{I}_{C} + \bm{V}_0 \frac{\partial \bm{\phi}_0(\bm{x})}{\partial \bm{\theta}^{\bm{\phi}}} \left(\frac{\partial \bm{\phi}_0(\bm{x}_i)}{\partial \bm{\theta}^{\bm{\phi}}}\right)^\top \bm{V}_0^\top \\
    &= (\langle \bm{\phi}_0(\bm{x}), \bm{\phi}_0(\bm{x}_i)\rangle + 1) \bm{I}_{C} + \bm{V}_0 \frac{\partial \bm{\phi}_0(\bm{x})}{\partial \bm{\theta}^{\bm{\phi}}} \left(\frac{\partial \bm{\phi}_0(\bm{x}_i)}{\partial \bm{\theta}^{\bm{\phi}}}\right)^\top \bm{V}_0^\top \\
    &= \bm{P}(\bm{x}, \bm{x}_i) + \bm{F}(\bm{x}, \bm{x}_i)\label{eq:NTKF}.
  \end{align}

  For gradient descent, the update to the parameters $\bm{\theta}^{\bm{f}}$ at time $t$ is given by:
  \begin{align}
    \bm{\theta}^{\bm{f}}_{t+1} - \bm{\theta}^{\bm{f}}_{t} &= -\eta \left(\frac{\partial L(\bm{f}_{t})}{\partial \bm{\theta}^{\bm{f}}}\right)^\top\\
    &= -\eta \sum_{i=1}^{N} \left(\frac{\partial L(\bm{f}_{t})}{\partial \left([\sigma_{\text{SM}}(\bm{f}(\bm{x}))]_{y_i}\right)} \frac{\partial \left([\sigma_{\text{SM}}(\bm{f}_{t}(\bm{x}))]_{y_i}\right)}{\partial \bm{f}(\bm{x}_i)} \frac{\partial \bm{f}_{t}(\bm{x}_i)}{\partial \bm{\theta}^{\bm{f}}}\right)^\top \\
    &= \eta \sum_{i=1}^{N} \left(\bm{\delta}_i^\top \frac{\partial \bm{f}_{t}(\bm{x}_i)}{\partial \bm{\theta}^{\bm{f}}}\right)^{\top} \\
    &= \eta \sum_{i=1}^{N} \frac{\partial \bm{f}_{t}(\bm{x}_i)}{\partial \bm{\theta}^{\bm{f}}}^\top \bm{\delta}_i \label{eq:updateThetaF}.
  \end{align}
  Assuming that one-epoch training approximates FT, the model is expressed as $\bm{f}^{\text{FT}} = \bm{f}_{1}$. Therefore, the update to the model $\bm{f}$ in the linearized regime is given by:
  \begin{align}
    \bm{f}^{\text{FT}}(\bm{x}) - \bm{f}_{0}(\bm{x}) &= \bm{f}_{1}(\bm{x}) - \bm{f}_{0}(\bm{x})\quad (\because \text{linearized regime})\\
    &= \eta \sum_{i=1}^{N} \frac{\partial \bm{f}_{0}(\bm{x})}{\partial \bm{\theta}^{\bm{f}}} \left(\frac{\partial \bm{f}_{0}(\bm{x}_i)}{\partial \bm{\theta}^{\bm{f}}}\right)^\top \bm{\delta}_i \quad (\because \text{Eq.}~\eqref{eq:updateThetaF}) \\
    &= \eta \sum_{i=1}^{N} \left(\bm{P}(\bm{x}, \bm{x}_i) + \bm{F}(\bm{x}, \bm{x}_i)\right) \bm{\delta}_i.\quad (\because \text{Eq.}~\eqref{eq:NTKF})
  \end{align}

  Finally, the same derivation with Eq.~\eqref{eq:updateThetaF}, the update to the parameters $\bm{\theta}^{\bm{\phi}}$ at time $t$ is given by
  \begin{align}
    \bm{\theta}^{\bm{\phi}}_{t+1} - \bm{\theta}^{\bm{\phi}}_{t} &= \eta \sum_{i=1}^{N} \frac{\partial \bm{f}_{t}(\bm{x}_i)}{\partial \bm{\theta}^{\bm{\phi}}}^\top \bm{\delta}_i \label{eq:updateThetaPhi}.
  \end{align}  
  Therefore, the update to the feature extractor after FT, given by $\bm{\phi}^{\text{FT}} = \bm{\phi}_{1}$ for the same assumption, is:
  \begin{align}
    \bm{\phi}^{\text{FT}}(\bm{x}) - \bm{\phi}_{0}(\bm{x}) &= \bm{\phi}_{1}(\bm{x}) - \bm{\phi}_{0}(\bm{x}) \\
    &= \frac{\partial \bm{\phi}_{0}(\bm{x})}{\partial \bm{\theta}^{\bm{\phi}}} (\bm{\theta}^{\bm{\phi}}_{1} - \bm{\theta}^{\bm{\phi}}_{0})\quad (\because \text{linearized regime})\\
    &= \frac{\partial \bm{\phi}_{0}(\bm{x})}{\partial \bm{\theta}^{\bm{\phi}}} \eta \sum_{i=1}^{N} \left(\frac{\partial \bm{f}_0(\bm{x}_i)}{\partial \bm{\theta}^{\bm{\phi}}}\right)^\top \bm{\delta}_i \quad (\because \text{Eq.}~\eqref{eq:updateThetaPhi})\\
    &= \frac{\partial \bm{\phi}_{0}(\bm{x})}{\partial \bm{\theta}^{\bm{\phi}}} \eta \sum_{i=1}^{N} \left(\bm{V}_{0} \frac{\partial \bm{\phi}_{0}(\bm{x}_i)}{\partial \bm{\theta}^{\bm{\phi}}}\right)^\top \bm{\delta}_i \quad (\because \text{Eq.}~\eqref{eq:derivativePhi})\\
    &= \eta \sum_{i=1}^{N} \frac{\partial \bm{\phi}_{0}(\bm{x})}{\partial \bm{\theta}^{\bm{\phi}}} \left(\frac{\partial \bm{\phi}_0(\bm{x}_i)}{\partial \bm{\theta}^{\bm{\phi}}}\right)^\top \bm{V}_{0}^\top \bm{\delta}_i \\
    &= \eta \sum_{i=1}^{N} \Theta^{\bm{\phi}}(\bm{x}, \bm{x}_i) \bm{V}_{0}^\top \bm{\delta}_i.
  \end{align}
  This completes the proof.
\end{proof}
\subsubsection{Proof of Corollary~\ref{cor:feature_distortion}}
\label{proof:corollary}
\newtheorem*{RestateProposition3}{\rm\bf \Cref{cor:feature_distortion}}
\begin{RestateProposition3}
  Within the context of the linear model (\Cref{dfn:linearModel}), for any sample $\bm{x} \in \operatorname{Span}(\mathcal{X})^{\bot}$, the orthogonal complement of the subspace spanned by the training sample set $\mathcal{X}$, the features after FT remain unchanged, expressed as:
  \begin{align}
    \bm{\phi}^{\text{FT}}(\bm{x}) = \bm{\phi}_0(\bm{x}),
  \end{align}
  where $\bm{\phi}^{\text{FT}}(\bm{x})$ and $\bm{\phi}_0(\bm{x})$ denote the feature vectors after and before FT, respectively.
\end{RestateProposition3}
\paragraph{Proof of \Cref{cor:feature_distortion}}
\begin{proof}
  The feature extractor is given by $\bm{\phi}(\bm{x}) = \bm{B}\bm{x}$, where $\bm{B}$ is the weight matrix. The derivative of the feature extractor with respect to the parameters $\theta^{\bm{\phi}}=\theta^{\bm{B}}$ is:
  \begin{align}
    \frac{\partial \bm{\phi}(\bm{x})}{\partial \theta^{\bm{\phi}}}=\frac{\partial \bm{B}\bm{x}}{\partial \theta^{\bm{B}}} = \bm{x} \otimes \bm{I}_h,
  \end{align}
  so the empirical NTK matrix of the feature extractor becomes:
  \begin{align}
    \Theta^{\bm{\phi}}(\bm{x_i}, \bm{x_j}) &:= \frac{\partial \bm{\phi}_{0}(\bm{x_i})}{\partial \theta^{\bm{\phi}}} \frac{\partial \bm{\phi}_{0}(\bm{x_j})}{\partial \theta^{\bm{\phi}}}^\top \\
    &= \langle \bm{x_i}, \bm{x_j} \rangle \otimes \bm{I}_h
  \end{align}
  where $\otimes$ denotes the kronecker product.

  From the \Cref{prop:ntk}, the feature update is given by:
  \begin{align}
    \bm{\phi}^{\text{FT}}(\bm{x}) - \bm{\phi}_0(\bm{x}) &= \eta \sum_{i=1}^{N} \Theta^{\bm{\phi}}(\bm{x}, \bm{x}_i) \bm{V}_0^\top \bm{\delta}_i \\
    &= \eta \sum_{i=1}^{N} \langle \bm{x}, \bm{x}_i \rangle \bm{V}_0^\top \bm{\delta}_i,
  \end{align}
  where $\bm{\delta}_i = \bm{e}_{y_i} - \bm{\sigma}_{\text{SM}}(\bm{f}_0(\bm{x}_i))$, $\bm{V}_0$ is the classifier weight matrix at the start of training, and $\eta$ is the learning rate. For any sample $\bm{x} \in \operatorname{Span}(\mathcal{X})^{\bot}$, $\langle \bm{x}, \bm{x}_i \rangle = 0$ for all $\bm{x}_i \in \mathcal{X}$, so the feature update is $0$ for OOD samples, namely:
  \begin{align}
    \bm{\phi}^{\text{FT}}(\bm{x}) - \bm{\phi}_0(\bm{x}) = 0.
  \end{align}
  This completes the proof.
\end{proof}
%-------------------------------------------------------------------------------------------------------------------
\subsubsection{Proof of Proposition~\ref{prop:lora}}
\label{subsec:lora_ft}
\newtheorem*{RestateProposition2}{\rm\bf \Cref{prop:lora}}
\begin{RestateProposition2}
  Consider the linear model setting (\Cref{dfn:linearModel}) and let $\bm{f}^{\text{LoRA}}$ and $\bm{f}^{\text{FT}}$ be the models obtained via one-epoch training with LoRA and standard FT in the NTK regime. Let $r$ denote the rank of the LoRA hyperparameter, and $\sigma^2$ represent the variance of the low-rank weight matrix initialization. Assume the input samples $\bm{x}$ satisfy $\|\bm{x}\| \leq c$. Then, for each sample pair $\bm{x}_i, \bm{x}_j \in \mathcal{X}$, the pre-train-effective components of the NTK matrix for LoRA and FT, $\bm{P}^{\text{LoRA}}(\bm{x}_i, \bm{x}_j)$ and $\bm{P}^{\text{FT}}(\bm{x}_i, \bm{x}_j)$, are identical:
  \begin{align}
    \bm{P}^{\text{LoRA}}(\bm{x}_i, \bm{x}_j) = \bm{P}^{\text{FT}}(\bm{x}_i, \bm{x}_j).
  \end{align}
  Moreover, with at least $1 - 4\exp(-(\epsilon^2 - \epsilon^3)r/4)$ probability, their FT-effective components, $\bm{F}^{\text{LoRA}}(\bm{x}_i, \bm{x}_j)$ and $\bm{F}^{\text{FT}}(\bm{x}_i, \bm{x}_j)$, satisfy:
  \begin{align}
    \|\bm{F}^{\text{LoRA}}(\bm{x}_i, \bm{x}_j) - \sigma^2 r \bm{F}^{\text{FT}}(\bm{x}_i, \bm{x}_j)\| \leq c\epsilon \|\bm{V}_0 \bm{V}_0^\top\|.
  \end{align}
\end{RestateProposition2}
%-------------------------------------------------------------------------------------------------------------------
\paragraph{Proof Approach}
To prove this theorem, we use a lemma from distributional properties:

\begin{lem}[Corollary of the distributional Johnson-Lindenstrauss Lemma]
    Given vectors $\bm{u}, \bm{v} \in \mathbb{R}^d$ with $\|\bm{u}\|, \|\bm{v}\| \leq c$, and a random matrix $\bm{A} \in \mathbb{R}^{k \times d}$ with i.i.d. entries from a distribution with mean 0 and variance 1, for any $\epsilon > 0$:
    \begin{align}
        \Pr\left[ |(\bm{A}\bm{u})^\top (\bm{A}\bm{v}) - \bm{u}^\top \bm{v}| \geq c \epsilon \right] \leq 4\exp\left( -(\epsilon^2 - \epsilon^3)k/4 \right).
    \end{align}
    \label{lem:johnson_lindenstrauss}
\end{lem}
%---------------------------------------------------------------
\paragraph{Proof of \Cref{prop:lora}}
\begin{proof}
  The feature vector of LoRA is given by $\bm{\phi}^{\text{LoRA}}(\bm{x}) = \bm{B}_0 \bm{x} + \bm{B}^{\text{LoRA}} \bm{A}^{\text{LoRA}} \bm{x}$, where pre-trained feature weight matrix $\bm{B}_0$ is  fixed during training, and $\bm{A}^{\text{LoRA}} \in \mathbb{R}^{r \times d}$ and $\bm{B}^{\text{LoRA}} \in \mathbb{R}^{h \times r}$ are low-rank weight matrices in LoRA. $\bm{A}^{\text{LoRA}}$ is initialized from a normal distribution with mean 0 and variance $\sigma^2$, while $\bm{B}^{\text{LoRA}}$ is initialized with zeros. The LoRA feature updates are represented as $\bm{\phi}^{\text{LoRA}}(\bm{x}) = \bm{B}_0 \bm{x} + \bm{B}^{\text{LoRA}} \bm{A}^{\text{LoRA}} \bm{x}$, with $\bm{B}_0$ fixed during training.

  The pre-train-effective components of LoRA and FT, denoted as $\bm{P}^{\text{LoRA}}(\bm{x}, \bm{x}_i)$ and $\bm{P}^{\text{FT}}(\bm{x}, \bm{x}_i)$ respectively, are defined as:
  \begin{align}
    \bm{P}^{\text{LoRA}}(\bm{x}, \bm{x}_i) &= (\langle \bm{\phi}^{\text{LoRA}}_0(\bm{x}), \bm{\phi}^{\text{LoRA}}_0(\bm{x}_i)\rangle + 1) \bm{I}_{C},\\
    \bm{P}^{\text{FT}}(\bm{x}, \bm{x}_i) &= (\langle \bm{\phi}^{\text{FT}}_0(\bm{x}), \bm{\phi}^{\text{FT}}_0(\bm{x}_i)\rangle + 1) \bm{I}_{C},
  \end{align}
  where $\bm{I}_{C}$ is the identity matrix of size $C$. These pre-train-effective components are identical since:
  \begin{align}
    \bm{\phi}^{\text{LoRA}}_0(\bm{x}) = \bm{B}_0 \bm{x} + \bm{B}_0^{\text{LoRA}} \bm{A}_0^{\text{LoRA}} \bm{x} = \bm{B}_0 \bm{x} = \bm{\phi}^{\text{FT}}_0(\bm{x}),
  \end{align}
  for all $\bm{x} \in \mathcal{X}$ because $\bm{B}^{\text{LoRA}}$ is initialized as a zero matrix i.e. $\bm{B}^{\text{LoRA}}_0 = O$.

  For the FT-effective component of the NTK matrix, consider the derivatives concerning LoRA parameters $\bm{B}^{\text{LoRA}}$ and $\bm{A}^{\text{LoRA}}$:
  \begin{align}
    \frac{\partial \bm{\phi}^{\text{LoRA}}(\bm{x})}{\partial \theta^{\bm{B}^{\text{LoRA}}}} &= \bm{A} \bm{x} \otimes \bm{V},\\
    \frac{\partial \bm{\phi}^{\text{LoRA}}(\bm{x})}{\partial \theta^{\bm{A}^{\text{LoRA}}}} &= \bm{x} \otimes \bm{V} \bm{B}^{\text{LoRA}} \bm{B}^{\text{LoRA}\top} \bm{V}^\top.
  \end{align}
  Here, $\theta^{\bm{B}^{\text{LoRA}}}$ and $\theta^{\bm{A}^{\text{LoRA}}}$ denote the parameters of $\bm{B}^{\text{LoRA}}$ and $\bm{A}^{\text{LoRA}}$, respectively.

  The FT-effective component of the NTK matrix for LoRA, denoted as $\bm{F}^{\text{LoRA}}(\cdot, \cdot)$, is derived by combining these partial derivatives:
  \begin{align}
    \bm{F}^{\text{LoRA}}(\bm{x}, \bm{x}_i)
    =& \bm{V}_0 \left( \frac{\partial \bm{\phi}_{0}^{\text{LoRA}}(\bm{x})}{\partial \theta^{\bm{B}^{\text{LoRA}}}} \frac{\partial \bm{\phi}_{0}^{\text{LoRA}}(\bm{x}_i)}{\partial \theta^{\bm{B}^{\text{LoRA}}}}^\top + \frac{\partial \bm{\phi}_{0}^{\text{LoRA}}(\bm{x})}{\partial \theta^{\bm{A}^{\text{LoRA}}}} \frac{\partial \bm{\phi}_{0}^{\text{LoRA}}(\bm{x}_i)}{\partial \theta^{\bm{A}^{\text{LoRA}}}}^\top \right) \bm{V}_0^\top \notag \\
    =& \bm{V}_0 \left( \langle \bm{A}^{\text{LoRA}}_{0}\bm{x}, \bm{A}^{\text{LoRA}}_{0}\bm{x}_i\rangle + \langle \bm{x}, \bm{x}_i\rangle \bm{B}^{\text{LoRA}}_{0}\bm{B}^{\text{LoRA}\top}_{0} \right) \bm{V}_0^\top \notag \\
    =& \langle \bm{A}_{0}^{\text{LoRA}}\bm{x}, \bm{A}_{0}^{\text{LoRA}}\bm{x}_i\rangle \bm{V}_0 \bm{V}_0^\top,
    \label{eq:complex_lora}
  \end{align}
  where the last equality holds because $\bm{B}^{\text{LoRA}}_{0}$ is a zero matrix.

  Similarly, the FT-effective component of the NTK matrix for standard FT, $\bm{F}^{\text{FT}}(\cdot, \cdot)$, is given by:
  \begin{align}
    \bm{F}^{\text{FT}}(\bm{x}, \bm{x}_i)
    =& \bm{V}_0 \left(\frac{\partial \bm{\phi}^{\text{FT}}_0(\bm{x})}{\partial \theta_{\bm{B}}}\frac{\partial \bm{\phi}^{\text{FT}}_0(\bm{x}_i)}{\partial \theta_{\bm{B}}}^\top \right) \bm{V}_0^\top \notag \\
    =& \langle \bm{x}, \bm{x}_i\rangle \bm{V}_0 \bm{V}_0^\top.
    \label{eq:complex_ft}
  \end{align}

  Using the Johnson-Lindenstrauss lemma, with a probability of at least $1 - 4\exp(-(\epsilon^2 - \epsilon^3) r / 4)$:
  \begin{align}
    |\langle \bm{A}^{\text{LoRA}} \bm{x}, \bm{A}^{\text{LoRA}} \bm{x}_i \rangle - \sigma^2 r \langle \bm{x}, \bm{x}_i \rangle| \leq c \sigma^2 r \epsilon,
  \end{align}
  which implies:
  \begin{align}
    \| \bm{F}^{\text{LoRA}}(\bm{x}, \bm{x}_i) - \sigma^2 r \bm{F}^{\text{FT}}(\bm{x}, \bm{x}_i) \|
    =& \|\langle \bm{A}^{\text{LoRA}} \bm{x}, \bm{A}^{\text{LoRA}} \bm{x}_i\rangle \bm{V}_0 \bm{V}_0^\top - \sigma^2 r \langle \bm{x}, \bm{x}_i\rangle \bm{V}_0 \bm{V}_0^\top\| \\
    \leq& |\langle \bm{A}^{\text{LoRA}} \bm{x}, \bm{A}^{\text{LoRA}} \bm{x}_i\rangle - \sigma^2 r \langle \bm{x}, \bm{x}_i\rangle| \|\bm{V}_0 \bm{V}_0^\top\| \\
    \leq& c \sigma^2 r \epsilon \|\bm{V}_0 \bm{V}_0^\top\|.
  \end{align}

  This completes the proof.
\end{proof}
%-------------------------------------------------------------------------------------------------------------------
\subsection{Experimental details}
\label{subsec:experimentalDetails}
\subsubsection{Datasets}
From the SuperGLUE benchmark~\citep{wang2019superglue}, we used the five datasets: BoolQ~\citep{clark-etal-2019-boolq}, CB (CommitmentBank)~\citep{de2019commitmentbank}, RTE (Recognizing Textual Entailment)~\citep{dagan2005pascal, rte2, giampiccolo2007third, bentivogli2009fifth}, WiC (Words in Context)~\citep{burstein2019proceedings}, and WSC (Winograd Schema Challenge)~\citep{levesque2012winograd}. From the GLUE benchmark~\citep{wang2018glue}, we used the three datasets: CoLA (Corpus of Linguistic Acceptability)~\citep{warstadt-etal-2019-neural}, MRPC (Microsoft Research Paraphrase Corpus)~\citep{dolan2005automatically}, and SST-2 (Stanford Sentiment Treebank, version 2)~\citep{socher-etal-2013-recursive}. Four datasets from BOSS~\citep{yuan2023revisiting} were used in OOD evaluation: Amazon Reviews~\citep{mcAuley2013hidden}, Dynasent~\citep{potts-etal-2021-dynasent}, SemEval~\citep{nakov-etal-2016-semeval}, and SST-5~\citep{socher-etal-2013-recursive}. Finally, we used the PubMed $20$k RCT dataset~\citep{dernoncourt2017pubmed} for validation in practical settings. The dataset statistics are detailed in~\Cref{tab:dataset}.

For the datasets from the GLUE, SuperGLUE, and BOSS benchmarks, we divided the original training set using a 9:1 training-to-validation ratio, using the original validation set as the test set, in accordance with~\citet{chen-etal-2022-revisiting}. For PubMed $20$k RCT, we used the original training, validation, and test sets for their respective purposes.
%-------------------------------------------------------------------------------------------------------------------
\subsubsection{Implementation and training details}
When applying LoRA, LoRA was applied only to the query and value projection matrices of the attention mechanism in the Transformer architecture, following the approach described in the original paper by \citet{hu2022lora}. The LoRA settings were fixed at $\alpha = 8$ and $r = 8$ for all experiments.

The model was trained for $10$ epochs without early stopping, and the one showing the best performance on the validation set was chosen for further evaluation. We used the Adam optimizer~\citep{kingma2017adam}. Our code is built on PyTorch~\citep{paszke2019pytorch}, using the HuggingFace Transformers library~\citep{wolf-etal-2020-transformers} and AdapterHub~\citep{pfeiffer-etal-2020-adapterhub}. All experiments were run on a single NVIDIA A$100$ GPU. The results reported are averages from 3 tuning seeds and 5 evaluation seeds.

For LP, cross-validation and automatic hyperparameter adjustment were used to find the optimal L2 regularization strength, using scikit-learn~\citep{scikit-learn} with its standard training parameters.

Details on the hyperparameters for our experiments can be found in~\Cref{tab:hyperparameters}.
%-------------------------------------------------------------------------------------------------------------------
\subsubsection{Details of each experiment}
\paragraph{Experiments on the GLUE and SuperGLUE benchmarks}
For the FT and LoRA methods, the learning rate and batch size were adopted from~\citet{chen-etal-2022-revisiting}, where these hyperparameters were optimized using grid search on the validation set. For LP-FT and LP-LoRA, batch size is fixed at 8 and we tuned the learning rate.

\paragraph{Experiments on BOSS benchmark and the PubMed $20$k RCT dataset}
For the experiments on BOSS benchmark and the PubMed $20$k RCT dataset, we tuned the learning rate and batch size using grid search based on the validation set performance.

\paragraph{Calculation of the NTK matrix}
We computed the NTK matrix for FT, LoRA, LP-FT, and LP-LoRA as specified in Eq.~\eqref{eq:ntkTraining}. We separately calculated the pre-train-effective and FT-effective components of the NTK matrix. Following the methodology by~\citet{malladi2024fine}, we used functorch~\citep{functorch2021} and forward-mode auto-differentiation~\citep{pmlr-v162-novak22a} for these calculations. To reduce computational costs, we randomly selected $10$\% of the parameters from the word embedding matrix for derivative calculations. For datasets with more than $250$ samples, we used a subset of $250$ randomly selected samples to compute the NTK matrix.

\paragraph{Solving the Kernel Regression}
Following the methodology described by~\citet{malladi2024fine}, we treated each output logit independently in our kernel regression model. This method is based on the representer theorem, where the empirical risk minimizer is expressed as a linear combination of kernel features from the training data: $\bm{f}(\bm{x}) = \sum_{i=1}^{NC} \alpha_i \bm{K}(\bm{x}, x_i)$, with $\bm{K}$ representing the NTK matrix or its component for a training set of size $NC \times NC$. We solved this optimization using logistic regression with L2 regularization and used the resulting coefficients $\alpha_i$ to compute logits on the test set via its corresponding NTK matrix.

\paragraph{Effects of classifier weight norms in training}
We scaled the norms of the classifiers within the range of $[0.1, 0.5, 1, 2, 5, 10, 50, 100]$ before proceeding to the FT stage of training, specifically after random initialization in FT and after LP training in LP-FT. We conducted this experiment using the CB and RTE datasets and Boss benchmark. We apply the LoRA method on the CB and RTE datasets. We averaged the results over 5 seeds for the CB and RTE datasets and 3 seeds for the Boss benchmark, plotting these with their standard deviations.

\paragraph{Temperature scaling}
We applied temperature scaling~\citep{pmlr-v70-guo17a} to the logits of the model at test time. Following the methodology of the original paper~\citep{pmlr-v70-guo17a}, we tuned the temperature parameter using the validation set to minimize the negative log-likelihood. For implementation, we employed the Adam optimizer~\citep{kingma2017adam} with a learning rate of $1\times 10^{-3}$, optimizing the temperature for $1 \times 10^5$ steps. We incorporated early stopping based on the negative log-likelihood, with a patience of $10$ iterations starting from an initial temperature value of $1.0$. The number of the bins to calculate ECE and MCE is set to $15$.
%-------------------------------------------------------------------------------------------------------------------
\begin{table}[htbp]
  \caption{Hyperparameter configurations. The settings include batch size (bs), learning rate (lr), alpha ($\alpha$), and rank ($r$).}
  \label{tab:hyperparameters}
  \centering
  \small
  \setlength{\tabcolsep}{3pt} % Adjusts the space between columns
  \begin{tabular}{cccccccccccc}
  \toprule
  Method              & Name & CB & RTE & BoolQ & WiC & WSC & CoLA & SST-2 & MRPC & Amazon & PubMed \\
  \midrule
  \multirow{2}{*}{FT} & bs   & 16 & 16  & 32    & 32   & 16  & 32  & 32   & 16 & 16 & 8 \\
                      & lr & $5e-5$ & $1e-5$ & $1e-5$ & $1e-5$ & $1e-3$ & $5e-5$ & $1e-5$ & $1e-5$ & $1e-5$ & $5e-6$ \\
  \midrule
  \multirow{4}{*}{LoRA} & bs & 16 & 16  & 32    & 16   & 16  & 16  & 32  & 32 & 16 & 8 \\
                        & lr & $1e-3$ & $1e-3$ & $5e-4$ & $1e-3$ & $1e-4$ & $1e-3$ & $5e-4$ & $5e-4$ & $1e-3$ & $5e-4$ \\
   & $\alpha$ & \multicolumn{10}{c}{8} \\
   & $r$ & \multicolumn{10}{c}{8} \\
  \midrule
  \multirow{2}{*}{LP-FT} & bs & \multicolumn{10}{c}{8} \\
 & lr & $5e-6$ & $1e-5$ & $1e-5$ & $1e-5$ & $1e-3$ & $1e-5$ & $1e-5$ & $1e-5$ & $1e-6$ & $5e-6$ \\
  \midrule
  \multirow{4}{*}{LP-LoRA} & bs & \multicolumn{10}{c}{8} \\
& lr & $1e-4$ & $5e-4$ & $5e-4$ & $1e-3$ & $1e-4$ & $1e-3$ & $1e-3$ & $1e-3$ & $5e-4$ & $1e-3$ \\
   & $\alpha$ & \multicolumn{10}{c}{8} \\
   & $r$ & \multicolumn{10}{c}{8} \\
  \bottomrule
  \end{tabular}
\end{table}
% --------------------------------------------------------------------------------------------------------------------------------
\begin{table}[htbp]
  \caption{Dataset statistics. This table provides detailed counts of the classes, training, validation, and test samples for different datasets across various tasks including natural language inference (NLI), word sense disambiguation (WSD), question answering (QA), coreference resolution (coref.), sentiment analysis (sentiment), and sequential sentence classification (sequential).}
    \label{tab:dataset}
    \centering\small
    \begin{tabular}{@{}l l c r r r l@{}}
    \toprule
    Dataset & Benchmark & Classes & Train & Val & Test & Task \\
    \midrule
    CB & \multirow{5}{*}{SuperGLUE} & 3 & 225 & 25 & 57 & NLI \\
    RTE & & 2 & 2,241 & 249 & 277 & NLI \\
    BoolQ & & 2 & 8,484 & 943 & 3,270 & QA \\
    WiC & & 2 & 5,400 & 600 & 638 & WSD \\
    WSC & & 2 & 498 & 56 & 104 & coref. \\
    \midrule
    CoLA & \multirow{3}{*}{GLUE} & 2 & 7,695 & 855 & 1,040 & acceptability \\
    SST-2 & & 2 & 60,614 & 6,735 & 872 & sentiment \\
    MRPC & & 2 & 3,301 & 367 & 408 & sentiment \\
    \midrule
    Amazon & \multirow{4}{*}{BOSS} & 3 & 3,000 & 1,000 & 1,000 & sentiment \\
    Dynasent & & 3 & - & - & 1,000 & sentiment \\
    SemEval & & 3 & - & - & 1,000 & sentiment \\
    SST-5 & & 3 & - & - & 1,000 & sentiment \\
    \midrule
    PubMed $20$k RCT & PubMed & 5 & 15,000 & 2,500 & 2,500 & sequential \\
    \bottomrule
    \end{tabular}
  \end{table}

%-------------------------------------------------------------------------------------------------------------------
\subsection{Additional experimental results}
\label{subsection:ExperimentAppendix}
%----------------------------------------------------------------------------------
\subsubsection{Results on the SuperGLUE and GLUE benchmarks}
\label{subsection:idExperimentAppendix}
\Cref{tab:GLUESuperGLUE} shows the test results for the SuperGLUE and GLUE benchmarks. We report accuracy and its standard deviation on the test sets, except for the CoLA dataset, which uses the Matthew's correlation coefficient for the performance metric.

\Cref{fig:NormIncreaseCB} shows the increase in the classifier weight norm during training on the CB dataset. With more iterations or epochs, there is a noticeable increase in both accuracy and the classifier weight norm.

\Cref{fig:featureAnalysis} and \Cref{fig:featureAnalysisLoRA} display t-SNE visualizations of the feature vectors from the CB dataset. After FT, the features are distinctly separated by class. In contrast, the classifier row vectors remain nearly identical to those of the pre-trained model. After LP-FT, the features retain the structure of the pre-trained model, but the classifier row vectors deviate from their initial state. A similar pattern is observed with the LoRA method.
%----------------------------------------------------------------------------------
\begin{table}[htbp]
  % notebook/01_paper_v2/additional_dataset_superGLUE.ipynb
  \caption{Test results on the SuperGLUE and GLUE benchmarks. We report the accuracy and its standard deviation, other than the CoLA dataset, which is evaluated by the Matthew's correlation coefficient. We take the average of five seeds.}
  \label{tab:GLUESuperGLUE}
\centering
\begin{tabular}{crrrrr}
  \toprule
  Dataset & \multicolumn{1}{c}{LP}& \multicolumn{1}{c}{FT} & \multicolumn{1}{c}{LP-FT} & \multicolumn{1}{c}{LoRA} & \multicolumn{1}{c}{LP-LoRA} \\ \midrule
  CB & $77.86 \pm 4.24$&$81.43 \pm 3.91 $&$ \bm{84.64} \pm \bm{2.40} $&$ 77.50 \pm 5.30 $&$ 75.71 \pm 2.04$ \\
  RTE & $57.69 \pm 1.10$&$74.73 \pm 3.04 $&$ \bm{76.75} \pm \bm{0.87} $&$ 72.85 \pm 1.41 $&$ 74.08 \pm 2.57$ \\
  SST-2 & $86.31 \pm 0.10$& $92.41 \pm 0.32$ & $\bm{94.52} \pm \bm{0.26}$ & $50.92 \pm 0.00$ & $94.22 \pm 0.45$ \\
  WIC & $61.32 \pm 0.28$& $65.89 \pm 1.15$ & $\bm{66.14} \pm \bm{1.83}$ & $62.70 \pm 7.37$ & $64.29 \pm 1.82$ \\
  CoLA & $46.27 \pm 0.33$& $\bm{58.75} \pm \bm{1.70}$ & $57.95 \pm 1.95$ & $57.29 \pm 2.98$ & $58.21 \pm 1.55$ \\
  MRPC & $73.09 \pm 0.86$& $\bm{88.14} \pm \bm{0.73}$ & $87.60 \pm 0.79$ & $68.38 \pm 0.00$ & $87.79 \pm 1.00$ \\
  WSC & $\bm{63.46} \pm \bm{0.00}$& $\bm{63.46} \pm \bm{0.00}$ & $\bm{63.46} \pm \bm{0.00}$ & $\bm{63.46} \pm \bm{0.68}$ & $\bm{63.46} \pm \bm{0.00}$ \\
  BoolQ & $64.66 \pm 0.08$& $78.69 \pm 0.27$ & $\bm{79.00} \pm \bm{0.42}$ & $77.59 \pm 0.39$ & $77.67 \pm 0.50$ \\ \bottomrule
\end{tabular}
\end{table}
%----------------------------------------------------------------------------------
\begin{figure}[htbp]
  %notebook/00_paper/prediction_analysis.ipynb
    \centering
    \begin{minipage}[b]{0.49\linewidth}
      \centering
      \includegraphics[width=0.8\linewidth]{images/prediction_analysis/norm_increase_LP_noreg_cb.png}
      \subcaption{LP (CB)}
  \end{minipage}
  \vspace{0.5\baselineskip}
  \centering
  \begin{minipage}[b]{0.49\linewidth}
    \centering
    \includegraphics[width=0.8\linewidth]{images/prediction_analysis/norm_increase_normal_cb_ft.png}
    \subcaption{FT (CB)}
\end{minipage}
\centering
\begin{minipage}[b]{0.49\linewidth}
  \centering
  \includegraphics[width=0.8\linewidth]{images/prediction_analysis/norm_increase_normal_cb_lora.png}
  \subcaption{LoRA (CB)}
\end{minipage}
\centering
\begin{minipage}[b]{0.49\linewidth}
  \centering
  \includegraphics[width=0.8\linewidth]{images/prediction_analysis/norm_increase_normal_rte_lora.png}
  \subcaption{LoRA (RTE)}
\end{minipage}
\centering
\begin{minipage}[b]{0.49\linewidth}
  \centering
  \includegraphics[width=0.8\linewidth]{images/prediction_analysis/norm_final_cb_weight_norms.png}
  \subcaption{After training (CB)}
\end{minipage}
%notebook/00_paper/prediction_analysis.ipynb
  \caption{The increase in the norm of the classifier weight during training. }
  \label{fig:NormIncreaseCB}
\end{figure}
%------------------------------------------------------------------------------------------
\begin{figure}[htbp]
  \centering
  \begin{minipage}[b]{0.49\linewidth}
    \centering
    \includegraphics[width=0.7\linewidth]{images/tsne/tsne_cb_ft_normal.png}
    \subcaption{FT}
\end{minipage}
\begin{minipage}[b]{0.49\linewidth}
  \centering
  \includegraphics[width=0.7\linewidth]{images/tsne/tsne_cb_ft_lp_ft.png}
  \subcaption{LP-FT}
\end{minipage}
  \caption{Small changes in feature and large changes in classifier weight during LP-FT. We visualize the t-SNE plot of the penultimate layer features and the classifier row vector of the model trained on the CB dataset. (a) The features after FT are clearly separated by class, while the classifier row vectors are plotted nearly the same place as the pre-trained model. (b) The features after LP-FT keep the structure of the pre-trained model, while the classifier row vectors are changed from the initialization.}
  \label{fig:featureAnalysis}
\end{figure}
%----------------------------------------------------------------------------------
\begin{figure}[htbp]
  \centering
  \begin{minipage}[b]{0.49\linewidth}
    \centering
    \includegraphics[width=0.7\linewidth]{images/tsne/tsne_cb_lora_normal.png}
    \subcaption{FT}
\end{minipage}
\begin{minipage}[b]{0.49\linewidth}
  \centering
  \includegraphics[width=0.7\linewidth]{images/tsne/tsne_cb_lora_lp_ft.png}
  \subcaption{LP-LoRA}
\end{minipage}
  \caption{The t-SNE plot of the penultimate layer features and the classifier row vector of the model trained with LoRA on the CB dataset.}
  \label{fig:featureAnalysisLoRA}
\end{figure}
%----------------------------------------------------------------------------------
\subsubsection{Results of NTK analysis}
\label{subsection:ntkAnalysisAppendix}
\Cref{tab:ntkStatisticsAppendix} displays the kernel statistics, while~\Cref{fig:ntkAppendix} shows the distribution of singular values.~\Cref{fig:ntkHeatmap} and~\Cref{fig:ntkHeatmapLoRA} visually depict the trace norms of sub-matrices within the NTK matrix. For the kernel matrix $\bm{K} \in \mathbb{R}^{NC\times NC}$, we calculated the trace norms of the sub-matrix $\bm{K}(\bm{x}_i, \bm{x}_j) \in \mathbb{R}^{C\times C}$ for each sample pair $(\bm{x}_i, \bm{x}_j)$ in the training sets.

\Cref{fig:ntkHeatmap} reveals a consistent pattern in the FT-effective component of the NTK matrix across all datasets: pairs of identical samples in diagonal positions typically exhibit higher trace norms. This suggests that the FT-effective component is more effective at capturing relationships among samples compared to the pre-train-effective component. Additionally, in the CB dataset, certain sample pairs, particularly in classes 1 and 3, show notably high trace norms, indicating that the pre-trained model effectively differentiates between these class samples.

\begin{table}[htbp]
  % notebook/01_paper_v2/ntk_statistics.ipynb
  \centering
  \caption{Kernel statistics on the RTE, BoolQ, and WiC datasets. FN, Acc, and FT Ratio denote the Frobenius norm, kernel regression accuracy, and  contribution of the FT-effective component, respectively. Pre-train E and FT E refer to the pre-train-effective and FT-effective components of the NTK matrix.}
  \label{tab:ntkStatisticsAppendix}
  \begingroup
\renewcommand{\arraystretch}{0.6}
\begin{tabular}{llcrrrr}
  \toprule
  Dataset & Method & Kernel & Rank & FN & Acc (train/test) & FT Ratio \\
  \midrule
 \multirow{9}{*}{RTE}& - & Pre-train E &$28$&$4.70\times 10^{4}$&$66.40 / 51.20$ & - \\ \cmidrule{2-7}
 & \multirow{2}{*}{FT} & FT E &$488$&$1.29\times 10^{4}$&$96.60 / 53.40$ & \multirow{2}{*}{$0.2148$} \\
 &  & NTK &$191$&$5.98\times 10^{4}$&$97.60 / 53.00$ &  \\ \cmidrule{2-7}
 & \multirow{2}{*}{LoRA} & FT E &$432$&$2.51\times 10^{1}$&$70.80 / 54.60$ & \multirow{2}{*}{$0.0005$} \\
 & & NTK &$30$&$4.70\times 10^{4}$&$59.60 / 54.80$ &  \\ \cmidrule{2-7}
 & \multirow{2}{*}{LP-FT} & FT E &$250$&$3.80\times 10^{6}$&$100.00 / 51.20$ & \multirow{2}{*}{$0.9918$} \\
 &  & NTK &$251$&$3.84\times 10^{6}$&$100.00 / 51.20$ &  \\ \cmidrule{2-7}
 & \multirow{2}{*}{LP-LoRA} & FT E &$243$&$7.60\times 10^{3}$&$84.80 / 51.20$ & \multirow{2}{*}{$0.1942$} \\
 & & NTK &$103$&$5.26\times 10^{4}$&$88.00 / 51.20$ &  \\\midrule
 \multirow{9}{*}{BoolQ}& - & Pre-train E &$32$&$4.48\times 10^{4}$&$53.60 / 57.20$ & -  \\ \cmidrule{2-7}
 & \multirow{2}{*}{FT} & FT E &$495$&$1.24\times 10^{4}$&$100.00 / 56.40$ & \multirow{2}{*}{$0.2139$} \\
 &  & NTK &$215$&$5.67\times 10^{4}$&$53.80 / 57.20$ &  \\ \cmidrule{2-7}
 & \multirow{2}{*}{LoRA} & FT E &$448$&$2.48\times 10^{1}$&$53.60 / 57.20$ & \multirow{2}{*}{$0.0005$} \\
 & & NTK &$34$&$4.48\times 10^{4}$&$53.60 / 57.20$ &  \\ \cmidrule{2-7}
 & \multirow{2}{*}{LP-FT} & FT E &$247$&$4.46\times 10^{6}$&$100.00 / 61.60$ & \multirow{2}{*}{$0.9921$} \\
 &  & NTK &$248$&$4.49\times 10^{6}$&$100.00 / 61.20$ &  \\ \cmidrule{2-7}
 & \multirow{2}{*}{LP-LoRA} & FT E &$237$&$8.56\times 10^{3}$&$68.80 / 63.60$ & \multirow{2}{*}{$0.2118$} \\
 & & NTK &$99$&$5.07\times 10^{4}$&$86.00 / 59.20$ &  \\\midrule
 \multirow{9}{*}{WiC}& - & Pre-train E &$16$&$4.81\times 10^{4}$&$66.00 / 54.00$ & - \\ \cmidrule{2-7}
 & \multirow{2}{*}{FT} & FT E &$488$&$1.45\times 10^{4}$&$89.00 / 59.00$ & \multirow{2}{*}{$0.2216$} \\
 &  & NTK &$235$&$6.17\times 10^{4}$&$90.60 / 59.00$ & - \\ \cmidrule{2-7}
 & \multirow{2}{*}{LoRA} & FT E &$438$&$2.58\times 10^{1}$&$72.00 / 52.00$ & \multirow{2}{*}{$0.0005$} \\
 & & NTK &$19$&$4.81\times 10^{4}$&$65.80 / 56.40$ &  \\ \cmidrule{2-7}
 & \multirow{2}{*}{LP-FT} & FT E &$218$&$7.77\times 10^{7}$&$100.00 / 56.80$ & \multirow{2}{*}{$0.9996$} \\
 & & NTK &$219$&$7.77\times 10^{7}$&$100.00 / 56.40$ &  \\ \cmidrule{2-7}
 & \multirow{2}{*}{LP-LoRA} & FT E &$218$&$1.09\times 10^{5}$&$72.00 / 59.60$ & \multirow{2}{*}{$0.7454$} \\
 & & NTK &$195$&$1.47\times 10^{5}$&$80.80 / 59.60$ &  \\
  \bottomrule
\end{tabular}
\endgroup
\end{table}
%------------------------------------------------------------------------------------------
\begin{figure}[htbp]
  % notebook/11_LP/visualize_btk0418.ipynb
\begin{minipage}[b]{0.49\linewidth}
    \centering
    \includegraphics[width=\linewidth]{images/ntk/singularvalue_normalized_rte.png}
    \subcaption{RTE}
  \end{minipage}
  \centering
  \begin{minipage}[b]{0.49\linewidth}
    \centering
    \includegraphics[width=\linewidth]{images/ntk/singularvalue_normalized_boolq.png}
    \subcaption{BoolQ}
\end{minipage}
    \begin{minipage}[b]{0.49\linewidth}
      \centering
      \includegraphics[width=\linewidth]{images/ntk/singularvalue_normalized_wic.png}
      \subcaption{WiC}
  \end{minipage}
  \caption{Singular value distribution normalized by the maximum singular value on the RTE, BoolQ, and WiC datasets. Pre-train E denotes the pre-train-effective component, and other plots denote the FT-effective component of NTK matrix with each training option.}
\label{fig:ntkAppendix}
\end{figure}
%---------------------------------------------------------

%----------------------------------------------------------------------------------

  \begin{figure}[htbp]
    % notebook/01_paper_v2/ntk.ipynb
    \begin{minipage}[b]{0.3\linewidth}
      \centering
      \includegraphics[width=0.8\linewidth]{images/ntk/trace_simple_cb_ft_normal.png}
      \subcaption{Pre-train E (CB)}
    \end{minipage}
    \begin{minipage}[b]{0.3\linewidth}
      \centering
      \includegraphics[width=0.8\linewidth]{images/ntk/trace_complex_cb_ft_normal.png}
      \subcaption{FT E (FT, CB)}
    \end{minipage}
    \begin{minipage}[b]{0.3\linewidth}
      \centering
      \includegraphics[width=0.8\linewidth]{images/ntk/trace_complex_cb_ft_lp_ft-sol.png}
      \subcaption{FT E (LP-FT, CB)}
      \end{minipage}

    \begin{minipage}[b]{0.3\linewidth}
    \centering
    \includegraphics[width=0.8\linewidth]{images/ntk/trace_simple_cb_ft_normal.png}
    \subcaption{Pre-train E (CB)}
  \end{minipage}
  \begin{minipage}[b]{0.3\linewidth}
    \centering
    \includegraphics[width=0.8\linewidth]{images/ntk/trace_complex_rte_ft_normal.png}
    \subcaption{FT E (FT, RTE)}
  \end{minipage}
  \begin{minipage}[b]{0.3\linewidth}
    \centering
    \includegraphics[width=0.8\linewidth]{images/ntk/trace_complex_rte_ft_lp_ft-sol.png}
    \subcaption{FT E (LP-FT, RTE)}
    \end{minipage}

  \begin{minipage}[b]{0.3\linewidth}
    \centering
    \includegraphics[width=0.8\linewidth]{images/ntk/trace_simple_boolq_ft_normal.png}
    \subcaption{Pre-train E (BoolQ)}
  \end{minipage}
  \begin{minipage}[b]{0.3\linewidth}
    \centering
    \includegraphics[width=0.8\linewidth]{images/ntk/trace_complex_boolq_ft_normal.png}
    \subcaption{FT E (FT, BoolQ)}
  \end{minipage}
  \begin{minipage}[b]{0.3\linewidth}
    \centering
    \includegraphics[width=0.8\linewidth]{images/ntk/trace_complex_boolq_ft_lp_ft-sol.png}
    \subcaption{FT E (LP-FT, BoolQ)}
    \end{minipage}

  \begin{minipage}[b]{0.3\linewidth}
    \centering
    \includegraphics[width=0.8\linewidth]{images/ntk/trace_simple_wic_ft_normal}
    \subcaption{Pre-train E (WiC)}
  \end{minipage}
  \begin{minipage}[b]{0.3\linewidth}
    \centering
    \includegraphics[width=0.8\linewidth]{images/ntk/trace_complex_wic_ft_lp_ft-sol.png}
    \subcaption{FT E (FT, WiC)}
  \end{minipage}
  \begin{minipage}[b]{0.3\linewidth}
  \centering
  \includegraphics[width=0.8\linewidth]{images/ntk/trace_complex_wic_ft_lp_ft-sol.png}
  \subcaption{FT E (LP-FT, WiC)}
  \end{minipage}
    \caption{Heat map of NTK matrix on the CB, RTE, BoolQ, and WiC dataset. We calculate the trace norm of the sub-matrix of the NTK matrix for each sample pair and visualize them grouped by class. Pre-train E and FT E refer to the pre-train-effective and FT-effective components of the NTK matrix.}
  \label{fig:ntkHeatmap}
  \end{figure}
%----------------------------------------------------------------------------------
\begin{figure}[htbp]
  % notebook/01_paper_v2/ntk.ipynb
  \begin{minipage}[b]{0.24\linewidth}
    \centering
    \includegraphics[width=0.9\linewidth]{images/ntk/trace_complex_cb_lora_normal.png}
    \subcaption{LoRA, CB}
  \end{minipage}
  \begin{minipage}[b]{0.24\linewidth}
  \centering
  \includegraphics[width=0.9\linewidth]{images/ntk/trace_complex_rte_lora_normal.png}
  \subcaption{LoRA, RTE}
\end{minipage}
\begin{minipage}[b]{0.24\linewidth}
  \centering
  \includegraphics[width=0.9\linewidth]{images/ntk/trace_complex_boolq_lora_normal.png}
  \subcaption{LoRA, BoolQ}
\end{minipage}
\begin{minipage}[b]{0.24\linewidth}
  \centering
  \includegraphics[width=0.9\linewidth]{images/ntk/trace_complex_wic_lora_lp_ft-sol.png}
  \subcaption{LoRA, WiC}
\end{minipage}

\begin{minipage}[b]{0.24\linewidth}
  \centering
  \includegraphics[width=0.9\linewidth]{images/ntk/trace_complex_cb_lora_lp_ft-sol.png}
  \subcaption{LP-LoRA, CB}
  \end{minipage}
\begin{minipage}[b]{0.24\linewidth}
\centering
\includegraphics[width=0.9\linewidth]{images/ntk/trace_complex_rte_lora_lp_ft-sol.png}
\subcaption{LP-LoRA, RTE}
\end{minipage}
\begin{minipage}[b]{0.24\linewidth}
\centering
\includegraphics[width=0.9\linewidth]{images/ntk/trace_complex_boolq_lora_lp_ft-sol.png}
\subcaption{LP-LoRA, BoolQ}
\end{minipage}
\begin{minipage}[b]{0.24\linewidth}
\centering
\includegraphics[width=0.9\linewidth]{images/ntk/trace_complex_wic_lora_lp_ft-sol.png}
\subcaption{LP-LoRA, WiC}
\end{minipage}
  \caption{Heat map of NTK matrix of FT-effective component with LoRA on the CB, RTE, BoolQ, and WiC dataset with LoRA. We calculate the trace norm of the sub-matrix of the NTK matrix for each sample pair and visualize them grouped by class.}
\label{fig:ntkHeatmapLoRA}
\end{figure}
%----------------------------------------------------------------------------------
\subsubsection{Experiments on BOSS benchmark}
\label{subsection:oodExperiment}
\Cref{tab:oodResults} shows indicate that LP-FT surpasses FT in OOD robustness and achieves higher accuracy in ID settings than LoRA. This suggests that LP-FT is effective in enhancing model robustness to OOD samples with reduced feature changes.
%------------------------------------------------------------------------------------------

\Cref{tab:oodFeatureAppendix} displays the statistics of feature and classifier changes on the Amazon, Dynasent, SemEval, and SST-5 datasets. The FDR within the ID is lower for LP-FT than for FT, whereas the FDR for OOD is higher for LP-FT than for FT. This indicates that LP-FT is learning robust features that are less sensitive to OOD data.
%---------------------------------------------------------
% notebook/01_paper_v2/ood.ipynb
\begin{table}[tp]
  \caption{Evaluation results on BOSS benchmark. We report the average accuracy and standard deviation over five seeds. The best results are highlighted in bold.}
  \label{tab:oodResults}
  \centering
  \begin{tabular}{cllll}
    \toprule
    \multirow{2}{*}{Method}& \multicolumn{1}{c}{ID} & \multicolumn{3}{c}{OOD} \\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    & \multicolumn{1}{c}{Amazon} & \multicolumn{1}{c}{Dynasent} & \multicolumn{1}{c}{SemEval} & \multicolumn{1}{c}{SST-5} \\
    \midrule
    LP & $83.04 \pm 0.01$ & $42.69 \pm 0.05$ & $50.04 \pm 0.01$ & $56.81 \pm 0.11$ \\
    FT & $88.66 \pm 1.62$ & $44.33 \pm 1.11$ & $52.20 \pm 1.82$ & $72.52 \pm 1.28$ \\
    LoRA & $86.05 \pm 2.16$ & \bm{$46.70 \pm 1.68$} & \bm{$55.29 \pm 2.93$} & $72.88 \pm 1.84$ \\
    LP-FT & \bm{$88.89 \pm 1.02$} & $45.41 \pm 0.80$ & $51.96 \pm 2.72$ & \bm{$73.78 \pm 1.05$} \\
    LP-LoRA & $88.17 \pm 1.97$ & $43.37 \pm 1.50$ & $48.84 \pm 3.20$ & $72.31 \pm 1.30$ \\
    \bottomrule
  \end{tabular}
\end{table}
%--------------------------------------------------------------------------------------------------------------------
\begin{table}[htbp]
  \caption{Comparison of feature and classifier changes on the Amazon (ID), Dynasent, SemEval, and SST-5 (OOD) datasets. CS, Diff, FDR, and Norm denote cosine similarity, difference norm, Fisher's discriminant ratio, and norm, respectively. (F) and (C) indicate feature and classifier statistics. Averages were calculated over five seeds.}
  \label{tab:oodFeatureAppendix}
  \centering
  \small
  {
\tabcolsep = 3pt
\begin{minipage}{\textwidth}
  \centering
  \begin{tabular}{cccccccc}
  \toprule
  \multirow{2}{*}{Method} & \multicolumn{4}{c}{Amazon} & \multicolumn{3}{c}{Dynasent} \\
  \cmidrule(r){2-5} \cmidrule(r){6-8}
   & CS(F) & Diff(F) & FDR(F) & Norm(C) & CS(F) & Diff(F) & FDR(F) \\
  \midrule
  Pre-trained &$ 0.996 $&$ - $&$  1.30\times 10^{0}$ & $ 9.51\times 10^{-1} $&$ 0.996 $&$ - $&$  1.94\times 10^{0}$  \\
  LP &$ 0.996 $&$ - $&$  1.30\times 10^{0}$ & $ 1.20\times 10^{2} $&$ 0.996 $&$ - $&$  1.94\times 10^{0}$ \\
  FT          &$ 0.691 $&$ 1.94\times 10^{1} $&$  3.74\times 10^{0}$ & $ 9.50\times 10^{-1} $&$ 0.652 $&$ 1.80\times 10^{1} $&$  2.03\times 10^{0}$ \\
  LoRA        &$ 0.848 $&$ 1.16\times 10^{1} $&$  3.38\times 10^{0}$ & $ 1.81\times 10^{0} $&$ 0.855 $&$ 7.53\times 10^{0} $&$  2.06\times 10^{0}$  \\
  LP-FT       &$ 0.999 $&$ 2.27\times 10^{0} $&$  3.00\times 10^{0}$ & $ 1.20\times 10^{2} $&$ 0.998 $&$ 2.54\times 10^{0} $&$  2.20\times 10^{0}$ \\
  LP-LoRA     &$ 0.999 $&$ 2.24\times 10^{0} $&$  3.01\times 10^{0}$ & $ 1.18\times 10^{2} $&$ 0.999 $&$ 2.56\times 10^{0} $&$  2.04\times 10^{0}$ \\
  \bottomrule
  \end{tabular}
\end{minipage}
\par\vspace{0.5cm}\par
\begin{minipage}{\textwidth}
  \centering
  \begin{tabular}{cccccccc}
      \toprule
      \multirow{2}{*}{Method} & \multicolumn{3}{c}{SemEval} & \multicolumn{3}{c}{SST5} \\
      \cmidrule(r){2-4} \cmidrule(r){5-7}
       & CS(F) & Diff(F) & FDR(F) & CS(F) & Diff(F) & FDR(F) \\
      \midrule
      Pre-trained &$ 0.996 $&$ - $&$  1.24\times 10^{0}$ &$ 0.998 $&$ - $&$  1.69\times 10^{1}$  \\
      LP &$ 0.996 $&$ - $&$  1.24\times 10^{0}$ &$ 0.998 $&$ - $&$  1.69\times 10^{1}$  \\
      FT          &$ 0.727 $&$ 1.68\times 10^{1} $&$  1.49\times 10^{0}$ &$ 0.604 $&$ 1.84\times 10^{1} $&$  2.26\times 10^{1}$ \\
      LoRA        &$ 0.885 $&$ 6.74\times 10^{0} $&$  1.44\times 10^{0}$ &$ 0.837 $&$ 8.72\times 10^{0} $&$  2.01\times 10^{1}$  \\
      LP-FT       &$ 0.997 $&$ 2.06\times 10^{0} $&$  1.45\times 10^{0}$ &$ 0.998 $&$ 1.86\times 10^{0} $&$  2.02\times 10^{1}$  \\
      LP-LoRA     &$ 0.999 $&$ 2.08\times 10^{0} $&$  1.19\times 10^{0}$ &$ 0.998 $&$ 1.85\times 10^{0} $&$  1.95\times 10^{1}$  \\
      \bottomrule
      \end{tabular}
  \end{minipage}
  }
\end{table}

%----------------------------------------------------------------------------------
\subsubsection{Change of feature and classifier norms}
\label{subsection:featureNormAppendix}
\Cref{tab:featureAppendix} shows the changes in features during the FT stage, indicating that the changes are smaller during LP-FT compared to FT. \Cref{tab:classifierNormAppendix} shows the classifier norms, which increase during training, with a more noticeable increase observed during LP than during FT.
\begin{table}[htbp]
  % notebook/01_paper_v2/feature_analysis.ipynb
  \caption{Feature change in FT stage. The change during LP-FT is smaller than during FT.}
  \label{tab:featureAppendix}
\centering
\begin{tabular}{crrrrr}
  \toprule
  Dataset & \multicolumn{1}{c}{FT} & \multicolumn{1}{c}{LoRA}& \multicolumn{1}{c}{LP-FT} & \multicolumn{1}{c}{LP-LoRA} \\ \midrule
  CB &$2.11 \times 10^{1}$&$2.07 \times 10^{1}$&$1.15 \times 10^{1}$&$7.85 \times 10^{0}$\\
  RTE &$2.12 \times 10^{1}$&$1.51 \times 10^{1}$&$3.33 \times 10^{0}$&$3.87 \times 10^{0}$\\
  COLA &$1.91 \times 10^{1}$&$1.10 \times 10^{1}$&$3.05 \times 10^{0}$&$2.75 \times 10^{0}$\\
  SST-2 &$2.31 \times 10^{1}$&$3.78 \times 10^{0}$&$6.95 \times 10^{0}$&$2.17 \times 10^{0}$\\
  MRPC &$2.11 \times 10^{1}$&$1.80 \times 10^{0}$&$1.84 \times 10^{0}$&$1.94 \times 10^{0}$\\
  BoolQ &$2.23 \times 10^{1}$&$1.55 \times 10^{1}$&$2.31 \times 10^{0}$&$1.95 \times 10^{0}$\\
  WiC &$2.08 \times 10^{1}$&$1.04 \times 10^{1}$&$2.28 \times 10^{0}$&$2.16 \times 10^{0}$\\
  WSC &$9.14 \times 10^{0}$&$2.44 \times 10^{-1}$&$7.33 \times 10^{0}$&$2.02 \times 10^{-1}$\\
  Amazon &$1.98 \times 10^{1}$&$1.35 \times 10^{1}$&$2.21 \times 10^{0}$&$2.28 \times 10^{0}$\\
  Dynasent &$1.94 \times 10^{1}$&$8.02 \times 10^{0}$&$2.47 \times 10^{0}$&$2.59 \times 10^{0}$\\
  SemEval &$1.83 \times 10^{1}$&$6.97 \times 10^{0}$&$1.99 \times 10^{0}$&$2.14 \times 10^{0}$\\
  SST-5 &$2.03 \times 10^{1}$&$9.08 \times 10^{0}$&$1.79 \times 10^{0}$&$1.89 \times 10^{0}$\\
  \bottomrule
\end{tabular}
\end{table}
%----------------------------------------------------------------------------------
%-------------------------------------------------------------------------
\begin{table}[tbp]
  % notebook/01_paper_v2/feature_analysis.ipynb
  \caption{The classifier weight norms. The classifier weight norms increase during training, and the increase is more pronounced in LP.}
  \label{tab:classifierNormAppendix}
\centering
{
\tabcolsep = 3pt
\begin{tabular}{crrrrrr}
  \toprule
  Dataset & \multicolumn{1}{c}{Pretrain} & \multicolumn{1}{c}{FT} & \multicolumn{1}{c}{LoRA} & \multicolumn{1}{c}{LP} & \multicolumn{1}{c}{LP-FT} & \multicolumn{1}{c}{LP-LoRA} \\ \midrule
  CB &$9.47 \times 10^{-1}$&$9.51 \times 10^{-1}$&$1.56 \times 10^{0}$&$3.35 \times 10^{1}$&$3.35 \times 10^{1}$&$3.35 \times 10^{1}$\\
  RTE &$7.95 \times 10^{-1}$&$8.05 \times 10^{-1}$&$1.45 \times 10^{0}$&$2.86 \times 10^{1}$&$2.86 \times 10^{1}$&$2.85 \times 10^{1}$\\
  COLA &$7.95 \times 10^{-1}$&$7.88 \times 10^{-1}$&$1.06 \times 10^{0}$&$3.46 \times 10^{1}$&$3.46 \times 10^{1}$&$3.51 \times 10^{1}$\\
  SST2 &$7.95 \times 10^{-1}$&$7.20 \times 10^{-1}$&$1.96 \times 10^{0}$&$1.32 \times 10^{2}$&$1.09 \times 10^{2}$&$1.03 \times 10^{2}$\\
  MRPC &$7.95 \times 10^{-1}$&$7.98 \times 10^{-1}$&$1.35 \times 10^{0}$&$1.12 \times 10^{1}$&$1.12 \times 10^{1}$&$1.12 \times 10^{1}$\\
  BoolQ &$7.95 \times 10^{-1}$&$7.98 \times 10^{-1}$&$1.15 \times 10^{0}$&$1.27 \times 10^{1}$&$1.27 \times 10^{1}$&$1.25 \times 10^{1}$\\
  WiC &$7.95 \times 10^{-1}$&$7.98 \times 10^{-1}$&$1.14 \times 10^{0}$&$3.21 \times 10^{1}$&$3.25 \times 10^{1}$&$3.27 \times 10^{1}$\\
  WSC &$7.95 \times 10^{-1}$&$6.87 \times 10^{-1}$&$7.88 \times 10^{-1}$&$2.26 \times 10^{-4}$&$1.08 \times 10^{-1}$&$2.16 \times 10^{-2}$\\
  Amazon &$9.51 \times 10^{-1}$&$9.47 \times 10^{-1}$&$1.67 \times 10^{0}$&$1.21 \times 10^{2}$&$1.21 \times 10^{2}$&$1.20 \times 10^{2}$\\
  \bottomrule
\end{tabular}}
\end{table}
%----------------------------------------------------------------------------------
\subsubsection{Effects of classifier weight norms in training}
\Cref{fig:scaleNormOOD} (Boss benchmark) and \Cref{fig:scaleNormID} (the CB and RTE datasets) illustrate the changes in features from the pre-trained models. Except for the CB dataset, the change in features in LP-FT is generally smaller than in FT when using large classifier norms. The CB dataset has a smaller sample size, which could be an exception. 
% ---------------------------------------------------------------------------------
\begin{figure}
    \centering
    \begin{minipage}{0.49\linewidth}
    \centering
    \includegraphics[width=\linewidth]{images/scaling/amazon_ood_norm_scale_feature}
    \subcaption{Amazon (ID)}
  \end{minipage}
  \centering
    \begin{minipage}{0.49\linewidth}
    \centering
    \includegraphics[width=\linewidth]{images/scaling/dynasent_ood_norm_scale_feature}
    \subcaption{Dynasent (OOD)}
  \end{minipage}
  \centering
    \begin{minipage}{0.49\linewidth}
    \centering
    \includegraphics[width=\linewidth]{images/scaling/semeval_ood_norm_scale_feature}
    \subcaption{SemEval (OOD)}
  \end{minipage}
  \caption{Difference of features of the samples with scaling the classifier weight norms on BOSS benchmark. The dashed vertical lines indicate the original norms of the classifier weight.}
  \label{fig:scaleNormOOD}
\end{figure}
%----------------------------------------------------------------------------------
\begin{figure}
    \centering
    \begin{minipage}{0.49\linewidth}
    \centering
    \includegraphics[width=\linewidth]{images/scaling/cb_norm_scale_feature.png}
    \subcaption{CB}
  \end{minipage}
  \centering
    \begin{minipage}{0.49\linewidth}
    \centering
    \includegraphics[width=\linewidth]{images/scaling/rte_norm_scale_feature.png}
    \subcaption{RTE}
  \end{minipage}
  \caption{Difference of features of the samples with scaling the classifier weight norms on the CB and RTE datasets. The dashed vertical lines indicate the original norms of the classifier weight.}
  \label{fig:scaleNormID}
\end{figure}
%----------------------------------------------------------------------------------
%----------------------------------------------------------------------------------
\subsubsection{Temperature scaling}
The result of temperature scaling on SuperGLUE and GLUE is presented in \Cref{tab:temperatureScalingAppendixSuperGLUE,tab:temperatureScalingAppendixGLUE}. 
%------------------------------------------------------------------------------------------
\begin{table}[ht]
  % notebook/01_paper_v2/temperature_test_ece.ipynb
  \centering
  \caption{ECE and MCE with temperature scaling on SuperGLUE. w/o TS and w/ TS denote without and with temperature scaling, respectively, and Imp. represents the improvement because of temperature scaling. We bold the best improvements. We take 5 seeds and report the mean and the standard deviation.}
  \label{tab:temperatureScalingAppendixSuperGLUE}
  \centering
  \small
  {
\tabcolsep = 3pt
  \begin{tabular}{lccrrr}
  \toprule
  Dataset & Metric & Method & w/o TS & w/ TS & Imp. \\
  \midrule
  \multirow{8}{*}{CB}&  \multirow{4}{*}{ECE (\%)}
         & FT & $15.60 \pm 0.96$ & $14.64 \pm 1.75$ & $0.95$ \\
       & & LP-FT & $13.93 \pm 0.45$ & $13.13 \pm 0.56$ & $0.80$ \\
       & & LoRA & $12.89 \pm 0.41$ & $16.22 \pm 0.55$ & $-3.34$ \\
       & & LP-LoRA & $14.78 \pm 0.93$ & $13.51 \pm 1.67$ & $1.27$ \\
  \cmidrule(lr){2-6}
 & \multirow{4}{*}{MCE (\%)}
         & FT & $75.99 \pm 6.12$ & $69.99 \pm 5.83$ & $6.01$ \\
       & & LP-FT & $76.78 \pm 3.66$ & $70.28 \pm 3.27$ & $6.50$ \\
       & & LoRA & $52.58 \pm 4.72$ & $66.75 \pm 7.96$ & $-14.16$ \\
       & & LP-LoRA & $68.16 \pm 4.95$ & $60.80 \pm 2.30$ & $7.36$ \\
  \midrule
  \multirow{8}{*}{RTE}&  \multirow{4}{*}{ECE (\%)}
         & FT & $21.16 \pm 1.36$ & $5.13 \pm 0.63$ & $16.03$ \\
       & & LP-FT & $21.72 \pm 0.28$ & $5.48 \pm 0.77$ & $16.24$ \\
       & & LoRA & $11.92 \pm 2.23$ & $6.17 \pm 0.20$ & $5.76$ \\
       & & LP-LoRA & $18.14 \pm 0.99$ & $5.72 \pm 0.48$ & $12.42$ \\
  \cmidrule(lr){2-6}
 & \multirow{4}{*}{MCE (\%)}
         & FT & $53.11 \pm 8.51$ & $25.87 \pm 6.30$ & $27.24$ \\
       & & LP-FT & $63.95 \pm 7.70$ & $13.94 \pm 1.80$ & $50.01$ \\
       & & LoRA & $25.04 \pm 3.33$ & $13.75 \pm 0.91$ & $11.29$ \\
       & & LP-LoRA & $40.46 \pm 7.22$ & $18.82 \pm 2.00$ & $21.63$ \\
  \midrule
 \multirow{8}{*}{BoolQ}&  \multirow{4}{*}{ECE (\%)}
         & FT & $13.63 \pm 0.61$ & $1.83 \pm 0.09$ & $11.81$ \\
       & & LP-FT & $18.93 \pm 0.15$ & $2.41 \pm 0.42$ & $16.51$ \\
       & & LoRA & $8.88 \pm 0.38$ & $1.45 \pm 0.18$ & $7.43$ \\
       & & LP-LoRA & $14.09 \pm 0.92$ & $2.07 \pm 0.19$ & $12.02$ \\
  \cmidrule(lr){2-6}
 & \multirow{4}{*}{MCE (\%)}
         & FT & $23.26 \pm 1.48$ & $5.79 \pm 0.90$ & $17.47$ \\
       & & LP-FT & $40.82 \pm 1.94$ & $5.21 \pm 0.53$ & $35.60$ \\
       & & LoRA & $13.96 \pm 0.72$ & $3.85 \pm 0.56$ & $10.11$ \\
       & & LP-LoRA & $24.60 \pm 2.52$ & $5.51 \pm 0.72$ & $19.09$ \\
  \midrule
 \multirow{8}{*}{WiC}&  \multirow{4}{*}{ECE (\%)}
         & FT & $25.88 \pm 2.39$ & $8.85 \pm 0.53$ & $17.03$ \\
       & & LP-FT & $29.47 \pm 1.57$ & $7.68 \pm 0.55$ & $21.78$ \\
       & & LoRA & $18.66 \pm 4.39$ & $5.93 \pm 1.42$ & $12.73$ \\
       & & LP-LoRA & $22.22 \pm 1.98$ & $8.06 \pm 0.60$ & $14.15$ \\
  \cmidrule(lr){2-6}
 & \multirow{4}{*}{MCE (\%)}
         & FT & $41.59 \pm 5.39$ & $17.01 \pm 2.87$ & $24.58$ \\
       & & LP-FT & $39.20 \pm 2.74$ & $17.04 \pm 1.50$ & $22.16$ \\
       & & LoRA & $27.95 \pm 7.38$ & $11.40 \pm 2.77$ & $16.54$ \\
       & & LP-LoRA & $30.99 \pm 3.64$ & $14.45 \pm 1.01$ & $16.54$ \\
  \midrule
 \multirow{8}{*}{WSC}&  \multirow{4}{*}{ECE (\%)}
         & FT & $6.26 \pm 2.37$ & $7.97 \pm 0.06$ & $-1.71$ \\
       & & LP-FT & $6.38 \pm 1.78$ & $8.01 \pm 0.06$ & $-1.63$ \\
       & & LoRA & $10.53 \pm 1.35$ & $9.19 \pm 0.60$ & $1.34$ \\
       & & LP-LoRA & $11.40 \pm 0.23$ & $8.24 \pm 0.01$ & $3.15$ \\
  \cmidrule(lr){2-6}
 & \multirow{4}{*}{MCE (\%)}
         & FT & $6.26 \pm 2.37$ & $7.97 \pm 0.06$ & $-1.71$ \\
       & & LP-FT & $6.38 \pm 1.78$ & $8.01 \pm 0.06$ & $-1.63$ \\
       & & LoRA & $13.27 \pm 1.12$ & $11.12 \pm 1.51$ & $2.15$ \\
       & & LP-LoRA & $11.40 \pm 0.23$ & $8.24 \pm 0.01$ & $3.15$ \\
  \bottomrule
\end{tabular}
}
\end{table}
%------------------------------------------------------------------------------------------
\begin{table}[ht]
  % notebook/01_paper_v2/temperature_test_ece.ipynb
  \centering
  \caption{ECE and MCE with temperature scaling on GLUE. w/o TS and w/ TS denote without and with temperature scaling, respectively, and Imp. represents the improvement because of temperature scaling. We bold the best improvements. We take 5 seeds and report the mean and the standard deviation.}
  \label{tab:temperatureScalingAppendixGLUE}
  \centering
  \small
  {
\tabcolsep = 3pt
  \begin{tabular}{lccrrr}
  \toprule
  Dataset & Metric & Method & w/o TS & w/ TS & Imp. \\
  \midrule
 \multirow{8}{*}{CoLA}&  \multirow{4}{*}{ECE (\%)}
         & FT & $15.08 \pm 0.55$ & $4.46 \pm 0.83$ & $10.61$ \\
       & & LP-FT & $15.74 \pm 0.40$ & $9.53 \pm 1.23$ & $6.21$ \\
       & & LoRA & $11.25 \pm 1.32$ & $4.18 \pm 0.40$ & $7.07$ \\
       & & LP-LoRA & $13.82 \pm 0.48$ & $4.30 \pm 0.43$ & $9.52$ \\
  \cmidrule(lr){2-6}
 & \multirow{4}{*}{MCE (\%)}
         & FT & $47.19 \pm 5.15$ & $24.35 \pm 3.33$ & $22.84$ \\
       & & LP-FT & $54.59 \pm 2.94$ & $20.31 \pm 1.37$ & $34.28$ \\
       & & LoRA & $31.01 \pm 5.83$ & $15.23 \pm 2.74$ & $15.78$ \\
       & & LP-LoRA & $38.36 \pm 7.85$ & $15.36 \pm 1.83$ & $23.00$ \\
  \midrule
  \multirow{8}{*}{SST-2}&  \multirow{4}{*}{ECE (\%)}
         & FT & $4.61 \pm 0.31$ & $2.26 \pm 0.22$ & $2.35$ \\
       & & LP-FT & $5.67 \pm 0.12$ & $2.00 \pm 0.21$ & $3.66$ \\
       & & LoRA & $4.84 \pm 0.13$ & $2.71 \pm 0.16$ & $2.12$ \\
       & & LP-LoRA & $6.22 \pm 0.10$ & $2.53 \pm 0.08$ & $3.69$ \\
  \cmidrule(lr){2-6}
 & \multirow{4}{*}{MCE (\%)}
         & FT & $49.22 \pm 4.78$ & $42.72 \pm 5.24$ & $6.50$ \\
       & & LP-FT & $74.91 \pm 1.72$ & $42.77 \pm 5.75$ & $32.13$ \\
       & & LoRA & $54.20 \pm 2.84$ & $36.58 \pm 5.82$ & $17.63$ \\
       & & LP-LoRA & $71.12 \pm 3.97$ & $32.47 \pm 3.74$ & $38.65$ \\
  \midrule
 \multirow{8}{*}{MRPC}&  \multirow{4}{*}{ECE (\%)}
         & FT & $10.71 \pm 0.39$ & $4.61 \pm 0.24$ & $6.10$ \\
       & & LP-FT & $10.35 \pm 0.14$ & $3.68 \pm 0.10$ & $6.68$ \\
       & & LoRA & $6.58 \pm 0.68$ & $4.04 \pm 0.87$ & $2.54$ \\
       & & LP-LoRA & $9.03 \pm 0.85$ & $3.89 \pm 0.40$ & $5.14$ \\
  \cmidrule(lr){2-6}
 & \multirow{4}{*}{MCE (\%)}
         & FT & $61.84 \pm 7.93$ & $32.72 \pm 1.69$ & $29.12$ \\
       & & LP-FT & $74.43 \pm 2.22$ & $22.73 \pm 1.33$ & $51.70$ \\
       & & LoRA & $28.80 \pm 5.05$ & $17.57 \pm 2.00$ & $11.23$ \\
       & & LP-LoRA & $52.20 \pm 6.64$ & $22.76 \pm 7.60$ & $29.44$ \\
  \bottomrule
\end{tabular}
}

\end{table}
%----------------------------------------------------------------------------------
\subsubsection{PubMed 20k}
In addition to the natural language understanding benchmarks, we also evaluated LP-FT on the PubMed $20$k RCT dataset to evaluate its effectiveness in practical applications. The PubMed $20$k RCT dataset, a subset of PubMed $200$k~\citep{dernoncourt2017pubmed}, comprises 20,000 medical abstracts from randomized controlled trials, categorized into five classes. Efficient tools for navigating extensive medical literature are essential for the medical community.

The results are presented in~\Cref{tab:pubMed}. The LoRA model outperforms other models, although the performance of FT, LP-FT, and LoRA models are relatively similar.
\begin{table}[htbp]
  \centering
  % notebook/01_paper_v2/pubmed.ipynb
  \caption{Test accuracy on PubMed $20$k.}
  \begin{tabular}{rrrrr}
      \toprule
       LP &FT & LP-FT & LoRA & LP-LoRA \\ \midrule
       $82.64 \pm 0.02$&$87.09 \pm 0.17$&$87.05 \pm 0.11$&$\bm{87.13 \pm 0.09}$&$86.85 \pm 0.07$\\\bottomrule
  \end{tabular}
  \label{tab:pubMed}
\end{table}
%----------------------------------------------------------------------------------
