%\documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{tikz,xcolor}
\usepackage{amsmath,amsfonts}
\usepackage{multirow}
\usepackage{colortbl}  %彩色表格需要加载的宏包
%\usepackage{wrapfig}
\usepackage{amssymb, booktabs}
\usepackage{epsfig}
\usepackage[pagebackref=true,breaklinks=true,letterpaper=true,colorlinks,citecolor=blue,linkcolor=blue,bookmarks=false]{hyperref}
\definecolor{defaultcolor}{gray}{0.9}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Trusted re-weighting for label distribution learning}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
%\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
%\author[1]{Harry~Q.~Bovik}
%\author[1,2]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[1]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
%\affil[1]{%
%    Computer Science Dept.\\
%    Cranberry University\\
%    Pittsburgh, Pennsylvania, USA
%}
%\affil[2]{%
%    Second Affiliation\\
%    Address\\
%    …
%}
%\affil[3]{%
%    Another Affiliation\\
%    Address\\
%    …
%  }
\author[1]{Zhuoran Zheng}
\author[2]{Chen Wu}
\author[3]{Yeying Jin}
\author[1]{Xiuyi Jia\thanks{Correspondence.  This work was supported by the National Natural
		Science Foundation of China (62176123).}}
% Add affiliations after the authors
\affil[1]{%
	School of Computer Science and Engineering\\
	Nanjing University of Science and Technology\\
	Nanjing, China
}
\affil[2]{%
	University of Science and Technology of China\\
	Hefei, China
}
\affil[3]{%
	Department of Electrical and Computer Engineering\\
	National University of Singapore\\
	Singapore
}  
 

  
\begin{document}
\maketitle

\begin{abstract}
	Label distribution learning (LDL) is a novel machine learning paradigm that aims to shift 0/1 labels into descriptive degrees to characterize the polysemy of instances. 
	%
	Since the description degree takes a value between 0$\sim$1, it is difficult for the annotator to accurately annotate each label. 
	%
	Therefore, the predictive ability of numerous LDL algorithms may be degraded by the presence of noise in the label space. 
	%
	To address this problem, we propose a novel stability-trust LDL framework that aims to reconstruct the feature space of an arbitrary LDL dataset by using feature decoupling and prototype guidance.
	%
	Specifically, first, we use prototype learning to select reliable cluster centers (representative vectors of label distributions) to filter out a set of clean samples (with labeled noise) on the original dataset.
	%
	Then, we decouple the feature space (eliminating correlations among features) by modeling a weight assigner that is learned on this clean sample set, thus assigning weights to each sample of the original dataset.
	%
	Finally, all existing LDL algorithms can be trained on this new re-weighted dataset for the goal of robust modeling. 
	%
	In addition, we create a new image dataset to support the training and testing of compared models. 
	%
	Experimental results demonstrate that the proposed framework boosts the performance of the LDL algorithm on datasets with label noise.
\end{abstract}

\section{Introduction}
%
Currently, label distribution learning (LDL) plays a landmark role in characterizing task uncertainty and conveying the polysemy of an instance~\citep{gao2017deep,geng2016label,zheng2021uncertainty,zheng2022label}.
%
In contrast to the classical multi-label learning paradigm~\citep{zhang2013review}, LDL describes an instance as a distribution of descriptive degrees rather than a vector of 0/1 labels (see Figure~\ref{f0}).
%
Therefore, a learner (classifier or regressor) tends to focus more on tracking the decision bounds, hence the robustness of the whole algorithm is boosted~\citep{le2023uncertainty}.


\begin{figure}[t]\scriptsize
	\begin{center}
		\tabcolsep 1pt
		\vspace{-2mm}
		\begin{tabular}{@{}cccc@{}}
			\includegraphics[width = 0.46\textwidth]{img/f0.png}             \\
		\end{tabular}
	\end{center}
	\vspace{-2mm}
	\caption{This figure visualizes the difference between label distribution learning and multi-label learning describing a single instance.
		If the description object is an image, the label distribution value represents the percentage of components in the image.}
	\vspace{-4mm}
	\label{f0}
\end{figure}


Recently, a large body of work~\citep{chen2021toward,gao2018age,li2022ultra,liu2021ngdnet,si2022towards,zhao2021robust} leverages the properties of LDL to characterize the relation between feature space and label space for achieving competitive performance.
%
However, most researchers overlook the fact that the label space of the dataset may be noisy since the uncertainty of manual annotation and the inductive bias of the label enhancement algorithm~\citep{xu2019label} can introduce noise into the label space (for example, the annotators misrecorded the percentage of the two components and exchanged their label distribution values).
%
This low-quality set of labels can cause the LDL algorithm to be off the right modeling track, usually showing up as under-performance on the test set.


Several works~\citep{li2022label,zheng2022label} attempt to address the problem of label noise on LDL benchmarks.
%
\citet{li2022label} by building expert knowledge to endow the training set samples with different weights during the model iteration.
%
\citet{zheng2022label} estimate label uncertainty by building a label distribution matrix on the label space.
%
%Unfortunately, these approaches are expensive to model due to their reliance on experts' prior knowledge and data sampling algorithms.
%
Existing work on noise processing is embedded in the prediction algorithm, which is tightly coupled with the algorithm and has low generalizability.
%
In contrast, we develop a generalized LDL pre-processing framework (stability-trust framework) in this paper, which is a catch-all paradigm for constructing high-quality LDL datasets to serve existing LDL algorithms to boost performance.
%
Specifically, first, we attempt to create a pseudo-label space over the label distribution space, to distinguish which samples may be noisy with the help of the prototype guidance.
%
%Here, we propose a new insight to conduct sample filtering by raising the representation dimension of input tabular samples.
%
Then, a clean training set is filtered by a look-up table for training an efficient weight assignor (when the weight assignor is trained on a noisy dataset it leads to a degradation of the algorithm performance).
%
Note that the prototype space includes several vectors that can represent the characteristics of the label distribution space.
%
For instance, if the label distribution space consists of six label values, we divide the dataset into six subsets and estimate their expectations to obtain six representative vectors.
%
Finally, a weight assignor reconstructs the original LDL dataset using a feature decoupling scheme (including a kernel map).
%
Feature decoupling is to treat the feature space of the customized LDL datasets as tabular information and decouple the correlation between features by assigning weights to the samples.
%
However, this weight assigner predicts that the label distribution can be overly compact due to the prototype guide, and for this reason, we consider using uncertainty modeling to assign tiny weights to a small number of samples as a positive incentive for noise~\citep{li2022positive}.
%
The contributions of this paper are summarized as:

\noindent {\textbf{1)}} We propose a stability-trust framework that achieves the dual purpose of denoising and feature decoupling by assigning weights to the raw sample space that helps the downstream learners enhance their regression abilities.

\noindent {\textbf{2)}} We use a prototype space to guide the weight assignor to inscribe a compact learning space for faster convergence of the model. In addition, we consider an uncertainty modeling algorithm to construct some positive incentive noise to boost the performance of the learner.

\noindent {\textbf{3)}} In contrast to the existing tabular datasets (customized LDL dataset), we build a new image dataset stored in the form of image-to-label\footnote[1]{https://github.com/zzr-idam/LDL} to evaluate the deep networks. 





\section{Related Work}
%
\noindent \textbf{Label distribution learning.}
%
%
Currently, LDL plays a vital role in estimating a task's uncertainty and thus boosting the model generalization capability.
%
LDL is similar to deep learning modeling approaches, where the output of a model is usually standardized into probability vectors by Softmax.
%
However, in contrast, LDL gives semantic information and a priori distributional constraints, which can allow it to be used as a regularization term to help improve the performance of existing methods.
%
%As shown in Figure~\ref{f0}, although a vector with probability values as components has a richer representation than logic values however the probability values are not stable.
%Label distribution learning has attracted several attention as a new learning paradigm.
The LDL paradigm is built from an age estimation task~\citep{geng2016label}.
%
%
%Label distribution learning comes from the scheme proposed by (\cite{?}) to address the age estimation task.
%
Since then a large number of approaches have been proposed, such as low-rank hypothesis-based~\citep{jia2019facial,ren2019label}, metric-based~\citep{gao2018age}, manifold-based~\citep{c2022label,wang2021label}, and label correlation-based~\citep{qian2022feature,teng2021incomplete}.
%
Moreover, some approaches are implemented in computer vision~\citep{chen2021toward,gao2018age,li2022unimodal,zhao2021robust}, and speech recognition~\citep{si2022towards} tasks to boost the performance of classifiers.
%
Recently, several approaches based on LDL start to tackle the label noise problem~\citep{li2022label,zheng2022label}.
%
However, these approaches are customized strategies, and we attempt to build a generalized preprocessing method to serve extant LDL algorithms.
%


\noindent \textbf{Prototype learning.}
%
Prototype learning~\citep{deng2021variational,dong2018few,li2021adaptive,ren2022prototype,wang2021interactive,yang2018robust} is a classical learning paradigm in machine learning and pattern recognition, which aims to select a representative subset to guide the behavior of downstream tasks.
%
For example, the nearest neighbor algorithm (KNN)~\citep{guo2003knn} is a typical prototype learning case, which guides the aggregation of the whole dataset by obtaining the centroids of a cluster.
%
Currently, prototype learning is utilized in several domains, such as image recognition, speech recognition~\citep{rouat2021prototype}, and inference of textual content~\citep{haghighi2006prototype}.
%
%Overall, prototype learning aims to build a robust pool or space for the algorithm to learn a robust decision boundary.
%
In the LDL domain, prototype learning plays the role of feature selection to help downstream LDL learners~\citep{gonzalez2020prolsfeo}.
%
In this paper, the prototype learning paradigm is used to help model a weight assignor by filtering a clean subset.
%To the best of our knowledge, this is the first try at introducing prototype learning into LDL tasks.
%


%\noindent \textbf{Confident learning.}
%
%Our work is built on a large body of work termed ``confident learning''~\cite{fang2022confident,northcutt2021confident,wang2021efficientclip,zhang2020characterizing}.
%
%Elkan et al.~\cite{elkan2008learning} first propose the use of counting to evaluate the ability of the model on a binary classification task.
%
%Later, some iteration-based methods~\cite{han2019deep,natarajan2013learning} are proposed to tackle the problem of the learner's weak robustness.
%
%Among them, methods based on conditional noise assumptions dominate~\cite{northcutt2021confident,xie2022ccmn}.
%%
%To the best of our knowledge, our model is the first algorithm that attempts to introduce confident (trust) learning on LDL tasks.
%
%It is worth noting that the class condition assumption cannot be enforced on our framework due to the existence of constraint relations between the values of the distributions (the label distribution with sum 1).

\noindent \textbf{Label noise estimation.}
%
The existing label space of large datasets hardly avoids the disturbance of noise, due to the complexity of the task, the subjectivity of the annotator, the inaccuracy of the annotation algorithm, etc.
%
Based on this, numerous works are presented to address the problem of noise disturbance~\citep{arazo2019unsupervised,ju2022improving,kaneko2019label,li2022improving,reeve2019fast,xie2022ccmn,yao2020dual,zhu2021second}.
%
There are two main strategies to solve such problems, one is to build a robust learning target or regularization term, and the other is to renovate the model for unbiased estimation.
%
In the field of LDL, there are already some works~\citep{li2022label,zheng2022label} that consider the presence of noise in the label space, however, these works are only applicable to customized LDL algorithms.
%
In contrast, our algorithm is a general framework as a data pre-processing technique.





\section{Stability-trust Framework}
%
In this paper, we develop a stability-trust framework focusing on tackling the problem of label distribution datasets with noisy labels.
%
Be aware that our framework can also handle multi-class tasks with noisy labels.
%

%
%Although the existing algorithms~\cite{zheng2022label, li2022label} address the problem of label distribution datasets with noisy labels, they overlook the potential value of labels.
%
\noindent \textbf{Notation.}
%
Given a particular instance, the goal of LDL is to learn the degree to which each label describes that instance. 
%
Input matrix  (tabular data) $\mathbf{X} \in \mathbb{R}^{M \times N}$, where $M$ is the number of instances and $N$ is the dimension of features.
%
%Input tensor (image data) $\mathcal{T} \in \mathbb{R}^{C \times W \times H}$, where $C$ is the number of channels, $W$ and $H$ is the width and height of the tensor, respectively. 
%
%We define the \textit{i-th} instance in the dataset as $x_{i}$. 
$x_i$ is the $i$-th instance in the dataset.
%
The label distribution space is defined as $\mathcal{D} \in \mathbb{R}^{M \times L} $, and $\mathcal{D}_j$ is the $j$-th label. 
%
For each instance $x_{i}$, its label distribution is $\mathcal{D}_{i}=\left\{{d_{x_{i}}^{y_{1}},d_{x_{i}}^{y_{2}}, \cdot ,d_{x_{i}}^{y_{L}} }\right\} $, where $ d_{x_{i}}^{y_{j}} $ is the description degree of the label $y_{j} $ for $x_{i} $. 
%
The $d_{x_{i}}^{y_{j}}$ is constrained by ${{d_{x_{i}}^{y_{j}}}}\in[0,1] $ and $\sum_{j=1}^{L}{d_{x_{i}}^{y_{j}}}=1$.
%
In addition, the prototype space is defined as $\mathcal{P} \in \mathbb{R}^{L \times L} $, then the prototype vector is defined as $p_{j}$.
%
The virtual label vector of all instances guided by the prototype learning is $\mathcal{VL} = \{vl_1, ...,vl_M\}$.
%
The label distribution that is predicted by the model is defined as $\mathcal{L}_{i} = \left\{{l_{x_{i}}^{y_{1}},l_{x_{i}}^{y_{2}},...,l_{x_{i}}^{y_{L}} }\right\}$.
%
Building a pseudo-label vector on the label space $\mathcal{Y}$ is $\mathcal{Q} = {q_1, q_2, \cdot, q_M}$ , and $q_{i}$ denotes the pseudo-label  for instance $x_i$.


\noindent \textbf{Assumptions.}
%
We rely on three key principles or assumptions for developing a stability-trust framework.
%
\noindent \textbf{a)} Prototypes are usually the information least disturbed by noise, such as the output of the mean filter and adaptive weighted average filter.
%
The prototype space as a ``clean'' set can push the predictive distribution of the model closer to the central data distribution.
%
In other words, using the prototype space as a guiding principle may lead to the construction of a new sample space that is more compact within the class and expands the distance between classes.
%
For blind datasets with noisy labels, this strategy yields a high-quality set with minimal outlay.
%
\noindent \textbf{b)} We use the prototype space to check the estimated flags on the label distribution against the flags that are self-contained by the label distribution to filter out high-quality learning space for the weight assignor.
%
This is an efficient filtering mechanism that uses the consistency of these two flags as the base for whether the sample is credible or not.
%This strategy is a vector search algorithm that estimates the flag of each label distribution by matching modeling due to the relative confidence of the vectors in the prototype space.
%
%
\noindent \textbf{c)} Based on the stable learning paradigm~\citep{shen2020stable}, we attempt to improve the inference ability of the classifier by decoupling the correlation between features.
%
Specifically, stable learning uses a tactic of assigning weights to samples to achieve feature decoupling, and the overall framework can be written as Algorithm~\ref{alg1}.
%
%
Here $w$ can be a linear algorithm or a deep network, and $\hat{\beta}$ works ultimately on the raw sample space $\textbf{X}$.

% Dataset D=$\{ \textbf{x}^{(i)}=(x_{1}^{(i)},...,x_{d}^{(i)}), y^{(i)}\}_{i=1}^{n}$
%
%In the training stage of the model, we pre-configured some pseudo-labels in the training dataset.
%
%This pair of models aim to estimate whether these pseudo-labels are consistent or not.
\begin{algorithm}[t]
	%\textsl{}\setstretch{1.8}
	\renewcommand{\algorithmicrequire}{\textbf{Input:}}
	\renewcommand{\algorithmicensure}{\textbf{Output:}}
	\caption{Stable Learning Framework}
	\label{alg1}
	\begin{algorithmic}[1]
		\STATE $\textbf{Input}:$ Dataset $\mathcal{B}$=$\{ \textbf{x}^{(i)}=(x_{1}^{(i)},...,x_{d}^{(i)}), y^{(i)}\}_{i=1}^{n}$
		\STATE $\textbf{Output}:$ Coefficients $\hat{\beta}$ on each variables
		\STATE /*Step I*/
		\STATE Learn weight $w(\textbf{X})$ to make $\textbf{X}$ are mutually independent of each other.
		\STATE /*Step II*/
		\STATE Solve weighted least squares with weighting function $w(\textbf{X})$. The solution is $\hat{\beta}_{w}^{(n)}$.
		\STATE Return $\hat{\beta}_{w}^{(n)}$.
	\end{algorithmic}  
	%\vspace{-2mm}
\end{algorithm}

%

\noindent \textbf{Goal.}
%
Although the framework aims to assign weights to each sample in the raw sample space formally, it has two key goals.
%
On the one hand, decoupling the correlation between features constructs a stable and robust learning space.
%
On the other hand, the output space of the network is guided by prototypes to create a high-quality training set with compact intra-class distance and relaxed inter-class distance.
%
However, the principle of prototype learning overly makes the model's predictions compact, and to alleviate this problem, we add a moderate amount of noise to the training set.

\begin{figure}[t!] 
	\begin{center}
		\begin{tabular}{@{}c@{}}
			\includegraphics[width = 0.49\textwidth]{img/F12.png}                   
		\end{tabular}
	\end{center}
	\vspace{-4mm}
	\caption{\textbf{Our architecture.} This figure(a) shows the architecture of the proposed stability-trust, which consists of three parts. This figure(b) represents the performance of the three LDL algorithms on the dataset, with the \textcolor{blue}{blue line} indicating the training set without the modification and the \textcolor{red}{red line} indicating the dataset with reconstruction scheme. The data in Figure(b) are normalized to be between 0 and 1. Predictably, images that are integrated over a curvilinear surface show better performance with larger areas.}
	\vspace{-4mm}
	\label{fw}
\end{figure}


\section{Proposed Method}
%
As shown in Figure~\ref{fw}, our framework is divided into three stages,
%
first, we use a standard prototype learning to filter out a relatively clean training set $\hat{\mathcal{B}}$; next, we design some customized loss terms by which a corresponding coefficient is learned for each instance $x_{i}$ of the raw space; finally, potentially noisy instances are given slight weights as positive incentive noise as part of the training set.
%
This approach is a plug-and-play data pre-processing strategy to model arbitrary sizes of tabular data sets.


\noindent \textbf{Obtaining a clean set $\hat{\mathcal{B}}$ from the raw space $\mathcal{B}$.}
%
Faced with a label distribution dataset of arbitrary size with label noise $\mathcal{B}$, we need to simply clean it with the help of the prototype space $\mathcal{P}$.
%
The purpose of cleaning dataset $\mathcal{B}$ to obtain $\hat{\mathcal{B}}$ is to provide a high-quality training set for generating a weight assignor.
%
So far, one question needs to be discussed, \textit{why do we require prototype learning to guide the reconstruction of datasets}?
%
We visualize the label space of a label distribution dataset as shown in Figure~\ref{fig-GridMap}.



\begin{figure}[!h] 
	\begin{center}
		\begin{tabular}{@{}c@{}}
			\includegraphics[width = 0.48\textwidth]{img/f2.png}                   
		\end{tabular}
	\end{center}
	\vspace{-4mm}
	\caption{We visualize the label space of the SBU-3DFE dataset by using the t-SNE algorithm~\citep{van2014accelerating}, where t-SNE is based on the KPCA algorithm~\cite{anowar2021conceptual}. Intuitively, the label distribution space of SBU-3DFE can be viewed as having 6 clusters, a property that exactly matches the dimensionality of the label space.  Even if the label distribution space is noisy, the center position of each cluster can still serve as reliable target information.}
	\vspace{-4mm}
	\label{fig-GridMap}
\end{figure}

%
The label space of this dataset (SBU-3DFE) has 6 dimensions, which correspond exactly to the 6 clusters in Figure~\ref{fig-GridMap}.
%
%Thus, we attempt to move the target of each learned example closer to the center of the nearest cluster, to generate a compact learning space.
%
We leverage prototype learning to sieve out representative vectors of each cluster as prototypes $p_{j}$.
%
Specifically, we start with building the prototype space $\mathcal{P}$ on the training dataset $\mathcal{B}$.
%
In the first step, $L$ subsets are constructed, and each subset stores the vectors $\mathcal{D}$ that can represent this label.
%
%For example, the dimension of the label space of the dataset is $L$ and the vector of label distributions is viewed as an array, the array with index 0 greater than $\frac{1}{L}$ forms a subset, and so on.
%
In the second step, the mean values in each of the $L$ sets are obtained as a prototype to build a prototype space of size $L \times L$.
%
The formal expression under the Python style:
\begin{small}
	\begin{equation}
		\begin{aligned}
			\text{prototype}[\text{j}, :] = \text{mean}(\mathcal{D}\underbrace{[\text{where}(\mathcal{D}_{i}[\text{j}] > (1/L)), :]}_{\color{blue}{\text{prototype vector}}}), \text{j} \in L.
		\end{aligned}
	\end{equation}
\end{small}
%
Following the prototype space $\mathcal{P}$ being constructed, 
%
we introduce how to build the pseudo-label vector $\mathcal{Q}$ and the prototype learning guided virtual label vector $\mathcal{VL}$. 
%
In label distribution space $\mathcal{D}$, the index number of the maximum value in each label distribution $\mathcal{D}_i$ is assigned as the pseudo-label $q_i$ for instance $x_i$. For example, for the instance with label distribution [ 0.1, 0.1, 0.1, \textcolor{red}{0.4}, 0.1, 0.2 ], its pseudo-label is 4. For the virtual label vector $\mathcal{VL}$ guided by prototype learning, we apply KNN (K=1) on the prototype space $\mathcal{P}$ to search the virtual label for each instance. For example, for $x_i$, we calculate the Euclidean distance to each prototype in the prototype space $\mathcal{P}$, to select the nearest prototype $p_j$ and use the index number with the maximum value in prototype $p_j$ as the virtual label of $x_i$. Finally, compare the constructed $\mathcal{Q}$ and $\mathcal{VL}$,  if the paired $q_i$ and $vl_i$ take the same value, keep the instance $x_i$ and its corresponding label distribution $\mathcal{D}_i$ to obtain a new training set $\mathcal{B}$.  




\noindent \textbf{Learning the coefficients of the raw samples.}
%
We try to design a set of coefficients assigned $\hat{\beta}_{w}^{(n)}$ to the raw samples.
%
%Note that the sample size of the training data $\hat{\mathcal{B}}$ is usually smaller than the raw sample space $\mathcal{B}$, which we discuss here in two subsections.
%
%On one hand, we introduce how to use $\hat{\mathcal{B}}$ to reconstruct some of the samples in $\mathcal{B}$ by using coefficients $\hat{\beta}_{w}^{(i)}$; on the other hand, we help the LDL algorithm to improve the noise resistance by using coefficients $\hat{\beta}_{w}^{(n-i)}$ for the rest of the samples.
%

%\noindent 1) Obtaining a set of coefficients $\hat{\beta}_{w}^{(i)}$. 
%
%The creation of this set of coefficients has the crucial purpose of decoupling the feature space and, in addition, since the samples corresponding to these coefficients $\hat{\beta}_{w}^{(i)}$ are filtered there is no need to take into account the noise characteristics.
%
Specifically, we use a simple linear model to learn these coefficients in an end-to-end manner.
%
%So far, one challenge blocks the development of the algorithm; because of the different feature dimensions of the LDL dataset (see Table~\ref{T1}), a standard line model is difficult to model on a varied tabular space.
%
First, we introduce the \textbf{stable learning} problems as follows:
%

\textit{Problem}. Given the target $\hat{\beta}_{w}^{(i)}$ and input variables $x$ = [$x_1,...x_{s}$] $\in \mathbb{R}^{s}$, the task is to learn a predictive model which can achieve uniformly small error on any data point.

We consider the linear regression problem with model misspecification.
%
Specifically, we can assume the target $\hat{\beta}_{w}^{(i)}$ is generated by the following form:
\begin{equation}
	\hat{\beta}_{w}^{(i)} = x^\top \mathbf{W}_{1:s}
	+ \mathbf{W}_{0} + b(x) + \epsilon,
\end{equation}
where $x \in \mathbb{R}^{s}$ is an input vector, $b(x)$ is a bias term that depends on $x$, such that $|b(x)| \geq \delta $ and $\epsilon$ is zero-mean noise with variance $\sigma^{2}$.
%
Next, we need to use this model to build a set of training data with an optimization target to generate $\hat{\beta}_{w}^{(i)}$.
%
Here, we eliminate the values of the non-diagonal elements (ND) of the correlation matrix with the help of an L2 norm.
%
\begin{equation}
	\min~||\sum_{i=1}^{N}\text{ND}(((\hat{\mathbf{X}}^\top \mathbf{W})\mathbf{X})(\hat{\mathbf{X}}^\top \mathbf{W})\mathbf{X})^\top)_{i} - \rho||_{2},
\end{equation}
where $\hat{\mathbf{X}}$ denotes the feature space of a clean set of samples $\hat{\mathcal{B}}$, $\mathbf{X}$ denotes feature space of raw samples corresponding to $\mathcal{B}$, and $\rho$ denotes a small number ($\rho \leq 0.01$).
%
Note that ND assembles the non-diagonal elements of a square matrix into a one-dimensional array.
%
Predictably, we only address the correlations that exist between features in a linear space.
%
High-order correlations may still exist, and to completely decouple the correlations between features, we kernel-mapped (Gaussian kernel: $e^{- \frac{||x-x^{'}||^{2}}{2\sigma^{2}}}$) the reconstruction matrix ($\text{ker}(\hat{\mathbf{X}}^\top \mathbf{W})\mathbf{X})$).
%
The optimization objective of this algorithm can be written:
\begin{equation} 
	\begin{aligned}
		& \min~||\sum_{i=1}^{N}\text{ND}(((\hat{\mathbf{X}}^\top \mathbf{W})\mathbf{X})(\hat{\mathbf{X}}^\top \mathbf{W})\mathbf{X})^\top)_{i}  - \rho||_{2} + \\ & \lambda ||\sum_{i=1}^{N}\text{ND}(\text{ker}(((\hat{\mathbf{X}}^\top \mathbf{W})\mathbf{X}))\text{ker}((\hat{\mathbf{X}}^\top \mathbf{W})\mathbf{X})^\top))_{i} - \rho||_{2},
	\end{aligned}
\end{equation}
%
where $\lambda$ denotes a hyperparameter, which is obtained by parameter sensitivity analysis.
%
In addition, we add a regularization term for $\mathbf{W}$. The overall optimization objective can be written as follows:
\begin{equation} 
	\begin{aligned}
		\label{eq} 
		& \min~||\sum_{i=1}^{N}\text{ND}(\bar{\mathbf{X}}\bar{\mathbf{X}}^{\top})_{i} - \rho||_{2} + \lambda ||\sum_{i=1}^{N}\text{ND}(\text{ker}(\bar{\mathbf{X}})\text{ker}(\bar{\mathbf{X}})^{\top})_{i} \\ & - \rho||_{2} + \gamma ||\mathbf{W}||_{1},
	\end{aligned}
\end{equation}
%
where $\bar{\mathbf{X}} = \hat{\mathbf{X}}^\top \mathbf{W}\mathbf{X}$, $\gamma$ is a hyperparameter.
%
To eliminate the higher-order correlation between features, we apply a soft trick whose values of the diagonal elements of tr($\bar{\mathbf{X}}\bar{\mathbf{X}}^{\top}$) tend to 1.
%
%This soft trick aims to make the whole model easy to converge.
%
The approach considered in this paper is motivated by the following theoretical result, which shows the effect of model misspecification bias even when the sample size is infinity.
%

\textbf{\textit{Proposition}}. Consider the L2 norm when the sample size is infinity:
\begin{equation}
	\hat{\beta} = \min \mathbf{E}_{(\mathbf{X},\rho)}(\sum_{i=1}^{N}\text{ND}(\bar{\mathbf{X}}\bar{\mathbf{X}}^{\top})_{i} - \rho)^{2}.
\end{equation}
The estimation bias caused by the worst case perturbation error $|b(x)| \leq \delta$ can be as bad as $||\hat{\beta} - \mathbf{W}||^{2} \leq 2(\delta/\psi) + \delta$, where $\psi^{2}$ is the smallest eigenvalue of $\mathbf{E}((\mathbf{X}-\mathbf{E}(\mathbf{X}))(\mathbf{X}-\mathbf{E}(\mathbf{X}))^{\top})$.

\textit{Proof}. Let $\Delta\mathbf{W} = \mathbf{W} - \bar{\mathbf{W}}$ and $\Delta\hat{\beta} = \hat{\beta} - \bar{\mathbf{W}}$. We have $\Delta\hat{\beta} = \min \mathbf{E}(\mathbf{X}\Delta\mathbf{W}-b(\mathbf{X}))^{2}.$
%
At the optimal solution, we have $\Delta\hat{\beta}$ = $\mathbf{E}(b(\mathbf{X})) - \mathbf{E}(\mathbf{X}^{\top}\Delta\hat{\beta}_{1:s})$. By elimination $\mathbf{W}_{0}$, and let $\tilde{\mathbf{X}} = \mathbf{X} - \mathbf{E}(\mathbf{X})$, and $\tilde{b}(\mathbf{X}) = b(\mathbf{X})-\mathbf{E}_{\mathbf{X}}(b(\mathbf{X}))$, we have 
$\Delta\hat{\beta}_{1:s} = \min (\tilde{\mathbf{X}}^{\top}\Delta\mathbf{W}_{1:s}-\tilde{b}(\mathbf{X}))^{2}.$
%
It follows that
$
\Delta\hat{\beta}_{1:s} =(\mathbf{E}(\tilde{\mathbf{X}})\tilde{\mathbf{X}_{\top}})^{-1}\mathbf{E}(\tilde{b}(\mathbf{X}))\tilde{\mathbf{X}}.
$
This implies that $\Delta\hat{\beta}_{1:s} \leq \delta/\psi$. Moreover, it implies that $|\Delta\hat{\beta}_{0} \leq \delta + \delta/\psi|$.
We thus obtain the desired bound.
%



In the proposition, we observe that the worst-case estimation error tends to infinity when $\psi$ tends to 0. 
%
This means that when the variables are highly co-linear, ordinary least squares yield a bad solution even when the training data is very large (or infinite).
%
To solve this problem, we introduce the re-weighting theorem in~\citep{shen2020stable} to alleviate this problem.
%
This strategy leads to a total bias that is a constant value, providing a base for stable learning.
\begin{equation}
	||\hat{\beta}-\bar{\mathbf{W}}||^{2} = O(1) + O(n^{-1/2})\sqrt{\mathbf{E}_{\mathbf{X}\sim \mathbb{N}}w(\mathbf{X})^{2}\sigma},
\end{equation}
where $\mathbb{N}$ denotes the Gaussian distribution.
%

In this paper, we use an automatic differentiation framework (PyTorch) to run Eq.~\ref{eq} on an RTX3090 GPU shader with 24G RAM.
%
Note that since the feature spaces of Gene, Twitter, and Flicker are vast, we split the batch to conduct the learning of weights $\hat{\beta}$.
%
Here, since the split-batch implementation of these datasets, the method cannot be directly globally modeled, and for this reason, we train on these three datasets to conduct more epochs (training rounds). 
%
%We train on these three datasets with more epochs and bigger hyperparameters $\gamma$ to trade off the performance of this algorithm.
%
%The weight coefficients $\hat{\beta}^{i}_{w}$ obtained by this algorithm correspond to only one part of the samples in $\mathcal{B}$.

So far, we observe a phenomenon that the sample space reconstructed by the weight assignor is overly compact for the downstream learners, and these learners underperform on the test samples.
%
To solve this problem, we introduce some positive incentive noise.
%
The source of these positive incentive noises is the doubtful samples ($\mathcal{B} - \hat{\mathcal{B}}$) after being filtered by the prototype guidance.


\begin{table}[t] \tiny
	\begin{center}
		%\vspace{-2mm}
		\caption{Statistics of the experimental datasets. $\hat{\mathcal{B}}$  denotes a relatively clean dataset obtained from the raw sample space.}
		\vspace{-2mm}
		\label{T1}
		\begin{tabular}{lcccccc}
			\toprule
			ID & Dataset 	      & Examples	& Features  & Labels  & $\hat{\mathcal{B}}$   & Full-rank         \\
			\midrule
			1  &  wc-LDL          &  500        & 243       & 12   &  163          & Yes\\
			2  &  SJAFFE          &  213        & 243       & 6    &  180  & Yes\\
			3  &  SBU-3DFE        &  2500       & 243       & 6    &  156  & Yes\\
			4  &  Scene           &  2000       & 294       & 9    &  204  & Yes\\
			5  &  Gene            &  17892      & 36        & 68   &  9868 & Yes\\
			6  &  Movie           &  7755       & 1869      & 5    &  6045 & Yes\\
			7  &  M2B             &  1240       & 250       & 5    &  799  & Yes\\
			8  &  SCUT            &  1500       & 300       & 5    &  879  & Yes\\
			9  &  fbp5500         &  5500       & 512       & 5    &  362  & Yes\\
			10  &  RAF-ML         &  4908       & 200       & 6    &  3120 & Yes\\
			11  &  Twitter        &  10040      & 200       & 8    &  7802 & Yes\\
			12  &  Flickr         &  11150      & 200       & 8    &  4978 & Yes\\
			\bottomrule
		\end{tabular}%
		\vspace{-4mm}
	\end{center}
\end{table}

\begin{table*}[!h]  \scriptsize
	\begin{center}
		\vspace{-0mm}
		\caption{The performance of our proposed method with the comparison algorithms on 12 datasets. The best-performing results are marked in \textbf{bold}.}
		%, with the last column (BLB) showing the degree of performance improvement for each algorithm compared to those without regularization techniques.}
	\vspace{-2mm}
	\label{T3}
	\resizebox{\linewidth}{!}{
		\begin{tabular}{c|c|cccccc}
			\toprule
			Dataset                             & Algorithm	   & Chebyshev $\downarrow$  & Clark $\downarrow$   & Canberra $\downarrow$   & K-L $\downarrow$    & Cosine $\uparrow$   & Intersection $\uparrow$           \\ 	
			\midrule 
			
			&     Ours     & \textbf{0.0743 $\boldsymbol{\pm}$ \text{0.0011}}           & \textbf{0.3884 $\boldsymbol{\pm}$ \text{0.0055}}       & \textbf{0.7667 $\boldsymbol{\pm}$ \text{0.0033}}          & 0.0421 $\pm$ \text{0.0008}     & \textbf{0.9896 $\boldsymbol{\pm}$ \text{0.0009}}      & \textbf{0.8813 $\boldsymbol{\pm}$ \text{0.0014}}                \\
			
			&     Baseline-LDL     & 0.0788 $\pm$ \text{0.0019}           & 0.4008 $\pm$ \text{0.0042}       & 0.7770 $\pm$ \text{0.0023}          & 0.0408 $\pm$ \text{0.0056}     & 0.9801 $\pm$ \text{0.0017}      & 0.8760 $\pm$ \text{0.0015}                \\
			
			&     INP     & 0.0779 $\pm$ \text{0.0021}           & 0.3980 $\pm$ \text{0.0051}       & 0.7779 $\pm$ \text{0.0030}          & \textbf{0.0404 $\boldsymbol{\pm}$ \text{0.0020}}     & 0.9883 $\pm$ \text{0.0009}      & 0.8778 $\pm$ \text{0.0014}                \\
			
			&     PCA    & 0.0748 $\pm$ \text{0.0122}           & 0.4008 $\pm$ \text{0.0020}       & 0.7883 $\pm$ \text{0.0012}          & 0.0422 $\pm$ \text{0.0051}     & 0.9887 $\pm$ \text{0.0012}      & 0.8790 $\pm$ \text{0.0034}                \\
			
			
			&  LDL-LRR     & 0.0923 $\pm$ \text{0.0030}          & 0.4212 $\pm$ \text{0.0036}      & 0.8135 $\pm$ \text{0.0024}          & 0.0511 $\pm$ \text{0.0049}     & 0.9718 $\pm$ \text{0.0022}      & 0.8669 $\pm$ \text{0.0047}                   \\
			
			&  LDL-LCLR    & 0.1057 $\pm$ \text{0.0019}           & 1.0569 $\pm$ \text{0.0039}       & 0.7890 $\pm$ \text{0.0039}          & 0.0545 $\pm$ \text{0.0037}     & 0.9668 $\pm$ \text{0.0049}      & 0.8383 $\pm$ \text{0.0018}                     \\
			
			&  LDLSF       & 0.1009 $\pm$ \text{0.0038}           & 0.4199 $\pm$ \text{0.0044}       & 0.9008 $\pm$ \text{0.0015}          & 0.0519 $\pm$ \text{0.0040}     & 0.9779 $\pm$ \text{0.0018}      & 0.8660 $\pm$ \text{0.0022}                     \\
			
			&  LALOT       & 0.0989 $\pm$ \text{0.0019}          & 0.6689 $\pm$ \text{0.0019}       & 0.8089 $\pm$ \text{0.0049}          & 0.0477 $\pm$ \text{0.0018}     & 0.9476 $\pm$ \text{0.0020}      & 0.8700 $\pm$ \text{0.0033}                    \\
			
			\multirow{-7}{*}{wc-LDL} 		            &  BFGS-LLD    & 0.1122 $\pm$ \text{0.0039}           & 1.5657 $\pm$ \text{0.0021}       & 0.7998 $\pm$ \text{0.0020}          & 0.0498 $\pm$ \text{0.0051}     & 0.9704 $\pm$ \text{0.0036}      & 0.8611 $\pm$ \text{0.0016}                 \\
			
			\midrule
			
			&  Ours        & \textbf{0.0822 $\boldsymbol{\pm}$ \text{0.0019}}           & \textbf{0.4001 $\boldsymbol{\pm}$ \text{0.0033}}       & \textbf{0.7888 $\boldsymbol{\pm}$ \text{0.0043}}          & 0.4053 $\pm$ \text{0.0013}      & \textbf{0.9891 $\boldsymbol{\pm}$ \text{0.0002}}      & \textbf{0.8846 $\boldsymbol{\pm}$ \text{0.0055}}                   \\
			
			&  Baseline-LDL     & 0.0899 $\pm$ \text{0.0033}           & 0.4128 $\pm$ \text{0.0027}       & 0.8007 $\pm$ \text{0.0013}          & 0.4212 $\pm$ \text{0.0074}     & 0.9709 $\pm$ \text{0.0013}      & 0.8699 $\pm$ \text{0.0015}                \\
			
			&  INP        & 0.0854 $\pm$ \text{0.0018}           & 0.4008 $\pm$ \text{0.0030}       & 0.7955 $\pm$ \text{0.0023}          & \textbf{0.4010 $\boldsymbol{\pm}$ \text{0.0012}}      & 0.9799 $\pm$ \text{0.0014}      & 0.8809 $\pm$ \text{0.0015}                   \\
			
			&  PCA        & 0.0832 $\pm$ \text{0.0033}           & 0.4012 $\pm$ \text{0.0008}       & 0.7910 $\pm$ \text{0.0043}          & 0.4155 $\pm$ \text{0.0087}      & 0.9823 $\pm$ \text{0.0049}      & 0.8832 $\pm$ \text{0.0055}                   \\
			
			
			&  LDL-LRR     & 0.0866 $\pm$ \text{0.0021}           & 0.4220 $\pm$ \text{0.0036}       & 0.8001 $\pm$ \text{0.0024}          & 0.4258 $\pm$ \text{0.0049}     & 0.9610 $\pm$ \text{0.0022}      & 0.8689 $\pm$ \text{0.0047}                       \\
			
			&  LDL-LCLR    & 0.1057 $\pm$ \text{0.0019}           & 1.0569 $\pm$ \text{0.0039}       & 0.7890 $\pm$ \text{0.0039}          & 0.5045 $\pm$ \text{0.0037}     & 0.9668 $\pm$ \text{0.0049}      & 0.8383 $\pm$ \text{0.0018}                      \\
			
			&  LDLSF       & 0.1122 $\pm$ \text{0.0038}           & 0.4397 $\pm$ \text{0.0044}       & 0.9212 $\pm$ \text{0.0015}          & 0.5557 $\pm$ \text{0.0040}     & 0.9779 $\pm$ \text{0.0018}      & 0.8660 $\pm$ \text{0.0022}                       \\
			
			&  LALOT       & 0.0979 $\pm$ \text{0.0018}           & 0.6799 $\pm$ \text{0.0021}       & 0.8077 $\pm$ \text{0.0039}          & 0.4756 $\pm$ \text{0.0015}     & 0.9433 $\pm$ \text{0.0111}      & 0.8423 $\pm$ \text{0.0034}                      \\
			
			\multirow{-6}{*}{SJAFFE} 		            &  BFGS-LLD    & 0.1334 $\pm$ \text{0.0139}           & 1.6648 $\pm$ \text{0.0023}       & 0.7999 $\pm$ \text{0.0022}          & 0.4771 $\pm$ \text{0.0051}     & 0.9711 $\pm$ \text{0.0036}      & 0.8655 $\pm$ \text{0.0116}                                        \\
			
			\midrule
			
			&  Ours        & \textbf{0.0811 $\boldsymbol{\pm}$ \text{0.0023}}           & \textbf{0.3987 $\boldsymbol{\pm}$ \text{0.0024}}       & \textbf{0.7533 $\boldsymbol{\pm}$ \text{0.0027}}          & \textbf{0.0354 $\boldsymbol{\pm}$ \text{0.0031}}      & \textbf{0.9888 $\boldsymbol{\pm}$ \text{0.0066}}      & \textbf{0.8997 $\boldsymbol{\pm}$ \text{0.0030}}                    \\
			
			&  Baseline-LDL     & 0.0970 $\pm$ \text{0.0442}           & 0.4151 $\pm$ \text{0.0088}       & 0.7810 $\pm$ \text{0.0023}          & 0.0414 $\pm$ \text{0.0019}     & 0.9711 $\pm$ \text{0.0013}      & 0.8797 $\pm$ \text{0.0016}                \\
			
			&  INP        & 0.0833 $\pm$ \text{0.0020}           & 0.3994 $\pm$ \text{0.0010}       & 0.7611 $\pm$ \text{0.0020}          & 0.0365 $\pm$ \text{0.0014}      & 0.9811 $\pm$ \text{0.0015}      & 0.8900 $\pm$ \text{0.0017}                    \\
			
			
			&  PCA       & 0.0820 $\pm$ \text{0.0045}           & 0.3999 $\pm$ \text{0.0011}       & 0.7689 $\pm$ \text{0.0111}          & 0.0370 $\pm$ \text{0.0077}      & 0.9866 $\pm$ \text{0.0015}      & 0.8953 $\pm$ \text{0.0044}                    \\
			
			&  LDL-LRR     & 0.0912 $\pm$ \text{0.0036}           & 0.4013 $\pm$ \text{0.0039}      & 0.7602 $\pm$ \text{0.0021}          & 0.0369 $\pm$ \text{0.0028}     & 0.9697 $\pm$ \text{0.0029}      & 0.8891 $\pm$ \text{0.0033}                    \\
			
			&  LDL-LCLR    & 0.1100 $\pm$ \text{0.0025}           & 0.9660 $\pm$ \text{0.0039}       & 0.7897 $\pm$ \text{0.0033}          & 0.0511 $\pm$ \text{0.0021}     & 0.9677 $\pm$ \text{0.0056}      & 0.8555 $\pm$ \text{0.0032}                   \\
			
			&  LDLSF       & 0.1009 $\pm$ \text{0.0038}           & 0.4199 $\pm$ \text{0.0044}       & 0.9008 $\pm$ \text{0.0015}          & 0.0519 $\pm$ \text{0.0040}     & 0.9780 $\pm$ \text{0.0029}      & 0.8660 $\pm$ \text{0.0022}                     \\
			
			&  LALOT       & 0.0899 $\pm$ \text{0.0021}           & 0.6563 $\pm$ \text{0.0019}       & 0.8132 $\pm$ \text{0.0100}          & 0.0468 $\pm$ \text{0.0021}     & 0.9441 $\pm$ \text{0.0011}      & 0.8723 $\pm$ \text{0.0034}                    \\
			
			\multirow{-7}{*}{SBU} 		            &  BFGS-LLD    & 0.1119 $\pm$ \text{0.0030}           & 1.4657 $\pm$ \text{0.0022}       & 0.7700 $\pm$ \text{0.0025}          & 0.0492 $\pm$ \text{0.0053}     & 0.9753 $\pm$ \text{0.0036}      & 0.8710 $\pm$ \text{0.0019}                    \\
			
			\midrule
			
			&  Ours        & \textbf{0.2981 $\boldsymbol{\pm}$ \text{0.0024}}           & \textbf{2.3077 $\boldsymbol{\pm}$ \text{0.0013}}       & \textbf{6.4133 $\boldsymbol{\pm}$ \text{0.0029}}          & \textbf{0.8029 $\boldsymbol{\pm}$ \text{0.0020}}      & \textbf{0.7991 $\boldsymbol{\pm}$ \text{0.0011}}      & \textbf{0.5699 $\boldsymbol{\pm}$ \text{0.0014}}                    \\
			
			&  Baseline-LDL     & 0.3155 $\pm$ \text{0.0022}           & 2.3559 $\pm$ \text{0.0155}       & 6.6958 $\pm$ \text{0.1231}          & 0.8533 $\pm$ \text{0.0099}     & 0.7664 $\pm$ \text{0.0015}      & 0.5349 $\pm$ \text{0.0014}                \\
			
			&  INP        & 0.2998 $\pm$ \text{0.0020}           & 2.3374 $\pm$ \text{0.0018}       & 6.5163 $\pm$ \text{0.0018}          & 0.8111 $\pm$ \text{0.0029}      & 0.7890 $\pm$ \text{0.0049}      & 0.5691 $\pm$ \text{0.0010}                    \\
			
			&  PCA        & 0.3010 $\pm$ \text{0.0213}           & 2.3266 $\pm$ \text{0.0085}       & 6.533$\pm$ \text{0.0091}          & 0.8097 $\pm$ \text{0.0031}      & 0.7913 $\pm$ \text{0.0033}      & 0.5612 $\pm$ \text{0.0006}                    \\
			
			&  LDL-LRR     & 0.2989 $\pm$ \text{0.0111}           & 2.3698 $\pm$ \text{0.0051}      & 6.4777 $\pm$ \text{0.0025}          & 0.8362 $\pm$ \text{0.0069}     & 0.7744 $\pm$ \text{0.0077}      & 0.5444 $\pm$ \text{0.0049}                  \\
			
			&  LDL-LCLR    & 0.3740 $\pm$ \text{0.0066}           & 2.4986 $\pm$ \text{0.0066}       & 6.8600 $\pm$ \text{0.0067}          & 0.8559 $\pm$ \text{0.0039}     & 0.7119 $\pm$ \text{0.0122}      & 0.5119 $\pm$ \text{0.0081}                   \\
			
			&  LDLSF       & 0.3441 $\pm$ \text{0.0249}           & 2.9884 $\pm$ \text{0.0055}       & 6.6900 $\pm$ \text{0.0055}          & 0.8391 $\pm$ \text{0.0044}     & 0.7336 $\pm$ \text{0.0088}      & 0.5660 $\pm$ \text{0.0041}                     \\
			
			&  LALOT       & 0.3129 $\pm$ \text{0.0152}           & 2.3999 $\pm$ \text{0.0044}       & 6.6366 $\pm$ \text{0.0078}          & 0.8226 $\pm$ \text{0.0033}     & 0.7390 $\pm$ \text{0.0100}      & 0.5224 $\pm$ \text{0.0066}                   \\
			
			\multirow{-7}{*}{Scene} 		            & BFGS-LLD    & 0.3598 $\pm$ \text{0.0020}           & 2.4998 $\pm$ \text{0.0033}       & 6.7999 $\pm$ \text{0.0049}          & 0.8400 $\pm$ \text{0.0033}     & 0.7333 $\pm$ \text{0.0064}      & 0.5199 $\pm$ \text{0.0055}                    \\ \midrule
			
			&  Ours        & \textbf{0.0480 $\boldsymbol{\pm}$ 0.0033}            & \textbf{2.1008 $\boldsymbol{\pm}$ 0.0259}       & \textbf{14.0800 $\boldsymbol{\pm}$ 0.0153}          & \textbf{0.2320 $\boldsymbol{\pm}$ 0.0094}      & \textbf{0.8406 $\boldsymbol{\pm}$ 0.0023}      & \textbf{0.7997 $\boldsymbol{\pm}$ 0.0077}                   \\ 
			
			&  Baseline-LDL     & 0.0509 $\pm$ \text{0.0066}           & 2.2004 $\pm$ \text{0.0055}       & 14.1449 $\pm$ \text{0.2448}          & 0.2440 $\pm$ \text{0.0024}     & 0.8345 $\pm$ \text{0.0009}      & 0.7821 $\pm$ \text{0.0016}                \\
			
			&  INP        & 0.0488 $\pm$ \text{0.0012}           & 2.1029 $\pm$ \text{0.0259}       & 14.0888 $\pm$ \text{0.0551}          & 0.2335 $\pm$ \text{0.0044}      & 0.8395 $\pm$ \text{0.0032}      & 0.7984 $\pm$ \text{0.0066}                   \\
			
			&  PCA        & 0.0482 $\pm$ \text{0.0013}           & 2.1020 $\pm$ \text{0.0212}       & 14.0835 $\pm$ \text{0.0142}          & 0.2321 $\pm$ \text{0.0087}      & 0.8390 $\pm$ \text{0.0016}      & 0.7989 $\pm$ \text{0.0099}                   \\
			
			&  LDL-LRR     & 0.0494 $\pm$ \text{0.0039}           & 2.1888 $\pm$ \text{0.0861}      & 14.2550 $\pm$ \text{0.0144}          & 0.2400 $\pm$ \text{0.0077}     & 0.8388 $\pm$ \text{0.0144}      & 0.7789 $\pm$ \text{0.0040}                     \\
			
			&  LDL-LCLR    & 0.0511 $\pm$ \text{0.0022}           & 2.2201 $\pm$ \text{0.0444}       & 14.2101 $\pm$ \text{0.0510}          & 0.2566 $\pm$ \text{0.0047}     & 0.8302 $\pm$ \text{0.0012}      & 0.7722 $\pm$ \text{0.0060}                    \\
			
			&  LDLSF       & 0.0513 $\pm$ \text{0.0030}           & 2.2221 $\pm$ \text{0.0036}       & 14.3667 $\pm$ \text{0.0265}          & 0.2445 $\pm$ \text{0.0077}     & 0.8320 $\pm$ \text{0.0010}      & 0.7701 $\pm$ \text{0.0026}                      \\
			
			&  LALOT       & 0.0505 $\pm$ \text{0.0033}           & 2.1989 $\pm$ \text{0.0194}       & 14.1855 $\pm$ \text{0.0922}          & 0.2443 $\pm$ \text{0.0088}     & 0.8297 $\pm$ \text{0.0060}      & 0.7888 $\pm$ \text{0.0013}                     \\
			
			\multirow{-7}{*}{Gene} 		            &  BFGS-LLD    & 0.0578 $\pm$ \text{0.0066}           & 2.3008 $\pm$ \text{0.0188}       & 14.3559 $\pm$ \text{0.1556}          & 0.2480 $\pm$ \text{0.0015}     & 0.8300 $\pm$ \text{0.0049}      & 0.7786 $\pm$ \text{0.0070}                     \\ \midrule
			
			&  Ours        & \textbf{0.1071 $\boldsymbol{\pm}$ 0.0008}           & \textbf{0.4997 $\boldsymbol{\pm}$ 0.0064}       & \textbf{0.9710 $\boldsymbol{\pm}$ 0.0044}          & \textbf{0.0970 $\boldsymbol{\pm}$ 0.0008}      & \textbf{0.9595 $\boldsymbol{\pm}$ 0.0063}      & 0.8791 $\pm$ \text{0.0019}                     \\
			
			& Baseline-LDL     & 0.1109 $\pm$ \text{0.0033}           & 0.5119 $\pm$ \text{0.0155}       & 1.0889 $\pm$ \text{0.0111}          & 0.1355 $\pm$ \text{0.0022}     & 0.9422 $\pm$ \text{0.0333}      & 0.8744 $\pm$ \text{0.0054}                \\
			
			&  INP        & 0.1089 $\pm$ \text{0.0018}           & 0.5001 $\pm$ \text{0.0044}       & 0.9722 $\pm$ \text{0.0040}          & 0.0977 $\pm$ \text{0.0008}      & 0.9585 $\pm$ \text{0.0061}      & \textbf{0.8861 $\boldsymbol{\pm}$ 0.0006}                     \\
			
			&  PCA        & 0.1077 $\pm$ \text{0.0006}           & 0.5013 $\pm$ \text{0.0032}       & 0.9720 $\pm$ \text{0.0032}          & 0.0972 $\pm$ \text{0.0005}      & 0.9590 $\pm$ \text{0.0002}      & 0.8853 $\pm$ \text{0.0022}                     \\
			
			&  LDL-LRR     & 0.1107 $\pm$ \text{0.0009}           & 0.5019 $\pm$ \text{0.0010}      & 0.9801 $\pm$ \text{0.0061}          & 0.1045 $\pm$ \text{0.0049}     & 0.9591 $\pm$ \text{0.0022}      & 0.8772 $\pm$ \text{0.0027}                    \\
			
			&  LDL-LCLR    & 0.1177 $\pm$ \text{0.0086}           & 0.5345 $\pm$ \text{0.0040}       & 1.1533 $\pm$ \text{0.0111}          & 0.1559 $\pm$ \text{0.0030}     & 0.9360 $\pm$ \text{0.0049}      & 0.8222 $\pm$ \text{0.0011}                    \\
			
			&  LDLSF       & 0.1155 $\pm$ \text{0.0045}           & 0.5339 $\pm$ \text{0.0062}       & 1.1152 $\pm$ \text{0.0050}          & 0.1540 $\pm$ \text{0.0041}     & 0.9445 $\pm$ \text{0.0020}      & 0.8551 $\pm$ \text{0.0044}                     \\
			
			&  LALOT       & 0.1221 $\pm$ \text{0.0110}           & 0.5440 $\pm$ \text{0.0033}       & 1.1112 $\pm$ \text{0.0040}          & 0.1503 $\pm$ \text{0.0008}     & 0.9477 $\pm$ \text{0.0022}      & 0.8559 $\pm$ \text{0.0002}                    \\
			
			\multirow{-7}{*}{Movie} 		            &  BFGS-LLD    & 0.1310 $\pm$ \text{0.0032}           & 0.5230 $\pm$ \text{0.0022}       & 1.1170 $\pm$ \text{0.0024}          & 0.1595 $\pm$ \text{0.0155}     & 0.9400 $\pm$ \text{0.0003}      & 0.8491 $\pm$ \text{0.0018}                    \\
			
			%			
			\bottomrule
		\end{tabular}%
	}
	\vspace{-6mm}
\end{center}
\end{table*}





%The rest of the raw samples (positive incentive noise).
%
The samples weighted by our algorithm occupy only 50-90\% of the raw samples, as shown in Table~\ref{T1}.
%
Although the label space of the remaining samples has a high probability of carrying noise, noisy data are not always noxious and may have positive incentive properties~\citep{li2022positive}.
%
We want to assign a certain amount of weight to these samples to model the robust decision boundary without disrupting the generalization ability of the model as much as possible.
%
Here, we use a customized normal distribution $0.01 \times \mathbb{N}(0, 1)$ which randomly assigns weights to these samples.
%
Through the experimental part, we observe that this strategy makes the prediction of the label distribution more relaxed, and this method plays the role of regularization due to the feature decoupling that makes the prediction result of the LDL model overly compact.
%

\section{Experiments}

\textbf{Algorithm configurations.}
%
We conduct experiments on 12 datasets and the characteristics of the datasets are summarized in Table~\ref{T1}.
%
%All datasets are configured by reference to ~\cite{?}. 
Except for dataset wc-LDL, the configurations of all other datasets are referenced to~\citep{wang2021label}.
%We also add the Yeast dataset and a newly collected dataset.
%
This new release dataset (wc-LDL) has 500 watercolor images and corresponding label distribution (12 emotions).
%
%wc-LDL is constructed with a thorough discussion in the supplementary material.
For the wc-LDL dataset, we give 12 emotion tips to the annotators including 5 men and 5 women.
%
Finally, the outputs of these 10 annotators are normalized as the label distribution corresponding to the image.
%
Note that before watercolor images are annotated, we ask these experts to take a comprehensive view based on the lines and color combinations of the images.
%
For example, dense lines express vexation,  blue denotes depression, and red denotes enthusiasm.
%
To construct a training set with noisy labels, we use a switching algorithm with randomness at 20-35\% of the training set.
%
This algorithm aims to exchange the values of the label distribution over a label distribution ([0.1, 0.2, 0.7] $\rightarrow$ [0.7, 0.1, 0.2]).
%The label has 12 emotions in total, and each pair of images is evaluated by 10 professionals.
%
%As a result of the final evaluation, a mean value is taken as the distribution value of the label space.
We develop a simple linear model with a data pre-processing method (Ours). 
%
In addition, we set up a baseline (Baseline-LDL) with data pre-processing (without the strategy of randomly assigning weights to noisy samples).
%We develop the networks ($\texttt{MLP}^{1}$, $\texttt{MLP}^{2}$, and $\texttt{MLPs}^{3}$) with configurations on different datasets also reported in Table~\ref{T1}.
%
%Each multilayer perceptron-based network has several neurons in the input and output layers defined by the features and labels of the dataset, and $\text{H}_{2:end-1}$ denotes the number of hidden layer neurons.
%
To evaluate the performance of LDL models, we use the six metrics proposed by~\citep{geng2016label}, including Chebyshev distance $\downarrow$, Clark distance $\downarrow$, Canberra distance $\downarrow$, KL divergence $\downarrow$, Cosine similarity $\uparrow$, and Intersection similarity $\uparrow$.
%
%LF and DA denote the loss function and data augmentation method, respectively.
%
$\downarrow$ represents the indicator's performance favoring low values and $\uparrow$ represents the indicator's performance favoring high values.
%


\begin{table*}[h] \scriptsize
\begin{center}
	\vspace{-2mm}
	\caption{Ablation study. Effectiveness of the loss functions and the modules on Gene. Quantitative results demonstrate the effectiveness of each module.}
	\vspace{-2mm}
	\label{T5}
	\resizebox{\linewidth}{!}{
		\begin{tabular}{c|cccccc}
			\toprule
			Algorithm	   & Chebyshev $\downarrow$  & Clark $\downarrow$   &Canberra $\downarrow$   &K-L $\downarrow$    &Cosine $\uparrow$   &Intersection $\uparrow$                \\
			
			
			\midrule
			\rowcolor{defaultcolor}
			Ours        & 0.0480 $\pm$ \text{0.0033}           & 2.1008 $\pm$ \text{0.0259}       &14.0800 $\pm$ \text{0.0153}          & 0.2320 $\pm$ \text{0.0094}      & 0.8406 $\pm$ \text{0.0023}      & 0.7997 $\pm$ \text{0.0077}                   \\
			
			w/o FD       & 0.0499 $\pm$ \text{0.0063}           & 2.1331 $\pm$ \text{0.0220}       &14.1866 $\pm$ \text{0.0155}          & 0.2329 $\pm$ \text{0.0110}      & 0.8361 $\pm$ \text{0.0023}      & 0.7900 $\pm$ \text{0.0056}                   \\
			
			w/o KT       & 0.0511 $\pm$ \text{0.0034}           & 2.1226 $\pm$ \text{0.0230}       &14.1164 $\pm$ \text{0.0163}          & 0.2445 $\pm$ \text{0.0011}      & 0.8389 $\pm$ \text{0.0023}      & 0.7884 $\pm$ \text{0.0039}                   \\
			
			
			w/o RT        & 0.0498 $\pm$ \text{0.0019}           & 2.1121 $\pm$ \text{0.0100}       &14.2911 $\pm$ \text{0.0156}          & 0.2333 $\pm$ \text{0.0094}      & 0.8398 $\pm$ \text{0.0022}      & 0.7990 $\pm$ \text{0.0075}                   \\
			
			\bottomrule
		\end{tabular}%
	}
	\vspace{-6mm}
\end{center}
\end{table*}


{\flushleft \textbf{Experimental setting.}}
%
We conduct comparative experiments with seven LDL algorithms (Baseline-LDL, INP~\citep{zheng2022label}, BFGS-LLD~\citep{geng2016label}, LDL-LRR~\citep{jia2021label}, LDL-LCLR~\citep{ren2019label}, LDLSF~\citep{ren2019labelf}, principal component analysis (PCA) and LALOT~\citep{zhao2018label}).
%
Baseline-LDL as one of the methods of comparison is trained only on a relatively clean set of samples.
%Meanwhile, this baseline model uses regularization techniques (weight decay, early stopping, greed soup, data augmentation) to enhance generalization, and the optimizer uses the standard SGD.
%
INP presents an implicit representation to estimate the uncertainty of the label space.
%
BFGS-LLD is based on a linear model, the loss function is K-L divergence, and the optimization method is the quasi-Newton approach.
%
LDL-LRR and LDL-LCLR both consider label correlations in the learning process, with the former considering the order relationship of the labels and the latter capturing global relationships between labels.
%
For LDL-LRR, the parameters $\lambda$ and $\beta$ are tuned from $10^{\{-6,-5, \ldots,-2,-1\}}$ and $10^{\{-3,-2, \ldots, 1,2\}}$, respectively. For LDL-LCLR, the parameters $\lambda_{1}, \lambda_{2}, \lambda_{3}, \lambda_{4}$ and $k$ are set to $0.0001,0.001,0.001,0.001$ and $4$, respectively. 
%
LDLSF leverages label-specific features and common features simultaneously, whose parameters $\lambda_{1}, \lambda_{2}$ and $\lambda_{3}$ are tuned from $10^{\{-6,-5, \ldots,-2,-1\}}$, respectively, and $\rho$ is set to $10^{-3}$. 
%
LALOT adopts optimal transport distance as the loss function, and the trade-off parameter $C$ and the regularization coefficient $\lambda$ are set to $200$ and $0.2$, respectively.
%
%Our approach for experimental settings is reported in the supplementary material.
The fine-tuning settings for all comparison methods are referenced in~\citep{jia2021label}.
%
%It is worth noting that early stopping and greed soup are also used on all the datasets where the comparison algorithm is implemented.
In addition to the above comparison algorithms, we introduce PCA as one of the comparison algorithms because PCA also serves to decouple the feature space. PCA serves as a preprocessing framework (retaining 80 percent of the features), followed immediately by a standard linear regressor.

{\flushleft \textbf{Results and analysis.}}
%
We conduct 10 times 5-fold cross-validation on each dataset. 
%
The experimental results are presented in the form of ``mean$\pm$std'' in Table~\ref{T3} (the rest of the showcase is released in the supplemental material).
%
Overall, our proposed method outperforms other comparison algorithms on all evaluation metrics.
%
Three main reasons contribute to the competitive results of our approach.
%
%
\textbf{i):} With the uniform optimization scheme, our algorithm performs better than the baseline algorithm (Baseline-LDL) due to the feature decoupling. 
%
%Moderate noise, especially on the Gene dataset, due to the uncertainty that comes with manual annotation, our approach has a huge performance gain with the help of implicit distribution representation with Gaussian priors.
%
\textbf{ii):} From the performance of the baseline model, our method obtains competitive results on most of the metrics, thanks to the samples with uncertainty. In addition, we note that methods with label constraints (e.g., LDL-LRR) also perform well, and it may be due to label constraints that ignore the noisy label interference.
%
\textbf{iii):} Since the powerful learning capability of kernel mapping, the advantage of our approach is vast on image and text datasets.
%
Moreover, we evaluate the range of p-values for the six metrics on 12 data sets.
%'




Chebyshev $[1.54e-104, 1.00e+00]$, Clark $[5.44e-97, 1.98e-02]$, Canberra $[9.62e-98, 1.10e-01]$, K-L $[1.77e-102, 1.99e-01]$, C osine $[1.33e-99, 2.01e-01]$, and Intersection $[1.33e-113, 7.88e-01]$
%According to the test results, the LDL methods have significantly different performances in terms of each metric on all datasets except \textit{Gene} (at a 0.05 significance level). 
%
%Compared with other benchmarks, our approach underperforms the \textit{Gene} dataset may be because the label distribution space tends to be uniformly distributed.

\begin{figure}[h] 
\begin{center}
	\begin{tabular}{@{}c@{}}
		\includegraphics[width = 0.485\textwidth]{img/F4.png}        
	\end{tabular}
\end{center}
\vspace{-4mm}
\caption{This figure shows the performance comparison of these four algorithms after implementing our framework. The first row indicates that the algorithms are run on the Movie dataset with label noise, the second row indicates that the algorithms are run on the M2B dataset with label noise, and the third row indicates that the algorithms are run on the SCUT dataset with label noise.}
\vspace{-4mm}
\label{ra}
\end{figure}


{\flushleft \textbf{Parameter sensitivity analysis.}}
%
Our method has two parameters, including the regularization parameter $\lambda$ and $\gamma$.
%
To analyze the sensitivity of $\lambda$ and $\gamma$, we run our method with two sets \{0.001, 0.005, 0.01, 0.05, 0.1\}, and \{0.001, 0.005, 0.01, 0.05, 0.1\} on the Gene dataset.
%
We conduct a 5-fold cross-validation and achieved the following results (Cosine $\uparrow$):
%
$\lambda_{i}$ $\rightarrow$ \{0.9841 $\pm$ 0.0133, 0.9802 $\pm$ 0.0007, \textbf{0.9896 $\pm$ 0.0009}, 0.9876 $\pm$ 0.0044, 0.9877 $\pm$ 0.0084\}.
%
$\gamma_{i}$ $\rightarrow$ \{0.9813 $\pm$ 0.0043, 0.9821 $\pm$ 0.0075, \textbf{0.9896 $\pm$ 0.0009}, 0.9890 $\pm$ 0.0100, 0.9892 $\pm$ 0.0023\}.



{\flushleft \textbf{Ablation study.}}
%
To demonstrate the effectiveness of the loss function and the module of our model, we conduct an ablation study involving the following three experiments, and the results are shown in Table~\ref{T5}:
%
\textbf{(a)} w/o feature decoupling: We remove the weight assignment strategy, and our model is trained only on samples that are weighted by clean samples and those with uncertainty.
%
\textbf{(b)} w/o the kernel mapping: We remove the loss term in Eq.~\ref{eq} for the kernel trick, keeping only the first loss term and a regularization term.
%
\textbf{(c)} w/o regularization term: We remove the third loss term in Eq.~\ref{eq}.
%
%\textbf{(d)} We replace $\texttt{Tmax}$ with $\texttt{nn.Softmax}$ in deep prototype learning (w/o $\texttt{Tmax}$).
%The effectiveness of SNN: we use standard MLPs to replace SNNs in the same network architecture, shown in Table~\ref{T4}. 
%
%\textbf{(d)} %The effectiveness of GNN: for deep implicit function construction, we use standard MLPs to replace GNNs, as shown in Table~\ref{T4}. 
%
We conduct 10 times 5-fold cross-validation on the dataset of the ablation experiment.
%
%We note that in the ablation experiment (c), the pre-trained model has a lower KL divergence and the similarity of the distribution is weaker without pre-training.
% 

\begin{table*}[!htb] \tiny
	\caption{ Overall performance of \textit{MedMNIST} (v2) in metrics of AUC and ACC, using ResNet-18 / ResNet-50~\citep{al2020breast} with resolution $28$ and $224$, auto-sklearn, AutoKeras, Google AutoML Vision, FPVT~\citep{liu2022feature}, and Ours.}
	\vspace{-4mm}
	\label{tab:4dResults}
	\vspace{-1mm}
	\begin{center}
		\resizebox{\linewidth}{!}{
			\begin{tabular}{@{}ccccccccccccc@{}}
				\toprule
				\multirow{2}{*}{Methods} &
				\multicolumn{2}{c}{PathMNIST} &
				\multicolumn{2}{c}{ChestMNIST} &
				\multicolumn{2}{c}{DermaMNIST} &
				\multicolumn{2}{c}{OCTMNIST} &
				\multicolumn{2}{c}{PneumoniaMNIST} &
				\multicolumn{2}{c}{BloodMNIST} 
				\\
				& AUC & ACC & AUC & ACC & AUC & ACC & AUC & ACC & AUC & ACC & AUC & ACC\\ \midrule
				ResNet-18 (28)     & 0.970 & 0.823 & 0.700 & 0.941 & 0.846 & 0.711 & 0.950 & 0.730 & 0.953 & 0.840 & 0.990 & 0.932\\
				ResNet-18 (224)    & 0.971 & 0.860 & 0.702 & 0.943 & 0.890 & 0.721 & 0.952 & 0.753 & 0.960 & 0.848  &0.990 &0.955\\
				ResNet-50 (28)     & 0.971 & 0.833 & 0.691 & 0.942 & 0.883 & 0.705 & 0.923 & 0.744 & 0.941 & 0.833 & 0.989 & 0.950\\
				ResNet-50 (224)    & 0.973 & 0.841 & 0.676 & 0.929 & 0.890 & 0.713 & 0.944 & 0.702 & 0.960 & 0.893 & 0.972 & 0.935 \\
				auto-sklearn       & 0.444 & 0.386 & 0.640 & 0.625 & 0.886 & 0.730 & 0.843 & 0.591 & 0.940 & 0.863 & 0.982 & 0.870\\
				AutoKeras          & 0.951 & 0.860 & 0.711 & 0.932 & 0.910 & 0.755 & 0.950 & 0.731 & 0.965 & 0.911 & 0.994 & 0.950\\
				Google AutoML Vision  & 0.981 & 0.833 & 0.710 & 0.941 & 0.920 & 0.749 & 0.932 & 0.722 & 0.990 & 0.930  & 0.992 & 0.957\\
				
				FPVT  & 0.965& 0.900 & 0.715 & 0.940 & 0.911 & 0.753 & 0.952 & 0.769 & 0.930 & 0.892 & 0.970 & 0.942\\
				Ours  & \bf0.992& \bf0.939 & \bf0.816 & \bf0.959 & \bf0.931 & \bf0.826 & \bf0.966 & \bf0.835 & \bf0.988 & \bf0.960 & \bf0.992 & \bf0.989\\
				
				\bottomrule
			\end{tabular}
		}
		\vspace{-4mm}
	\end{center}
\end{table*}


\noindent \textbf{Discussion.}
%
%Movie, M2B, SCUT, fbp5500
%
The stability-trust framework can offer a more compact regression space with the help of prototype learning.
%
To verify our theory, we use t-SNE to enforce the predicted label distributions and the ground-truth label distributions of the raw dataset, respectively.
%
We evaluate four datasets (Movie, M2B, SCUT, and fbp5500) and the results are visualized in the supplemental material.
%
We note that the stability-trust framework can aggregate similar label vectors more compactly.
%
Although a compact prediction space can reduce the number of outliers, this results in a loss of accuracy in quantitative evaluation.
%
In this paper, we propose to leverage the rest of the samples with noise to give them small weights as the training set to alleviate this problem.
%
To evaluate the effectiveness of this method, we propose a metric that computes the average distance between the predicted label distribution to the prototype vector, which is written as:
\begin{equation}
\text{Score} = \text{Sigmoid}(\frac{1}{N}\sum_{i}^{N}||\mathcal{L}_{i}-p||_{2}),
\end{equation}
where Sigmoid is intended to normalize the output.
%
We use our algorithm to evaluate the above problem on four data sets (Movie, M2B, SCUT, and fbp5500).
%
The score of our algorithm is \{0.35, 0.42, 0.44, 0.29\} when trained on only clean samples, and scores trained on samples containing noise are \{0.47, 0.45, 0.51, 0.33\}; respectively.
%
The training set containing noisy samples can be a more relaxed prediction result, which theoretically extends the decision boundary of the model.
%Note that our proposed metric is based on the distance.

In addition, we need to evaluate whether the stability-trust framework is suited for the multi-classification tasks (MedMNIST (v2)~\cite{yang2023medmnist}).
%
%We used the \textit{MedMNIST} (v2) dataset as the evaluation benchmark.
%
%The \textit{MedMNIST} (v2) dataset includes twelve 2D and six 3D standardized datasets from carefully selected sources covering primary data modalities (X-ray, OCT, ultrasound, CT, and electron microscope), diverse classification tasks (binary/multi-class, ordinal regression, and multi-label) and dataset scales (from 100 to 100,000).
%
Here, this framework is evaluated only on the multi-class dataset.
%
%Following the existing work, we select AUC and ACC as evaluation methods.
%
%This task is to assign a certain number of error labels to the label space in 10\% of the training set.
%

We use ResNet-18 as the baseline method.
%
At first, we used a label enhancement algorithm~\citep{xu2019label} to convert \textit{MedMNIST} (v2) into a pseudo-LDL dataset.
%
We use ResNet-18~\citep{ayyachamy2019medical} set as the baseline method.
%
We use cross-entropy and set the batch size to 128 during the model training.
%
We utilize an AdamW optimizer with an initial learning rate of 0.001 and train the network for 100 epochs, delaying the learning rate by 0.1 after 50 and 75 epochs.
%
This ResNet-18 is implemented on \textit{MedMNIST} (v2) after being enforced by the stability-trust framework.
%
It is worth noting that the image is flattened and then input to the stability-trust framework.
%
As shown in Table~\ref{tab:4dResults}, our method achieves optimal results on noisy labels' datasets (10\% noise).
%
%We use cross-entropy and set the batch size to 128 during the model training.
%
%We utilize an AdamW optimizer~\cite{loshchilov2017fixing} with an initial learning rate of 0.001 and train the network for 100 epochs, delaying the learning rate by 0.1 after 50 and 75 epochs.
%
%This ResNet is implemented on \textit{MedMNIST} (v2) after being enforced by the stability-trust framework.
%
%It is worth noting that the image is flattened and then input to the stability-trust framework.
%
%As shown in Table~\ref{tab:4dResults}, our method achieves optimal results on the datasets containing noisy labels.
%
%
Besides, we demonstrate the degree of performance improvement through BFGS-LLD, LALOT, LDLSF, and LDL-LRR algorithms conducted on Movie, M2B, and SCUT datasets.
%
These methods used our framework to pre-process the dataset before implementation.
%
As shown in Figure~\ref{ra}, the stability-trust framework as a data pre-processing technique can enable the performance of the LDL algorithm to be enhanced on a benchmark with noise.
%
%In terms of training time, the training time of our method takes only \textbf{2h} on the Movie dataset, while INP takes \textbf{10/~15} hours (3090 RTX GPU shader).

%\section{Performance of Non-LDL Dataset}

%We need to evaluate whether the stability-trust framework is suited for the multi-classification tasks (\textit{MedMNIST} (v2)).


%\vspace{-2mm}
%\section{limitations}
%\vspace{-2mm}
%Compared with other benchmarks, our approach underperforms the \textit{Gene} dataset may be because the label distribution space tends to be uniformly distributed.
%
%Moreover, the configuration of the parameters of the model is also a mind-numbing task conducted on the 12 benchmarks.


%

\section{Conclusion}

%
We propose a stability-trust framework to overcome the problem of noisy labels on 13 benchmarks (12 label distributions and 1 multi-classification).
%
Our approach has two key components, one is prototype learning to guide the model to learn the compact space; the other is the feature decoupling strategy.
%
Our method is more efficient compared to the existent LDL de-noising methods and it does not require additional knowledge and an expensive sampling process.
%
A large number of experimental results demonstrate the effectiveness of our approach.

% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{Trusted re-weighting for label distribution learning\\(Supplementary Material)}
\maketitle

\section*{\centering Overview}
In this supplemental material, we introduce the implementation details of the experiment in Section I.
We show a heat map for our algorithm to eliminate correlations between features in Section II.




\section{Implementation of Experiment}
\label{s1}

We conduct 10 times 5-fold cross-validation on each dataset. 
%
The experimental results are presented in the form of ``mean$\pm$std'' in Tables~\ref{TT} and~\ref{TTT}.
%
Our algorithm achieves competitive results compared to other algorithms.

\begin{table*}[h]  \scriptsize
	\begin{center}
		\vspace{-0mm}
		\caption{The performance of our proposed method with the comparison algorithms on 12 datasets. The best-performing results are marked in \textbf{bold}.}
		%, with the last column (BLB) showing the degree of performance improvement for each algorithm compared to those without regularization techniques.}
	\vspace{-0mm}
	\label{TT}
	\resizebox{\linewidth}{!}{
		\begin{tabular}{c|c|cccccc}
			\toprule
			Dataset                             & Algorithm	   & Chebyshev $\downarrow$  & Clark $\downarrow$   & Canberra $\downarrow$   & K-L $\downarrow$    & Cosine $\uparrow$   & Intersection $\uparrow$           \\ 	
			\midrule 
			
			&  Ours        & \textbf{0.0480 $\boldsymbol{\pm}$ 0.0033}            & \textbf{2.1008 $\boldsymbol{\pm}$ 0.0259}       & \textbf{14.0800 $\boldsymbol{\pm}$ 0.0153}          & \textbf{0.2320 $\boldsymbol{\pm}$ 0.0094}      & \textbf{0.8406 $\boldsymbol{\pm}$ 0.0023}      & \textbf{0.7997 $\boldsymbol{\pm}$ 0.0077}                   \\
			
			&  Baseline-LDL     & 0.0509 $\pm$ \text{0.0066}           & 2.2004 $\pm$ \text{0.0055}       & 14.1449 $\pm$ \text{0.2448}          & 0.2440 $\pm$ \text{0.0024}     & 0.8345 $\pm$ \text{0.0009}      & 0.7821 $\pm$ \text{0.0016}                \\
			
			&  INP        & 0.0488 $\pm$ \text{0.0012}           & 2.1029 $\pm$ \text{0.0259}       & 14.0888 $\pm$ \text{0.0551}          & 0.2335 $\pm$ \text{0.0044}      & 0.8395 $\pm$ \text{0.0032}      & 0.7984 $\pm$ \text{0.0066}                   \\
			
			&  PCA        & 0.0482 $\pm$ \text{0.0013}           & 2.1020 $\pm$ \text{0.0212}       & 14.0835 $\pm$ \text{0.0142}          & 0.2321 $\pm$ \text{0.0087}      & 0.8390 $\pm$ \text{0.0016}      & 0.7989 $\pm$ \text{0.0099}                   \\
			
			&  LDL-LRR     & 0.0494 $\pm$ \text{0.0039}           & 2.1888 $\pm$ \text{0.0861}      & 14.2550 $\pm$ \text{0.0144}          & 0.2400 $\pm$ \text{0.0077}     & 0.8388 $\pm$ \text{0.0144}      & 0.7789 $\pm$ \text{0.0040}                     \\
			
			&  LDL-LCLR    & 0.0511 $\pm$ \text{0.0022}           & 2.2201 $\pm$ \text{0.0444}       & 14.2101 $\pm$ \text{0.0510}          & 0.2566 $\pm$ \text{0.0047}     & 0.8302 $\pm$ \text{0.0012}      & 0.7722 $\pm$ \text{0.0060}                    \\
			
			&  LDLSF       & 0.0513 $\pm$ \text{0.0030}           & 2.2221 $\pm$ \text{0.0036}       & 14.3667 $\pm$ \text{0.0265}          & 0.2445 $\pm$ \text{0.0077}     & 0.8320 $\pm$ \text{0.0010}      & 0.7701 $\pm$ \text{0.0026}                      \\
			
			&  LALOT       & 0.0505 $\pm$ \text{0.0033}           & 2.1989 $\pm$ \text{0.0194}       & 14.1855 $\pm$ \text{0.0922}          & 0.2443 $\pm$ \text{0.0088}     & 0.8297 $\pm$ \text{0.0060}      & 0.7888 $\pm$ \text{0.0013}                     \\
			
			\multirow{-7}{*}{Gene} 		            &  BFGS-LLD    & 0.0578 $\pm$ \text{0.0066}           & 2.3008 $\pm$ \text{0.0188}       & 14.3559 $\pm$ \text{0.1556}          & 0.2480 $\pm$ \text{0.0015}     & 0.8300 $\pm$ \text{0.0049}      & 0.7786 $\pm$ \text{0.0070}                     \\
			
			\midrule
			
			&  Ours        & \textbf{0.1071 $\boldsymbol{\pm}$ 0.0008}           & \textbf{0.4997 $\boldsymbol{\pm}$ 0.0064}       & \textbf{0.9710 $\boldsymbol{\pm}$ 0.0044}          & \textbf{0.0970 $\boldsymbol{\pm}$ 0.0008}      & \textbf{0.9595 $\boldsymbol{\pm}$ 0.0063}      & 0.8791 $\pm$ \text{0.0019}                     \\
			
			& Baseline-LDL     & 0.1109 $\pm$ \text{0.0033}           & 0.5119 $\pm$ \text{0.0155}       & 1.0889 $\pm$ \text{0.0111}          & 0.1355 $\pm$ \text{0.0022}     & 0.9422 $\pm$ \text{0.0333}      & 0.8744 $\pm$ \text{0.0054}                \\
			
			&  INP        & 0.1089 $\pm$ \text{0.0018}           & 0.5001 $\pm$ \text{0.0044}       & 0.9722 $\pm$ \text{0.0040}          & 0.0977 $\pm$ \text{0.0008}      & 0.9585 $\pm$ \text{0.0061}      & \textbf{0.8861 $\boldsymbol{\pm}$ 0.0006}                     \\
			
			&  PCA        & 0.1077 $\pm$ \text{0.0006}           & 0.5013 $\pm$ \text{0.0032}       & 0.9720 $\pm$ \text{0.0032}          & 0.0972 $\pm$ \text{0.0005}      & 0.9590 $\pm$ \text{0.0002}      & 0.8853 $\pm$ \text{0.0022}                     \\
			
			&  LDL-LRR     & 0.1107 $\pm$ \text{0.0009}           & 0.5019 $\pm$ \text{0.0010}      & 0.9801 $\pm$ \text{0.0061}          & 0.1045 $\pm$ \text{0.0049}     & 0.9591 $\pm$ \text{0.0022}      & 0.8772 $\pm$ \text{0.0027}                    \\
			
			&  LDL-LCLR    & 0.1177 $\pm$ \text{0.0086}           & 0.5345 $\pm$ \text{0.0040}       & 1.1533 $\pm$ \text{0.0111}          & 0.1559 $\pm$ \text{0.0030}     & 0.9360 $\pm$ \text{0.0049}      & 0.8222 $\pm$ \text{0.0011}                    \\
			
			&  LDLSF       & 0.1155 $\pm$ \text{0.0045}           & 0.5339 $\pm$ \text{0.0062}       & 1.1152 $\pm$ \text{0.0050}          & 0.1540 $\pm$ \text{0.0041}     & 0.9445 $\pm$ \text{0.0020}      & 0.8551 $\pm$ \text{0.0044}                     \\
			
			&  LALOT       & 0.1221 $\pm$ \text{0.0110}           & 0.5440 $\pm$ \text{0.0033}       & 1.1112 $\pm$ \text{0.0040}          & 0.1503 $\pm$ \text{0.0008}     & 0.9477 $\pm$ \text{0.0022}      & 0.8559 $\pm$ \text{0.0002}                    \\
			
			\multirow{-7}{*}{Movie} 		            &  BFGS-LLD    & 0.1310 $\pm$ \text{0.0032}           & 0.5230 $\pm$ \text{0.0022}       & 1.1170 $\pm$ \text{0.0024}          & 0.1595 $\pm$ \text{0.0155}     & 0.9400 $\pm$ \text{0.0003}      & 0.8491 $\pm$ \text{0.0018}                    \\
			
			\midrule
			
			&  Ours        & \textbf{0.3691 $\boldsymbol{\pm}$ 0.0021}           & \textbf{1.1541 $\boldsymbol{\pm}$ 0.0131}       & \textbf{2.0880 $\boldsymbol{\pm}$ 0.0056}          & \textbf{0.4872 $\boldsymbol{\pm}$ 0.0026}      & \textbf{0.8028 $\boldsymbol{\pm}$ 0.0033}      & \textbf{0.6800 $\boldsymbol{\pm}$ 0.0082}                   \\
			
			& Baseline-LDL     & 0.3997 $\pm$ \text{0.0077}           & 1.2889 $\pm$ \text{0.0056}       & 2.1992 $\pm$ \text{0.2887}          & 0.5006 $\pm$ \text{0.0044}     & 0.7887 $\pm$ \text{0.0099}      & 0.6558 $\pm$ \text{0.0065}                \\
			
			&  INP        & 0.3763 $\pm$ \text{0.0022}           & 1.1560 $\pm$ \text{0.0102}       & 2.0889 $\pm$ \text{0.0055}          & 0.4880 $\pm$ \text{0.0023}      & 0.7998 $\pm$ \text{0.0022}      & 0.6703 $\pm$ \text{0.0033}                   \\
			
			&  PCA        & 0.3731 $\pm$ \text{0.0017}           & 1.1555 $\pm$ \text{0.0123}       & 2.0893 $\pm$ \text{0.0048}          & 0.4883 $\pm$ \text{0.0112}      & 0.7999 $\pm$ \text{0.0091}      & 0.6745 $\pm$ \text{0.0044}                   \\
			
			
			&  LDL-LRR     & 0.3793 $\pm$ \text{0.0010}           & 1.1590 $\pm$ \text{0.0167}      & 2.1084 $\pm$ \text{0.0034}          & 0.4998 $\pm$ \text{0.0012}     & 0.7931 $\pm$ \text{0.0023}      & 0.6634 $\pm$ \text{0.0077}                     \\
			
			&  LDL-LCLR    & 0.4040 $\pm$ \text{0.0082}           & 1.2444 $\pm$ \text{0.0045}       & 2.2000 $\pm$ \text{0.0009}          & 0.4996 $\pm$ \text{0.0013}     & 0.7760 $\pm$ \text{0.0079}      & 0.6555 $\pm$ \text{0.0012}                 \\
			
			&  LDLSF       & 0.4159 $\pm$ \text{0.0055}          & 1.3105 $\pm$ \text{0.0041}       & 2.2155 $\pm$ \text{0.0076}          & 0.5002 $\pm$ \text{0.0006}     & 0.7552 $\pm$ \text{0.0004}      & 0.6234 $\pm$ \text{0.0033}                    \\
			
			&  LALOT       & 0.3881 $\pm$ \text{0.0099}           & 1.4883 $\pm$ \text{0.0012}       & 2.1257 $\pm$ \text{0.0268}          & 0.4990 $\pm$ \text{0.0008}     & 0.7549 $\pm$ \text{0.0021}      & 0.6620 $\pm$ \text{0.0053}                    \\
			
			\multirow{-7}{*}{M2B} 		            &  BFGS-LLD    & 0.3811 $\pm$ \text{0.0044}          & 1.3650 $\pm$ \text{0.0002}       & 2.1992 $\pm$ \text{0.0095}          & 0.4995 $\pm$ \text{0.0005}     & 0.7699 $\pm$ \text{0.0040}      & 0.6532 $\pm$ \text{0.0009}                   \\
			
			\midrule
			
			& Ours        & \textbf{0.3851 $\boldsymbol{\pm}$ 0.0034}           & 1.2580 $\pm$ \text{0.0191}       & \textbf{2.1901 $\boldsymbol{\pm}$ 0.0042}          & \textbf{0.4900 $\boldsymbol{\pm}$ 0.0036}      & \textbf{0.7007 $\boldsymbol{\pm}$ 0.0002}      & \textbf{0.6955 $\boldsymbol{\pm}$ 0.0004}                   \\
			
			&  Baseline-LDL        & 0.4008 $\pm$ \text{0.0008}     & 1.3365 $\pm$ \text{0.0155}       & 2.2110 $\pm$ \text{0.0339}          & 0.5119 $\pm$ \text{0.0044}     & 0.6697 $\pm$ \text{0.0012}      & 0.6489 $\pm$ \text{0.0055}                \\
			
			&  INP        & 0.3895 $\pm$ \text{0.0021}           & 1.2640 $\pm$ \text{0.0111}       & 2.1995 $\pm$ \text{0.0095}          & 0.4911 $\pm$ \text{0.0030}      & 0.6990 $\pm$ \text{0.0002}      & 0.6904 $\pm$ \text{0.0001}                   \\
			
			&  PCA        & 0.3903 $\pm$ \text{0.0023}           & 1.2642 $\pm$ \text{0.0155}       & 2.1942 $\pm$ \text{0.0044}          & 0.4903 $\pm$ \text{0.0017}      & 0.6992 $\pm$ \text{0.0001}      & 0.6912 $\pm$ \text{0.0022}                   \\
			
			
			&  LDL-LRR     & 0.3901 $\pm$ \text{0.0011}             &1.3000 $\pm$ \text{0.0122}      &2.2006 $\pm$ \text{0.0039}          & 0.5088 $\pm$ \text{0.0026}     & 0.6992 $\pm$ \text{0.0023}      & 0.6889 $\pm$ \text{0.0007}                  \\
			
			&  LDL-LCLR    & 0.4240 $\pm$ \text{0.0042}           & 1.3444 $\pm$ \text{0.0055}       & 2.2450 $\pm$ \text{0.0016}          & 0.5131 $\pm$ \text{0.0022}     & 0.6261 $\pm$ \text{0.0005}      & 0.5500 $\pm$ \text{0.0012}                  \\
			
			&  LDLSF       & 0.4360 $\pm$ \text{0.0015}           & \textbf{1.2185 $\boldsymbol{\pm}$ 0.0022}       & 2.2159 $\pm$ \text{0.0076}          & 0.5120 $\pm$ \text{0.0006}     & 0.6261 $\pm$ \text{0.0004}      & 0.5534 $\pm$ \text{0.0030}                     \\
			
			&  LALOT       & 0.3999 $\pm$ \text{0.0009}           & 1.4983 $\pm$ \text{0.0012}       & 2.2207 $\pm$ \text{0.0158}          & 0.4995 $\pm$ \text{0.0002}     & 0.6549 $\pm$ \text{0.0020}      & 0.6411 $\pm$ \text{0.0044}                    \\
			
			\multirow{-7}{*}{SCUT} 		            &  BFGS-LLD    & 0.3992 $\pm$ \text{0.0055}          & 1.5656 $\pm$ \text{0.0163}       & 2.2832 $\pm$ \text{0.0080}          & 0.4966 $\pm$ \text{0.0011}     & 0.6491 $\pm$ \text{0.0040}      & 0.6333 $\pm$ \text{0.0013}                    \\
			
			
			%
			
			\bottomrule
		\end{tabular}%
	}
	\vspace{-1mm}
\end{center}
\end{table*}

\begin{table*}[h] \scriptsize
\begin{center}
	\vspace{-0mm}
	\caption{The performance of our proposed method with the comparison algorithms on 12 datasets.  The best-performing results are marked in \textbf{bold}.}
	%, with the last column (BLB) showing the degree of performance improvement for each algorithm compared to those without regularization techniques.}
\vspace{-0mm}
\label{TTT}
\resizebox{\linewidth}{!}{
	\begin{tabular}{c|c|cccccc}
		\toprule
		Dataset                             & Algorithm	   & Chebyshev $\downarrow$  & Clark $\downarrow$   & Canberra $\downarrow$   & K-L $\downarrow$    & Cosine $\uparrow$   & Intersection $\uparrow$           \\
		\midrule
		
		&  Ours        & \textbf{0.1212 $\boldsymbol{\pm}$ 0.0001}           & \textbf{1.1666 $\boldsymbol{\pm}$ 0.0123}       & \textbf{2.0921  $\boldsymbol{\pm}$ 0.0200}          & \textbf{0.1031 $\boldsymbol{\pm}$ 0.0012}      & \textbf{0.9708 $\boldsymbol{\pm}$ 0.0012}      & \textbf{0.8600 $\boldsymbol{\pm}$ 0.0035}                 \\
		
		&  Baseline-LDL    & 0.1287 $\pm$ \text{0.0091}           & 1.1899 $\pm$ \text{0.0333}       & 2.1177 $\pm$ \text{0.0432}          & 0.1100 $\pm$ \text{0.0033}     & 0.9610 $\pm$ \text{0.0022}      & 0.8447 $\pm$ \text{0.0064}                \\
		
		&  INP        & 0.1251 $\pm$ \text{0.0002}           & 1.1890 $\pm$ \text{0.0120}       & 2.0980  $\pm$ \text{0.0223}          & 0.1053 $\pm$ \text{0.0009}      & 0.9643 $\pm$ \text{0.0015}      & 0.8501 $\pm$ \text{0.0025}                 \\
		
		&  PCA        & 0.1220 $\pm$ \text{0.0030}           & 1.1755 $\pm$ \text{0.0111}       & 2.0999  $\pm$ \text{0.0123}          & 0.1044 $\pm$ \text{0.0014}      & 0.9650 $\pm$ \text{0.0007}      & 0.8534 $\pm$ \text{0.0031}                 \\
		
		&  LDL-LRR     & 0.1222 $\pm$ \text{0.0030}           & 1.1733 $\pm$ \text{0.0038}      & 2.0992 $\pm$ \text{0.0095}          & 0.1077 $\pm$ \text{0.0077}     & 0.9633 $\pm$ \text{0.0021}      & 0.8512 $\pm$ \text{0.0066}                  \\
		
		&  LDL-LCLR    & 0.1277 $\pm$ \text{0.0016}          & 1.1969 $\pm$ \text{0.0039}       & 2.1194 $\pm$ \text{0.0046}          & 0.1135 $\pm$ \text{0.0006}     & 0.9588 $\pm$ \text{0.0044}      & 0.8483 $\pm$ \text{0.0014}                  \\
		
		&  LDLSF       & 0.1270 $\pm$ \text{0.0028}           & 1.1909 $\pm$ \text{0.0164}       & 2.1846 $\pm$ \text{0.0119}          & 0.1193 $\pm$ \text{0.0041}     & 0.9609 $\pm$ \text{0.0019}      & 0.8460 $\pm$ \text{0.0007}                     \\
		
		&  LALOT       & 0.1306 $\pm$ \text{0.0022}           & 1.1921 $\pm$ \text{0.0015}       & 2.1111 $\pm$ \text{0.0171}          & 0.1120 $\pm$ \text{0.0015}     & 0.9430 $\pm$ \text{0.0019}      & 0.8400 $\pm$ \text{0.0004}                   \\
		
		\multirow{-7}{*}{fbp5500} 		            &  BFGS-LLD    & 0.1299 $\pm$ \text{0.0049}           & 1.4655 $\pm$ \text{0.0041}       & 2.1675 $\pm$ \text{0.0024}          & 0.1135 $\pm$ \text{0.0055}     & 0.9595 $\pm$ \text{0.0030}      & 0.8419 $\pm$ \text{0.0018}                   \\
		
		\midrule
		
		&  Ours       & \textbf{0.1421 $\boldsymbol{\pm}$ 0.0025}           & \textbf{1.3588 $\boldsymbol{\pm}$ 0.0321}       & \textbf{2.6798 $\boldsymbol{\pm}$ 0.0026}          & \textbf{0.2006 $\boldsymbol{\pm}$ 0.0033}      & \textbf{0.9429 $\boldsymbol{\pm}$ 0.0023}      & \textbf{0.8334 $\boldsymbol{\pm}$ 0.0029}                   \\
		
		&  Baseline-LDL     & 0.1489 $\pm$ \text{0.0023}           & 1.3994 $\pm$ \text{0.0451}       & 2.7006 $\pm$ \text{0.0903}          & 0.2118 $\pm$ \text{0.0022}     & 0.9288 $\pm$ \text{0.0019}      & 0.8196 $\pm$ \text{0.0044}                \\
		
		&  INP        & 0.1456 $\pm$ \text{0.0021}           & 1.3651 $\pm$ \text{0.0441}       & 2.6888 $\pm$ \text{0.0023}          & 0.2017 $\pm$ \text{0.0012}      & 0.9394 $\pm$ \text{0.0026}      & 0.8247 $\pm$ \text{0.0077}                   \\
		
		&  PCA        & 0.1432 $\pm$ \text{0.0020}           & 1.3660 $\pm$ \text{0.0454}       & 2.6889 $\pm$ \text{0.0019}          & 0.2008 $\pm$ \text{0.0010}      & 0.9390 $\pm$ \text{0.0009}      & 0.8320 $\pm$ \text{0.0023}                   \\
		
		&  LDL-LRR     & 0.1426 $\pm$ \text{0.0033}           & 1.3659 $\pm$ \text{0.0211}      & 2.7125 $\pm$ \text{0.0422}          & 0.2149 $\pm$ \text{0.0007}     & 0.9390 $\pm$ \text{0.0013}      & 0.8277 $\pm$ \text{0.0044}                      \\
		
		&  LDL-LCLR    & 0.1515 $\pm$ \text{0.0022}           & 1.5923 $\pm$ \text{0.0117}       & 2.7779 $\pm$ \text{0.0239}          & 0.2244 $\pm$ \text{0.0030}     & 0.9262 $\pm$ \text{0.0062}      & 0.8189 $\pm$ \text{0.0098}                   \\
		
		&  LDLSF       & 0.1488 $\pm$ \text{0.0024}           & 1.3889 $\pm$ \text{0.0086}       & 2.7672 $\pm$ \text{0.0660}          & 0.2302 $\pm$ \text{0.0044}     & 0.9111 $\pm$ \text{0.0051}      & 0.8117 $\pm$ \text{0.0022}                     \\
		
		&  LALOT       & 0.1479 $\pm$ \text{0.0010}           & 1.3659 $\pm$ \text{0.0099}       & 2.6956 $\pm$ \text{0.0144}          & 0.2221 $\pm$ \text{0.0064}     & 0.9311 $\pm$ \text{0.0021}      & 0.8107 $\pm$ \text{0.0008}                    \\
		
		\multirow{-7}{*}{RAF-ML} 		            &  BFGS-LLD    & 0.1499 $\pm$ \text{0.0009}           & 1.6656 $\pm$ \text{0.0066}       & 2.7101 $\pm$ \text{0.0211}          & 0.2541 $\pm$ \text{0.0055}     & 0.9204 $\pm$ \text{0.0023}      & 0.8157 $\pm$ \text{0.0050}                    \\
		\midrule
		
		&  Ours        & \textbf{0.2770 $\boldsymbol{\pm}$ 0.0081}           & \textbf{2.2309 $\boldsymbol{\pm}$ 0.0113}       & \textbf{5.1097 $\boldsymbol{\pm}$ 0.0051}          & \textbf{0.5104 $\boldsymbol{\pm}$ 0.0054}      & \textbf{0.8987 $\boldsymbol{\pm}$ 0.0044}      & \textbf{0.7987 $\boldsymbol{\pm}$ 0.0016}                  \\
		
		&  Baseline-LDL     & 0.2887 $\pm$ \text{0.0040}           & 2.3008 $\pm$ \text{0.0151}       & 5.4999 $\pm$ \text{0.1555}          & 0.6060 $\pm$ \text{0.0042}     & 0.8667 $\pm$ \text{0.0066}      & 0.7774 $\pm$ \text{0.0031}                \\
		
		&  INP        & 0.2777 $\pm$ \text{0.0021}           & 2.2374 $\pm$ \text{0.0110}       & 5.1163 $\pm$ \text{0.0018}          & 0.5111 $\pm$ \text{0.0029}      & 0.8807 $\pm$ \text{0.0049}      & 0.7891 $\pm$ \text{0.0014}                  \\
		
		&  PCA       & 0.2771 $\pm$ \text{0.0082}           & 2.2343 $\pm$ \text{0.0144}       & 5.1167 $\pm$ \text{0.0072}          & 0.5110 $\pm$ \text{0.0036}      & 0.8837 $\pm$ \text{0.0012}      & 0.7924 $\pm$ \text{0.0041}                  \\
		
		&  LDL-LRR     & 0.2802 $\pm$ \text{0.0021}           & 2.2441 $\pm$ \text{0.0051}      & 5.2002 $\pm$ \text{0.0023}          & 0.5189 $\pm$ \text{0.0035}     & 0.8662 $\pm$ \text{0.0042}      & 0.7789 $\pm$ \text{0.0014}                     \\
		
		&  LDL-LCLR    & 0.2994 $\pm$ \text{0.0045}           & 2.4900 $\pm$ \text{0.0012}       & 6.9609 $\pm$ \text{0.0041}          & 0.6056 $\pm$ \text{0.0031}     & 0.7110 $\pm$ \text{0.0021}      & 0.7110 $\pm$ \text{0.0088}                     \\
		
		&  LDLSF       & 0.3007 $\pm$ \text{0.0002}           & 2.7887 $\pm$ \text{0.0057}       &5.6101 $\pm$ \text{0.0118}         & 0.6396 $\pm$ \text{0.0022}     & 0.7939 $\pm$ \text{0.0098}      & 0.7660 $\pm$ \text{0.0007}                       \\
		
		&  LALOT       & 0.3133 $\pm$ \text{0.0021}           & 2.3141 $\pm$ \text{0.0016}       & 5.5336 $\pm$ \text{0.0241}          & 0.5233 $\pm$ \text{0.0012}     & 0.8595 $\pm$ \text{0.0550}      & 0.7214 $\pm$ \text{0.0049}                    \\
		
		\multirow{-7}{*}{Twitter} 		            &  BFGS-LLD    & 0.3114 $\pm$ \text{0.0044}           & 2.5511 $\pm$ \text{0.0028}       & 5.7145 $\pm$ \text{0.0041}          & 0.5461 $\pm$ \text{0.0153}     & 0.8335 $\pm$ \text{0.0055}      & 0.7744 $\pm$ \text{0.0020}     \\
		
		\midrule
		
		&  Ours        & \textbf{0.2801 $\boldsymbol{\pm}$ 0.0088}           & \textbf{2.3169 $\boldsymbol{\pm}$ 0.0064}       & \textbf{5.2188 $\boldsymbol{\pm}$ 0.0159}          & \textbf{0.5314 $\boldsymbol{\pm}$ 0.0033}      & \textbf{0.8406 $\boldsymbol{\pm}$ 0.0044}      & \textbf{0.7832 $\boldsymbol{\pm}$ 0.0025}                 \\
		
		&  Baseline-LDL     & 0.3134 $\pm$ \text{0.0021}           & 2.6641 $\pm$ \text{0.1051}       & 5.5599 $\pm$ \text{0.0130}          & 0.6007 $\pm$ \text{0.0022}     & 0.8330$\pm$ \text{0.0099}      & 0.7661 $\pm$ \text{0.0034}                \\
		
		&  INP        & 0.2816 $\pm$ \text{0.0031}           & 2.3356 $\pm$ \text{0.0097}       & 5.2222 $\pm$ \text{0.0159}          & \textbf{0.5314 $\boldsymbol{\pm}$ 0.0013}      & \textbf{0.8406 $\boldsymbol{\pm}$ 0.0014}      & 0.7741 $\pm$ \text{0.0025}                 \\
		
		&  PCA        & 0.2813 $\pm$ \text{0.0074}           & 2.3226 $\pm$ \text{0.0061}       & 5.2210 $\pm$ \text{0.0103}          & 0.5317 $\pm$ 0.0072      & 0.8405 $\pm$ 0.0055      & 0.7749 $\pm$ \text{0.0092}                 \\
		
		&  LDL-LRR     & 0.2885 $\pm$ \text{0.0012}           & 2.3209 $\pm$ \text{0.0174}      & 5.3459 $\pm$ \text{0.0229}          & 0.5558 $\pm$ \text{0.0032}     & 0.8401$\pm$ \text{0.0040}      & 0.7699 $\pm$ \text{0.0037}                    \\
		
		&  LDL-LCLR    & 0.2970 $\pm$ \text{0.0009}           & 2.4444 $\pm$ \text{0.0063}       & 6.1600 $\pm$ \text{0.0041}         & 0.6222 $\pm$ \text{0.0013}     & 0.7919 $\pm$ \text{0.0029}      & 0.7090 $\pm$ \text{0.0070}                     \\
		
		&  LDLSF       & 0.3301 $\pm$ \text{0.0009}           & 2.8888 $\pm$ \text{0.0459}       & 5.9152 $\pm$ \text{0.0121}          & 0.6100 $\pm$ \text{0.0021}     & 0.8139 $\pm$ \text{0.0098}      & 0.7360 $\pm$ \text{0.0037}                      \\
		
		&  LALOT       & 0.3411 $\pm$ \text{0.0026}           & 2.9140 $\pm$ \text{0.0019}       & 5.3333 $\pm$ \text{0.0243}          & 0.5737 $\pm$ \text{0.0012}     & 0.8225 $\pm$ \text{0.0202}     & 0.7144 $\pm$ \text{0.0004}                    \\
		
		\multirow{-7}{*}{Flickr} 		            &  BFGS-LLD    & 0.3200 $\pm$ \text{0.0041}           & 2.7517 $\pm$ \text{0.0060}       & 5.8149 $\pm$ \text{0.0048}          & 0.5961 $\pm$ \text{0.0099}     & 0.8131 $\pm$ \text{0.0011}      & 0.7407 $\pm$ \text{0.0077}       \\
		
		\bottomrule
	\end{tabular}%
}
\vspace{-1mm}
\end{center}
\end{table*}


The stability-trust framework (no positive incentive noise) can offer a more compact regression space with the help of prototype learning.
%
However, our algorithm with positive incentive noise can extend the decision boundary.
%
To verify our theory, we use t-SNE~\citep{van2014accelerating} to enforce the predicted label distributions and the ground-truth label distributions of the raw dataset, respectively.
%
We evaluate four datasets (Movie, M2B, SCUT, and fbp5500) and the results are visualized in Figure~\ref{fig-TT}.


\section{Heat Maps of Feature Correlations}


We use heat maps to evaluate feature correlations in the Gene dataset to verify that our method has the capability of attribute decoupling (see Figure~\ref{fig-heat}).
%
Figure~\ref{fig-heat}(a) demonstrates strong correlation between the raw dataset features and Figure~\ref{fig-heat}(b) demonstrates weak correlation between the features.




\begin{figure*}[t]\scriptsize
\begin{center}
\tabcolsep 1pt
\vspace{-2mm}
\begin{tabular}{@{}cccc@{}}
	\includegraphics[width = 0.23\textwidth]{img/movie1.png}               &
	\includegraphics[width = 0.23\textwidth]{img/m2b1.png}                &
	\includegraphics[width = 0.23\textwidth]{img/scut1.png}                 &
	\includegraphics[width = 0.23\textwidth]{img/fbp55001.png}               
	\\
	
	\includegraphics[width = 0.23\textwidth]{img/movie2.png}               &
	\includegraphics[width = 0.23\textwidth]{img/m2b2.png}                 &
	\includegraphics[width = 0.23\textwidth]{img/scut2.png}                  &
	\includegraphics[width = 0.23\textwidth]{img/fbp55002.png}                 \\
	
	
	(a) Movie &
	(b) M2B & 
	%NL & 
	(c) SCUT & 
	
	(f) fbp5500
	\\
\end{tabular}
\end{center}
\vspace{-2mm}
\caption{This figure visualizes the data distribution in the label space, with the first row indicating the spatial distribution of the raw dataset and the second row indicating the predicted label distribution.}
\vspace{-2mm}
\label{fig-TT}
\end{figure*}





\begin{figure*}[h]\scriptsize
\begin{center}
\tabcolsep 1pt
\vspace{-2mm}
\begin{tabular}{@{}cccc@{}}
	\includegraphics[width = 0.46\textwidth]{img/Figure_1.png}               &
	\includegraphics[width = 0.46\textwidth]{img/Figure_2.png}       \\
	
	
	(a) Raw dataset &
	(b) Human gene with our method  \\
\end{tabular}
\end{center}
\vspace{-2mm}
\caption{This figure shows the feature correlation of Gene datasets with our method. Our approach has a clear ability to decouple features.}
\vspace{-2mm}
\label{fig-heat}
\end{figure*}




\end{document}
