%%
%% This is file `sample-sigconf.tex',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% samples.dtx  (with options: `sigconf')
%% 
%% IMPORTANT NOTICE:
%% 
%% For the copyright see the source file.
%% 
%% Any modified versions of this file must be renamed
%% with new filenames distinct from sample-sigconf.tex.
%% 
%% For distribution of the original source see the terms
%% for copying and modification in the file samples.dtx.
%% 
%% This generated file may be distributed as long as the
%% original source files, as listed above, are part of the
%% same distribution. (The sources need not necessarily be
%% in the same archive or directory.)
%%
%% Commands for TeXCount
%TC:macro \cite [option:text,text]
%TC:macro \citep [option:text,text]
%TC:macro \citet [option:text,text]
%TC:envir table 0 1
%TC:envir table* 0 1
%TC:envir tabular [ignore] word
%TC:envir displaymath 0 word
%TC:envir math 0 word
%TC:envir comment 0 0
%%
%%
%% The first command in your LaTeX source must be the \documentclass command.
\documentclass[prologue,table,sigconf]{acmart}
%% NOTE that a single column version is required for 
%% submission and peer review. This can be done by changing
%% the \doucmentclass[...]{acmart} in this template to 
%% \documentclass[manuscript,screen]{acmart}
%% 
%% To ensure 100% compatibility, please check the white list of
%% approved LaTeX packages to be used with the Master Article Template at
%% https://www.acm.org/publications/taps/whitelist-of-latex-packages 
%% before creating your document. The white list page provides 
%% information on how to submit additional LaTeX packages for 
%% review and adoption.
%% Fonts used in the template cannot be substituted; margin 
%% adjustments are not allowed.

%%
%% \BibTeX command to typeset BibTeX logo in the docs
\AtBeginDocument{%
  \providecommand\BibTeX{{%
    \normalfont B\kern-0.5em{\scshape i\kern-0.25em b}\kern-0.8em\TeX}}}

%% Rights management information.  This information is sent to you
%% when you complete the rights form.  These commands have SAMPLE
%% values in them; it is your responsibility as an author to replace
%% the commands and values with those provided to you when you
%% complete the rights form.



%%
%% Submission ID.
%% Use this when submitting an article to a sponsored event. You'll
%% receive a unique submission ID from the organizers
%% of the event, and this ID should be used as the parameter to this command.
%%\acmSubmissionID{123-A56-BU3}

%%
%% For managing citations, it is recommended to use bibliography
%% files in BibTeX format.
%%
%% You can then either use BibTeX with the ACM-Reference-Format style,
%% or BibLaTeX with the acmnumeric or acmauthoryear sytles, that include
%% support for advanced citation of software artefact from the
%% biblatex-software package, also separately available on CTAN.
%%
%% Look at the sample-*-biblatex.tex files for templates showcasing
%% the biblatex styles.
%%

%%
%% The majority of ACM publications use numbered citations and
%% references.  The command \citestyle{authoryear} switches to the
%% "author year" style.
%%
%% If you are preparing content for an event
%% sponsored by ACM SIGGRAPH, you must use the "author year" style of
%% citations and references.
%% Uncommenting
%% the next command will enable that style.
%%\citestyle{acmauthoryear}

%%
%% end of the preamble, start of the body of the document source.
\usepackage{subfigure}
\usepackage{bm}
\usepackage{amsmath,amsfonts}
\usepackage[ruled,linesnumbered]{algorithm2e}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{definition}{Definition}
\usepackage{lineno,hyperref}
\usepackage{caption}
\usepackage{multirow}
\usepackage{bbding}

\copyrightyear{2024}
\acmYear{2024}
\setcopyright{acmlicensed}\acmConference[MM '24]{Proceedings of the 32nd ACM International Conference on Multimedia}{October 28-November 1, 2024}{Melbourne, VIC, Australia}
\acmBooktitle{Proceedings of the 32nd ACM International Conference on Multimedia (MM '24), October 28-November 1, 2024, Melbourne, VIC, Australia}
\acmDOI{10.1145/3664647.3681414}
\acmISBN{979-8-4007-0686-8/24/10}


\begin{document}

%%
%% The "title" command has an optional parameter,
%% allowing the author to define a "short title" to be used in page headers.
\title{Deep Incomplete Multi-View Semi-Supervised Multi-Label Learning Network with Unbiased Loss}

%%
%% The "author" command and its associated commands are used to define
%% the authors and their affiliations.
%% Of note is the shared affiliation of the first two authors, and the
%% "authornote" and "authornotemark" commands
%% used to denote shared contribution to the research.


\author{Quanjiang Li}
\affiliation{%
  \institution{National University of Defense Technology}
  % \streetaddress{1 Th{\o}rv{\"a}ld Circle}
  \city{Changsha}
  \country{China}}
\email{liquanjiang@nudt.edu.cn}


\author{Tingjin Luo}
\authornote{Corresponding author}
\affiliation{%
  \institution{National University of Defense Technology}
  % \streetaddress{1 Th{\o}rv{\"a}ld Circle}
  \city{Changsha}
  \country{China}}
\email{tingjinluo@hotmail.com}

\author{Mingdie Jiang}
\affiliation{%
  \institution{National University of Defense Technology}
  % \streetaddress{1 Th{\o}rv{\"a}ld Circle}
  \city{Changsha}
  \country{Country}}
\email{jiangmingdie20@nudt.edu.cn}

\author{Jiahui Liao}
\affiliation{%
  \institution{National University of Defense Technology}
  % \streetaddress{1 Th{\o}rv{\"a}ld Circle}
  \city{Changsha}
  \country{China}}
\email{liaojiahui19@nudt.edu.cn}


\author{Zhangqi Jiang}
\affiliation{%
  \institution{National University of Defense Technology}
  % \streetaddress{1 Th{\o}rv{\"a}ld Circle}
  \city{Changsha}
  \country{China}}
\email{jiangzq@nudt.edu.cn}


%%
%% By default, the full list of authors will be used in the page
%% headers. Often, this list is too long, and will overlap
%% other information printed in the page headers. This command allows
%% the author to define a more concise list
%% of authors' names for this purpose.
\renewcommand{\shortauthors}{Quanjiang Li and Tingjin Luo et al.}

%%
%% The abstract is a short summary of the work to be presented in the
%% article.
\begin{abstract}
  Due to the explosive growth in data sources and label categories, multi-view multi-label learning has garnered widespread attention. However, multi-view multi-label data often exhibits incomplete features and  a huge number of unlabeled instances, due to the technical limitations  and high  cost of manual labeling in practice. Learning for such simultaneous missing of view features and labels is crucial but rarely studied, particularly when the labeled samples  are limited. In this paper, we tackle this problem by proposing a novel Deep Incomplete Multi-View Semi-Supervised Multi-Label Learning method (DIMvSML).
Specifically, to improve high-level representations of missing features,  deep graph network is firstly employed to recover the feature information with structural similarity relations. Meanwhile, we design the structure-specific deep feature extractors to obtain discriminative information and preserve the cross-view consistency for the recovered data with instance-level contrastive loss. Furthermore, to eliminate the bias of the estimate of the risk that the semi-supervised multi-label methods minimise, we design a safe  estimate framework with an unbiased loss  and improve its empirical performance by using pseudo-labels of unlabeled data. Besides, we provide both the theoretical proof of better estimate variance and the intuitive explanation of our debiased framework. Finally, extensive experimental results on public datasets validate the superiority of DIMvSML compared with state-of-the-art methods.
\end{abstract}

%%
%% The code below is generated by the tool at http://dl.acm.org/ccs.cfm.
%% Please copy and paste the code instead of the example below.
%%
\begin{CCSXML}
<ccs2012>
   <concept>
       <concept_id>10010147.10010257.10010293.10010294</concept_id>
       <concept_desc>Computing methodologies~Neural networks</concept_desc>
       <concept_significance>500</concept_significance>
       </concept>
   <concept>
       <concept_id>10010147.10010257.10010282.10011305</concept_id>
       <concept_desc>Computing methodologies~Semi-supervised learning settings</concept_desc>
       <concept_significance>500</concept_significance>
       </concept>
 </ccs2012>
\end{CCSXML}

\ccsdesc[500]{Computing methodologies~Neural networks}
\ccsdesc[500]{Computing methodologies~Semi-supervised learning settings}

%%
%% Keywords. The author(s) should pick words that accurately describe
%% the work being presented. Separate the keywords with commas.
\keywords{Deep Learning, Incomplete Multi-view Learning, Semi-supervised Classification, Multi-label Learning.}

%% A "teaser" image appears between the author and affiliation
%% information and the body of the document, and typically spans the
% %% page.
% \begin{teaserfigure}
%   \includegraphics[width=\textwidth]{sampleteaser}
%   \caption{Seattle Mariners at Spring Training, 2010.}
%   \Description{Enjoying the baseball game from the third-base
%   seats. Ichiro Suzuki preparing to bat.}
%   \label{fig:teaser}
% \end{teaserfigure}

% \received{20 February 2007}
% \received[revised]{12 March 2009}
% \received[accepted]{5 June 2009}

%%
%% This command processes the author and affiliation and title
%% information and builds the first part of the formatted document.
\maketitle

\section{Introduction}
Multi-label learning has attracted increasing attention due to its widespread application in areas such as text
classification  \cite{ueda2002parametric,chang2020taming}, image annotation  \cite{cabral2011matrix,shu2016image}, and computer
vision tasks \cite{everingham2010pascal,gong2013deep}. Furthermore, with the exponential increase of data sources and feature extraction methods, it is no longer adequate  to  describe and analyze instances from a singular perspective \cite{fang2021animc}. In real-world applications, objects are usually processed in multiple views, like face information captured by diverse sensors and image data stored using both video and audio techniques. Doubtlessly, the utilization of multi-view data enables comprehensive and accurate description of observed instances \cite{gao2015multi,zhou2022rehearsal}. Besides, multi-view data offers abundant data presentation modes, which can be combined with multi-label to represent the rich information content  and semantic structure of complex data \cite{xiao2024new,zhou2022open}. Therefore, this paper focuses on the multi-view multi-label classification task, namely MVMLC.

\begin{figure}[h]
\includegraphics[width=0.35\textwidth]{./figure/newshiyitu.pdf}
  \caption{The example of incomplete multi-view semi-supervised multi-label data.}\label{fig1}
\end{figure}

For MVMLC, many methods  have been proposed, such as the manifold regularization MVMLC \cite{luo2013multiview}, potential semantic-aware LSA-MML \cite{zhang2018latent} and label-embedding based method \cite{zhu2018multi}. However, these traditional methods assume that the given data has complete views and labels, which is violated in
practice. On the one hand, the heterogeneous data collected from multiple sources may contain missing views due to the quality of storage equipment and the difficulty of storage methods \cite{chen2022adaptively}. For instance, in multi-view multimedia annotation tasks, video, audio and subtitle serve as distinct views. It is common to face situations where not all multimedia content encompasses all three views \cite{wen2023deep}. On the other hand, manual tagging of all labels being both challenging and expensive, label information in real datasets frequently exhibits varying degrees of incompleteness \cite{liu2018svm}. Clearly, the absence of views and labels detrimentally impacts MvMLC. In recent years, the challenges posed by the two types of missing have received widespread attention and some works which can simultaneously handle both  issues have been proposed. iMvWL \cite{tan2018incomplete} combined two weighted matrix factorization models into a unified framework to extract the consensus representation and derive a subspace from missing views with multiple labels. NAIML  \cite{li2021concise} exploited both consensus across multiple views and the global and local structures among multiple labels from rank constraint. By incorporating missing information into the weighted fusion and classification module,
the proposed DD-IMvMLC \cite{wen2023deep} could effectively explore available data and label information to obtain the discriminative feature extractor and classifier. DICNet \cite{liu2023dicnet} emphasized the utilization of stacked auto-encoders to exploit the high-level semantic representations of samples.  Besides, it introduced an incomplete instance-level contrastive learning  to capture consistent representations. Wen et al. \cite{liu2023incomplete} proposed
LMVCAT, which employed two transformer-style modules for cross-view feature aggregation and multi-label classification and utilized an adaptively weighted view fusion module to obtain view-consistent embedding features. 

These MVMLC methods under incomplete data only focus on the partial absence of multiple labels and each instance has a subset of labels that can be utilized to infer the missing ones. However, multi-view data not only suffers from missing features but also encounters unlabeled instances in reality. For example, the features and labels of tumor patient data are obtained from various examinations and tests \cite{miller2022modeling}. Certain patients may face limitations in undergoing MRI or PET scans, or in providing blood samples for laboratory testing, thereby leading to corresponding views being unavailable. Besides, factors such as research focus, resource constraint, and annotation complexity greatly contribute to patient labels, including tumor types, grading, treatment responses and so on, remaining unannotated. In aggregate, real-world datasets often present simultaneous problems of missing views and numerous unlabeled instances as shown in Fig. \ref{fig1}. There are few methods available today that can effectively address both issues.  We are all aware that in supervised multi-label scenarios, correlations within labels are beneficial for label recovery. However, in semi-supervised problem  \cite{luo2018semi,sun2023semi}, where a sample lacks any annotations, solely considering methods for handling missing from label relevance is simplistic. In fact, to address the problem of  semi-supervised multi-label under incomplete views, we need to focus on the following three aspects: i) Since the stability of the model will be seriously affected by the limited availability of data, we should consider maximizing the reconstruction of missing information. Rather than establishing losses only on observed data  \cite{wen2018incomplete,zhuge2023absent}, we should enhance feature semantic from a data recovery perspective and maintain the stability of the subsequent modules \cite{jiang2024deep}. ii) High-level representations should be explored to improve the contribution of features to classification with limited labeled instances.
iii) Improperly incorporating unlabeled data to construct an unsupervised loss undermines the unbiased estimation of ideal risk from supervised losses, which makes the algorithm often lack the support of statistical theory \cite{van2020survey,zhou2018brief}. Besides, it introduces bias to the solution of  supervised losses optimization, resulting in potential performance degradation of semi-supervised methods. In Fig. \ref{fig22}, we select ten relatively balanced labels of Yeast to perform the experiment of label distribution simulation and only set $20\%$ of the test data to be labeled. We can observe that when introducing unlabeled data and learning with traditional biased semi-supervised losses, the performance (marked in purple) tends to be worse than using only supervised data (marked in green). Moreover, the harm caused by the bias expands as the number of label categories increases, which needs to be  controlled in multi-label learning. Therefore, the third aspect is to improve the framework of loss functions to make semi-supervised model safe and robust.

\begin{figure}[t]
\includegraphics[width=0.38\textwidth]{./figure/dis.pdf}
  \caption{The label distribution obtained by three different methods on Yeast. }\label{fig22}
\end{figure}

To tackle these problems, we propose a novel deep incomplete multi-view semi-supervised multi-label learning method named DIMvSML. Specifically, for mitigating the negative influence of missing views, DIMvSML employs the Graph Neural Network to recover the missing data by capitalizing on the existing similarity relations. Based on the auto-encoder structures, we design feature extractors  and decoder networks to learn high-level semantic and discriminative representations from all views. In addition, to preserve the cross-view consensus, we adopt the instance-level contrastive loss to enhance the mutual information between different views. An unbiased version of loss function is designed to eliminate the risk of latent downgrade due to the introduction of the unlabeled data and we prove theoretically that this  framework also has a lower estimate variance. As depicted in Fig. \ref{fig22},  utilizing our unbiased loss function leads to  stable performance compared to the traditional semi-supervised loss when using unlabeled data. During training, the pseudo-labels are assigned to explore the additional supervisory information contained in unlabeled data. The main contributions of our work are summarised as follows:
\begin{itemize}
\item[$\bullet$] We propose the DIMvSML to solve this crucial, but rarely studied problem. To our knowledge, this is the first GNN-based  multi-view  multi-label learning framework  capable of handling both incomplete views and few labeled instances.
\item[$\bullet$] DIMvSML is a unified framework  designed to recover the absent views, preserve the high-level semantic representations with cross-view consensus and provide a safe risk estimation framework simultaneously.   
\item[$\bullet$] Extensive experimental results present that our DIMvSML outperforms other compared approaches in almost all cases, demonstrating its superiority and effectiveness.
\end{itemize}

\section{Methodology}
%方法主要分为三大板块：GNN试图补全，多视图多标签表征学习，可信半监督学习
\textbf{Notations and Problem Formulation.} Suppose an incomplete multi-view dataset with $N$ instances and $V$ views, i.e., $\mathcal{X}=\{\bm{X}^{(v)}\}_{v=1}^{V}$, where $\bm{X}^{(v)}=\{\bm{x}_{i}^{(v)}\}_{i=1}^{N} \in \mathbb{R}^{N\times d_{v}}$ is the $d_{v}$ dimensional feature matrix of the $v$-th view. Let $\bm{Y} \in\{0,1\}^{N \times C}$ represent the label matrix and $C$ is
the number of categories. Besides, $\bm{y}_{i} \in\{0,1\}^{C}$ is a row vector that denotes  the label of the $i$-th instance. 
$\bm{M} \in \mathbb{R}^{N \times V}$ is an indicator matrix, where $\bm{M}_{i,j}=1$ indicates that the $i$-th instance has the feature of the $j$-th view, otherwise $\bm{M}_{i,j}=0$ means the feature is missing and set as ‘NaN’. For convenience, we define $n_{l}$ and $n_{u}$ as the number of labeled and unlabeled instances, which satisfies $n_{l} \ll N$ and $n_{l} + n_{u} = N$. We also denote $\mathcal{L}$ and $\mathcal{U}$ as the index spaces for labeled and unlabeled instances, respectively.

\begin{figure*}[h]
\includegraphics[width=0.9\textwidth]{./figure/newflow.pdf}
  \caption{The main framework of our proposed DIMvSML, which is composed of three modules: (a) GNN-based feature completion module; (b) Multi-view representation learning module and (c) Safe semi-supervised learning module.}\label{fig2}
\end{figure*}

Obviously, the missing features will affect the learning of complementary and consistency across all views and prevent subsequent
modules from using valid information. Furthermore, the scarcity
of labeled samples will seriously limit the learning of multi-label
semantics, which demands better utilization of unlabeled data to
prevent the harm caused by the bias of the semi-supervised loss. To address these challenges, we propose a novel deep learning framework named DIMvSML. The main framework of our DIMvSML is illustrated in Fig. \ref{fig2}. Specifically, DIMvSML consists of three main modules: (a) GNN-based feature completion module for recovering the incomplete feature information; (b) Multi-view representation learning module for capturing the high-level semantic representations and discriminative information from all views; (c) Safe semi-supervised multi-label learning module for providing an unbiased semi-supervised risk estimator with lower variance. 
\subsection{GNN-based Feature Completion Module}
Since the absence of features will lead to the poor performance of deep learning \cite{shorten2019survey}, data recovery is required to realize data augmentation. Recently, GNN-based approaches have garnered attention in data recovery owing to their capacity to extract the geometric details embedded within data \cite{liang2022spatial, wang2022incomplete}. Sato  \cite{sato2023graph}  further presented theoretical evidence that substantiates the  effectiveness of GNNs in recovering hidden features. Therefore, we employ GNNs to recover the missing views by leveraging the similarity  relations  between the available data.

Firstly, we construct the view-specific graph $\bm{S}^{(v)} \in \mathbb{R}^{N \times N}$  through $k$-nearest neighbors ($k$-NN) algorithm. $\bm{S}^{(v)}$ demonstrates the relations between  the  corresponding instances of existing data in the $v$-th view, where $\bm{S}_{i, j}^{(v)}=1$ means $\bm{M}_{i, v} \bm{M}_{j, v}=1$ and $\bm{x}_{j}^{(v)}$ is the neighbor of $\bm{x}_{i}^{(v)}$. Considering the consistency  across multiple views, similarity relations between instances in existing views are valid for the missing views. Therefore, we transfer the established graph relations to find the available instances that are associated with the missing ones in each view. Then the  transferred $k$-NN graph can be obtained by
\begin{equation}\label{eq1}
\bm{K}^{(v)}  =\sum_{k  =  1, k \neq v}^{V} \bm{S}^{(k)} \operatorname{diag}\left(\bm{M}_{: , v}\right), 
\end{equation}
where operator diag(·) forms a diagonal matrix, and $\bm{M}_{: , v}$ denotes the $v$-th column of the matrix $\bm{M}$. Secondly,  we employ $\bm{K}^{(v)}$ as the adjacency matrix  and the related existing  features  as the iuput nodes in each view-specific GNN to recover the missing data. After the  propagation of relation information  over $\bm{K}^{(v)}$ in the first layer of the GNN, the initially reconstructed data can be obtained by
\begin{equation}\label{eq2}
\hat{\bm{x}}_{i}^{(v)}=\sigma\left(\bm{b}_{v}+\sum_{\bm{K}_{i, j}^{(v)} \geq 1} \bm{K}_{i, j}^{(v)}  \bm{\omega}_{v} \bm{x}_{j}^{(v)}\right),
\end{equation}
where $\bm{b}_{v}$ and  $\bm{\omega}_{v}$ denote the bias and transformation matrix of
the $v$-th view, respectively.  In our experiments, we set $\sigma$ as the  rectified linear unit (ReLU) activation
function.  Finally, we combine the reconstructed missing data with the existing data to acquire the recovered matrices $\{\widetilde{\bm{X}}^{(v)}\}_{v=1}^{V}$.
Moreover, in order to consolidate
 the recovery performance, 
we minimize the rebuilding loss $L_{\mathrm{rb}}$ only over recovered data as
follows:
\begin{equation}
L_{\mathrm{rb}}=\sum_{v=1}^{V} \sum_{\bm{K}_{i, j}^{(v)} \geq 1}\left\|\widetilde{\bm{X}}_{i, :}^{(v)}-\bm{x}_{j}^{(v)}\right\|_{2}^{2}(1-\bm{M}_{i,v}).
\end{equation}

\subsection{Multi-view Representation Learning Module}
Due to the small number of labeled instances in semi-supervised learning (SSL), we need an efficient representation module to simultaneously explore the high-level semantic of features, unique characteristic for each view and substantial connections between features and labels. Therefore, we adopt the deep neural network rather than the shallow linear model for adaptively extracting the advanced representations. Besides, considering that different views have their distinctive characteristics, the  feature extraction network $E^{(v)}(\cdot)$ and decoder network $D^{(v)}(\cdot)$ should be tailored for each view.
Following \cite{xie2016unsupervised, wen2023deep}, we adopt the well-known network structure of stacked auto-encoder. These view-specific networks are all composed of multi-layer perceptrons but with different hidden layer dimensions. By constructing such network structure, we can effectively capture the discriminative information inherent in each view. Therefore, we set the structure of both the $E^{(v)}(\cdot)$ and $D^{(v)}(\cdot)$  as four stacked linear layers with  ReLU activation functions i.e., \{Linear, ReLU, Linear, ReLU, Linear, ReLU, Linear\}. Specifically, for the $d_{v}$ dimensional feature data of the $v$-th view, the dimensions of the four linear layers in the encoder network are adaptively set as $0.8d_{v}$, $0.8d_{v}$, $1500$, and $d$, where $d$ is the corresponding dimension of the last layer and can be adjusted according to the number of label categories. In reverse, the dimensions of the decoder network are set as $1500$, $0.8d_{v}$, $0.8d_{v}$, $d_{v}$. Through extraction of networks, we can get the representation matrix $\bm{Z}^{(v)}=E^{(v)}(\widetilde{\bm{X}})$ by minimizing
\begin{equation}
L_{\mathrm{rc}}=\frac{1}{n_{l} V} \sum_{v=1}^{V}\left[\sum_{i \in \mathcal{L}}\left(\left\|\widetilde{\bm{X}}_{i,:}^{(v)}-D^{(v)}\left(\bm{Z}_{i,:}^{(v)}\right)\right\|_{2}^{2}  /d_{v}\right)\right].
\end{equation}

To integrate the coded features, we obtain a common representation matrix $\bm{Z} \in \mathbb{R}^{d V \times N}$ by concatenating the view-specific representations, , which maximizes the retention of recovered features while minimizing complexity. For the purpose of effectively exploring  the correlations between features and labels, we employ deep network classifier which  transforms the feature space into a probability space  associated with the labels, i.e.,  the elements of the output  can be regarded as the probability of the instance to the corresponding label. Therefore, we design the classifier as the combination of two  stacked linear layers  and a Sigmoid activation function, where the dimensions are set as $d/2$ and $C$. For improving prediction accuracy, we use  the cross entropy \cite{chen2019learning}  to guide model training. Suppose $\bm{F} \in \mathbb{R}^{N \times C}$ is a prediction matrix generated by  classifier, we employ the following loss for labeled instances:
\begin{equation}
L_{\mathrm{bce}}=-\frac{1}{n_{l} C} \sum_{i \in \mathcal{L}} \sum_{j=1}^{C}\left(Y_{ij} \log \left(F_{ij}\right)+\left(1-Y_{ij}\right) \log \left(1-F_{ij}\right)\right).
\end{equation}


\subsection{Safe Semi-supervised Learning Module}
To prevent the introduction of unlabeled data from causing performance degradation, we provide a safe semi-supervised loss function framework in this section. Assume that the shared parameters of the whole network is $\Theta$ and the ultimate objective of our training framework is  to minimise a ideal risk $\mathcal{R}$ over a data distribution $p(x, y)$. Since the distribution $p(x, y)$ is  unknowable, we generally minimise a empirical risk $\hat{\mathcal{R}}(\Theta)$, which acts as a surrogate for $\mathcal{R}$  and is computed on a sample of $N$ i.i.d points drawn from $p(x, y)$. Considering supervised  losses from  each labeled instance, we suppose $L_{\mathrm{rc}}=\frac{1}{n_{l}} \sum_{i \in \mathcal{L}} G(\Theta ; x_{i})$ and $L_{\mathrm{bce}}=\frac{1}{n_{l}} \sum_{i \in \mathcal{L}} T(\Theta ; x_{i};y_{i})$. Besides, we let loss $L\left(\Theta ; x_{i};y_{i}\right)=G\left(\Theta ; x_{i}\right)+\lambda_{1} T\left(\Theta ; x_{i};y_{i}\right)$. Then the supervised risk to minimise is
\begin{eqnarray}\label{eq_rcc}
\hat{\mathcal{R}}_{C C}(\Theta)  =  \frac{1}{n_{l}} \sum_{i \in \mathcal{L}} L\left(\Theta ; x_{i}; y_{i}\right).
\end{eqnarray}
This traditional supervised risk estimate is unbiased and converges wisely to $\mathcal{R}(\Theta)$. However, a notable limitation of this  framework under semi-supervised problem is that a considerable amount of unlabeled data is not utilized.  Therefore, we employ the instance-level contrastive loss \cite{lin2022dual} on the unlabeled data to maximize the mutual information between the representations of different views. To calculate the mutual
information, we use
a Softmax activation function $\sigma_{s}$  at the last layer of the
encoder and then we obtain  that ${\widetilde{\bm{z}}}_{i}^{(v)}=\sigma_{s}(\bm{z}_{i}^{(v)})$, which is treated as a distribution probability vector  \cite{ji2019invariant}. In other words, $\widetilde{\bm{z}}^{(v)}$ and $\widetilde{\bm{z}}^{\left(v^{*}\right)}$ $(1 \leq v<v^{*} \leq V)$ can be seen as the distribution of two discrete cluster assignment variables over $d$ classes. Therefore, we can compute the joint probability distribution as below:
\begin{eqnarray}
P^{\left(v, v^{*}\right)}=\frac{1}{n_{u}} \sum_{i \in \mathcal{U}} \left(\widetilde{z}_{i}^{\left(v^{*}\right)}\right)^{T} \widetilde{z}_{i}^{(v)}.
\end{eqnarray}
Then the mutual
information between the $v$-th and  $v^{*}$-th view can be calculated through
\begin{equation}
\ell_{v, v^{*}}=-\sum_{t=1}^{m} \sum_{t^{\prime}=1}^{m} \mathbf{P}_{t, t^{\prime}}^{\left(v, v^{*}\right)} \ln \left(\frac{\mathbf{P}_{t, t^{\prime}}^{\left(v, v^{*}\right)}}{\left(\mathbf{P}_{t}^{(v)}\right)^{\alpha+1}\left(\mathbf{P}_{t^{\prime}}^{\left(v^{*}\right)}\right)^{\alpha+1}}\right),
\end{equation}
where $\bm{P}^{\left(v\right)}$ and $\bm{P}^{\left(v^{*}\right)}$ are the  marginal probability distribution of the $v$-th and $v^{*}$-th view. In our experiments, we simply fix the balanced parameter $\alpha$ to $9$. The loss ${L_{icl}}^{-}$ under unlabeled data can be obtained by enumerating the mutual information between different views, i.e., 
\begin{equation}
{L_\mathrm{icl}}^{-}=\frac{1}{V} \sum_{1 \leq v<v^{*} \leq V} \ell_{v, v^{*}}.
\end{equation}
To facilitate subsequent analysis,
we turn ${L_\mathrm{icl}}^{-}$ into the form ${L_\mathrm{icl}}^{-}=\frac{1}{n_{u}} \sum_{i \in \mathcal{U}} H\left(\Theta ; x_{i}\right)$. The concrete form
of the three instance-level losses $G\left(\Theta ; x_{i}\right)$, $T\left(\Theta ; x_{i};y_{i}\right)$ and $H\left(\Theta ; x_{i}\right)$ is easily acquired and will be listed in Appendix.
After introducing unlabeled data , we aim to minimise the SSL risk :
\begin{eqnarray}\label{eq10}
\hat{\mathcal{R}}_{S S L}(\Theta)  =  \frac{1}{n_{l}} \sum_{i \in \mathcal{L}} L\left(\Theta ; x_{i}; y_{i}\right)+\frac{\lambda_2}{n_{u}} \sum_{i \in \mathcal{U}} H\left(\Theta ; x_{i}\right).
\end{eqnarray}
Since unlabeled data lacks the necessary labeled guidance for classification, introducing it  often carries the risk of potential performance degradation, especially when the data distribution assumption is not satisfied \cite{li2014towards}. Even though the learning methods presented in the Eq. (\ref{eq10}) can handle some basic SSL problems, the SSL risk estimate  is biased or even asymptotic, which not only hinders the use of  statistical learning theory, but also damages the actual effect of the model \cite{mey2022improved}. Moreover, the biased harm  becomes prominent when the number of label categories increases. Hence, in multi-label classification, we should compensate for this bias in the loss function. Inspired by Hugo et al. \cite{schmutz2022don}, we  obtain the following unbiased version of the SSL estimator:
\begin{equation}\label{eq11}
\footnotesize
\hat{\mathcal{R}}_{{DeSSL }}(\Theta)=\frac{1}{n_{l}} \sum_{i \in \mathcal{L}} L\left(\Theta ; x_{i} ; y_{i}\right)+\frac{\lambda_{2}}{n_{u}} \sum_{i \in \mathcal{U}} H\left(\Theta ; x_{i}\right)-\frac{\lambda_{2}}{n_{l}} \sum_{i \in \mathcal{L}} H\left(\Theta ; x_{i}\right).
\end{equation}
This framework uses labelled data to annul the bias, which do not rely on data distribution assumption. In addition to being unbiased, this framework also has favorable estimate variance. To measure the variance of the risk estimate in the Eq.(\ref{eq11}), we require information regarding  instance tagging. Therefore, we introduce a binary random variable $r\sim \mathcal{B}(\pi)$ that states whether or not a data point is labelled. $r_i=1$ denotes the $i$-th instance is labeled and $r_i=0$ denotes missing. $\pi \in(0,1)$ is the probability of being labelled. Under the assumption that the missingness of a label is independent of its feature and value, we can obtain the following theorem:
\begin{theorem}\label{11}
When $\lambda_{1}$ is fixed, the function $\lambda_{2} \mapsto \mathbb{V}\left(\hat{\mathcal{R}}_{D e S S L}(\Theta) \mid r\right)$ reaches its minimum for:
\begin{equation*}
\scriptsize
{\lambda_{2}}^{*}=\frac{n_{u}}{n}\left(\frac{\operatorname{Cov}(G(\Theta ; x, y), H(\Theta ; x))}{\mathbb{V}(H(\Theta ; x))}+\frac{\operatorname{Cov}\left(\lambda_{1} T(\Theta ; x, y), H(\Theta ; x)\right)}{\mathbb{V}(H(\Theta ; x))}\right)
\end{equation*}
and at ${\lambda_{2}}^{*}$:
$$
\begin{aligned}
\left.\mathbb{V}\left(\hat{\mathcal{R}}_{D e S S L}(\Theta) \mid r\right)\right|_{{\lambda_{2}}^{*}} & =\left(1-\frac{n_{u}}{n} \rho_{L, H}^{2}\right) \mathbb{V}\left(\hat{\mathcal{R}}_{C C}(\Theta)\mid r\right) \\
& \leq \mathbb{V}\left(\hat{\mathcal{R}}_{C C}(\Theta)\mid r\right),
\end{aligned}
$$
where $\rho_{L, H}=\operatorname{Corr}(L(\Theta; x, y), H(\Theta; x))$.
\end{theorem}
A detailed proof of this theorem is presented in  Appendix. From the theorem \ref{11}, we can know that the variance  of the unbiased estimate $\hat{\mathcal{R}}_{D e S S L}(\theta)$ is less than that of $\hat{\mathcal{R}}_{C C}(\Theta)$ using only supervised data when $\lambda_{1}$ and $\lambda_{2}$ meet certain conditions.
This theorem also guides us to simultaneously
 adjust  
$\lambda_{1}$  and $\lambda_{2}$  to achieve a stable risk estimation effect in practice. When our estimate is unbiased and the variance is smaller, we can theoretically ensure that our semi-supervised module is safe when introducing the unlabeled data and no worse than using only supervised data.
To validate the correctness of our unbiased framework analysis, we train 
 our DIMvSML on Yeast and split the test dataset into $20\%$ labeled and $80\%$ unlabeled data to calculate the $\hat{\mathcal{R}}_{S S L}(\Theta)$ and $\hat{\mathcal{R}}_{{DeSSL }}(\Theta)$ risks  that
we compared to the oracle risk estimate using all the test set. For variance test experiment, we split 50 times
the test set  to estimate the variance of the risk estimator. Besides, we compute ${\lambda_{2}}^{*}$ using the entire test set. As shown in Fig. \ref{figure 66}, the result illustrates that $\hat{\mathcal{R}}_{{DeSSL}}(\Theta)$ is unbiased for any value of $\lambda_{2}$ and its variance can be optimised by adjusting $\lambda_{2}$ when $\lambda_{1}$ is fixed. Besides, it can be seen that $\mathbb{V}(\hat{\mathcal{R}}_{D e S S L}(\Theta))$ is less than $\mathbb{V}(\hat{\mathcal{R}}_{S S L}(\Theta))$ in most cases and the theoretical  value of ${\lambda_{2}}^{*}$ is close to the minimum point calculated from the actual sampling.
\begin{figure}[htb]	
\subfigure{\includegraphics[width=0.235\textwidth]{./figure/debiased.pdf}}
\subfigure{\includegraphics[width=0.236\textwidth]{./figure/var.pdf}}
  \caption{Intuitive explanation of our theoretical analysis. (Left) Risk estimate value for $\hat{\mathcal{R}}_{S S L}(\Theta)$ and $\hat{\mathcal{R}}_{{DeSSL }}(\Theta)$ compared to the true value of the risk. (Right) The influence of $\lambda_{2}$ on the raios of $\mathbb{V}(\hat{\mathcal{R}}_{D S S L}(\Theta))\/\mathbb{V}(\hat{\mathcal{R}}_{S S L}(\Theta))$ when $\lambda_{1}=1$. }\label{figure 66}
\end{figure}


Due to the capability to supply new labeled data for training, pseudo-label methods have gained significant prominence in deep semi-supervised classification tasks \cite{jia2020semi}. Therefore, we assign pseudo-labels to  exploit additional supervisory information and improve model performance. Since the classification confidence exhibited in the early stage of classifier is low, we  choose the  output of classifier as labels for unlabeled instances at half of the total training epoch. Then all instances are incorporated into training process. 




\subsection{Training Strategy}
The training strategy employed in DIMvSML contains two phases: pre-training and alternative optimization. In the pre-training phase, we only use the rebuilding loss $L_{\mathrm{rb}}$ to simply train the view-specific GNNs. During the alternate optimization phase, the three proposed modules mutually complement each other, resulting in simultaneous enhancement of classification performance. Denote the loss ${L_\mathrm{icl}}^{+}=\frac{1}{n_{l}} \sum_{i \in \mathcal{L}} H\left(\Theta; x_{i}\right)$, the overall loss of our DIMvSML in alternative optimization phase be formulated as
\begin{equation}\label{eq12}
L=L_{\mathrm{rc}}+\lambda_{1}L_{\mathrm{bce}}+\lambda_{2}({L_\mathrm{icl}}^{-}-{L_\mathrm{icl}}^{+}),
\end{equation}
where $\lambda_{1}$ and $\lambda_{2}$ are penalty coefficients. The training process of our DIMvSML is summarized in Appendix.   
% \begin{table*}[!ht]
% \captionsetup{justification=raggedright,singlelinecheck=false}
% 		\caption{A brief description of data sets.}\label{table 11}
% 		%\centering
% 		%\tiny
% 		%\scriptsize
%   \renewcommand\arraystretch{1.0}
%   \renewcommand\tabcolsep{6.0pt} %列间距
% 		\begin{tabular}{lllllll}
% 			\hline
% 			View&Yeast&VOC 2007&Corel 5k&Esp Game&IAPR TC-12 &MIR FLICKR\\
% 			\hline 
%     1
% 			 %&Genetic Expression(79)&DenseHue(100)&DenseHue(100)&DenseHue(100)&DenseHue(100)&DenseHue(100)\\
%     &Genetic Expression(79)&DenseHue(100)&DenseHue(100)&DenseHue(100)&DenseHue(100)&DenseHue(100)\\
%     2
%               &Phylogenetic Profile(24)&DenseSift(1000)&DenseSift(1000)&DenseSift(1000)&DenseSift(1000)&DenseSift(1000)\\
             
%     3
%               &-&GIST(512)&GIST(512)&GIST(512)&GIST(512)&GIST(512)\\
%     4         &-&HSV(4096)&HSV(4096)&HSV(4096)&HSV(4096)&HSV(4096)\\
%     5         &-&RGB(4096)&RGB(4096)&RGB(4096)&RGB(4096)&RGB(4096)\\
%     6         &-&LAB(4096)&LAB(4096)&LAB(4096)&LAB(4096)&LAB(4096)\\
%             \hline
%     Label&14&20&260&268&291&38\\
%     Instance&2417&9963&4999&20770&19627&25000\\
    
% 			\hline
% 		\end{tabular}
% 	\end{table*}



\begin{table*}[!ht]
		\caption{Ranking Loss, ACC, AP and AUC of different methods on six public datasets with  LER fixed to 20$\%$ and PER  fixed to 50$\%$. The best result on each row is bolded and the second-best result is underlined.}\label{table 1}
		\centering
		%\tiny
		%\scriptsize
  \renewcommand\arraystretch{1.0}
  \renewcommand\tabcolsep{4.0pt} %列间距
		\begin{tabular}{cccccccc>{\columncolor{gray!25}}c}
			\hline
			Dataset&Metric&TM3L&iMVWL&NAIML&DD-IMvMLC&DICNET&LMVCAT&DIMvSML\\
			\hline 
			 &Ranking Loss$\downarrow$ &25.21$\pm$0.55&\underline{21.86$\pm$0.68}&23.67$\pm$0.80&25.63$\pm$4.89&22.52$\pm$0.49&24.54$\pm$0.78&\textbf{20.04$\pm$0.64}\\
            
    &ACC $\uparrow$&69.53$\pm$0.31&72.13$\pm$0.40&\underline{75.53$\pm$0.34}&71.37$\pm$2.16&70.46$\pm$1.02&73.51$\pm$0.73&\textbf{78.34$\pm$0.72}\\
            Yeast
            
			 &AP$\uparrow$&66.73$\pm$0.57&\underline{69.92$\pm$0.31}&69.26$\pm$0.73&67.72$\pm$3.75&69.32$\pm$0.51&67.57$\pm$0.99&\textbf{72.48$\pm$0.41}\\
            
			&AUC$\uparrow$&55.42$\pm$1.19&49.99$\pm$0.76&59.09$\pm$1.19&52.42$\pm$2.06&54.30$\pm$1.06&\underline{60.41$\pm$1.45}&\textbf{62.47$\pm$1.58}\\
			
			\hline 
			 &Ranking Loss$\downarrow$ &30.39$\pm$0.32&22.62$\pm$0.77&22.59$\pm$0.42&16.93$\pm$0.44&\underline{16.40$\pm$0.35}&17.33$\pm$0.79&\textbf{14.42$\pm$0.90}\\
            
    &ACC $\uparrow$&98.65$\pm$0.02&97.46$\pm$0.09&98.63$\pm$0.03&98.68$\pm$0.00&98.69$\pm$0.01&\underline{98.69$\pm$0.01}&\textbf{98.70$\pm$0.01}\\
            Corel 5k
			&AP$\uparrow$&22.53$\pm$0.75&14.74$\pm$1.96&27.61$\pm$0.64&26.17$\pm$0.45&\underline{28.54$\pm$0.74}&26.97$\pm$0.87&\textbf{29.90$\pm$1.79}\\
            
			&AUC$\uparrow$&58.53$\pm$1.31&50.18$\pm$0.74&\underline{61.67$\pm$0.63}&54.22$\pm$0.83&55.80$\pm$1.07&56.62$\pm$1.70&\textbf{66.38$\pm$2.83}\\
            
			\hline 
			 &Ranking Loss$\downarrow$ &29.05$\pm$0.41&32.54$\pm$1.24&29.36$\pm$1.72&29.76$\pm$2.36&24.67$\pm$0.41&\underline{20.57$\pm$1.38}&\textbf{17.87$\pm$0.65}\\
            
    &ACC $\uparrow$&92.29$\pm$0.04&87.83$\pm$0.25&88.37$\pm$2.45&92.48$\pm$0.04&\underline{92.68$\pm$0.05}&91.22$\pm$0.55&\textbf{93.41$\pm$0.09}\\
            VOC 2007
			&AP$\uparrow$&44.44$\pm$0.48&41.02$\pm$0.71&41.84$\pm$1.00&43.28$\pm$1.06&46.45$\pm$0.45&\underline{51.21$\pm$1.32}&\textbf{56.05$\pm$0.57}\\
            
            &AUC$\uparrow$&59.60$\pm$0.20&50.08$\pm$0.14&49.99$\pm$1.33&52.33$\pm$1.01&58.36$\pm$0.84&\underline{73.45$\pm$1.40}&\textbf{76.30$\pm$1.42}\\


			\hline 
			&Ranking Loss$\downarrow$ &31.76$\pm$0.29&25.28$\pm$0.61&24.13$\pm$0.33&21.75$\pm$1.61&19.98$\pm$0.25&\underline{19.45$\pm$0.26}&\textbf{18.26$\pm$0.32}\\
            
    &ACC $\uparrow$&98.15$\pm$0.02&96.90$\pm$0.09&\underline{98.25$\pm$0.00}&98.24$\pm$0.00&98.24$\pm$0.01&98.24$\pm$0.01&\textbf{98.26$\pm$0.01}\\
            Esp Game
			 &AP$\uparrow$&18.02$\pm$0.17&15.76$\pm$2.13&25.03$\pm$0.26&21.56$\pm$1.35&25.00$\pm$0.35&\underline{26.04$\pm$0.28}&\textbf{26.17$\pm$0.56}\\
            
			&AUC$\uparrow$&55.82$\pm$0.45&49.92$\pm$0.18&58.94$\pm$0.49&52.69$\pm$1.85&53.98$\pm$0.18&\underline{62.00$\pm$0.89}&\textbf{65.10$\pm$0.58}\\

			\hline 
			 &Ranking Loss$\downarrow$ &25.40$\pm$0.19&23.62$\pm$0.80&24.63$\pm$0.57&19.72$\pm$0.99&17.16$\pm$0.23&\underline{16.50$\pm$0.42}&\textbf{15.21$\pm$1.69}\\
            
    &ACC $\uparrow$&97.90$\pm$0.02&96.61$\pm$0.06&97.04$\pm$1.22&98.03$\pm$0.01&\underline{98.04$\pm$0.01}&98.02$\pm$0.01&\textbf{98.06$\pm$0.01}\\
            IAPR TC-12
			&AP$\uparrow$&22.53$\pm$0.19&16.50$\pm$1.23&19.88$\pm$0.23&22.26$\pm$0.63&25.18$\pm$0.16&\underline{27.11$\pm$0.57}&\textbf{27.90$\pm$2.35}\\
            
			&AUC$\uparrow$&59.18$\pm$0.28&49.93$\pm$0.38&50.25$\pm$0.45&55.48$\pm$1.14&59.20$\pm$0.16&\underline{65.13$\pm$0.94}&\textbf{69.04$\pm$3.64}\\

			\hline 
			  &Ranking Loss$\downarrow$ &24.14$\pm$0.24&19.95$\pm$0.29&19.89$\pm$2.95&17.12$\pm$1.10&15.30$\pm$0.34&\underline{14.09$\pm$0.77}&\textbf{13.83$\pm$0.25}\\
            
    &ACC $\uparrow$&86.90$\pm$0.12&83.87$\pm$0.05&84.35$\pm$0.98&\underline{87.69$\pm$0.03}&87.58$\pm$0.07&87.61$\pm$0.64&\textbf{88.78$\pm$0.17}\\
            MIR FLICKR
			 &AP$\uparrow$&47.55$\pm$0.44&44.41$\pm$0.43&45.75$\pm$1.37&50.71$\pm$1.69&54.29$\pm$0.55&\underline{55.46$\pm$0.09}&\textbf{57.56$\pm$0.45}\\
            
			&AUC$\uparrow$&58.30$\pm$0.28&50.13$\pm$0.30&49.43$\pm$0.90&60.94$\pm$2.92&63.19$\pm$0.43&\underline{72.38$\pm$1.14}&\textbf{74.36$\pm$0.30}\\
			\hline
		\end{tabular}
\end{table*}

\begin{figure*}[htp]
\centering
\includegraphics[width=0.78\textwidth]{./figure/biaozhu.pdf}\\
\subfigure[Yeast]{	\includegraphics[width=5.3cm,height=4.5cm]{./figure/AUC_yeast_label.pdf}}
	 	\label{}
\subfigure[Corel 5k]{	\includegraphics[width=5.3cm,height=4.4cm]{./figure/AUC_corel5k_label.pdf}}
	 	\label{}
   \subfigure[VOC 2007]{\includegraphics[width=5.3cm,height=4.4cm]{./figure/AUC_pascal_label.pdf}}
	 	\label{}\\
	 	\subfigure[Esp Game]{
\includegraphics[width=5.3cm,height=4.4cm]{./figure/AUC_espgame_label.pdf}}
	 	\label{}
	 	\subfigure[IAPR TC-12]{	\includegraphics[width=5.3cm,height=4.4cm]{./figure/AUC_iaprtc12_label.pdf}}
	 	\label{}
    \subfigure[MIR FLICKR]{		\includegraphics[width=5.3cm,height=4.4cm]{./figure/AUC_mirflickr_label.pdf}}
	 	\label{} 
	\caption{AUC comparisons on six datasets with LER varying from $15\%$ to $55\%$  while PER=$50\%$.}\label{figure 3}
\end{figure*}


\section{Experiments}
\subsection{Experimental Setup}
\textbf{Datasets.}
In our experiments, 
six public multi-view multi-label datasets are selected to validate the proposed method, i.e., \textbf{Yeast} \cite{guillaumin2010multimodal}, \textbf{Corel 5k}  \cite{duygulu2002object}, \textbf{VOC 2007}  \cite{everingham2010pascal}, \textbf{Esp Game}  \cite{2004Labeling},  \textbf{IAPR TC-12}  \cite{grubinger2006iapr}, \textbf{MIR FLICKR}  \cite{huiskes2008mir}.  For the first dateset, we 
pick Genetic Expression and Phylogenetic Profile as two views;  for the other five datasets, we choose six types of features as six views, i.e., GIST, HSV, DenseHue, DenseSift, RGB, and LAB.


\noindent\textbf{Comparison Methods.} To validate the effectiveness of DIMvSML, we compare it with six state-of-the-art approaches, which can be categorized into two groups: traditional methods and deep methods. Traditional methods include: TM3L  \cite{zhao2021two}, iMvWL  \cite{tan2018incomplete}, NAIML  \cite{li2021concise}, while deep methods include DD-IMvMLC  \cite{wen2023deep}, DICNet  \cite{liu2023dicnet}, LMVCAT  \cite{liu2023incomplete}. Five of them are introduced in the preliminaries and TM3L is a multi-view multi-label classification method, which can handle partial multi-label data. Noting that except for TM3L, the other five methods can handle both feature and label missing simultaneously. Therefore, the missing views are populated by their average instance calculated from the corresponding available instances of the same view for TM3L in our experiment. For all comparison methods, we will prioritize the parameter settings recommended in the original code implementations or specified in their respective papers. 




\noindent\textbf{Data Preparation.} Each dataset can be divided into training, validation and test sets in the ratio of 7:1:2. To simulate the partial view setting, we randomly remove some views of samples from each set. Concretely, according to the pre-set partial example ratio (PER), PER$\%$ instances are randomly selected as incomplete instances, which randomly missing $1 \sim V-1$ views (at least one view per instance is available to
keep the total number of samples constant). For the SSL situation, according to  the pre-set labeled example ratio (LER), we randomly select LER$\%$ instances as labeled instances in the training dataset.

\noindent\textbf{Implementation Details.}
The $k$-NN graphs are constructed based on the Euclidean distance metric, where the neighbor number $k$ is fixed to 10 for all datasets. The Adam optimizer is employed with an initial learning rate of 0.0001 for optimizing the training loss. In addition, Ranking Loss (RL), Accuracy (ACC), Average Precision (AP) and adapted area under curve (AUC) are adopted as four evaluation metrics. All the experimental results are derived from ten independent runs of the methods, and the final average results along with their corresponding standard deviations are presented. Our model is implemented by PyTorch on one NVIDIA GeForce RTX 4090 with GPU of 24GB memory.

\begin{figure*}[htp]
\centering
\includegraphics[width=0.78\textwidth]{./figure/biaozhu.pdf}
   \label{}\\
\subfigure[Yeast]{	\includegraphics[width=5.3cm,height=4.4cm]{./figure/AUC_yeast_feature.pdf}}
	 	\label{}
\subfigure[Corel 5k]{	\includegraphics[width=5.3cm,height=4.4cm]{./figure/AUC_corel5k_feature.pdf}}
	 	\label{}
   \subfigure[VOC 2007]{\includegraphics[width=5.3cm,height=4.4cm]{./figure/AUC_pascal_feature.pdf}}
	 	\label{}\\
	 	\subfigure[Esp Game]{
\includegraphics[width=5.3cm,height=4.4cm]{./figure/AUC_espgame_feature.pdf}}
	 	\label{}
	 	\subfigure[IAPR TC-12]{	\includegraphics[width=5.3cm,height=4.4cm]{./figure/AUC_iaprtc12_feature.pdf}}
	 	\label{}
    \subfigure[MIR FLICKR]{	\includegraphics[width=5.3cm,height=4.4cm]{./figure/AUC_mirflickr_feature.pdf}}
    \caption{AUC comparisons on six datasets with PER varying from $0\%$ to $90\%$  while LER=$20\%$.}\label{figure 4}
\end{figure*}
% \renewcommand\arraystretch{1}
% \renewcommand\tabcolsep{6pt} %列间距
% 	\begin{table}[H]
% 		\caption{\label{table 1}Table showing the specific conditions of each dataset.}
% 		\centering
% 		\normalsize
% 		\begin{tabular}{cccccc}
% 			\hline		Dataset&Instance&Label&View
%    \\
% 		\hline
% 			Yeast& 2417 & 14  & 2\\
% 		Corel 5k &4999 &260 &6 \\
%            VOC 2007 &9963 &20 &6\\
%             Esp Game  & 20770 & 268 & 6 \\
%             IAPR TC-12&19627 &291 &6\\
%         MIR FLICKR &25000 &38 &6\\
% 			\hline
%       \end{tabular}
% 		\label{tabel1}
% 	\end{table} 

%对比方法共选择6个。文杰老师的三个深度的工作： DIMC, DICNET,LMVCAT。两个处理缺失多视图多标签的方法: iMvWL(IJCAN)和NAIML(TPAMI)。还有一个方法TM3L是处理多视图多标签中的标签缺失，运用的时候将特征进行平均值补全。
%指标选择ACC,AUC,AP,Rankingloss
\subsection{Performance Evaluation}
%\noindent\textbf{Performance Evaluation.} 
To comprehensively verify our DIMvSML, we compare it with six competitive methods from two key aspects: i) view missing and ii) label insufficient. For view missing, we fix LER to 20$\%$, while PER is selected in $\{0\%,10\%,30\%,50\%,70\%,90\%\}$. For label insufficient, we fix PER to 50$\%$, while LER is chosen in $\{15\%,20\%,25\%,30\%,35\%,40\%\,45\%\}$. The statistical results are presented in Table \ref{table 1}, Fig. \ref{figure 3} and Fig. \ref{figure 4}. Tabel \ref{table 1}  displays the  four metrics with LER fixed at $20\%$ and PER fixed at $50\%$,  while Fig. \ref{figure 3} and Fig. \ref{figure 4} show the AUC when LER and PER change respectively. The additional results of Rankingloss and AP are shown in Appendix.  

Regarding the missing view, we can find that 1) When PER$=0\%$, DIMvSML achieves the best performance on all datasets, which indicates that the proposed method is also stable and effective in the classification tasks under complete views. 2) With PER increasing from $10\%$ to $90\%$, DIMvSML still outperforms the other six methods. It shows our method adequately addresses the problem of  missing views and is beneficial for feature completion. 3) DIMvSML has the capability to address scenarios where the absence of certain views significantly impacts the classification process. In other words, even when all compared methods  fail to deliver satisfactory results, our approach continues to exhibit a considerable effect. For example, DIMvSML and the most competitive method achieve AUC of $70.27\%$ and $61.78\%$ when PER=$10\%$ on Corel 5k, revealing an increase of nearly $10\%$ percent. As for label insufficient, we have the following observations: 1) Our method achieves better among all compared methods in almost all cases. 2) Our DIMvSML is robust to few labeled instances since it consistently exhibits relatively promising performance with lower LER. For example, DIMvSML and the second-best method LMVCAT achieve AUC of $74.79\%$ and $73.8\%$  when LER=$45\%$ on IAPR TC-12. As LER=$15\%$, the performance of DIMvSML is $71.64\%$ and superior to $61.25\%$ of LMVCAT. 

\subsection{Ablation Study} 
The ablation experiments on VOC 2007 and IAPR TC-12 are carried out to thoroughly investigate the impact of the three critical modules of DIMvSML. When the GNN-based feature completion module ($S_{1}$) is disabled, we employ the average strategy to fill the missing data.  When the Multi-view representation learning module ($S_{2}$) is disabled, we simply concatenate each view and remove the loss $L_{\mathrm{rc}}$. As for the safe semi- supervised learning module, we compare the classification performance under the $\hat{\mathcal{R}}_{C C}(\Theta)$ and the $\hat{\mathcal{R}}_{{DeSSL}}(\Theta)$ loss framework since we focus on whether the introduction of unlabeled data would actually improve performance. The ablation results are listed in Table \ref{table:ablation}. We can know that: 1) In the first three rows, performance is reduced when $S_{1}$ and $S_{2}$ are removed respectively, which indicates that our method is effective for data recovery and feature extraction. 2) In the last two rows, the performance of our debiased framework $\hat{\mathcal{R}}_{{DeSSL}}(\Theta)$ is better than $\hat{\mathcal{R}}_{C C}(\Theta)$ using only supervised data. 
It demonstrates that our method indeed enhances model performance with the introduction of unlabeled data and provide a reliable effect for semi-supervised classification.

\begin{table}[thp]
\caption{Ablation study on  VOC 2007 and IAPR TC-12 with PER=$50\%$ and LER=$20\%$. `\Checkmark' and `\XSolidBrush' represent the used and not used corresponding item, respectively.}\label{table:ablation}
    \centering
    \resizebox{\linewidth}{!}{
    \begin{tabular}{cccc|cc|cc}
    \toprule
    \multirow{2}{*}{$S_{1}$} & \multirow{2}{*}{$S_{2}$}& \multirow{2}{*}{$\hat{\mathcal{R}}_{C C}(\Theta)$} & \multirow{2}{*}{$\hat{\mathcal{R}}_{{DeSSL}}(\Theta)$}&
    \multicolumn{2}{c|}{VOC 2007} & \multicolumn{2}{c}{IAPR TC-12} \\
    \cmidrule{5-8}
    &  &  &  & AP & AUC & AP & AUC \\
    \midrule   
    \XSolidBrush & \Checkmark & \XSolidBrush &  \Checkmark &   54.14 & 73.81 & 25.98 & 67.15 \\
    \Checkmark &  \XSolidBrush &  \XSolidBrush &   \Checkmark & 49.08 & 68.98 & 19.93 & 59.92 \\
     \Checkmark & \Checkmark & \XSolidBrush &  \Checkmark &  \textbf{56.05} & \textbf{76.30}  & \textbf{27.90}  & \textbf{69.04} \\
    \bottomrule
    \Checkmark & \Checkmark & \Checkmark &  \XSolidBrush &  53.55 & 73.28  & 23.54  & 66.21\\
     \Checkmark & \Checkmark & \XSolidBrush &  \Checkmark &  \textbf{56.05} & \textbf{76.30}  & \textbf{27.90}  & \textbf{69.04} \\
    \bottomrule
    \end{tabular}
    }
    
\end{table}





\subsection{Parameter Sensitivity}
% \setlength{\parskip}{1em}
% \noindent\textbf{Parameter Sensitivity.}   
We conduct experiments on VOC 2007 and IAPR TC-12 to analyze the sensitivity of $\lambda_{1}$ and $\lambda_{2}$.  Two parameters are selected from the range of $\{0.01,0.1,1,10,100\}$ and the joint influence  are presented  in the heatmap as shown in the Fig. \ref{figure 6}.  Since the difference between the best performance and the worst is 22.9 on VOC 2007, we can learn that our method is sensitive to both parameters.  The result further validates the Theorem $1$ and emphasizes the need to simultaneously adjust $\lambda_{1}$ and $\lambda_{2}$ to approach the condition for the Theorem $1$, which ensures stable performance.
 
 \begin{figure}[htb]	
  \subfigure[VOC 2007]{\includegraphics[width=0.23\textwidth]{./figure/canshu_pa.pdf}}
  \subfigure[IAPR TC-12]{\includegraphics[width=0.23\textwidth]{./figure/canshu_ia.pdf}}
   \caption{Parameter analysis of the trade-off parameters $\lambda_{1}$ and $\lambda_{2}$ on VOC 2007 and IAPR TC-12.}\label{figure 6}
\end{figure}

\section{Conclusion}
To tackle the incomplete multi-view semi-supervised multi-label problem, we propose a novel deep learning based method named DIMvSML in this paper. 
DIMvSML incorporates both the GNN-based feature completion, view-specific representation extraction network and safe semi-supervised multi-label learning module to preserve discriminative feature and enhance the semantic label information. Besides, we design an unbiased loss to alleviate the bias from large amount of unlabeled data and provide theoretical analysis of our safe risk estimator. Therefore, our DIMvSML can eliminate the negative effect of the incomplete data and use unlabeled information safely for efficient classification. Finally, extensive experimental results on six public datasets demonstrate the effectiveness and superiority of DIMvSML. In the future,  we will further extend to solve other multi-label problems under incomplete views, such as class-imbalance and noisy labels etc.


\section*{Acknowledgments}

This work was supported by the National Science Foundation of China Grant [62036013, 62376281], and the NSF for Huxiang Young Talents Program of Hunan Province under Grant [2021RC3070]. %Tingjin Luo is the corresponding author.

%%
%% The next two lines define the bibliography style to be used, and
%% the bibliography file.
\bibliographystyle{ACM-Reference-Format}
\bibliography{sample-base}

%%
%% If your work has an appendix, this is the place to put it.
% \appendix

% \section{Research Methods}

% \subsection{Part One}

% Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi
% malesuada, quam in pulvinar varius, metus nunc fermentum urna, id
% sollicitudin purus odio sit amet enim. Aliquam ullamcorper eu ipsum
% vel mollis. Curabitur quis dictum nisl. Phasellus vel semper risus, et
% lacinia dolor. Integer ultricies commodo sem nec semper.

% \subsection{Part Two}

% Etiam commodo feugiat nisl pulvinar pellentesque. Etiam auctor sodales
% ligula, non varius nibh pulvinar semper. Suspendisse nec lectus non
% ipsum convallis congue hendrerit vitae sapien. Donec at laoreet
% eros. Vivamus non purus placerat, scelerisque diam eu, cursus
% ante. Etiam aliquam tortor auctor efficitur mattis.

% \section{Online Resources}

% Nam id fermentum dui. Suspendisse sagittis tortor a nulla mollis, in
% pulvinar ex pretium. Sed interdum orci quis metus euismod, et sagittis
% enim maximus. Vestibulum gravida massa ut felis suscipit
% congue. Quisque mattis elit a risus ultrices commodo venenatis eget
% dui. Etiam sagittis eleifend elementum.

% Nam interdum magna at lectus dignissim, ac dignissim lorem
% rhoncus. Maecenas eu arcu ac neque placerat aliquam. Nunc pulvinar
% massa et mattis lacinia.

\end{document}
\endinput
%%
%% End of file `sample-sigconf.tex'.
