\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images

%-----Packages-----
% \usepackage[colorlinks=true, urlcolor=blue, linkcolor=red]{hyperref}
\usepackage{multirow}
\usepackage{color}
% \usepackage{underscore}
% \usepackage{cite}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{algorithm}\usepackage{algpseudocode}
% \usepackage{subfigure}
\usepackage{booktabs}
\usepackage{tikz}
\usepackage{makecell}
\usepackage{amssymb}
\usepackage{float}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{dsfont}
\usepackage{multirow}
\usepackage{arydshln}
\usepackage{soul}

% Support for easy cross-referencing
% \usepackage[capitalize]{cleveref}
% \crefname{section}{Sec.}{Secs.}
% \Crefname{section}{Section}{Sections}
% \Crefname{table}{Table}{Tables}
% \crefname{table}{Tab.}{Tabs.}
%------------------

\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2025}
% \editors{Under Review for MIDL 2025}

\title[Semi-Supervised Skin Lesion Segmentation under DME \& FD Co-Training]{Semi-Supervised Skin Lesion Segmentation under Dual Mask Ensemble with Feature Discrepancy Co-Training}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Thanh-Huy Nguyen\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{thanh-huy\_nguyen@etu.u-bourgogne.fr}\\
\Name{Hoang-Thien Nguyen\midlotherjointauthor\nametag{$^{2,3}$}} \Email{n21dccn080@student.ptithcm.edu.vn}\\
\Name{Xuan-Bach Nguyen\nametag{$^{2,4}$}} \Email{bach.nguyenspring@hcmut.edu.vn}\\
\Name{Nguyen Lan Vi Vu\nametag{$^{2,4}$}} \Email{vi.vuvivu2203@hcmut.edu.vn}\\
\Name{Quang-Vinh Dinh\nametag{$^{2}$}} \Email{vinh.dinhquang@aivietnam.edu.vn}\\
\Name{Fabrice Meriaudeau\midljointauthortext{Corresponding Author}\nametag{$^{1}$}} \Email{Fabrice.Meriaudeau@u-bourgogne.fr}\AND
\addr $^{1}$ Université Bourgogne Europe, CNRS, ICMUB UMR 6302, 21000 Dijon, France\\
\addr $^{2}$ AI Vietnam Research Lab, 660000 Ninh Thuan, Vietnam\\
\addr $^{3}$ Posts and Telecommunications Institute of Technology, Ho Chi Minh City, Vietnam \\ 
\addr $^{4}$ Ho Chi Minh University of Technology, Vietnam
}
% \midljointauthortext{Corresponding Author}
\begin{document}

\maketitle              % typeset the header of the contribution
%

\begin{abstract}
   Skin Lesion Segmentation with supportive Deep Learning has become essential in skin lesion analysis and skin cancer diagnosis. However, in the practical scenario of clinical implementation, there is a limitation in human-annotated labels for training data, which leads to poor performance in supervised training models. In this paper, we propose Dual Mask Ensemble (DME) based on a dual-branch co-training network, which aims to enforce two models to exploit information from different views. Specifically, we introduce a novel feature discrepancy loss trained with a cross-pseudo supervision strategy, which enhances model representation by encouraging the sub-networks to learn from distinct features, thereby mitigating feature collapse. Additionally, Dual Mask Ensemble training enables the sub-models to extract more meaningful information from unlabeled data by combining mask predictions. Experimental results demonstrate the effectiveness of our approach, achieving state-of-the-art performance across several metrics (Dice and Jaccard) on the ISIC2018 and HAM10000 datasets. Our code is available at \url{https://github.com/antares0811/DME-FD}.
\end{abstract}

%%%%%%%%% BODY TEXT
\section{Introduction}
\label{sec:intro}
Lesion segmentation plays an important role in automated skin lesion analysis, as it facilitates the extraction of clinically relevant features such as lesion size, border irregularity, and contrast with the surrounding skin. While many successful machine learning models bypass explicit segmentation, studies have shown that these features contribute to improved lesion characterization and diagnosis \cite{marchetti20233d}. By accurately delineating the lesion, segmentation can enhance downstream tasks such as feature extraction and classification. However, manual annotation of skin lesion images is labor-intensive and prone to variability, making it difficult to produce large, accurately labeled datasets required for training robust models. These challenges underscore the importance of semi-supervised learning approaches, which leverage both labeled and unlabeled data to reduce dependence on extensively labeled datasets while improving real-world model generalization deployment.

In recent years, semi-supervised learning (SSL) techniques have gained significant attention for training models with limited pixel-wise annotated data and a larger set of unlabeled data. Among these, pseudo-labeling methods \cite{9880151, 10.1007/978-3-030-58526-6_9} are widely used. However, they often face challenges related to confirmation bias \cite{9880151}, where incorrect pseudo-labels reinforce errors during training, leading to performance degradation due to training instability. Consistency regularization-based methods \cite{10.5555/3495724.3495775, 10203865}  generate predictions from weakly perturbed inputs to create pseudo-labels but still remain vulnerable to confirmation bias issues.

Conversely, co-training allows different sub-networks to infer the same instance from various perspectives and transfer knowledge from one view to another through pseudo-labeling. Co-training, in particular, leverages multi-view references to improve the model's perception and increase the reliability of the pseudo-labels generated \cite{10.1007/978-3-030-01267-0_9}. Cross-pseudo supervision (CPS) \cite{9577639} enforces consistency between the outputs of two networks by using cross-network pseudo-labels. CCVC \cite{ccvc} proposes a cross-view consistency strategy that pushes the feature extractor outputs of two networks apart, enabling the sub-networks to learn richer semantic information from conflicting predictions. \cite{zeng2024consistency} employs a single-encoder dual-decoder architecture, where differential decoder features are then served as feedback signals to the encoder. 

To design an effective method that prevents sub-networks from collapsing into similar, ineffective representations, we revisit the dual-branch networks \cite{9577639} and extend it with a proposal of a Dual Mask Ensemble (DME) for semi-supervised segmentation. Unlike CPS, our method leverages not only the information from the opposing subnet but also its own generated mask. This self-generated mask is combined with the opponent's predicted mask to guide the model during backpropagation. Specifically, we first introduce the Dual Mask Ensemble, a mask combination technique designed to enable the model to extract additional information from unlabeled data, thereby enhancing its ability to produce precise and reliable predictions. Similar to \cite{ccvc, lefed}, to prevent the sub-networks from collapsing into similar representations, we propose a new feature discrepancy loss that encourages the models to extract distinct features, thus diversifying their representation space. However, \cite{ccvc} relies on conflict-based consistency but lacks an explicit mechanism to address low-confidence predictions. In contrast, our advanced DME module adaptively combines predictions from dual sub-networks based on their confidence and consistency, resulting in more stable pseudo-labels and better generalization. Furthermore, unlike \cite{lefed}, which focuses on decoder-level discrepancy learning, our Feature Discrepancy module emphasizes learning from representation-level discrepancies between sub-networks, enhancing the diversity and complementarity of their predictions.  Our contribution can be summarized as follows:
\begin{itemize}
    \item We introduce the Dual Mask Ensemble, integrated with a dual-branch co-training framework, to enhance the model's ability to generate more reliable predictions.
    \item We propose a novel feature discrepancy loss that promotes the extraction of distinct features, effectively diversifying the model's representation space.
    \item Extensive experiments with our method on the ISIC2018 \cite{8363547} and HAM10000 \cite{tschandl2018ham10000, Tschandl2020HumancomputerCF} datasets show state-of-the-art performance, demonstrating our robustness in the semi-supervised skin segmentation task.  
\end{itemize}


% \begin{figure}[t]
%     \centering
%     \includegraphics[width=1.0\linewidth]{images/pre-figure.jpg}

%     \caption{A comparison between CPS\cite{9577639} and our method. The main difference between CPS and our method is that our proposed method utilizes not only the information of the opponent subnet but also mask information generated by itself, which combines with the predicted mask of the opponent one, to guide the model in the backpropagation process}
%     \label{fig:pre-figure}
% \end{figure}

% \section{Related Work}
% \label{sec:formatting}
% \textbf{Skin Lesion Segmentation.} Most of the outstanding performance methods in skin lesion segmentation are in a supervised manner. It can be represented in four model architectures: single network models\cite{refinenet, focusnet}, multiple network models\cite{BI201978, 8990108, LEI2020101716}, hybrid feature models\cite{Jayapriya, PEZHMANPOUR2020113129} and transformer models\cite{10.1007/978-3-030-87193-2_2, 10.1007/978-3-030-87193-2_31}. However, due to limits in high-quality annotated labels, generalizability to unseen, out-of-distribution data, semi-supervised\cite{9104928, ZHANG2022369, 9957055} and self-supervised\cite{9871734, 9761620} approaches are utilized to learn from both labeled and unlabeled samples. \cite{9104928} proposed a semi-supervised technique that minimizes the difference between the input image's network prediction under different transformations (random perturbations, flipping, and rotation). Self-supervision, on the other hand, is used to extract intrinsic information from data by solving pretext tasks, allowing the use of large amounts of unlabeled data to pre-train a model before fine-tuning it on the downstream task. For skin lesion segmentation, \cite{9098527} attempted to estimate the color distribution, such as blue and red channels, using the given green channel as input to improve the model's ability to extract global representations. 

% \textbf{Semi-supervised Semantic Segmentation.} There are two popular approaches: consistency regularization \cite{zou2021pseudoseg, 9157032, 10203865} and co-training techniques \cite{9577639, 9880095}. Consistency regularization \cite{zou2021pseudoseg, 9157032, 10203865} ensures that input predictions do not differ between distinct augmentations.PseudoSeg \cite{zou2021pseudoseg} used a weak-to-strong design to ensure that strong and weak augmented images produce similar results. CCT\cite{9157032} used feature perturbation techniques to transform features to the new representation, resulting in predictions that were similar to the original. Unimatch \cite{10203865} is a revisiting method that combines the weak-to-strong and feature perturbation paradigms. They also used the CutMix augmentation \cite{9008296} to increase diversity in training samples. Furthermore, co-training-based methods \cite{9577639} \cite{9880095} allow multiple models to learn from each other instead of collapsing on themselves. CPS\cite{9577639} used cross-supervision loss to exploit information between two distinct models. UCC \cite{9880095} utilized multiple decoders with weak-strong augmented inputs to enhance generalization and learn compact feature representation from two perspectives.

% However, these methods cannot fully exploit information from unsupervised data, causing models to generate noisy pseudo-predictions, leading to the confirmation bias problem. To address the aforementioned issue, we propose the Dual Mask Ensemble, which encourages the two models to learn more information from unlabeled data. Furthermore, to maintain one of the key co-training assumptions, that two sub-nets should be distinct, we increase the feature discrepancy loss, which may broaden the model representation space.

\begin{figure*}[t]
    \centering
    \includegraphics[width=0.87\linewidth]{images/DME-FD_updated.png}

    \caption{Overview of our proposed method. The Dual Mask Ensemble module combines masks predicted from weakly augmented inputs into a reliable mask and computes the DME loss with those predicted from strongly augmented inputs. The feature discrepancy loss is applied to features from both sub-networks' encoder outputs.}
    \label{fig:main-figure}
\end{figure*}

\section{Methodology}

Given a set of label images $D_l$ = \{($x^l$, $y^l$)\} along with unlabeled images $D_u$ = \{$x^u$\}. The main objective is to leverage information from the unlabeled set through two distinct training flows: Cross-Pseudo Supervision training \cite{9577639} and Dual Mask Ensemble training (\ref{DME}). However, using both flows may lead to model collapse, where the predictions of the two models become identical. To address this, we propose a feature discrepancy loss (\ref{feature_diss}) to preserve the diversity between the model views. A brief overview of our pipeline is provided in \ref{overall_framework} and illustrated in Figure \ref{fig:main-figure}.

% \subsection{Preliminaries}
% \label{cps}
% Based on the foundation of CPS \cite{9577639}, two models are parallel initialized with having the same structure but different parameters. Each model includes $\{f_i, g_i\}$ where $i\in\{1, 2\}$, $f$ and $g$ is the encoder and decoder respectively. 

% \textbf{Supervised training:} The labeled set $D_l$, which is augmented by a weak augmentation $A_w$ (specifically described in \cref{DME}), is fed into two models to get the prediction. We obtain the augmented version of $x^l$ as $X^l = A_w(x^l)$. After that, the predicted probabilities of the first model and the second model ($P_1^l$ and $P_2^l$) are computed as $P_1^l = g_1(f_1(X^l))$ and $P_2^l = g_2(f_2(X^l))$.

% Finally, the supervised loss between the prediction and ground truth labels is computed as:  
% \begin{align}
%      &L_{sup} = L_{bce,dice}(P_1^l, y^l) + L_{bce,dice}(P_2^l, y^l)     
% \end{align}
% where $y_i^l$ is the ground truth label, and $L_{bce,dice}(P, Y)$ is defined as:
% \begin{align}
% &L_{bce,dice}(P, Y) = \frac{1}{2}(L_{bce}(P, Y) + L_{dice}(P, Y)).
% \end{align}

% Both binary cross-entropy loss ($L_{bce}$) and dice loss ($L_{dice}$) are applied to help the model learn knowledge from the labels more efficiently. The detail of each loss is described as follows:
% \begin{align}
%      &L_{bce}(P, Y) = \sum_{i,j} BCE\left(y_{ij}, p_{ij}\right)
%      &L_{dice}(P, Y) = \sum_{i,j} \left( 1 - \frac{2 \sum_{c} y_{ij,c} p_{ij,c} }{ \sum_{c} y_{ij,c} + \sum_{c} p_{ij,c} } \right)
% \end{align}


% \textbf{Cross-pseudo supervision training:} We apply the same augmentation $A_w$ for the unlabeled input $x^u$ and feed forward into two initialized models to get the confidence maps ($P_1^{cps}$ and $P_2^{cps}$):
% \begin{align}
%      &P_1^{cps} = g_1(f_1(A_w(x^u))), 
%      &P_2^{cps} = g_2(f_2(A_w(x^u)))
% \end{align}
% Then, $P_1^{cps}$ and $P_2^{cps}$ are transformed into the one-hot label map, called pseudo segmentation maps by the \textit{argmax} function:
% \begin{align}
%      &Y_1^{cps} = \arg\max_{c} p_i(y = c \mid P_1^{cps}),
%      &Y_2^{cps} = \arg\max_{c} p_i(y = c \mid P_2^{cps}).
% \end{align}
% The bidirectional cross-supervision loss is applied between these predictions.  The pixel-wise one hot label map is selected $Y_1^{cps}$ to supervise the pixel-wise confidence map $P_2^{cps}$ and the other one is $Y_2^{cps}$ to $P_1^{cps}$.  The cross-pseudo supervision loss on the unlabeled data is computed as:
% \begin{equation}
% \begin{aligned}
%      L_{cps}^u = L_{bce,dice}(&P_1^{cps}, Y_2^{cps}) 
%      &+ L_{bce,dice}(P_2^{cps}, Y_1^{cps})
% \end{aligned}
% \end{equation}
\subsection{Dual Mask Ensemble}
\label{DME}
To fully exploit the information from unlabeled data, we adopt a weak-to-strong paradigm to help each model understand the semantic meaning of images by themselves. Let $A_w$ and $A_s$ denote weak and strong augmentations, respectively. Weak augmentation involves Random Flipping, while strong augmentations include Gaussian Noise, Brightness Contrast, and Color Jittering.

Firstly, the unlabeled input $x^u$ is transformed into weak ($X_w$) and strong ($X_s$) augmented versions  as $X_w = A_w(x^u)$ and $X_s = A_s(A_w(x^u))$. Then, the transformed inputs are fed into each model to obtain the confidence maps:
\begin{align}
     &P_1^W = g_1(f_1(X_w)), P_1^S = g_1(f_1(X_s)),
     &P_2^W = g_2(f_2(X_w)), P_2^S = g_2(f_2(X_s)).
\end{align}

\begin{figure*}[t]
    \centering
    \includegraphics[width=1\linewidth]{images/rebutal/feature.png}

    \caption{Visualization of the combination of Strong-Weak Augmentation, Feature Discrepancy Loss, and the Dual Mask Ensemble module from a feature perspective.}
    \label{fig:featurespace-figure}
\end{figure*}

Finally, we compute the loss between them. Both one-hot label maps of the weak ones are integrated to guide the stronger ones. However, raw predictions from weakly augmented images may contain noise, which can degrade model performance. To mitigate this, a fixed confidence threshold $\tau$ is applied:
\begin{align}
     &Y_1^W = \mathds{1} \left( \max(p_i) \geq \tau \right)\arg\max_{c} p_i(y = c \mid P_1^W),\\
     &Y_2^W = \mathds{1} \left( \max(p_i) \geq \tau \right)\arg\max_{c} p_i(y = c \mid P_2^W).
\end{align}
Here,  $\tau$ serves to separate object pixels from background pixels, $Y_1^W$ and $Y_2^W$ are the pseudo-label masks from the two models, which are then combined using the summation (OR) operation:
\begin{equation}
    \hat{Y}^W = Y_1^W \oplus Y_2^W
\end{equation}
The loss for the Dual Mask Ensemble (DME), $L_{DME}$ is defined as:
\begin{equation}
\begin{aligned}
    L_{DME} = L_{bce, dice}&(P_1^S, \hat{Y}^W) 
    &+ L_{bce, dice}(P_2^S, \hat{Y}^W)
\end{aligned}
\end{equation}

\subsection{Feature Discrepancy Loss}
\label{feature_diss}
The combination of both cross-supervision loss and DME loss can lead to model collapse, where all models produce identical predictions for a sample \cite{wu2024image}. To prevent this issue, we propose a feature discrepancy loss that ensures diversity in the model predictions by maintaining differences in the representation space. The feature discrepancy loss ($L_{dis}$), indicated in Figure \ref{fig:featurespace-figure}, is defined as:
\begin{equation}
    L_{dis}(f_1, f_2) = \frac{1}{D(f_1, f_2) + \epsilon}
\end{equation}
where $\epsilon$ = $1e^{-6}$ prevents division by zero, $f_1$ and $f_2$ are the features from two models, and $D$ represents the Manhattan distance function.

We first extract the feature representations from the model encoder's output. $F_1^{sup}$ and $F_1^{w}$ are the features of supervised and weakly augmented samples from the first model, while $F_2^{sup}$ and $F_2^{w}$ are the corresponding features for the second model. Next, we normalize the feature values using the $Softmax$ function:
\begin{align}
     &F_1^{sup}, F_1^{w} = Softmax(f_1([x^l, X_w])),
     &F_2^{sup}, F_2^{w} = Softmax(f_2([x^l, X_w])).
\end{align}
Finally, the feature discrepancy loss is applied to both the supervised and weakly augmented features:
\begin{equation}
    L_{FDL} = \frac{1}{2}(L_{dis}(F_1^{sup},F_2^{sup}) + L_{dis}(F_1^{w},F_2^{w}))
\end{equation}

\subsection{Overall framework}
\label{overall_framework}
Overall, the final objective loss is written as:
\begin{equation}
\begin{aligned}
    L = L_{sup} + \alpha(L_{cps} + L_{DME}) + \beta L_{FDL}
\end{aligned}
\end{equation}
where $\alpha$ is Consistency Warm-up in \cite{laine2017temporal}. Although using the feature discrepancy loss can increase the model's diversity between different views, it could harm the model by not getting the convergent point in the last epochs. To avoid this behavior, $\beta = 10^{-t/(T * 0.25)}$ is added as a decay for $L_{FDL}$, where t is the current epoch and T is the maximum number of epochs.

\section{Experiments}
\begin{table*}[htbp]
\vspace{-5mm}
\centering
\caption{Quantitative results on the ISIC-2018 under two labeled ratio configurations. \\ \textbf{L} and \textbf{U} are the training ratios of labeled and unlabeled sets, respectively.}
\label{table:tab1}
\resizebox{0.75\textwidth}{!}{\setlength{\tabcolsep}{5pt}
\def\arraystretch{0.9}
\begin{tabular}{|p{60pt}|c|c|p{65pt}|p{65pt}|p{65pt}|p{65pt}|}
\hline
\multirow{2}{*}{Method} & \multicolumn{2}{c|}{Ratio (\%)} & \multicolumn{4}{c|}{Metrics}\\ \cline{2-7}
& L & U & Dice (\%) & JC (\%) & PRE (\%) & ACC (\%)\\ \hline
 & 2 & - & 74.65 {\scriptsize$\pm$2.92} & 60.81 {\scriptsize$\pm$2.99} & 76.09 {\scriptsize$\pm$7.54} & 89.22 {\scriptsize$\pm$1.52} \\
SupOnly & 4 & - & 77.23 {\scriptsize$\pm$0.48} & 65.35 {\scriptsize$\pm$0.56} & 80.28 {\scriptsize$\pm$1.60} & 90.78 {\scriptsize$\pm$0.30} \\
% SupOnly & 8 & -  & 82.28 {\scriptsize$\pm$0.61} & 70.66 {\scriptsize$\pm$0.85} & 84.12 {\scriptsize$\pm$1.26} & 92.76 {\scriptsize$\pm$0.27} \\
 & 100 & - & 87.66 {\scriptsize$\pm$0.93} & 78.49 {\scriptsize$\pm$1.38} & 88.35 {\scriptsize$\pm$1.12} & 94.86 {\scriptsize$\pm$0.31} \\ 
\hline
%Ratio 1 - 99

PseudoSeg & & & 
76.34 {\scriptsize$\pm$3.88} & 
64.29 {\scriptsize$\pm$5.03} & 
81.92 {\scriptsize$\pm$4.28} 
& 90.30 {\scriptsize$\pm$2.01} \\
CCT & & & 
75.11 {\scriptsize$\pm$4.10} & 
62.99 {\scriptsize$\pm$5.79} & 
81.01 {\scriptsize$\pm$2.40} & 
89.23 {\scriptsize$\pm$2.93} \\
CPS & 1 & 99 & 
76.59 {\scriptsize$\pm$3.86} & 
64.31 {\scriptsize$\pm$3.81} & 
81.98 {\scriptsize$\pm$3.05} & 
90.01 {\scriptsize$\pm$1.40} \\
GTA-Seg & & & 
75.67 {\scriptsize$\pm$4.49} & 
63.87 {\scriptsize$\pm$4.18} & 
78.44 {\scriptsize$\pm$6.01} & 
89.55 {\scriptsize$\pm$1.39} \\
UniMatch & & & 
77.16 {\scriptsize$\pm$3.16} & 
65.05 {\scriptsize$\pm$4.45} & 
\textbf{82.43} {\scriptsize$\pm$5.17} & 
90.41 {\scriptsize$\pm$2.19} \\
\textbf{Ours} & & & 
\textbf{78.63} {\scriptsize$\pm$2.32} & 
\textbf{66.28} {\scriptsize$\pm$3.98} & 
82.12 {\scriptsize$\pm$2.87} & 
\textbf{90.86} {\scriptsize$\pm$1.66} \\
\hline
% }
 
\hline
%Ratio 2 - 98
PseudoSeg & & & 79.76 {\scriptsize$\pm$2.11} & 67.16 {\scriptsize$\pm$2.77} & \textbf{84.56} {\scriptsize$\pm$2.29} & 91.67 {\scriptsize$\pm$1.12} \\
CCT & & & 78.66 {\scriptsize$\pm$2.02} & 65.80 {\scriptsize$\pm$2.63} & 81.84 {\scriptsize$\pm$1.85} & 91.28 {\scriptsize$\pm$1.02} \\
CPS & 2 & 98 & 79.61 {\scriptsize$\pm$1.66} & 67.04 {\scriptsize$\pm$2.28} & 82.24 {\scriptsize$\pm$2.81} & 91.56 {\scriptsize$\pm$0.86} \\
GTA-Seg & & & 77.33 {\scriptsize$\pm$2.20} & 64.21 {\scriptsize$\pm$2.59} & 76.65 {\scriptsize$\pm$5.66} & 90.30 {\scriptsize$\pm$0.73} \\
UniMatch & & & 80.03 {\scriptsize$\pm$2.04} & 67.55 {\scriptsize$\pm$2.71} & 83.30 {\scriptsize$\pm$3.87} & 91.74 {\scriptsize$\pm$1.00} \\
\textbf{Ours} & & & \textbf{80.07} {\scriptsize$\pm$1.75} & \textbf{67.62} {\scriptsize$\pm$2.37} & 82.59 {\scriptsize$\pm$1.52} & \textbf{91.75} {\scriptsize$\pm$0.98} \\
\hline
PseudoSeg & & & 81.77 {\scriptsize$\pm$0.66} & 71.18 {\scriptsize$\pm$1.03} & \textbf{85.23} {\scriptsize$\pm$2.47} & 92.72 {\scriptsize$\pm$0.30} \\
CCT & & & 80.96 {\scriptsize$\pm$1.11} & 68.95 {\scriptsize$\pm$1.41} & 83.37 {\scriptsize$\pm$0.83} & 92.22 {\scriptsize$\pm$0.55} \\
CPS & 4 & 96 & 80.89 {\scriptsize$\pm$0.91} & 70.31 {\scriptsize$\pm$1.07} & 83.90 {\scriptsize$\pm$2.26} & 92.29 {\scriptsize$\pm$0.28} \\
GTA-Seg & & & 80.83 {\scriptsize$\pm$0.80} & 70.03 {\scriptsize$\pm$1.07} & 83.30 {\scriptsize$\pm$2.45} & 91.96 {\scriptsize$\pm$0.84} \\
UniMatch & & & 81.41 {\scriptsize$\pm$1.22} & 69.46 {\scriptsize$\pm$1.58} & 84.51 {\scriptsize$\pm$2.05} & 92.43 {\scriptsize$\pm$0.77} \\
\textbf{Ours} & & & \textbf{82.06} {\scriptsize$\pm$0.69} & \textbf{71.54} {\scriptsize$\pm$1.04} & 84.81 {\scriptsize$\pm$1.55} & \textbf{92.83} {\scriptsize$\pm$0.40} \\
\hline
PseudoSeg & & & 83.96 {\scriptsize$\pm$0.86} & \textbf{73.08} {\scriptsize$\pm$1.27} & 85.98 {\scriptsize$\pm$2.62} & \textbf{93.48} {\scriptsize$\pm$0.25} \\
CCT & & & 83.65 {\scriptsize$\pm$0.93} & 72.58 {\scriptsize$\pm$1.36} & 85.32 {\scriptsize$\pm$2.40} & 93.24 {\scriptsize$\pm$0.25} \\
CPS & 8 & 92 & 83.75 {\scriptsize$\pm$0.74} & 72.77 {\scriptsize$\pm$1.14} & 85.04 {\scriptsize$\pm$1.45} & 93.34 {\scriptsize$\pm$0.13} \\
GTA-Seg & & & 83.65 {\scriptsize$\pm$0.98} & 72.62 {\scriptsize$\pm$1.51} & 83.98 {\scriptsize$\pm$1.38} & 93.21 {\scriptsize$\pm$0.50} \\
UniMatch & & & 83.90 {\scriptsize$\pm$0.56} & 72.89 {\scriptsize$\pm$0.80} & 84.64 {\scriptsize$\pm$2.20} & 93.30 {\scriptsize$\pm$0.06} \\
\textbf{Ours} & & & \textbf{84.00} {\scriptsize$\pm$0.31} & 73.06 {\scriptsize$\pm$0.52} & \textbf{86.58} {\scriptsize$\pm$0.52} & 93.44 {\scriptsize$\pm$0.24} \\
\hline
\end{tabular}
}
\vspace{-5mm}
\end{table*}

\subsection{Experimental Settings} We evaluated our proposed methods on two publicly available datasets dedicated to the skin lesion segmentation task. The number of labeled samples is selected by 1\%, 2\%, 4\% of the total training samples, and the rest were used as unlabeled data. We also adopted 5-fold cross-validation to measure model performance. 
% Due to table width restrictions and the high number of columns, we only report the standard deviation (std) of the average of some metrics in tables and provide the std for each dataset in the supplementary material.

\textbf{ISIC-2018:} The ISIC-2018 \cite{8363547} dataset contains 3694 images with labeled masks. We used 2955 samples for training and 739 samples for evaluating the performance. 

\textbf{HAM10000:} The HAM10000 \cite{tschandl2018ham10000, Tschandl2020HumancomputerCF} dataset consists of 10015 samples, partitioned into 8012 samples for training and 2003 samples for validation.

% \subsection{Evaluation Metrics}
% The mean Dice similarity coefficient (Dice), Jaccard coefficient (JC), sensitivity (SEN), specificity (SPE), precision (PRE), and accuracy (ACC) are utilized to evaluate the performance of our proposed method.
% % , which are defined as:
% % \begin{align*}
% %     &Dice=\frac{2TP}{2TP + FP + FN}, &&SEN=\frac{TP}{TP + FN} \\
% %     &Jaccard=\frac{TP}{TP + FP + FN}, &&SPE=\frac{TN}{TN + FP} \\
% %     &ACC=\frac{TP + TN}{TP + TN + FP + FN}, &&PRE=\frac{TP}{TP + FP}
% % \end{align*}
% % where TP, FP, TN, and FN represent true-positive, false-positive, true-negative, and false-negative predictions, respectively.

\subsection{Implementation Details} The proposed method was implemented with PyTorch and trained on a single NVIDIA RTX A6000 card with 48 GB of memory. SwinUnet \cite{10.1007/978-3-031-25066-8_9} is utilized as our main model architecture. We use the AdamW optimizer with an initial learning rate of $1\times10^{-4}$, a linear decay scheduler whose step size is 50 and decay factor $\gamma$ = 0.5. The input images were resized to 224 $\times$ 224. The batch size was set to 8 for ISIC-2018 and 24 for HAM10000. The model was trained for 80 epochs. In the augmentation stages, we adopted Random Flipping for weak augmentation, while Random Color Distortion, Color Jitter, and Gaussian Noise were implemented for strong augmentation. The confidence threshold $\tau$ was set to 0.85. We evaluated performance using mean Dice similarity coefficient (Dice), Jaccard coefficient (JC), precision (PRE), and accuracy (ACC).
% , which are defined as:
% \begin{align*}
%     &Dice=\frac{2TP}{2TP + FP + FN}, &&SEN=\frac{TP}{TP + FN} \\
%     &Jaccard=\frac{TP}{TP + FP + FN}, &&SPE=\frac{TN}{TN + FP} \\
%     &ACC=\frac{TP + TN}{TP + TN + FP + FN}, &&PRE=\frac{TP}{TP + FP}
% \end{align*}
% where TP, FP, TN, and FN represent true-positive, false-positive, true-negative, and false-negative predictions, respectively.

\subsection{Comparison With Existing Methods}
\begin{table}[htbp]
\centering
\caption{Quantitative results on the HAM10000 under two labeled ratio configurations.}%, where SupOnly is trained using only the labeled data.}
\vspace{2mm}
\label{table:tab2}
\setlength{\tabcolsep}{5pt}
\resizebox{0.75\textwidth}{!}{\def\arraystretch{0.9}
\begin{tabular}{|p{60pt}|c|c|p{65pt}|p{65pt}|p{65pt}|p{65pt}|}
\hline
\multirow{2}{*}{Method} & \multicolumn{2}{c|}{Ratio (\%)} & \multicolumn{4}{c|}{Metrics}\\ \cline{2-7}
& L & U & Dice (\%) & JC (\%) & PRE (\%) & ACC (\%)\\ \hline
 & 2 & - & 88.15 {\scriptsize$\pm$0.21} & 78.90 {\scriptsize$\pm$0.31} & 88.12 {\scriptsize$\pm$0.42} & 93.73 {\scriptsize$\pm$0.07} \\
SupOnly & 4 & - & 89.59 {\scriptsize$\pm$0.07} & 81.24 {\scriptsize$\pm$0.12} & 90.83 {\scriptsize$\pm$1.17} & 94.56 {\scriptsize$\pm$0.09} \\
 & 100 & - & 93.54 {\scriptsize$\pm$0.25} & 87.92 {\scriptsize$\pm$0.42} & 93.89 {\scriptsize$\pm$0.57} & 96.58 {\scriptsize$\pm$0.16} \\ 
\hline
PseudoSeg & & & 90.02 {\scriptsize$\pm$0.17} & 81.94 {\scriptsize$\pm$0.28} & 92.11 {\scriptsize$\pm$1.26} & 94.81 {\scriptsize$\pm$0.18} \\
CCT & & & 89.93 {\scriptsize$\pm$0.10} & 81.79 {\scriptsize$\pm$0.15} & 91.55 {\scriptsize$\pm$0.96} & 94.75 {\scriptsize$\pm$0.11} \\
CPS & 2 & 98 & 89.94 {\scriptsize$\pm$0.14} & 81.81 {\scriptsize$\pm$0.23} & 92.21{\scriptsize$\pm$0.77} & 94.78 {\scriptsize$\pm$0.15} \\
GTA-Seg & & & 89.55 {\scriptsize$\pm$0.32} & 81.17 {\scriptsize$\pm$0.54} & 90.39 {\scriptsize$\pm$0.10} & 94.48 {\scriptsize$\pm$0.21} \\
UniMatch & & & 89.66 {\scriptsize$\pm$0.15} & 81.35 {\scriptsize$\pm$0.26} & 91.68 {\scriptsize$\pm$0.60} & 94.62 {\scriptsize$\pm$0.20} \\
\textbf{Ours} & & & \textbf{90.45} {\scriptsize$\pm$0.17} & \textbf{82.65} {\scriptsize$\pm$0.27} & \textbf{92.40} {\scriptsize$\pm$1.02} & \textbf{95.04} {\scriptsize$\pm$0.20} \\
\hline
PseudoSeg & & & 90.97 {\scriptsize$\pm$0.39} & 83.21 {\scriptsize$\pm$0.64} & \textbf{92.72} {\scriptsize$\pm$1.19} & 95.20 {\scriptsize$\pm$0.28} \\
CCT & & & 90.64 {\scriptsize$\pm$0.53} & 82.97 {\scriptsize$\pm$0.86} & 92.43 {\scriptsize$\pm$0.15} & 95.12 {\scriptsize$\pm$0.25} \\
CPS & 4 & 96 & 90.76 {\scriptsize$\pm$0.51} & 83.17 {\scriptsize$\pm$0.84} & 92.56 {\scriptsize$\pm$0.46} & 95.16 {\scriptsize$\pm$0.29} \\
GTA-Seg & & & 90.86 {\scriptsize$\pm$0.19} & 83.34 {\scriptsize$\pm$0.31} & 92.18 {\scriptsize$\pm$0.54} & 95.21 {\scriptsize$\pm$0.12} \\
UniMatch & & & 90.32 {\scriptsize$\pm$0.44} & 82.43 {\scriptsize$\pm$0.73} & 91.96 {\scriptsize$\pm$1.21} & 94.95 {\scriptsize$\pm$0.30} \\
\textbf{Ours} & & & \textbf{91.13} {\scriptsize$\pm$0.30} & \textbf{83.79} {\scriptsize$\pm$0.50} & 92.36 {\scriptsize$\pm$0.30} & \textbf{95.34} {\scriptsize$\pm$0.19} \\
\hline
\end{tabular}
}
\vspace{-3mm}
\end{table}

\subsubsection{Quantitative Comparison}

Our proposed framework is fairly compared with PseudoSeg, CCT, CPS, and UniMatch on ISIC-2018 and HAM10000. Quantitative results are detailed in Table \ref{table:tab1} and Table \ref{table:tab2}. A supervised baseline using only labeled data ("SupOnly") is also evaluated. All methods employ the same data augmentation, training strategies, and backbones to ensure fair comparisons.

\textbf{Segmentation Results on ISIC-2018: } Table \ref{table:tab1} compares our method with other semi-supervised segmentation frameworks on the ISIC-2018 dataset. With a setting of limited 2\% labeled data (59 labeled and 1896 unlabeled samples), our approach achieves notable improvements in both the Dice score (80.07\%) and the Jaccard coefficient (67.62\%), outperforming all competing methods. When the ratio of labeled data increases to 4\% (118 samples), the Dice and Jaccard scores further improve to 82.06\% and 71.54\%, maintaining the leading position. With an 8\% labeled dataset (236 labeled and 2791 unlabeled samples), our method achieves the highest Dice score (84.12\%) and ranks second in the Jaccard coefficient (73.24\%), slightly below the full-supervised baseline, while surpassing state-of-the-art methods like UniMatch and CPS.

\textbf{Segmentation Results on HAM10000: } Table \ref{table:tab2} displays a comparison of our performance with other semi-supervised segmentation frameworks on the HAM10000 dataset. Provided a limited set of 2\% labeled data (160 labeled and 7852 unlabeled images), our approach shows a marked improvement in both Dice score and Jaccard coefficient, achieving 90.45\% and 82.65\%, respectively. With 4\% (320) labeled images, our method achieves the highest performance, with a Dice score of 91.13\% and a Jaccard coefficient of 83.79\%.

\subsubsection{Qualitative Comparison}
\begin{figure*}[t]
    \centering
    \includegraphics[width=0.8\linewidth]{images/isic2018/isic.jpg}
    \vspace{-3mm}
    \caption{Visualization of semi-supervised model performance on ISIC2018 dataset under various supervised training sample ratio: A: 2\%; B: 4\%; C: 8\%}
    \label{fig:isic-figure}
\end{figure*}

\begin{figure*}[h]
    \centering
    \includegraphics[width=0.8\linewidth]{images/ham10000/00094.jpg}
    \vspace{-3mm}
    \caption{Visualization of semi-supervised model performance on the HAM10000 dataset under various supervised training sample ratio: A: 2\%; B: 4\%}
    \label{fig:ham10000-figure}
\end{figure*}

Figs. \ref{fig:isic-figure} and \ref{fig:ham10000-figure} visually compare the proposed method with existing approaches, alongside the original images, ground-truth labels, and full-supervised predictions for a detailed assessment. Our method clearly delivers smoother predictions with fewer blending pixels compared to other methods. We also visualized the effectiveness of our method on different types of lesions and cross-domain scenarios between HAM10000 and ISIC2018. The detailed figures are provided in the Appendix.
\subsection{Ablation Study}

\begin{table}[htbp]
\def\arraystretch{0.9}
\centering
\caption{Results on Mask Refinement Training with 4\% Labeled Samples in two datasets}
\vspace{2mm}
\label{table:ab_table3}
\resizebox{0.85\textwidth}{!}{\begin{tabular}{p{100pt}|p{65pt}|p{65pt}|p{65pt}|p{65pt}}
\hline
\multirow{2}{*}{Method} & \multicolumn{2}{c|}{ISIC-2018} & \multicolumn{2}{c}{HAM10000}\\
\cline{2-5}
& Dice (\%) & JC (\%) & Dice (\%) & JC (\%)\\
\hline
Intersect & 81.41 {\scriptsize$\pm$0.68} & 70.82 {\scriptsize$\pm$0.99} & 91.00 {\scriptsize$\pm$0.36} & 83.56 {\scriptsize$\pm$0.60}\\
Union & \textbf{82.06} {\scriptsize$\pm$0.69} & \textbf{71.54} {\scriptsize$\pm$1.04} & \textbf{91.13} {\scriptsize$\pm$0.30} & \textbf{83.79} {\scriptsize$\pm$0.50}\\
Self-teaching & 81.39 {\scriptsize$\pm$1.10} & 70.73 {\scriptsize$\pm$1.49} & 91.08 {\scriptsize$\pm$0.36} & 83.71 {\scriptsize$\pm$0.59}\\
Cross-view teaching & 81.49 {\scriptsize$\pm$0.97} & 70.89 {\scriptsize$\pm$1.38} & 91.07 {\scriptsize$\pm$0.42} & 83.70 {\scriptsize$\pm$0.70}\\
\hline
\end{tabular}
}
\end{table}

\begin{table}
% \vspace{2mm}
\def\arraystretch{0.9}
\centering
\caption{Ablation studies of our framework with 4\% labeled samples on ISIC2018}
\vspace{2mm}
\label{table:ab_table5}
\resizebox{0.65\textwidth}{!}{\begin{tabular}
{p{25pt}|p{30pt}|p{30pt}|p{30pt}|p{60pt}|p{60pt}}
\hline
Sup & CPS & DME & FDL & Dice (\%) & JC (\%)\\
\hline
\centering\checkmark &  &  &  & 77.23 {\scriptsize$\pm$0.48} & 65.35 {\scriptsize$\pm$0.56}\\
\centering\checkmark & \centering\checkmark &  &  & 79.61 {\scriptsize$\pm$1.66} & 67.04 {\scriptsize$\pm$2.28}\\
\centering\checkmark & \centering\checkmark & \centering\checkmark &  & 81.67 {\scriptsize$\pm$0.88} & 71.13 {\scriptsize$\pm$1.22}\\
\centering\checkmark & \centering\checkmark & \centering\checkmark & \centering\checkmark & \textbf{82.06} {\scriptsize$\pm$0.69} & \textbf{71.54} {\scriptsize$\pm$1.04}\\
\hline
\end{tabular}
}
\end{table}

\subsubsection{Mask Refinement Mechanism} 
Table \ref{table:ab_table3} compares four different approaches of mask integration for skin lesion segmentation on the ISIC2018 and HAM10000 datasets, using 4\% of labeled samples. The investigated approaches include Intersect, Union, Self-Teaching, and Cross-View Teaching.

\textbf{Intersect Method} employs a multiplication (AND) operation for mask integration, aiming to retain only the overlapping regions between different predictions. %As seen in Table 3, the Intersect method achieved a Dice coefficient of 81.41\% ± 0.68 and a Jaccard coefficient (JC) of 70.82\% ± 0.99 on the ISIC2018 dataset. In the HAM10000 dataset, it achieved a higher Dice of 91.00\% ± 0.36 and JC of 83.56\% ± 0.60. 
The performance, shown in Table \ref{table:ab_table3}, indicates that the strict intersection strategy can effectively filter out noisy predictions but risks discarding valuable information, leading to lower scores compared to other methods.

\textbf{Union Method} applies a summation (OR) operation to combine masks,  encompassing all possible regions covered by different predictions. This method, adopted as our current approach, exhibits superior performance, particularly on the ISIC2018 dataset, with a Dice coefficient of 82.06\% ± 0.69 and a JC of 71.54\% ± 1.04. Similarly, in the HAM10000 dataset, the Union approach continues to deliver top performance with a Dice of 91.13\% ± 0.30 and JC of 83.79\% ± 0.50. These results underline that the Union method effectively integrates multiple predictions, with complete ROI capture thanks to a comprehensive mask.

\textbf{Self-Teaching Method} \cite{zou2021pseudoseg} uses the weaker version of a pseudo mask to guide its own refinement towards a stronger version. %The results show that the Self-Teaching method achieves a Dice of 81.39\% ± 1.0 and a JC of 70.73\% ± 1.35 on the ISIC2018 dataset. In the HAM10000 dataset, it reaches a Dice of 91.08\% ± 0.38 and a JC of 83.71\% ± 0.69. 
 While the Self-Teaching yields slightly lower scores than the Union, it demonstrates competitive performance, especially in challenging cases where weak pseudo masks iteratively refine to deliver accurate predictions.

\textbf{Cross-View Teaching Method} \cite{ngo2024dual} involves cross-guidance, where a weak pseudo mask supervises predictions of stronger augmented images from the opposite model. %The results indicate that Cross-View Teaching has a Dice of 81.49\% ± 0.97 and JC of 70.89\% ± 1.38 on the ISIC2018 dataset, and a Dice of 91.07\% ± 0.42 and JC of 83.70\% ± 0.70 on the HAM10000 dataset.
This approach achieves performance comparable to the Self-Teaching. However, the added complexity of Cross-View Teaching does not consistently outperform the Union.

% Across both datasets, the Union method shows the best overall performance in both Dice and Jaccard metrics %, demonstrating the effectiveness of combining masks through summation (OR) operations. 
% This indicates that retaining a more comprehensive region of possible lesion areas provides better segmentation performance. While the Self-Teaching and Cross-View Teaching methods are also competitive, their more complex designs do not consistently improve performance over the simpler Union approach.

\subsubsection{Analysis on component effectiveness}

Our method incorporates several key components: a CPS module, a Dual Mask Ensemble (DME) module, and a feature discrepancy strategy. %The method utilizes four distinct losses — supervised loss ($L_{sup}$), CPS loss ($L_{cps}$), DME loss ($L_{DME}$), and feature discrepancy loss ($L_{FDL}$).
Table \ref{table:ab_table5} investigates the individual contributions of these components on the ISIC2018 dataset with 4\% supervised samples.

Applying cross-pseudo supervision loss ($L_{cps}$) improves Dice and JC metrics by over 2\% and 1.7\%, showing its effectiveness despite some correlation between sub-net views. Leveraging the DME module ($L_{DME}$) further boosts Dice by 2\% and Jaccard by 4\%. Finally, adding feature discrepancy loss ($L_{FDL}$) increases both metrics by 0.4\%, enabling sub-nets to learn from orthogonal views and outperforming state-of-the-art methods.
\subsubsection{Analysis on feature loss selection}
\begin{table}[h]
\centering
\setlength{\tabcolsep}{5pt}
\def\arraystretch{1.0}
\begin{tabular}{p{55pt}|c|c|p{65pt}|p{65pt}|p{65pt}|p{65pt}}
Method & L & U & Dice (\%) & JC (\%) & PRE (\%) & ACC (\%)\\
\hline
CCVC & 4\% & 96\% & 81.61 {\scriptsize$\pm$1.42} & 71.01 {\scriptsize$\pm$1.97} & \textbf{85.49} {\scriptsize$\pm$0.66} & 92.72 {\scriptsize$\pm$0.74} \\
\textbf{FDL} & & & \textbf{82.06} {\scriptsize$\pm$0.69} & \textbf{71.54} {\scriptsize$\pm$1.04} & 84.81 {\scriptsize$\pm$1.55} & \textbf{92.83} {\scriptsize$\pm$0.40} \\
\hline
CCVC & 8\% & 92\% & 83.67 {\scriptsize$\pm$0.59} & 72.66 {\scriptsize$\pm$0.80} & 84.80 {\scriptsize$\pm$1.21} & 93.33 {\scriptsize$\pm$0.06} \\
\textbf{FDL} & & & \textbf{84.00} {\scriptsize$\pm$0.31} & \textbf{73.06} {\scriptsize$\pm$0.52} & \textbf{86.58} {\scriptsize$\pm$0.52} & \textbf{93.44} {\scriptsize$\pm$0.24} \\
\hline
\end{tabular}
\caption{Feature loss design comparison on the ISIC-2018}
\label{table:feature_loss_selection}
\end{table}
\vspace{-1mm}
We compared our method with the most relevant approach on feature correlation between two networks - CCVC \cite{ccvc}. In contrast, FDL leverages modified Manhattan distance-based to enforce the difference between two feature representations. In Table \ref{table:feature_loss_selection}, our loss design achieved a clear improvement compared to CCVC in most metrics. A deeper analysis of FDL is provided in Appendix A, Table \ref{tab:late_corr} and Table \ref{tab:fdl} specifically.

\section{Conclusion}
In this work, we present a semi-supervised method based on a co-training framework for skin lesion segmentation. We have introduced the Dual Mask Ensemble module to enhance the model's ability to learn meaningful information from unlabeled data. Additionally, we demonstrate that our proposed feature discrepancy loss boosts model performance by encouraging distinct feature extraction, which avoids the collapse and diversifies the representation space of models, thus reducing the confirmation bias problem. Extensive experiments on benchmark datasets validate the robustness of the proposed approach. 

\section{Acknowledgement} 
We would like to thank the Graduate School INTHERAPI for its financial support.
%%%%%%%%% REFERENCES
% \bibliographystyle{splncs04}
\bibliography{midl25_094}
\pagebreak
\appendix

\section{Analysis on Feature Discrepancy Loss}

\begin{table}[h]
    \centering
    \begin{tabular}{cc} % Two columns
        \includegraphics[width=0.45\textwidth]{images/features/late/ours_corre1.jpg} & 
        \includegraphics[width=0.45\textwidth]{images/features/late/ours_corre2.jpg} \\
        (a) WA Unlabeled Samples & (b) Labeled Samples
    \end{tabular}
    \caption{Sample-wise Feature Correlation Using Cosine Similarity betweem both branches in dual-network, where WA denotes the weak augmentation.}
    \label{tab:late_corr}
\end{table}
\noindent\textbf{Impact of Feature Discrepancy Loss} To check whether the two parallel models utilize complementary or less-correlated features, we visualized the correlation between features of each sample among branches using cosine similarity. Following the Table \ref{tab:late_corr}, the diagonal elements being close to zero indicate that the feature representations from the two branches exhibit low similarity, suggesting that they capture distinct aspects of the data. Moreover, the observed differences between the labeled and unlabeled samples, where the labeled samples show slightly stronger decorrelation, support the idea that the feature discrepancy loss indeed encourages diverse feature learning. This addresses the concern about merely having shifted versions of similar feature vectors — if that were happening, we would expect more consistent and higher correlation patterns across the matrices. Instead, the observed variation and consistently low cosine similarity demonstrate that the models learn complementary and non-redundant features.\\

\begin{table}[h]
    \centering
    \setlength{\tabcolsep}{0pt}
    \begin{tabular}{ccc} % Two columns
        \includegraphics[width=0.33\textwidth]{images/features/early/ccvc_corre2.jpg} & 
        \includegraphics[width=0.33\textwidth]{images/features/middle/ccvc_corre2.jpg} & 
        \includegraphics[width=0.33\textwidth]{images/features/middle/ccvc_corre2.jpg} \\
        & (a) Feature Correlations (CCVC) & \\
        \includegraphics[width=0.33\textwidth]{images/features/early/ours_corre2.jpg} & 
        \includegraphics[width=0.33\textwidth]{images/features/middle/ours_corre2.jpg} & 
        \includegraphics[width=0.33\textwidth]{images/features/middle/ours_corre2.jpg} \\
        & (b) Feature Correlations (Ours) &
    \end{tabular}
    \caption{Visualization of feature correlation of each sample. \textbf{Left}, \textbf{Middle}, \textbf{Right} represent for the \textbf{early}, \textbf{middle} and \textbf{late} iteration.}
    \label{tab:fdl}
\end{table}

\noindent\textbf{Comparison to discrepancy loss of CCVC:} Table \ref{tab:fdl} provides valuable insight into the distinct behavior of our proposed discrepancy loss compared to the CCVC discrepancy loss. The diagonal cosine similarity values being close to zero reflect the degree of feature discrepancy between the two branches of the network. In the first figure ((a) - CCVC), the consistently strong negative correlations along the diagonal indicate a more rigid and potentially less adaptive discrepancy mechanism. In contrast, our method (second figure (b)) shows a more nuanced and flexible distribution of similarity values — this suggests that our approach captures a richer diversity in feature representations, likely leading to more robust and generalizable model performance. This highlights the advantage of our discrepancy loss in fostering complementary and well-differentiated feature learning between the branches.





\section{Rare skin lesion types}

\begin{figure}[h]
    \centering
    \includegraphics[width=1.0\textwidth]{images/rebutal/r2_q1.png} % Path to your image
    \caption{The visualization of different types of skin on ISIC dataset}
    \label{fig:isic_types}
\end{figure}

As seen in the provided figure \ref{fig:isic_types}, the first row shows the original images with various challenges — including hair artifacts, small and large lesions, black frames, color calibration marks, blood vessels, water bubbles, and overall complex or 'hard' cases. The second row illustrates the ground truth segmentations, while the third row shows our model’s predictions. Despite these difficult conditions, our approach consistently captures the lesion areas with high fidelity, maintaining accurate boundaries and minimizing false positives and negatives. Notably, even in cases with heavy occlusion (like hair) and small or irregularly shaped lesions, our method remains resilient, demonstrating its generalization ability across diverse and challenging data distributions. This highlights the robustness and effectiveness of our approach in real-world clinical scenarios.\\

\section{Cross-Domain Performance Evaluation} 

\begin{figure}[h]
    \centering
    \includegraphics[width=1.0\textwidth]{images/rebutal/r2_q3.png} % Path to your image
    \caption{The visualization of prediction with cross-domain between ISIC2018 and HAM10000. The top, second, and bottom rows indicate the images, groundtruth images, and predictions of models, respectively.}
    \label{fig:cross-domain}
\end{figure}

We perform cross-domain evaluation by training on one dataset and evaluating on the other. Specifically, we use the best-performing model weights from each dataset (ISIC2018 and HAM10000) and test them on the other dataset. The visual Figure \ref{fig:cross-domain} shows that while the segmentation performance generally transfers well, there are noticeable differences in mask quality, particularly in shape and boundary accuracy, indicating domain shifts between the datasets. This cross-domain evaluation highlights the model’s robustness and its limitations when adapting to unseen data distributions.\\



\end{document}