




\section{Methods}

\begin{figure}[t]
\centering
\includegraphics[width=0.88\linewidth]{figures/framework_new.pdf}
\caption{The proposed framework adapts the widely used cross-supervision baseline (a) with uncertainty-guided supervision to obtain reliable pseudo-labels (b–c), and further incorporates multi-level mutual learning (d) to improve cross-network consistency. Panels (b–c) (in blue) operate only on unlabeled data $x_u$, whereas (d) is applied only to labeled data $x_l$. The two networks share the same architecture but are optimized independently. $y$, $\tilde{y}$, and $\tilde{y}^{uc}$ denote the ground-truth mask, the raw pseudo-label, and the uncertainty-guided pseudo-label, respectively. $\odot$ denotes the Hadamard (element-wise) product, and $U^b$ is the binary mask from uncertainty map $U$. $x_u^s$ represents a strongly intensity-augmented version of $x_u$.  
We define $\tilde{y}_1^{uc}=\tilde{y}_1\odot U_1^{b}$ and $\tilde{y}_2^{uc}=\tilde{y}_2\odot U_2^{b}$, and omit them for brevity.
}

\label{framework}
\end{figure}


We begin with a semi-supervised segmentation dataset $D$, which consists of limited labeled data $\{x_l, y_l\}$ and a large amount of unlabeled data $\{x_u\}$, where $x$ and $y$ represent the input images and their annotations, respectively. 
\subsection{Preliminaries}
\paragraph{Generic pseudo-label learning.}
The generic pseudo-label learning \cite{bellver2019budget} for a single network (referred to as Generic) first trains the model $f$, with forward pass $f(\cdot)$ on $\{x_l, y_l\}$ and applies it to $x_u$ to obtain the logit map $f(x_u)$, which is then binarized to form pseudo-label $\tilde{y}_u$ and used as additional supervision. This can be described as: 
\begin{equation}
\label{generic eq}
L = L_s + L_{p}
\end{equation}
where $L_s$ and $L_p$ denote the supervised and pseudo-supervised loss for $\{x_l, y_l\}$ and $\{x_u, \tilde{y}_u\}$.





\paragraph{Cross-supervision.} 
\label{cross-supervision parahraph}
Endo-SemiS employs two individual U-Nets without sharing weights \cite{ronneberger2015u} to achieve cross-supervision signals, as shown in Fig.~\ref{framework}(a). For a given input $x\in\{x_l,x_u\}$, the supervision can be simply extended from Generic (Eq.~\ref{generic eq}) as:
\begin{equation}
\label{pseudo_supervision_eq}
    L_p^{\text{cross}}(x) = L_{p}(f_1(x), \tilde{y}_2) + L_{p}(f_2(x), \tilde{y}_1)
\end{equation}
 where $L_{p}^{\text{cross}}$ represents the cross-supervision applied to both networks using the pseudo-label from the other model. The subscripts $i\in\{1,2\}$ indicate the corresponding network. 
 Note that $f_i(x)$ denotes the raw logit map produced by network $i$ for input $x$.
 % Note, $f(x)$ denotes the raw logit map in this work. 
 For brevity, we include it in the loss function, as it can be converted to probabilities within the loss.


\subsection{Uncertainty-guided pseudo-label}
Uncertainty is introduced into the framework to mitigate confirmation bias (Fig.~\ref{framework}(b)). 
\textit{We hypothesize that uncertainty estimates allow us to identify unreliable pseudo-label regions and exclude them from supervision, so that training focuses on reliable areas.}



\paragraph{Aleatoric uncertainty.}\label{AC} We adopt the widely used weak-to-strong augmentation strategy \cite{sohn2020fixmatch}. Each unlabeled image $x_u$ first undergoes geometric augmentations, referred to as weak augmentation, and $x_u$ is further modified using intensity-based augmentations to obtain a strongly augmented image $x_u^s$. The corresponding pseudo-label $\tilde{y}_u$ is used to supervise the prediction from $x_u^s$. We also leverage CutMix \cite{yun2019cutmix} augmentation on $x_u$ and $x_u^s$ to further increase the robustness and segmentation performance.
% The CutMix is defined as: $
%     x_{cut} = M{_{cut}} \odot x_A + (1 - M{_{cut}}) \odot x_B
% $, where $x_A$ and $x_B$ follow the same augmentation pipeline, but undergo independent transformations to ensure diverse inputs. $M_{cut}$ is a randomly generated binary mask in each iteration. In this setup, $x_B$ serves as the auxiliary sample for $x_A$ and is only used for CutMix. The same $M_{cut}$ is used for both $x_u$ and $x_u^s$.



\begin{figure}[t]
\centering
\includegraphics[width=0.88\linewidth]{figures/uncertainty.pdf}
\caption{
(a) For an unlabeled image $x_u$, uncertainty-guided pseudo-labels $\tilde{y}_1^{uc}$ and $\tilde{y}_2^{uc}$ (green boxes) are obtained by dynamically filtering the raw pseudo-labels $\tilde{y}_1$ and $\tilde{y}_2$, leading to cleaner supervision. The label $y_u$ of the unlabeled image is shown for reference only.
(b) $M$ chooses the lower-uncertainty prediction at each pixel to obtain the joint pseudo-label $\tilde{y}_j^{uc}$ for more reliable supervision by correcting residual defects in $\tilde{y}_2^{uc}$ from (a).
(c) Compared with the pseudo-labels at epoch $n$ in (a), the  $\tilde{y}_1^{uc}$, $\tilde{y}_2^{uc}$ and $\tilde{y}_j^{uc}$  at epochs $n+1$ and $n+2$ become cleaner and more consistent with $y_u$, indicating the effectiveness of (a) and (b).}

\label{uncertainty}
\end{figure}


\paragraph{Epistemic uncertainty.}
The cross-supervision setup naturally accommodates stochastic regularization, so we insert Monte Carlo dropout \cite{kendall2017uncertainties} layers after each decoder convolution to estimate uncertainty and improve the reliability of pseudo-labels, which further improves segmentation performance \cite{yu2019uncertainty}. Specifically, as shown in Fig.~\ref{uncertainty}(a), each unlabeled sample $x_u$ is passed through both networks multiple times to estimate entropy-based uncertainty. For each network $f_i$ $(i\in\{1,2\})$, the final output probability map is computed as $P_i=\frac{1}{K}\sum_{k=1}^{K} p_{i,k}$, where $p_{i,k}$ denotes the probability map in the $k$-th forward pass of network $i$, and we set $K=5$. The entropy-based epistemic uncertainty map is derived as $U_i=\frac{1}{K}\sum_{k=1}^{K} h(p_{i,k})$, with $h(p)=-p\log p-(1-p)\log(1-p)$.



% Specifically, as shown in Fig.~\ref{uncertainty}(a), each unlabeled data $x_u$ is passed through both networks multiple times to estimate entropy-based uncertainty. 
% For each network, the final output probability map is computed as
% $P = \frac{1}{K} \sum_{k=1}^{K} p_k$, 
% where $p_k$ denotes probability map in each forward pass, and we set $K=5$. The entropy-based epistemic uncertainty ($U$) is derived as: $U = \frac{1}{K} \sum_{k=1}^{K} h(p_k)$,  with $h(p_k) = - p_k \log p_k - (1 - p_k) \log (1 - p_k)$.

\paragraph{Dynamic filtering.}Unlike previous works that use a fixed threshold \cite{sohn2020fixmatch}, the entire uncertainty map \cite{luo2022semi-uncertainty} or quantile-based selection \cite{yu2019uncertainty,yang2023revisiting}, we use a dynamic and data-driven thresholding strategy. Given $U_i$, the threshold is set as $T_i=\min[\mu(U_i)+\sigma(U_i),\, U_{i,0.95}]$, where $\mu$, $\sigma$ and $U_{i,0.95}$ denote the mean, standard deviation and $95^{th}$ percentile, respectively. Our adaptive thresholding effectively handles long-tail distributions and noisy predictions, yielding a more reliable uncertainty-based binary mask $U_i^{b}=\mathbbm{1}(U_i<T_i)$, where $\mathbbm{1}$ denotes the indicator function (see Fig.~\ref{uncertainty}(a)). The final uncertainty-guided pseudo-label for $x_u$ is then formulated as $\tilde{y}^{uc}_{i}=\tilde{y}_{i}\odot U_i^{b}$.






\subsection{Joint pseudo-label supervision} %The incorporation of aleatoric and epistemic uncertainty increases complexity. %While it increases robustness, it may also introduce additional noise into the pseudo-labels, particularly when supervising $x_u^s$, where the pseudo-label quality should be sufficient to guide harder samples. 

    Even with the incorporation of uncertainty estimates, the pseudo-labels may still be too noisy to provide appropriate supervision for harder samples. Most existing methods solely rely on the $\tilde{y}_u$ from each network for supervision, which may not be sufficient. To address this, \textit{our hypothesis is that joint supervision can effectively refine pseudo-labels by leveraging complementary information from both networks, providing more reliable supervision for challenging samples.}




As shown in Fig.~\ref{uncertainty}(b), the joint pseudo-label $\tilde{y}^{uc}_{j}$ is constructed in three steps: 
(1) Given the uncertainty maps $U_1$ and $U_2$ from the two networks in Endo-SemiS, we create a binary mask 
$M = \mathbbm{1}(U_1 < U_2)$ that selects the more confident prediction at each pixel. 
(2) Using this mask, we form the joint probability 
$P_j = M \odot P_1 + (1 - M) \odot P_2$ and obtain the raw pseudo-label $\tilde{y}_{j}$ by thresholding $P_j$ at $0.5$, while the joint uncertainty map is defined as 
$U_j = M \odot U_1 + (1 - M) \odot U_2$. 
 (3) Finally, we apply the dynamic filtering scheme to $U_j$ to obtain the binary uncertainty mask $U^b_j$ and compute the final uncertainty-guided joint pseudo-label as $\tilde{y}^{uc}_{j} = \tilde{y}_{j} \odot U^b_j$. 



For an unlabeled image $x_u$ and its strongly augmented version $x_u^s$, 
we extend the cross-supervision loss in Eq.~\ref{pseudo_supervision_eq} to a weak–strong setting, 
where pseudo-labels are generated from the weak augmented image (see Sec.~\ref{AC}) and used to supervise the strongly augmented image. Together with uncertainty-guided pseudo-label learning,
the cross pseudo-supervised loss $L_p^{\text{cross}}(x_u, x_u^s)$ is defined as:
\begin{equation}
L_p^{\text{cross}}(x_u, x_u^s)
=
\underbrace{
    L_p\big(f_1(x_u), \tilde{y}^{uc}_{2}\big)
  + L_p\big(f_2(x_u), \tilde{y}^{uc}_{1}\big)
}_{\text{uncertainty-guided cross-supervision}}
+
\underbrace{
    L_p\big(f_1(x_u^s), \tilde{y}^{uc}_{j}\big)
  + L_p\big(f_2(x_u^s), \tilde{y}^{uc}_{j}\big)
}_{\text{joint pseudo-label supervision}}
\end{equation}







\subsection{Multi-level mutual learning}
Individual networks may independently learn different representations, which can cause divergence and inconsistencies in their predictions. If one network is consistently wrong, it can bias the other network and propagate errors. We propose a multi-level mutual learning approach to mitigate this variability by aligning the learning trajectories of both models and promoting consistency in their predictions. Although it does not guarantee correctness on unlabeled data, it reduces randomness and stabilizes the learning process, making models less likely to reinforce extreme errors. 



We use the labeled data to apply mutual learning between the two networks. This encourages similarity at both the encoders and the decoders. The consistency from encoder and bottleneck features helps align feature representations and reduce variability in learned embeddings. 
Unlike previous work, which enforces the similarity between the probability maps \cite{zhang2018deep}, we enforce prediction consistency at the decoder level by aligning the logit maps of the networks, which is particularly important when generating pseudo-labels. Since pseudo-labels are filtered based on confidence thresholds, mutual learning stabilizes training by reducing prediction variance between networks, making the pseudo-label selection process more reliable. 


% \xx{Unlike previous work, which enforces consistency in the probability space \cite{zhang2018deep}, we enforce consistency directly on the logit maps, which is particularly important when generating pseudo-labels. Operating in the logit space not only aligns the foreground versus background decisions, but also penalizes cases where one network is very confident and the other is only moderately confident, thereby encouraging the two networks to match their confidence levels. Since pseudo-labels are filtered by confidence thresholds, this stronger alignment in confidence reduces prediction variance between networks and stabilizes the pseudo-label selection process.}






% For a labeled image $x_l$, let $f_1^e, f_1^b, f_1$ and $f_2^e, f_2^b, f_2$ denote the
% first encoder feature maps, bottleneck features, and logit maps of the two networks, respectively.
% The multi-level mutual learning loss is defined as:
% \begin{equation}
% L_m(x_l) = L_{m_{1\rightarrow2}}(x_l) + L_{m_{2\rightarrow1}}(x_l),
% \end{equation}
% \begin{equation}
% L_{m_{1\rightarrow2}}(x_l)
% = 0.5\, L_{\mathrm{ssim}}(f_1^e, f_2^e)
% + 0.5\, L_{\mathrm{KL}}(f_1^b, f_2^b)
% + L_{\mathrm{mse}}(f_1, f_2)
% \end{equation}
% and $L_{m_{2\rightarrow1}}(x_l)$ is defined by swapping the indices $1$ and $2$.


For a labeled image $x_l$, let $f_1^e, f_1^b, f_1^l$ and $f_2^e, f_2^b, f_2^l$ denote the
first encoder feature maps, bottleneck features, and logit maps of the two networks, respectively.
The multi-level mutual learning loss is defined as:
\begin{equation}
L_m(x_l) =
 L_{\mathrm{ssim}}\!\big(f_1^e, f_2^e\big)
+  0.5\, \big( L_{\mathrm{kl}}(p_1^b \parallel p_2^b)
                  + L_{\mathrm{kl}}(p_2^b \parallel p_1^b) \big)
+ 2\, L_{\mathrm{mse}}\!\big(f_1^l, f_2^l\big)
\end{equation}
where $p_i^b = \mathrm{softmax}(f_i^b)$ denotes the channel-wise probability distribution of the bottleneck feature map, $i\in\{1,2\}$.





\paragraph{Total objective function.}
For labeled and unlabeled data, the total objectives are:
\begin{equation}
L(x_l) = L_s(x_l) + 0.5\, L_p^{\text{cross}}(x_l) + 0.5\, L_m(x_l), \quad L(x_u) = 0.5\, L_p^{\text{cross}}(x_u, x_u^s)
\end{equation}
% where $L_s$ is the supervised loss, $L_p(\cdot)$ and $L_p(\cdot,\cdot)$ denote the pseudo-supervised losses on $x_l$ and on the weak–strong pair $(x_u, x_u^s)$, respectively. The loss weights are set empirically, as our goal is to demonstrate a broadly applicable method rather than tune for a specific task.





\subsection{Spatiotemporal (ST) correction at frame level}
Segmentations produced on semi-supervised frames may exhibit frame-level inconsistencies due to the lack of temporal information, which appear as isolated false positive (FP) or false negative (FN) frames. As a post-processing step, we leverage the inherent spatiotemporal information in video clips, and introduce a separate correction model ($f_{st}$) at frame level to mitigate false positive FP and FN frames. 

We denote the $n^{th}$ test frame by $x_n$ and its predicted binary segmentation mask by $\tilde{y}_n$. For each frame $x_n$, we define $R_n$ as the total number of foreground pixels in $\tilde{y}_n$. Our key assumption is that adjacent frames should not exhibit large discrepancies in $R_n$. In particular, for FN frames, the target regions overlap across these frames, whereas for FP frames, the background region remains consistent (or contains little foreground). These assumptions motivate our inter-frame FP/FN detection and correction. We enforce temporal consistency by correcting FP frames when $R_n > 0$ and $R_{n-1} = R_{n+1} = 0$. Similarly, we classify $x_n$ as a FN frame when $R_n = 0$ and $R_{n-1} > r$ and $R_{n+1} > r$. We set $r = \tfrac{1}{4} H W$, where $H$ and $W$ denote the frame height and width.





To refine the predictions, we train a separate correction model $f_{st}$ that operates on a local temporal window.
% Given training frames $x_{n-2}, \ldots, x_{n+2}$ and their corresponding masks $\{{y}_{n-2},\ldots,{y}_{n+2}\}$
Given labeled training pairs $\{(x_{n-2}, y_{n-2}), \ldots, (x_{n+2}, y_{n+2})\}$ sampled from $\{x_l, y_l\}$
, we concatenate them along the channel dimension to form $c_n$, and use this as input to predict a refined segmentation for the central frame $x_n$. During training, random corruptions are introduced to the masks with basic morphological operations or by setting them to zero. We use the MSE loss to enforce spatiotemporal consistency, and the total loss is: 
\begin{equation}
L = L_s(f_{st}(c_n), y_n)
  + 0.25 \sum_{k \in \{-1,1\}} L_{\mathrm{mse}}(f_{st}(c_n), y_{n+k})
  + 0.1  \sum_{k \in \{-2,2\}} L_{\mathrm{mse}}(f_{st}(c_n), y_{n+k})
\end{equation}
This formulation allows the network to leverage spatiotemporal information while preventing it from overly dominating the training process, thereby accommodating potential variations between frames. For inference, the correction model $f_{st}$ is applied to frames classified as FP or FN, and uses adjacent masks to satisfy the local-consistency assumption for challenging ureteroscopy videos.

















