\begin{comment}
\begin{table}[htbp]
\centering
\caption{
Comparison with transformer-based and foundation models across diverse medical imaging datasets. 
Values are reported as mean ± standard deviation. 
$p$-values are computed for Dice score significance vs. ICL-NoiseUNet (Wilcoxon signed-rank pairwise tests).
}
\label{tab:general_sota}
\Large
\resizebox{\textwidth}{1.8cm}{%
\begin{tabular}{lcccccc}
\toprule
\textbf{Model} &
\multicolumn{2}{c}{\textbf{CAMUS}} &
\multicolumn{2}{c}{\textbf{BUSI}} &
\multicolumn{2}{c}{\textbf{BUS-BRA}} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7}
& \textbf{Dice (mean ± std)} & \textbf{$p$-value}
& \textbf{Dice (mean ± std)} & \textbf{$p$-value}
& \textbf{Dice (mean ± std)} & \textbf{$p$-value} \\
\midrule
UNet-Transformers \cite{unetr3d}  & 0.921 +- 0.039 & $<\!1\times10^{-6}$ & 0.771 +- 0.216 & 0.045 & 0.923 +- 0.070 & 0.013 \\
SwinUNet \cite{cao2021swinunetunetlikepuretransformer} & 0.912 +- 0.043 & $<\!1\times10^{-12}$ & 0.642 +- 0.240 & $<\!1\times10^{-10}$ & 0.911 +- 0.070 & 0.007 \\
nnU-Net \cite{isensee2018nnunetselfadaptingframeworkunetbased} & 0.915 +- 0.046 & $<\!1\times10^{-20}$ & 0.612 +- 0.250 & $<\!1\times10^{-14}$ & 0.921 +- 0.070 & 0.016 \\
MultiverSeg \cite{wong2025multiversegscalableinteractivesegmentation} & 0.882 +- 0.078 & $<\!1\times10^{-20}$ & 0.643 +- 0.231 & $<\!1\times10^{-10}$ & 0.781 +- 0.201 & $<\!1\times10^{-19}$ \\
UltraSAM \cite{Meyer_2025} & 0.901 ± 0.065 & $1\times10^{-3}$ & 0.746 +- 0.201 & $1\times10^{-4}$ & 0.832 +- 0.102 & $<\!1\times10^{-12}$ \\
MedSAM2 \cite{ma2025medsam2segment3dmedical} & 0.892 +- 0.051 & $<\!1\times10^{-20}$ & 0.751 +- 0.216 & $4\times10^{-3}$ & 0.794 +- 0.160 & $<\!1\times10^{-20}$ \\
\midrule
\textbf{ICL-NoiseUNet (ours)} & \textbf{0.940 ± 0.028} & -- & \textbf{0.824 ± 0.146} & -- & \textbf{0.935 ± 0.065} & -- \\
\bottomrule
\end{tabular}%
}
\end{table}

\end{comment}
\begin{comment}
\begin{table}[ht]
\centering
\caption{
Comparison with current State-of-The-Art models across diverse medical imaging datasets. We compute the $p$-value for Dice score significance against ICL-NoiseUNet using Wilcoxon signed-rank pairwise tests. 
}
\label{tab:general_sota}
\Large
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lcccccccc}
\toprule
\textbf{Model} &
\multicolumn{2}{c}{\textbf{CAMUS}} &
\multicolumn{2}{c}{\textbf{BUSI}} &
\multicolumn{2}{c}{\textbf{BUS-BRA}} &
\multicolumn{2}{c}{\textbf{RADBOUD}} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
& \textbf{Dice} & \textbf{$p$-value}
& \textbf{Dice} & \textbf{$p$-value}
& \textbf{Dice} & \textbf{$p$-value}
& \textbf{Dice} & \textbf{$p$-value} \\
\midrule
UNet-Transformer \cite{unetr3d}  & 0.921 & $<\!1\times10^{-6}$ & 0.761  & 0.045 & 0.923 & 0.013 & 0.881 & $<\!1\times10^{-16}$\\
SwinUNet \cite{cao2021swinunetunetlikepuretransformer} & 0.912  & $<\!1\times10^{-12}$ & 0.642  & $<\!1\times10^{-10}$ & 0.911  & 0.007 & 0.900 & $<\!1\times10^{-16}$\\
nnU-Net \cite{isensee2018nnunetselfadaptingframeworkunetbased} & 0.915 & $<\!1\times10^{-20}$ & 0.612  & $<\!1\times10^{-14}$ & 0.921  & 0.016 & 0.921 & $<\!1\times10^{-16}$ \\
MultiverSeg \cite{wong2025multiversegscalableinteractivesegmentation} & 0.882 & $<\!1\times10^{-20}$ & 0.643 & $<\!1\times10^{-10}$ & 0.781  & $<\!1\times10^{-19}$ & 0.901 & $<\!1\times10^{-16}$\\
UltraSAM \cite{Meyer_2025} & 0.901  & $1\times10^{-3}$ & 0.746  & $1\times10^{-4}$ & 0.855 & $<\!1\times10^{-12}$ & $0.935$ & $4\times10^{-4}$\\
MedSAM2 \cite{ma2025medsam2segment3dmedical} & 0.892 & $<\!1\times10^{-20}$ & 0.751  & $4\times10^{-3}$ & 0.794 & $<\!1\times10^{-16}$ & 0.951 & $3\times10^{-3}$\\
\midrule
\textbf{ICL-NoiseUNet (ours)} & \textbf{0.940} & -- & \textbf{0.804} & -- & \textbf{0.931} & -- & \textbf{0.961} & -- \\
\bottomrule
\end{tabular}%
}
\end{table}
\end{comment}
\begin{table}[htbp]
\centering
\caption{Comparison with current State-of-The-Art models across diverse medical imaging datasets.Significance levels: $\dagger$ $p < 0.05$, $\ddagger$ $p < 0.01$, $\mathsection$ $p < 0.001$.  We compute the $p$-value for Dice score significance against ICL-NoiseUNet using Wilcoxon signed-rank pairwise tests. 
}
\label{tab:general_sota}
\Large
\resizebox{0.8\columnwidth}{!}{%
\begin{tabular}{lcccccccc}
\toprule
\multirow{2}{*}{Model} & 
\multicolumn{2}{c}{CAMUS} & 
\multicolumn{2}{c}{BUSI} & 
\multicolumn{2}{c}{BUS-BRA} &
\multicolumn{2}{c}{RADBOUD} \\
& Dice & Sig. & Dice & Sig. & Dice & Sig. & Dice & Sig. \\
\midrule
UNEt TRansformers \cite{unetr3d} & 0.921 & $\mathsection$ & 0.761 & $\dagger$ & 0.923 & $\dagger$ & 0.881 & $\mathsection$ \\
SwinUNet  \cite{cao2021swinunetunetlikepuretransformer}  & 0.912 & $\mathsection$ & 0.642 & $\mathsection$ & 0.911 & $\ddagger$ & 0.900 & $\mathsection$ \\
nnU-Net \cite{isensee2018nnunetselfadaptingframeworkunetbased}    & 0.915 & $\mathsection$ & 0.742 & $\mathsection$ & 0.921 & $\dagger$ & 0.921 & $\mathsection$ \\
MultiverSeg   \cite{wong2025multiversegscalableinteractivesegmentation}   & 0.882 & $\mathsection$ & 0.643 & $\mathsection$ & 0.781 & $\mathsection$ & 0.901 & $\mathsection$ \\
UltraSAM    \cite{Meyer_2025}   & 0.901 & $\ddagger$      & 0.746 & $\ddagger$      & 0.855 & $\mathsection$ & 0.935 & $\ddagger$ \\
MedSAM2     \cite{ma2025medsam2segment3dmedical}     & 0.892 & $\mathsection$ & 0.751 & $\ddagger$      & 0.794 & $\mathsection$ & 0.940 & $\ddagger$ \\
\midrule
ICL-NoiseUNet (ours) & \textbf{0.940} & -- & \textbf{0.804} & -- & \textbf{0.931} & -- & \textbf{0.961} & -- \\
\bottomrule
\end{tabular}
}
\end{table}
