\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\jmlrvolume{-- 142}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for Publication at MIDL 2026}
\title[Medical Image Segmentation with Fractional KAN and Mamba]{MedKamba: A Novel Approach Integrating State-Space Models and Fractional Kolmogorov–Arnold Networks for Medical Image Segmentation}

\usepackage[normalem]{ulem}
\usepackage{mwe} % to get dummy images
 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Amit Shakya} \orcid{0009-0007-3379-9268} \Email{amitshakya@yamaha-motor-india.com}\\
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 
% \AND
\Name{Akanksha Yadav} \Email{akanksha@yamaha-motor-india.com}\\
\Name{Rupesh Kumar} \Email{rupeshkumar@yamaha-motor-india.com}\\
\Name{Lalit Sharma} \Email{LSharma@yamaha-motor-india.com}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
\addr Emerging Technologies and Innovation Lab, Yamaha Motor Solutions India 
}

\begin{document}

\maketitle

\begin{abstract}
Medical image segmentation plays a crucial role in healthcare, serving as a key step in disease diagnosis and treatment planning. Convolutional neural networks (CNNs) are limited by their restricted receptive fields, whereas Transformer-based models suffer from quadratic computational cost. Recent advances such as Mamba, a selective state-space model with linear complexity, and its vision-oriented variant, the Visual State Space (VSS) models, have shown strong ability to capture long-range dependencies efficiently. However, they still exhibit shortcomings in segmentation tasks, including loss of pixel-level structural information and inefficient channel utilization. To address this, we introduce VSSM-based Local Aware Channel Enhancement (LACE) block, which incorporates local enhancement and channel attention to better preserve spatial detail. To this end, we proposed MedKamba, a novel U-shaped segmentation approach that employs a hybrid encoder with CNNs and LACE blocks to effectively capture both local and global contextual information. While the U-Net backbone remains highly efficient, its traditional skip connections rely on simple scale-matched fusion, limiting cross-scale interaction. To overcome this, we redesign the skip connections using Fractional Kolmogorov–Arnold Networks (\textit{f}-KANs) to generate channel-wise attention weights from features aggregated across multiple stages. Experiments on two benchmark datasets demonstrate that MedKamba consistently outperforms competing approaches and produces more visually accurate segmentation results.
Code is available at \url{https://github.com/amit-shakya-28/MedKamba-Medical-Image-Segmentation-using-KAN-and-Mamba}

\end{abstract}

\begin{keywords}
Medical Image Segmentation, State Space Models, Fractional KANs.
\end{keywords}

\section{Introduction}
Over the past decade, medical image segmentation has become an indispensable component of computer-aided diagnosis and surgical planning. It enables precise delineation of anatomical structures and lesions, thereby improving diagnostic accuracy and treatment efficiency. Traditional segmentation methods using handcrafted features and thresholds often struggle with the complexity and variability of medical data \cite{2}. 

With the advancement of deep learning, CNNs and Transformer models have greatly improved medical image segmentation, with CNNs like U-Net \cite{ronneberger2015unet}, ResNet \cite{targ2016resnet}, nnU-Net \cite{isensee2021nnunet}, and SegResNet \cite{tang2023segresnet} effectively capturing local structures via shared kernels and pooling. However, CNNs are limited by their local receptive fields, making it difficult for them to capture global contextual information. Despite the theoretical expansion offered by dilation or deeper stacking, the effective receptive field achieved in real models is significantly smaller. Moving beyond convolutional constraints, Transformer-based architectures such as Swin Transformer \cite{liu2021swin}, UNETR \cite{hatamizadeh2022unetr}, and SwinUNETR \cite{hatamizadeh2022unetr}, leverage multi-head self-attention (MHSA) to model long-range relationships, enabling them to effectively capture global contextual information within medical images. 
However, computing attention between all token pairs causes quadratic computational and memory cost, which is challenging for high-resolution medical images. Although expressive, these models often overfit small medical datasets and need large amounts of data.

Recent advances in Structured State-Space Models (SSMs) have enabled highly efficient sequence modeling for vision tasks. Notably, Mamba~\cite{gu2023mamba}, a newly introduced SSM, which is notable for its hardware-efficient design and selective scanning, enabling robust global feature modeling with linear complexity. These strengths make Mamba a strong, efficient alternative to Transformers, with studies showing comparable performance across NLP and medical imaging tasks. For example, U-Mamba~\cite{ma2024u} incorporated a hybrid module into the nnU-Net framework~\cite{isensee2021nnunet}, combining the local feature modeling strengths of CNNs with Mamba’s global representation capability, representing an early attempt to integrate Mamba blocks into medical image analysis. Building on Mamba, the Visual State Space (VSS) model~\cite{liu2024vmamba} incorporated a 2D Selective Scanning (SS2D) mechanism that scans images in four directions, enabling the extraction of richer and more diverse contextual cues. Swin-UMamba~\cite{liu2024swin} advanced earlier Mamba-based frameworks by integrating ImageNet-pretrained features, effectively merging broad visual priors with Mamba’s efficient global representation learning.
 
However, because Mamba is inherently designed for 1D sequential inputs, applying it directly to 2D images requires flattening spatial features, which disrupts neighborhood continuity and causes local pixel forgetting. Moreover, capturing long-range dependencies demands a large number of hidden states, introducing substantial channel redundancy and weakening discriminative channel learning. To mitigate these issues, we introduce the Local Aware Channel Enhanced (LACE) block, which augments Mamba with a local convolutional module to restore spatial locality lost during flattening. Additionally, a channel-attention mechanism suppresses redundant channels that arise from large hidden-state dimensions, while a learnable scaling factor modulates the skip connection for improved feature calibration. 
Many recent studies highlight the importance of incorporating multi-scale and multi-stage information in medical image segmentation. For instance, \cite{ruan2022malunet} proposed MALUNet, a lightweight U-shaped network that integrates four specialized modules. To enhance cross-stage feature interaction, MALUNet employs the Channel Attention Bridge (CAB) and Spatial Attention Bridge (SAB), which generate channel and spatial attention maps, respectively. These modules enable significant channel reduction while preserving strong segmentation performance. However, the MLP-based CAB still suffers from limited interpretability and quadratic complexity in feature dimensions.
 

To overcome all the limitations discussed above, we propose MedKamba, a U-shaped architecture whose encoders combine CNNs operations at starting stage followed by our VSSM based LACE blocks, enabling robust local–global feature modeling. Meanwhile, the skip connections are enhanced using a  KAN-based Spatial–Channel Attention (\textit{f}-KSCA) module, providing interpretable and efficient cross-stage feature refinement.
Furthermore, our approach incorporates Fractional KANs (\textit{f}-KAN) with a Fractional Jacobi Activation Function (FJAF), a learnable polynomial-inspired activation with tunable fractional order. Unlike fixed polynomial bases (e.g. Chebyshev or Legendre), FJAF adapts its functional shape during training, offering more expressive nonlinear modeling and improved training stability  \cite{aghaei2025fkan}.
\textit{To the best of the authors'
knowledge, this is one of the first efforts that integrates State Space Models and fractional KANs for medical image segmentation.}

In summary, this paper makes the following contributions: 
\begin{itemize}
    \item We propose MedKamba, a U-shaped medical image segmentation model that combines a State Space Model with Fractional KAN, achieving an effective balance between accuracy and computational efficiency.
    \item We develop the Local-aware Channel Enhancement (LACE) block, which enhances Mamba’s capability by restoring local spatial details and suppressing redundant channels.
    \item We present a Fractional Kolmogorov–Arnold Spatial-channel attention block (\textit{f}-KSCA) that integrates multi-stage global contextual information during decoding while improving multi-scale local features. \textit{f}-KSCA enables the block to model global relationships and assign context-sensitive weights to feature maps.
    \item Our method demonstrates strong performance improvements on two distinct medical imaging datasets, ISIC 2018 for skin lesions segmentation and BUSI for breast ultrasound, highlighting its ability to generalize across varied anatomical structures and imaging characteristics.
\end{itemize}


\section{Methodology}
\textbf{Overview}: Figure \ref{fig:proposed_architecture} shows our overall architecture of the proposed MedKamba, comprising an encoder, a decoder, and a skip connection. The encoder includes three convolutional blocks along with two newly introduced LACE block while the decoder applies two LACE blocks first and ends with 3 convolutional blocks. The encoder gradually compresses feature maps by a factor of two at each stage, while the decoder symmetrically restores their size.  The number of channels at each stage, denoted as C1 to C5, is configurable; in our experiments, we use 16, 32, 128, 160, and 256 channels respectively. The skip connection is enhanced by the use of \textit{f}-KSCA block. In the following section, we describe each component in detail.


\subsection{Feature Modeling in MedKamba}
An input image $I$ with shape $B \times C \times H \times W$ (representing batch size, channels, height, and width) is processed by the model using three main stages: an encoder that extracts features, skip connections that preserve important information, and a decoder that reconstructs the final output. 
Within the encoder, the image first goes through a convolutional block that produces 
the feature map $x_{e,1} \in \mathbb{R}^{B \times 16 \times \frac{H}{2} \times \frac{W}{2}}$. 
It then flows through two additional convolutional modules, generating higher-level features 
$x_{e,2} \in \mathbb{R}^{B \times 32 \times \frac{H}{4} \times \frac{W}{4}}$ and 
$x_{e,3} \in \mathbb{R}^{B \times 128 \times \frac{H}{8} \times \frac{W}{8}}$. 
Each module applies two $3 \times 3$ convolutions followed by a max-pooling layer to downsample 
the spatial resolution. Here, $x_{e,i}$ denotes the feature map generated at the $i$-th encoder level.
The feature map $x_{e,3}$ is first downsampled using a Patch Embedding layer, implemented as a $2 \times 2$ convolution with stride $2$, which decreases the spatial dimensions while expanding the channel depth. The output of this operation is then passed through VSSM based LACE block, producing the feature map $x_{e,4} \in \mathbb{R}^{B \times 160 \times \frac{H}{16} \times \frac{W}{16}} $. Section 2.2 outlines the LACE block in details. 
The skip-connection pathway employs the \textit{f}-KSCA to refine the feature maps 
$x_{e,1}$, $x_{e,2}$, $x_{e,3}$ and $x_{e,4}$ from the first four encoder stages by emphasizing the most informative channel and spatial responses. The enhanced outputs, denoted as $s_{1}$, $s_{2}$, $s_{3}$, $s_{4}$, are then passed to the decoder for feature fusion. Section 2.3 provides a comprehensive description of the \textit{f}-KSCA module. The decoder reconstructs the spatial details in a manner that parallels the encoder. 
We begin by upsampling the feature map $x_{e,5}$ using interpolation, where a linear layer increases 
the channel dimension and a rearrangement converts it to 
$\mathbb{R}^{B \times \frac{H}{16} \times \frac{W}{16} \times 256}$. 
This representation is fused with the skip-connected feature $x_{e,4}$ by channel concatenation. 
Repeating the same upsampling procedure generates 
$x_{d,4} \in \mathbb{R}^{B \times \frac{H}{8} \times \frac{W}{8} \times 160}$. 
Subsequently, three convolutional stages progressively increase the spatial resolution and refine the 
decoded features. A final $1 \times 1$ convolution compresses the channel dimension and generates the
segmentation output.

\subsection{Local Aware Channel Enhancement (LACE) block}
Earlier Transformer-based segmentation models generally employ a 
block structure that follows the sequence:
\textit{Norm–Attention–Norm–MLP.}
% \textit{Norm} $\rightarrow$ \textit{Attention} $\rightarrow$ \textit{Norm} $\rightarrow$ \textit{MLP}.
Although both SSMs and Attention can capture global dependencies, they exhibit 
different behaviors, and thus directly substituting Attention with SSM leads to sub-optimal performance, this suggests that designing a new block specifically for segmentation models could be highly effective.

To this end, we introduce the Local Aware Channel Enhancement (LACE) to tailor the SSM module for medical image segmentation. As shown in Fig. \ref{fig:proposed_architecture}(a), for an input deep feature map 
$F^D \in \mathbb{R}^{B \times C \times H \times W }$, the processing starts with a  LayerNorm (LN) operation, and then uses the 
Vision State-Space Module (VSSM)~\cite{liu2024vmamba} to model long-range spatial dependencies.  We incorporate a learnable scale factor $s \in \mathbb{R}^C$ to modulate skip-connection features.
\begin{equation}
    V^l=\mathrm{VSSM}(\mathrm{LN}(F^D)) + s \cdot F^D
\end{equation}
Since, SSMs handle feature maps as one-dimensional token sequences, the chosen 
flattening strategy plays a major role in determining how many neighboring pixels remain adjacent 
in the sequence. When using the four-direction unfolding strategy of~\cite{liu2024vmamba_arxiv}, each anchor pixel retains only four immediate neighbors in the 1D token sequence. 
Consequently, flattening the 2D feature map places many adjacent pixels far from each other, which may 
lead to the loss of important local pixel information. To regain neighborhood information, we apply a local convolution after the VSSM.
Specifically, we first apply LayerNorm to $V^{l}$ and then use convolutional layers to restore 
the information of the local features.
To remain computationally efficient, the convolution layer employs a bottleneck 
design: first, the channel dimension is reduced by a factor $\beta$, producing features of size 
$\mathbb{R}^{H \times W \times \frac{C}{\beta}}$, and then resized to match the original number of channels. Due to the reliance of SSMs on multiple hidden states for modeling long-range interactions, they often introduce redundant channel activations, as illustrated in Fig.\ref{fig:proposed_architecture}(a). 
To alleviate this redundancy, we integrate Channel Attention (CA)~\cite{hu2018senet} into the LACE block, enabling 
the SSM to learn diverse channel features while CA selects the most informative ones. 
Finally, a learnable scale factor $s' \in \mathbb{R}^{C}$ is added through the residual pathway
to obtain the final output 

\begin{equation}
    F^{D'} = \mathrm{CA}(\mathrm{Conv}(\mathrm{LN}(V^{l}))) + s' \cdot V^{l}.
\end{equation}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{Images/archs.png}
\caption{Overview of the proposed  architecture.}
\label{fig:proposed_architecture}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Vision State-Space Module}
Transformer-based segmentation networks frequently divide the input into tiny patches~\cite{chen2021ipt} or use shifted window attention~\cite{liang2021swinir}, which restricts global interaction throughout the image, in order to maintain computational efficiency.  As shown in Fig. \ref{fig:proposed_architecture}(b), the Vision State-Space Module (VSSM) uses a state-space formulation to model long-range dependencies.  In accordance with~\cite{liu2024vmamba_arxiv}, two parallel branches process the input feature $X \in \mathbb{R}^{H \times W \times C}$. 
The first branch uses a linear layer to increase the channel dimension to $\lambda C$. This is followed by a 2D-SSM layer, LayerNorm, depth-wise convolution, and SiLU activation~\cite{shazeer2020glu}.  Additionally, the second branch uses SiLU activation and projects the channels to $\lambda C$.  Lastly, a Hadamard (element-wise) product is employed to fuse the outputs from both branches. 

The element-wise product of the two branches is projected back 
to $C$ channels to form $X_{\text{out}}$:
% \begin{equation}
%     \[
%     \begin{aligned}
%     X_1 &= \mathrm{LN}\big(\mathrm{2D\text{-}SSM}(\mathrm{SiLU}(\mathrm{DWConv}(\mathrm{Linear}(X))))\big),\\
%     X_2 &= \mathrm{SiLU}(\mathrm{Linear}(X)),\\
%     X_{\text{out}} &= \mathrm{Linear}(X_1 \odot X_2),
%     \end{aligned}
%     \] 
% \end{equation}
\begin{equation}
\begin{aligned}
X_1   &= \mathrm{LN}\bigl(\text{2DSSM}(\mathrm{SiLU}(\mathrm{DWConv}(\mathrm{Linear}(X))))\bigr) \\
X_2   &= \mathrm{SiLU}(\mathrm{Linear}(X)) \\
\quad X_{\text{out}} &= \mathrm{Linear}\bigl(X_1 \odot X_2\bigr)
\end{aligned}
\end{equation}
where $\mathrm{DWConv}$ denotes depth-wise convolution and $\odot$ represents the Hadamard product.

\noindent \textbf{2D Selective Scan Module}
The traditional Mamba~\cite{gu2023mamba} can only access data from the section of the sequence that has previously been scanned since it handles inputs in a causal fashion. This behaviour presents challenges when applied to non-causal data, like images, even though it is suitable for NLP tasks. 
We use the 2D Selective Scan Module (2D-SSM) in accordance with~\cite{liu2024vmamba_arxiv} to better utilise 2D spatial structure.  The 2D feature map is flattened into four 1D sequences by scanning in four directions: top-left to bottom-right, bottom-right to top-left, top-right to bottom-left, and bottom-left to top-right. The discrete state-space equation is used to process each sequence in order to identify long-range dependencies. The original 2D layout is then restored by summing and reshaping the outputs.
 

\subsection{\textit{f}-KSCA Module}
This study integrates Kolmogorov–Arnold Networks (KANs) into the U-Net framework, leveraging their high efficiency and interpretability. Unlike standard MLPs, which rely on weight matrices $W$ and fixed activations $\sigma$:
\begin{equation}
\text{MLP}(Z) = (W_{K-1} \circ \sigma \circ \cdots \circ W_0) Z,
\end{equation}
KANs replace weights with learnable activation functions $\varphi_{q,p}$, forming each layer as:
\begin{equation}
\text{KAN}(Z) = (\Phi_{K-1} \circ \cdots \circ \Phi_0) Z, \quad 
\Phi = \{\varphi_{q,p}\}.
\end{equation}

\noindent KANs are based on the Kolmogorov-Arnold representation theorem, which states that any multivariate continuous function can be expressed as a finite sum of univariate functions composed with addition \cite{liu2024kan}. 

For $d$-dimensional input $\zeta$, a KAN can be formulated as:
\begin{equation}
\hat{\chi}(\zeta) = \sum_{q=0}^{2d} \Phi_q \Big[ \sum_{p=1}^{d} \varphi_{q,p}(\zeta_p) \Big],
\end{equation}
where $\varphi_{q,p} : [0,1] \to \mathbb{R}$ and $\Phi_q : \mathbb{R} \to \mathbb{R}$ are trainable univariate functions. Variants such as Fourier KANs \cite{zhang2025kolmogorov}, Wavelet KANs \cite{bozorgasl2405wav} and Chebyshev KANs \cite{ss2024chebyshev} replace B-splines with orthogonal polynomials or other basis functions to reduce computational cost and improve accuracy.  Despite their efficiency, polynomial-based KANs can still suffer from limited flexibility and smoothness, particularly when approximating non-polynomial or mixed-frequency signals. Although orthogonal polynomials (e.g.Chebyshev) improve numerical stability and mitigate Runge-type oscillations, they still require careful degree selection. To address these limitations, fractional Jacobi functions \cite{aghaei2025fkan} are utilized as trainable activations in the fractional KAN (fKAN). By introducing a learnable fractional order parameter $\gamma$  the basis functions extend classical polynomials into a continuous fractional functional space, enabling smooth interpolation between integer orders and more adaptive function approximation while preserving boundedness and closed-form derivatives.

 
The fKAN output is computed as:
\begin{equation}
\hat{\chi}(\zeta) = \sum_{q=0}^{Q} \Phi^{(\gamma)}_q \sum_{p=1}^{d} \varphi_{q,p}^{(\gamma)}(\zeta_p),
\end{equation}
allowing the network to explore a fractional polynomial space, improve approximation flexibility, and maintain interpretability while reducing computational complexity compared to standard MLPs.


Our \textit{f}-KSCA module integrates multi-scale, multi-stage global context, as illustrated in Fig.\ref{fig:proposed_architecture}(d). Unlike MALUNet \cite{ruan2022malunet}, we substitute the MLP with KAN. In the four-stage \textit{f}-KSCA module, the process starts with the Spatial Attention Bridge (SAB), allowing the network to highlight key spatial details and ignore less relevant regions. To achieve this, we first apply max pooling and average pooling to the feature map, then combine the resulting maps by concatenating them. After that, a shared dilated convolution is used to merge these features, and a sigmoid function is applied to produce the spatial attention map. Finally, we multiply the spatial attention map element-wise with the original image, then add the residual information to this result to obtain the final spatial attention output. The novelty of \textit{f}-KSCA lies in its first-time incorporation of Fractional KAN into the Channel Attention Bridge (CAB) to enhance interpretability. In this approach, Fractional KANs are used to generate channel attention maps, which then guide the fusion of residual features in later layers through adaptive weight assignment. As a result, CAB improves multi-stage features by highlighting the most important channels and suppressing those with weak contextual relevance. This process is described by the following equations.

\begin{equation}
\begin{aligned}
Z = \mathrm{Concat}(\mathrm{AvgPool}(x_i)), \quad i \in \{1,2,3,4\}\\
Att_4 = \sigma(\mathrm{KANs}(\mathrm{Conv1D}(y))), \quad i \in \{1,2,3,4\}\\
\mathrm{Out}_4 = x_4 + Att_4 \cdot x_4 \\
\end{aligned}
\end{equation}

AvgPool denotes global average pooling, while Conv1D represents a one-dimensional convolution operation. By leveraging the strong function-approximation capability of KAN, the proposed  \textit{f}-KSCA module efficiently combines multi-scale and multi-stage features during the decoding process, ultimately resulting in improved segmentation performance. 


\section{Experiments and Result Analysis}
\noindent\textbf{Dataset} 
The ISIC 2018 dataset~\cite{codella2019skin} is a widely adopted benchmark for skin lesion segmentation, comprising 3,694 dermoscopic images divided into 2,594 training samples, 100 validation samples, and 1,000 test samples. The images exhibit substantial variability in resolution (0.5–29 MP) and spatial dimensions, ranging from 540×576 up to 4499×6748 pixels, making the dataset a robust and diverse testbed for evaluating segmentation models.
The BUSI dataset~\cite{aldhabyani2020breastultrasound} provides breast ultrasound images with corresponding segmentation masks for normal, benign, and malignant cases. For our experiments, we use 647 images covering benign and malignant lesions, adopting an 80:20 random split. Due to tumor variability and ultrasound noise, BUSI is a challenging benchmark for accurate breast abnormality detection and segmentation.


\noindent\textbf{Training and Implementation detail}
All images were resized to $256 \times 256$, and the model was trained for 400 epochs using Adam (lr=$1\times10^{-4}$, momentum=0.9, weight decay=$1\times10^{-4}$). For KAN components, a higher lr of $1\times10^{-2}$ with the same weight decay was applied to stabilize spline learning. All experiments were implemented in PyTorch and executed on an NVIDIA A100 GPU. We used a composite loss combining Dice loss and binary cross-entropy (BCE). Segmentation quality is measured in terms of specificity, accuracy, Dice, and mIoU.

% Two simple side-by-side tables (no extra packages needed)
\begin{table}[t] % or [!htbp]
\centering
\small                    % shrink a bit
\setlength{\tabcolsep}{2pt}
\begin{tabular}{l|cc|cccc}
\hline
\textbf{Model} & \textbf{Param.} & \textbf{GFlops} & \textbf{Spec.} & \textbf{Accu.} & \textbf{mIoU} & \textbf{Dice} \\
\hline
Unet\cite{ronneberger2015unet}       & 14.75 & 25.19  & 89.63 & 90.60 & 75.21 & 84.30 \\
% TransUnet  & 92.43 & 93.79 & 93.19 & 0.8108 & 0.8844 \\
MultiResUnet\cite{ibtehaz2020multiresunet} &7.25 & 18.76 & 90.26 & 91.30 & 76.76 & 85.65 \\
nnUNet \cite{isensee2021nnunet} & 19.1 & 412.7 & 94.32 & 92.21 & 77.03 & 86.43 \\
U-NeXt\cite{valanarasu2022unext}        & \textbf{1.47}& \textbf{0.57}  & 91.01 & 91.33 & 77.57 & 85.79 \\  
% MALU-Net \cite{}  & &  & 91.01 & 91.33 & 77.57 & 85.79 \\
%UCTransNet\cite{wang2022uctransnet}         & 91.67 & 91.90 & 91.92 & 78.1 & 87.22 \\
Swin-Unet\cite{cao2022swin}   &27.15 & 5.91 & 89.89 & 91.42 & 76.44 & 85.54 \\
ACC-Unet  \cite{ibtehaz2023acc}  & 16.77 & 45.33 & 92.51 & 92.51 & \underline{79.12} & \underline{87.71} \\
MedSA \cite{wu2023medicalsam}   & 104.3 & 52.2 & 91.09 & 92.04& 77.71 & 86.36\\
VM-UNet \cite{ruan2024vm} & 27.43& 4.11 & 95.00 & 92.18 & 75.25 & 85.88\\
% \textcolor{red}{nnUNetv2}\cite{} & & &98.39 & 96.51 & 70.10 & 79.45 \\
U-KAN\cite{li2025u}      &  9.38 & 6.89 & 90.05 & 92.07 & 78.20 & 86.41 \\
H-vmunet \cite{wu2025h} & 6.43 &  \underline{1.48}  & \underline{95.01} & \underline{92.54} & 76.38 & 86.61 \\
\textbf{MedKamba (Ours)}    & \underline{5.50} & 2.16   & \textbf{95.15 }   &\textbf{93.00}  & \textbf{79.63}& \textbf{88.24} \\
\hline
\end{tabular}
\vspace{5pt}
\caption{Quantitative comparison against state-of-the-art segmentation models on ISIC 2018 dataset. The\textbf{ best} results are in \textbf{bold} and second best are \underline{underlined.}}
\label{tab:Result_quantitative_isic}
\end{table}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{table}[t]
\centering
\small
\setlength{\tabcolsep}{2pt}
\begin{tabular}{l|cc|cccc}
\hline
\textbf{Model} & \textbf{Param.} & \textbf{GFlops} & \textbf{Spec.} & \textbf{Accu.} & \textbf{mIoU} & \textbf{Dice} \\
\hline
Unet\cite{ronneberger2015unet}        & 14.75 & 25.19 & 97.94 & 96.07 & 68.30 & 76.80 \\
MultiResUNet\cite{ibtehaz2020multiresunet}  &7.25 & 18.76 & 96.31 & 95.10 & 64.99 & 74.66 \\
nnUNet \cite{isensee2021nnunet}& 19.1 & 412.7 &98.39 & 96.51 & 70.02 & 79.45 \\
U-NeXt\cite{valanarasu2022unext}        & \textbf{1.47} &\textbf{0.57} & 98.09 & \underline{96.41} & 70.13 & 79.04 \\
% MALUNet \cite{}                        & 80.11 & 98.09 & 96.41 & 70.13 & 79.04 \\
Swin-Unet\cite{cao2022swin}         &27.15 & 5.91 & 96.90 & 96.07 & 70.10 & 78.77 \\
ACC-Unet \cite{ibtehaz2023acc}               & 16.77 & 45.33 & 95.72 & 95.00 & 70.16 & 79.24 \\
MedSA \cite{wu2023medicalsam}   & 104.3 & 52.2 & 97.53 & 95.67 & 67.29& 77.16 \\
VM-UNet \cite{ruan2024vm}                   & 27.43& 4.11 & 97.59 & 95.58 & 61.36 & 76.05 \\
% \textcolor{red}{nnUNetv2}\cite{} & & &98.39 & 96.51 & 70.10 & 79.45 \\
U-KAN\cite{li2025u}          &  9.38 & 6.89 & 97.24 & 95.90 & \underline{70.12} & \underline{80.01} \\
H-vmunet \cite{wu2025h}                         & 6.43 & \underline{1.48} & \underline{98.19} & 96.36 & 66.55 & 79.92 \\
\textbf{MedKamba (Ours)}   &  \underline{5.50} & 2.16  & \textbf{98.54} & \textbf{96.62} & \textbf{71.07} & \textbf{82.17} \\
\hline
\end{tabular}
\vspace{5pt}
\caption{Quantitative comparison against state-of-the-art segmentation models on the BUSI dataset. The \textbf{best} results are in \textbf{bold}, and the second-best are \underline{underlined}.}
\label{tab:Result_BUSI}
\end{table}


\subsection{Result Analysis}
Table \ref{tab:Result_quantitative_isic} and \ref{tab:Result_BUSI} summarize the evaluation of the proposed approach across two benchmark datasets with different imaging modalities, comparing it with various established segmentation approaches. We assessed performance against traditional convolution-based networks, including U-Net \cite{ronneberger2015unet} and MultiRes-UNet \cite{ibtehaz2020multiresunet}, as well as the efficient transformer-based model Swin-UNet \cite{wang2022smeswinunet} and recently popular mamba-based VM-Unet\cite{ruan2024vm} and H-vmunet \cite{wu2025h}. In addition, since KAN represents a promising alternative to conventional MLPs, we included comparisons with advanced MLP-style segmentation frameworks like U-NeXt \cite{valanarasu2022unext}, and KAN based U-KAN \cite{li2025u}. The results indicate that MedKamba (Ours) consistently achieves superior performance across two datasets, outperforming the competing methods in both accuracy and reliability.
We also provide a detailed visual comparison across all datasets, as shown in Fig. \ref{fig:visual_result}. The results indicate that conventional CNN-based models tend to suffer from over or under-segmentation, reflecting their limited ability to capture global context and accurately distinguish between structures. In comparison, MedKamba demonstrates a noticeable reduction in false positive predictions, highlighting its robustness against noisy outputs. When evaluated against SOTA methods, the proposed approach consistently produces segmentation results with sharper boundaries and more precise structural details. Our proposed architecture is lightweight compared to most methods while achieving higher efficiency. These findings emphasize the model’s strength in delivering high-fidelity segmentations while preserving fine-grained anatomical features, validating the effectiveness of integrating state space models and \textit{f}-KANs.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure*}
\includegraphics[width=\textwidth]{Images/qualitative_results.png}
\caption {
Visual comparison of MedKamba (ours) with state-of-the-art methods: the top two rows show ISIC 2018 results, and the bottom two rows show BUSI results.
}
% \label{fig:vis_conparison}
\label{fig:visual_result}
\end{figure*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{Images/gradcam.png}
\caption{Explainability of MedKamba with channel activation.}
\label{fig:Explainability}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\subsection{Explainability}
We additionally investigate the interpretability benefits introduced by the KAN layers by examining their activation behaviors, as illustrated in Fig. \ref{fig:Explainability}. When conventional MLP layers are employed (first column), the network exhibits difficulty in generating meaningful activation responses over clinically relevant regions, which is reflected in a low Plausibility IoU score. This metric, proposed in~\cite{cambrin2024kan}, quantifies the overlap between threshold activation maps and ground-truth masks, with higher values indicating more plausible and reliable explanations. After replacing the MLP layers with fractional KAN layers (second column), a substantial improvement is observed. The activations become more coherent, clearly highlighting target structures and producing boundaries that better match the ground-truth masks (third column). These results indicate that fractional KAN layers improve explainability by aligning activations with anatomically meaningful regions, consistent with prior findings for KAN-based models.


\subsection{Ablation Study}
As shown in Table \ref{ablation}. We conducted an ablation study to analyze the individual contributions of each block and module in MedKamba. Starting from the original U-Net, we use only the first three convolutional layers. The remaining deeper blocks were replaced with our proposed LACE block, resulting in a significant performance improvement. To further enhance performance and refine the skip connections, we incorporated the CAB and SAB modules from MALUNet. These modules enable the model to simultaneously capture global and local contextual information, allowing it to “see” more comprehensively. To increase interpretability and introduce learnable activation functions, we replaced the traditional feedforward network with KAN. Additionally, we experimented with replacing the spline-based activation function (see 4th row in Table \ref{ablation}) with a more flexible Jacobian polynomial activation function (Ours)(see 5th row in Table \ref{ablation}), further improving the model’s adaptability and performance.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{table}[h!]
\centering
\vspace{5pt}
\begin{tabular}{l|c|c}
\hline
\textbf{Configurations} & \textbf{mIoU} & \textbf{Dice} \\
\hline
UNet & 68.30 & 76.80 \\
UNet + LACE & 68.14 & 80.31 \\
UNet + LACE + CAB & 68.93 & 80.82 \\
UNet + LACE + KSCA & 69.63 & 81.22 \\
\textbf{UNet + LACE + \textit{f}-KSCA (Ours)} & \textbf{71.07} & \textbf{82.17} \\
\hline
\end{tabular}
\caption{Ablation study results comparing different network variants on BUSI Dataset.}
\label{ablation}
\end{table}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Conclusion}
This work presents MedKamba, a medical image segmentation framework that blends U-Net’s strength in capturing fine-grained local details with enhanced convolutional layers powered by the VSSM-based LACE block to model global context. To improve the quality of skip-connection information, we introduce a KAN-based module, \textit{f}-KSCA. MedKamba is among the first approaches to jointly leverage both Mamba and KAN architectures for medical image analysis. Comprehensive experiments across multiple datasets show that MedKamba delivers strong, consistent performance across various modalities and tasks. Ablation studies also highlight the complementary advantages of Mamba and KAN, leading to better optimization efficiency and higher segmentation accuracy. This study focuses on 2D medical image segmentation. Investigating the extension of MedKamba to 3D volumetric datasets, such as CT and MRI, will be explored in future work.

\bibliography{midl26_142}

\end{document}
