\documentclass[runningheads]{llncs}

\usepackage{marvosym}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{amsmath,amssymb}
\usepackage{booktabs}
\usepackage{cite}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{tabularx}
\usepackage{subcaption}
\usepackage{multirow}
\usepackage{xcolor}

\usepackage{url}

% MICCAI-style hyperlinks
\usepackage[colorlinks=true, linkcolor=black, citecolor=black, urlcolor=black]{hyperref}

\begin{document}

\title{FedDSR: Codebook-Based Distribution Alignment for Heterogeneous Federated CT Super-Resolution}
\titlerunning{FedDSR}

\author{Suvadip Chakraborty\inst{1}\textsuperscript{(\Letter)}, Obed Jamir\inst{1} \and
Angshuman Paul\inst{1}}
\authorrunning{Chakraborty et al.}

\institute{Indian Institute of Technology Jodhpur, Rajasthan, India\\
\textsuperscript{\Letter}Correspondence: \email{m24csa032@iitj.ac.in}
}
\maketitle

\begin{abstract}
Federated Learning (FL) enables privacy-preserving decentralized training, making it well-suited for healthcare. We study CT super-resolution (SR) in a heterogeneous FL setting, where each client holds CT scans of a different anatomical region. Generative SR models are typically large, leading to high communication overhead, while data heterogeneity across clients further degrades performance. We propose FedDSR, a two-stage lightweight federated framework for heterogeneous CT image super-resolution. In the first stage, we introduce a distribution-aware aggregation strategy that leverages client codebook embeddings to capture semantic similarity across clients, producing robust and representative global encoder updates. In the second stage, each client independently fine-tunes its local decoder on institution-specific data, enabling personalization without modifying the shared encoder. Experiments on four public CT datasets, each assigned to a separate client, show that FedDSR outperforms state-of-the-art FL baselines. FedDSR achieves 3.2\% higher PSNR, 0.33\% higher SSIM, and 33.5\% lower LPIPS, along with improved FID. The source code is available at https://github.com/xzarulin7/FedDSR
\keywords{Federated learning  \and CT super-resolution \and Distribution-aware aggregation \and Heterogeneity.}
% Authors must provide keywords and are not allowed to remove this Keyword section.

\end{abstract}


% --- INTRODUCTION ---

\section{Introduction}

CT images with higher resolution may lead to improved diagnosis. Image super-resolution (SR) offers a practical way to synthesize high-resolution images from low-resolution acquisitions. Although deep generative models have achieved strong performance in SR for natural image, their application to medical imaging remains challenging due to modality-specific noise, anatomical constraints, and scanner-dependent artifacts. Diverse data from multiple institutions may be helpful in this context. However, strict privacy regulations, including HIPAA~\cite{hipaa1996} and GDPR~\cite{gdpr2016}, restrict centralized data sharing across hospitals, limiting large-scale training of generative models. Federated learning (FL) enables collaborative model training without exposing sensitive data. 

Existing FL methods such as FedAvg~\cite{mcmahan2023communicationefficientlearningdeepnetworks}, FedProx~\cite{li2020federatedoptimizationheterogeneousnetworks}, and FedBN~\cite{li2021fedbnfederatedlearningnoniid} degrade under heterogeneous data distributions. Personalized approaches, including FedRep~\cite{collins2023exploitingsharedrepresentationspersonalized}, Ditto~\cite{li2021dittofairrobustfederated}, FedBabu~\cite{oh2022fedbabuenhancedrepresentationfederated}, and FedPer~\cite{arivazhagan2019federatedlearningpersonalizationlayers}, improve local adaptation but rely on uniform aggregation, ignoring semantic distribution differences across clients. Also, large generative models incur high communication costs, while variations in scanner protocols and anatomy further exacerbate cross-client misalignment. In this work, we tackle CT image super-resolution under a severe form of heterogeneity, where each client holds CT scans of a different anatomical region. In such a setting, codebook embeddings may act as compact distribution descriptors, enabling principled aggregation across clients.

\begin{figure}[!t]
\centering
\begin{minipage}{0.22\columnwidth}
    \includegraphics[width=\linewidth]{Images/LR_1.pdf}
\end{minipage}\hspace{0.01\columnwidth}%
\begin{minipage}{0.22\columnwidth}
    \includegraphics[width=\linewidth]{Images/HR_1.pdf}
\end{minipage}\hspace{0.01\columnwidth}%
\begin{minipage}{0.22\columnwidth}
    \includegraphics[width=\linewidth]{Images/SR_1.pdf}
\end{minipage}
\caption{Qualitative comparison of low-resolution (LR), ground-truth high-resolution (HR), and super-resolved (SR) CT images generated by the proposed FedDSR method on the pancreas CT~\cite{roth2016pancreasct} dataset.}
\label{fig:comparison}
\end{figure}

Hence, to address these challenges, we propose FedDSR, a lightweight FL framework for heterogeneous CT image super-resolution based on a hierarchical VQ-VAE-2~\cite{razavi2019generatingdiversehighfidelityimages} architecture. The proposed method reconstructs high-quality images while preserving anatomical structures. FedDSR introduces a distribution-aware aggregation strategy that leverages client codebook embeddings to capture semantic similarity across institutions, enabling robust global encoder learning. Furthermore, a two-stage training strategy learns a shared global encoder and a personalized local decoder, enabling institution-specific adaptation without disturbing the shared representation. Since decoder parameters are never communicated to the server, FedDSR significantly reduces communication overhead.

%a two-stage training strategy separates global encoder federation from local decoder personalization, allowing institution-specific adaptation while maintaining shared semantic representations.

Our contributions are:
(i) a federated framework for heterogeneous CT super-resolution built on VQ-VAE-2 architecture;
(ii) a distribution-aware aggregation strategy using client codebook embeddings for semantic alignment under heterogeneous data;
(iii) a codebook vitalization mechanism with orthogonal initialization to prevent embedding collapse during non-IID~\cite{hsu2019measuringeffectsnonidenticaldata} training; and
(iv) a two-stage federated framework that combines global encoder learning with local decoder personalization to achieve robust and personalized synthesis.


% --- METHOD ---
\section{Methodology}

CT scans across institutions differ in scanner types, acquisition protocols, and patient populations. This causes locally trained models to learn client-specific patterns, leading to parameter divergence and suboptimal aggregation. We propose \textbf{FedDSR} (see Fig.~\ref{fig:fedmedsr}), a distribution-aware federated super-resolution framework. Each client in our setting trains a VQ-VAE-2 model to synthesize super-resolved images from low-resolution inputs. The learned top and bottom codebooks capture compact representations of client data characteristics. During communication, these embeddings are sent to the central server. Pairwise cosine similarity scores are computed between the codebook-based distribution representations $c_k$ of every pair of clients Eq.~\eqref{eq:2}, obtained by averaging each client's top and bottom codebook embeddings. Subsequently, the similarity scores are combined with dataset-size weights to obtain the aggregation coefficients.

\begin{figure}[t]
    \centering
    \includegraphics[width=1\columnwidth,trim=25 0 20 0, clip]{Images/FedDSR_framework.pdf}
    \caption{The FedDSR framework. Client codebook embeddings capture semantic similarity across clients and are used along with dataset size for distribution-aware aggregation of the global encoder. The personalized local decoder enables client-specific adaptations.}
    \label{fig:fedmedsr}
\end{figure}


\noindent\textbf{Capturing Data Distribution.} To capture diverse data distribution across clients, we use an enhanced VQ-VAE-2 as the generative backbone at each client because its discrete latent codebooks provide compact representations of local data distributions. The model learns hierarchical codebooks that encode structural and semantic CT information at multiple spatial scales. These codebooks represent client data characteristics and enable distribution-aware aggregation. Aggregation weights are computed using codebook similarity and dataset size.
To improve representation quality, we introduce the following:

\paragraph{(i) Efficient Cross-Attention Between Hierarchical Latents:}
In VQ-VAE-2, the top and bottom latents are learned independently. Therefore, each client learns a different distribution of global anatomy and fine texture across its two codebooks, making them unreliable for cross-client comparison. To ensure consistency, we introduce a cross-attention mechanism in which bottom-level features are guided by top-level features. Fine details are thus encoded with respect to the global structure at every client. Let $z_{\text{top}} \in \mathbb{R}^{B \times C_t \times H_t \times W_t}$ and $z_{\text{bottom}} \in \mathbb{R}^{B \times C_b \times H_b \times W_b}$ denote the top- and bottom-level latent feature maps capturing global and local CT structures, respectively. Queries, keys, and values are computed as $Q = W_q z_{\text{bottom}}$, $K = \text{Pool}(W_k z_{\text{top}})$, and $V = \text{Pool}(W_v z_{\text{top}})$, where $W_q, W_k, W_v$ are learnable projections and $\text{Pool}(\cdot)$ denotes average spatial pooling for dimension alignment. The attention output is defined as
\begin{equation}
\text{Attn}(Q, K, V) =
\text{Softmax}\!\left(\frac{Q K^\top}{\sqrt{d}}\right) V,
\end{equation}
where $d$ is the dimensionality of the key vectors used to normalize the attention scores. The output is projected and added residually to the bottom-level latent to enforce hierarchical consistency while maintaining computational efficiency.

\paragraph{(ii) Federated Codebook Vitalization and Orthogonal Initialization:}
Under non-IID FL, codebook collapse can leave only few embeddings active,
degrading distribution representation and aggregation reliability.
To prevent this, codebook embeddings are initialized with orthogonal vectors via QR decomposition~\cite{saxe2014exactsolutionsnonlineardynamics}. During training, code usage is tracked per client. Embeddings
falling below a usage threshold are reinitialized from active encoder outputs. This ensures diversity and prevents latent space collapse during federated training.

\noindent\textbf{Distribution-aware Aggregation.} Data heterogeneity across clients makes the training of a robust global model challenging.
Standard aggregation ignores distributional differences, leading to suboptimal
convergence. To address this, we propose a distribution-aware aggregation strategy
that captures each client's data distribution from its VQ-VAE-2 codebook semantics
and integrates it with dataset size to compute the final aggregation weights.

After local training, each client's VQ-VAE-2 learns a codebook of latent vectors encoding rich semantic representations of its local data distribution. For each
client $k$, we compute a compact representation $c_k$ by averaging the mean
embeddings from both the top and bottom codebooks:


\begin{equation}
c_k = \frac{1}{2} \left(
\frac{1}{E^{\text{top}}_k} \sum_{i=1}^{E^{\text{top}}_k} e^{(i)}_{k,\text{top}} +
\frac{1}{E^{\text{bottom}}_k} \sum_{i=1}^{E^{\text{bottom}}_k} e^{(i)}_{k,\text{bottom}}
\right),
\end{equation}
where $E^{\text{top}}_k$, $E^{\text{bottom}}_k$ denote the codebook sizes, $e^{(i)}_{k,\text{top}}, e^{(i)}_{k,\text{bottom}} \in \mathbb{R}^{128}$ denote the $i$-th codebook embedding vectors,
$N$ is the total number of clients, and $c_k, c_j \in \mathbb{R}^{128}$
are the codebook-based distribution representations of clients $k$ and $j$. Pairwise cosine similarity weights are computed as:

\begin{equation}
\label{eq:2}
s_k = \frac{1}{N} \sum_{j=1}^{N} \frac{c_k^\top c_j}{\|c_k\|\|c_j\|}.
\end{equation}

Since cosine similarity $s_k$ lies in $[-1,1]$, negative scores are clipped to zero and the resulting weights $w^{\text{sim}}_k$ are normalized to sum to one. Subsequently, clients with greater representational similarity receive higher weights. Dataset-size weights are computed as $w^{\text{data}}_k = \frac{|D_k|}{\sum_{j=1}^{N} |D_j|}$, ensuring clients with larger datasets have greater influence. The aggregation weight for client $k$ is:
\begin{equation}
\label{eq:4}
w^{\text{final}}_k = \gamma \, w^{\text{sim}}_k + (1-\gamma) \, w^{\text{data}}_k,
\end{equation}
where $\gamma \in [0,1]$ controls the relative contribution of semantic similarity
and dataset size. This produces a global model that accounts for both distributional
alignment and data availability across heterogeneous clients.

%Composite Super-resolution Loss
\noindent\textbf{Composite Super-resolution Loss.}
Each client is trained using a composite loss 
$\mathcal{L}_{SR}$ combining the following components, where $\hat{I}$ denotes the predicted super-resolved image and $I$ denotes the ground-truth high-resolution image.

\paragraph{1) Charbonnier Reconstruction Loss.}
The Charbonnier loss~\cite{charbonnier1997loss} is a smooth differentiable 
variant of the $\ell_1$ loss:
\begin{equation}
\mathcal{L}_{C} = \frac{1}{M} \sum_{i=1}^{M}
\sqrt{ \left( \hat{I}_i - I_i \right)^2 + \epsilon^2 },
\end{equation}

\paragraph{2) Perceptual Loss (LPIPS).}
The perceptual loss~\cite{zhang2018unreasonable} measures 
feature-space differences using a pretrained VGG network $\phi$:
\begin{equation}
\mathcal{L}_{L} = \sum_{l}
\frac{1}{C_l H_l W_l}
\left\| \phi_l(\hat{I}) - \phi_l(I) \right\|_2^2,
\end{equation}


\paragraph{3) Multi-Scale Structural Similarity Loss.}
The MS-SSIM loss~\cite{article} enforces structural consistency 
across multiple scales by comparing luminance, contrast, and structural 
information between the super-resolved and ground-truth images, defined as
$\mathcal{L}_{MS\text{-}S} = 1 - MS\text{-}SSIM(\hat{I}, I)$.

\paragraph{4) Edge-Aware Loss.}
An edge-aware loss~\cite{article1} using Sobel gradient operators 
preserves fine anatomical boundaries:
\begin{equation}
\mathcal{L}_{E} =
\left\| \sqrt{(S_x * \hat{I})^2 + (S_y * \hat{I})^2} -
\sqrt{(S_x * I)^2 + (S_y * I)^2} \right\|_1,
\end{equation}
where $S_x$ and $S_y$ are horizontal and vertical Sobel operators,
and $*$ denotes convolution.


\paragraph{5) Vector Quantization Commitment Loss.}
The VQ commitment loss~\cite{oord2018neuraldiscreterepresentationlearning}, given by $\mathcal{L}_{VQ} =
\left\| \text{sg}[z_e] - e \right\|_2^2
+
\beta
\left\| z_e - \text{sg}[e] \right\|_2^2$, 
stabilizes discrete latent learning. Here $z_e$ and $e$ denote the encoder output and its nearest codebook 
embedding, respectively, and $\text{sg}[\cdot]$ is the stop-gradient 
operator that blocks backpropagation.

\noindent The five objectives are combined into the final composite 
super-resolution loss:
\begin{equation}
\mathcal{L}_{SR} = \lambda_{1} \, \mathcal{L}_{C} 
+ \lambda_{2} \, \mathcal{L}_{L}
+ \lambda_{3} \, \mathcal{L}_{MS\text{-}S} 
+ \lambda_{4} \, \mathcal{L}_{E}
+ \lambda_{5} \, \mathcal{L}_{VQ},
\label{eq:lsr}
\end{equation}






\noindent\textbf{Training of Clients.}%
\label{sec:client_training}
At the start of each communication round, every client initializes its local VQ-VAE-2 model $\theta_k$ with the current global parameters $\Theta$. We split the model parameters into a federated subset $\theta_k^{\text{enc}}$ (encoder and quantizer, including codebooks) and a local subset $\theta_k^{\text{dec}}$ (decoder). $\theta_k^{\text{enc}}$ is aggregated, while $\theta_k^{\text{dec}}$ stays local and is never transmitted to the server. Each client
then trains on its local dataset $D_k$ for $r{=}5$ epochs using $\mathcal{L}_{SR}$ from Eq.~\eqref{eq:lsr}. After local training, the updated federated parameters $\theta^{'\,\text{enc}}_k(t)$ and
dataset size $|D_k|$ are sent to the server.

\noindent\textbf{Aggregation at Server.}%
\label{sec:server_aggregation}
The server performs distribution-aware aggregation by combining semantic similarity weights from Eq.~\eqref{eq:2} and dataset size weights using a mixing factor $\gamma = 0.5$. Using the weights from Eq.~\eqref{eq:4}, the federated parameters are aggregated as $\Theta^{\text{enc}}(t)=\sum_{k=1}^{N} w^{\text{final}}_{k}\,\theta'^{\,\text{enc}}_k(t)$
and redistributed to all clients, while each client keeps its own decoder.

\noindent\textbf{Local Decoder Fine-Tuning.}%
\label{sec:decoder_finetuning}
After the final communication round, each client fine-tunes its decoder locally using institution-specific data. The shared encoder and quantizer are frozen, preserving the global representation. Only the decoder weights $\theta_k^{\text{dec}}$ are updated using
$\mathcal{L}_{SR}$ with a reduced learning rate of $5{\times}10^{-6}$ for ten epochs, without any server communication. This enables client-specific personalization while keeping the shared encoder intact.

% 
% --- EXPERIMENTS AND RESULTS ---
\begin{figure*}[t]
\centering
\setlength{\tabcolsep}{1pt}
\renewcommand{\arraystretch}{0.8}
\setlength{\fboxsep}{0pt}
\setlength{\fboxrule}{0.6pt}

\begin{tabular}{ccccc}

% ================= ROW 1 =================
\begin{minipage}{0.17\linewidth}\centering
\fbox{\includegraphics[width=\linewidth]{Images/LR_2.pdf}}\\
\footnotesize (a) Input
\end{minipage} &

\begin{minipage}{0.17\linewidth}\centering
\fbox{\includegraphics[width=\linewidth]{Images/FedPer.pdf}}\\
\footnotesize (b) FedPer
\end{minipage} &

\begin{minipage}{0.17\linewidth}\centering
\fbox{\includegraphics[width=\linewidth]{Images/FedProx.pdf}}\\
\footnotesize (c) FedProx
\end{minipage} &

\begin{minipage}{0.17\linewidth}\centering
\fbox{\includegraphics[width=\linewidth]{Images/FedBabu.pdf}}\\
\footnotesize (d) FedBABU
\end{minipage} &

\begin{minipage}{0.17\linewidth}\centering
\fbox{\includegraphics[width=\linewidth]{Images/FedBN.pdf}}\\
\footnotesize (e) FedBN
\end{minipage} \\[2pt]

% ================= ROW 2 =================
\begin{minipage}{0.17\linewidth}\centering
\fbox{\includegraphics[width=\linewidth]{Images/Ditto.pdf}}\\
\footnotesize (f) Ditto
\end{minipage} &

\begin{minipage}{0.17\linewidth}\centering
\fbox{\includegraphics[width=\linewidth]{Images/FedAvg.pdf}}\\
\footnotesize (g) FedAvg
\end{minipage} &

\begin{minipage}{0.17\linewidth}\centering
\fbox{\includegraphics[width=\linewidth]{Images/FedRep.pdf}}\\
\footnotesize (h) FedRep
\end{minipage} &

\begin{minipage}{0.17\linewidth}\centering
\fbox{\includegraphics[width=\linewidth]{Images/FedDSR.pdf}}\\
\footnotesize (i) Ours
\end{minipage} &

\begin{minipage}{0.17\linewidth}\centering
\fbox{\includegraphics[width=\linewidth]{Images/HR_2.pdf}}\\
\footnotesize (j) GT
\end{minipage}

\end{tabular}

\caption{Qualitative comparison of CT super-resolution results on zoomed-in regions from the Pancreas CT~\cite{roth2016pancreasct} dataset for various methods.}
\label{fig:qualitative_results}
\end{figure*}

\begin{table}[!t]
\centering
\caption{Quantitative comparison of federated learning strategies on diverse CT datasets. 
All methods were run five times with results reported as mean $\pm$ standard deviation for PSNR, SSIM, LPIPS, and FID across 5 runs.}
\label{tab:results}
\small
\begin{tabular}{|l|c|c|c|c|}
\hline
\textbf{Method} & 
\textbf{PSNR $\uparrow$} & 
\textbf{SSIM ($\times 10^{-2}$)$\uparrow$} & 
\textbf{LPIPS ($\times 10^{-2}$)$\downarrow$} & 
\textbf{FID} \\
\hline
FedPer & 24.76 $\pm$ 0.13 & 85.67 $\pm$ 0.12 & 17.27 $\pm$ 0.06 & 73.44 $\pm$ 0.12\\
FedProx & 32.26 $\pm$ 0.10 & 93.59 $\pm$ 0.08 & 7.10 $\pm$ 0.09 & 31.18 $\pm$ 0.10 \\
Fedbabu & 36.65 $\pm$ 0.09 & 97.02 $\pm$ 0.11 & 3.60 $\pm$ 0.07 & 8.32 $\pm$ 0.10\\
FedBN & 38.23 $\pm$ 0.11 & 97.43 $\pm$ 0.08 & 4.52 $\pm$ 0.04 & 10.59 $\pm$ 0.07 \\
Ditto & 38.42 $\pm$ 0.08 & 97.47 $\pm$ 0.08 & 4.16 $\pm$ 0.06 & 9.52 $\pm$ 0.08\\
FedAvg & 39.11 $\pm$ 0.05 & 97.61 $\pm$ 0.04 & 4.30 $\pm$ 0.05 & 9.54$\pm$ 0.06 \\ 
FedRep & 39.60 $\pm$ 0.08 & 97.74 $\pm$ 0.10 & 3.36 $\pm$ 0.04 & 7.51 $\pm$ 0.04\\ 
\hline
\textbf{FedDSR (Ours)} & 
\textbf{40.38 $\pm$ 0.06} & 
\textbf{97.93 $\pm$ 0.03} & 
\textbf{2.86 $\pm$ 0.04} & 
\textbf{6.56 $\pm$ 0.08 } \\
\hline
\end{tabular}
\end{table}


\section{Experiments and Results}

\noindent\textbf{Datasets.}
We use four CT datasets for our experiments. These are: COVID-19~\cite{6fd59abc29334cf1b6738df785213d76},
Pancreas~\cite{roth2016pancreasct}, Kidney~\cite{islam2022kidney}, and Brain
Stroke~\cite{koc2022stroke}. Each dataset is assigned to a separate client. Inputs are grayscale slices resized to $128{\times}128$. Models generate $256{\times}256$
super-resolved outputs. Each dataset is partitioned at the patient level into
80\% training, 10\% validation, and 10\% testing to prevent data leakage
across splits. Performance is evaluated using 
PSNR~\cite{gonzalez2008dip},
SSIM~\cite{1284395}, LPIPS~\cite{zhang2018unreasonable},
and FID~\cite{heusel2018ganstrainedtimescaleupdate}
(Tables~\ref{tab:results},~\ref{tab:ablation}).

\noindent\textbf{Implementation Details.}
All experiments are conducted on an NVIDIA RTX A6000 GPU. FedDSR uses a 
hierarchical VQ-VAE-2~\cite{razavi2019generatingdiversehighfidelityimages} 
with residual-block encoders (384 hidden channels, 6 residual layers) and two 
vector quantizers (1024 embeddings, 128 dimensions, commitment cost 0.25). 
Training runs for 30 global rounds, with each 
client performing 5 local epochs using 
Adam~\cite{kingma2017adammethodstochasticoptimization} (learning rate 
$5{\times}10^{-5}$, batch size 8) and a cosine annealing scheduler (minimum 
$1{\times}10^{-6}$). The decoder fine-tuning (Sec.~\ref{sec:decoder_finetuning}), is performed over ten epochs (learning rate $5{\times}10^{-6}$). The composite loss Eq.~\eqref{eq:lsr} weights are set as 
$\lambda_{1}{=}2.0$, $\lambda_{2}{=}0.10$, 
$\lambda_{3}{=}0.20$, $\lambda_{4}{=}0.25$, and 
$\lambda_{5}{=}0.06$ based on the validation 
set. Early stopping with patience of 15 is applied. Results are reported as mean $\pm$ standard deviation 
across five runs.

\noindent\textbf{Comparative Performances.}
Table~\ref{tab:results} summarizes results across all federated methods. FedRep
shows competitive performance with balanced PSNR and SSIM. FedDSR consistently
outperforms all baselines, achieving the highest PSNR ($40.38$~dB), SSIM
($97.93{\times}10^{-2}$), and lowest LPIPS ($2.86{\times}10^{-2}$). FedAvg attains
only $39.11$~dB PSNR, while FedBN and FedProx remain stable but underperform in
overall fidelity. These results confirm the effectiveness of distribution-aware
aggregation for robust CT image super-resolution. Qualitative comparisons are shown
in Figure~\ref{fig:qualitative_results}.

\noindent\textbf{Downstream Task Evaluation.}
We evaluate super-resolved outputs on
three downstream tasks: COVID-19 classification using the COVIDx Net ~\cite{sun2024swinctcovid}, kidney CT classification using the hybrid CNN of ~\cite{sharma2025hybriddeeplearningframework}, and brain tumor
segmentation using nnU-Net~\cite{isensee2021nnunet}. From Table~\ref{tab:downstream_combined}, notice that the SR images generated by our model yield superior performance compared to the baseline LR images. This shows the utility of our model for downstream tasks.

\noindent\textbf{Communication Overhead.} FedDSR only transmits the encoder and quantizer parameters ($\sim$5.5~M of the 17.0~M total parameters, including the two codebooks of $2\times1024\times128$ embeddings, $\approx$1~MB) to the server per round. The decoder ($\sim$11.6~M parameters) never leaves the client. This reduces per-round communication by 68\% compared to transmitting the full model.

\begin{table}[!t]
\centering
\caption{Clinical evaluation of different input images for downstream tasks.}
\label{tab:downstream_combined}

\resizebox{\linewidth}{!}{
\begin{tabular}{|l|cc|cc|cc|}
\hline
\textbf{Input} 
& \multicolumn{2}{c|}{\textbf{COVID-19 Classification}} 
& \multicolumn{2}{c|}{\textbf{Kidney Classification}} 
& \multicolumn{2}{c|}{\textbf{Brain Segmentation}} \\
\hline
& \textbf{F1 (\%)} & \textbf{AUC (\%)} 
& \textbf{Acc (\%)} & \textbf{F1 (\%)} 
& \textbf{Dice (\%)} & \textbf{IoU (\%)} \\
\hline
HR CT (Upper Bound) & \textbf{90.30 $\pm$ 0.14} & \textbf{97.74 $\pm$ 0.09} 
& \textbf{99.81 $\pm$ 0.07} & \textbf{99.75 $\pm$ 0.13} 
& \textbf{93.52 $\pm$ 2.24} & \textbf{87.91 $\pm$ 4.09} \\
\hline
SR CT (Ours) & 85.30 $\pm$ 0.13 & 92.74 $\pm$ 0.11
& 95.85 $\pm$ 0.08 & 95.66 $\pm$ 0.07 
& 88.17 $\pm$ 4.83 & 79.55 $\pm$ 7.91 \\
\hline
LR CT & 72.63 $\pm$ 0.21 & 74.77 $\pm$ 0.17 & 80.35 $\pm$ 0.05 & 80.07 $\pm$ 0.11 & 79.13 $\pm$ 0.08 &  71.89 $\pm$ 0.09 \\
\hline
\end{tabular}
}
\end{table}

\begin{table}[!t]
\centering
\caption{Ablation study of FedDSR components. M1: baseline (FedAvg, no modules).
M2: M1 + distribution-aware aggregation. M3: M1 + cross-attention.
M4: full FedDSR (both modules). A dash indicates absence of a module.
PSNR, SSIM, LPIPS and FID are mean $\pm$ std across runs.}
\label{tab:ablation}

\resizebox{\linewidth}{!}{
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
\textbf{Model} & \textbf{D-aware} & \textbf{Cross-Attn} &
\textbf{PSNR (dB) $\uparrow$} & \textbf{SSIM ($\times 10^{-2}$) $\uparrow$} &
\textbf{LPIPS ($\times 10^{-2}$) $\downarrow$} & \textbf{FID $\downarrow$} \\
\hline
M1 & -- & -- & 39.11 $\pm$ 0.05 & 97.61 $\pm$ 0.04 & 4.30 $\pm$ 0.05 & 9.54 $\pm$  0.07\\
\hline
M2 & $\checkmark$ & -- & 39.65 $\pm$ 0.07 & 97.73 $\pm$ 0.05 & 3.74 $\pm$ 0.05 & 7.98 $\pm$ 0.05 \\
\hline
M3 & -- & $\checkmark$ & 39.81 $\pm$ 0.06 & 97.79 $\pm$ 0.04 & 3.21 $\pm$ 0.05 & 7.11 $\pm$ 0.06\\
\hline
\textbf{M4} & $\checkmark$ & $\checkmark$ &
\textbf{40.38 $\pm$ 0.06} &
\textbf{97.93 $\pm$ 0.03} &
\textbf{2.86 $\pm$ 0.04} &
\textbf{6.56 $\pm$ 0.08} \\
\hline
\end{tabular}
}
\end{table}


\subsection{Ablation Studies}

\noindent\textbf{Impact of Distribution-aware Aggregation and Cross-Attention.}
Table~\ref{tab:ablation} presents an ablation study evaluating the contribution of each FedDSR component. M1 is the baseline without any additional modules. M2 adds distribution-aware aggregation, improving PSNR by $0.54$~dB over M1. M3 introduces cross-attention between hierarchical latents, yielding a $0.70$~dB gain compared to the baseline. M4 is the full FedDSR combining both modules, achieving the best performance across all metrics. These results show that both components contribute independently and complementarily to robust CT super-resolution under non-IID federated settings. Codebook vitalization is enabled across all runs.

% --- CONCLUSION ---
\section{Conclusion}
We presented FedDSR, a two-stage federated framework for CT image super-resolution
under data heterogeneity. FedDSR uses VQ-VAE-2 codebook embeddings to capture
client-specific data distributions and performs distribution-aware aggregation for
robust global model training. A local decoder fine-tuning stage further enables
institution-specific personalization without server communication. Experiments on
four heterogeneous CT datasets demonstrate that FedDSR consistently outperforms
federated baselines across PSNR, SSIM, LPIPS, and FID. Ablation results confirm the independent effectiveness of distribution-aware aggregation and cross-attention between hierarchical latents. Downstream evaluations on
COVID-19 classification, kidney CT classification, and brain tumor segmentation
further validate the clinical utility of the super-resolved outputs. In the future, we will extend FedDSR to other modalities such as MRI and explore
cross-modal federated settings where clients hold images of different modalities.

\begin{credits}
\subsubsection{\ackname} This work was supported by a research grant from the Indian Institute of Technology Jodhpur.

\subsubsection{\discintname}
The authors have no competing interests to declare that are relevant to the content of this article.
\end{credits}


\bibliographystyle{splncs04}
\bibliography{mybibliography}
\end{document}