\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{booktabs} % For professional tables
\usepackage{multirow}
\usepackage{float}
\usepackage{xcolor} 
\usepackage{orcidlink}


\definecolor{rowgray}{RGB}{245, 245, 245}

% Header information for the submission
\jmlrvolume{-- nnn}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[Probabilistic Feature Imputation and Uncertainty-Aware Multimodal Federated Aggregation]{Probabilistic Feature Imputation and Uncertainty-Aware Multimodal Federated Aggregation}

% https://orcid.org/

% \orcidlink{0009-0005-7949-9370}

\midlauthor{
    \Name{Nafis Fuad Shahid\nametag{$^{1}$}\orcidlink{0009-0005-7949-9370}} \Email{nafisfuad21@iut-dhaka.edu}\\
    \Name{Maroof Ahmed\midlotherjointauthor\nametag{$^{1}$}\orcidlink{0009-0007-9308-503X}} \Email{maroofahmed@iut-dhaka.edu}\\
    \Name{Md Akib Haider\midljointauthortext{Contributed equally}\nametag{$^{1}$}\orcidlink{0009-0008-8595-1441}} \Email{akibhaider@iut-dhaka.edu}\\
    \Name{Saidur Rahman Sagor\nametag{$^{1}$}\orcidlink{0009-0000-1820-611X}} \Email{saidurrahman@iut-dhaka.edu}\\
    \Name{Aashnan Rahman\nametag{$^{1}$}\orcidlink{0009-0001-5675-8393}} \Email{aashnanrahman@iut-dhaka.edu}\\
    \Name{Md Azam Hossain\nametag{$^{1}$}\orcidlink{0000-0002-4315-5243}} \Email{azam@iut-dhaka.edu}\\
    \addr $^{1}$ Department of Computer Science and Engineering, Islamic University and Technology (IUT), Gazipur, Bangladesh
}


\begin{document}

\maketitle

\begin{abstract}
Multimodal federated learning enables privacy-preserving collaborative model training healthcare applications. However, a fundamental challenge arises from modality heterogeneity: many clinical sites possess only a subset of modalities due to resource constraints or workflow variations. Existing approaches address this through feature imputation networks that synthesize missing modality representations, yet these methods produce point estimates without reliability measures, forcing downstream classifiers to treat all imputed features as equally trustworthy. In safety-critical medical applications, this limitation poses significant risks. We propose the Probabilistic Feature Imputation Network (P-FIN), which outputs calibrated uncertainty estimates alongside imputed features. This uncertainty is leveraged at two levels: (1) locally, through sigmoid gating that attenuates unreliable feature dimensions before classification, and (2) globally, through Fed-UQ-Avg, an aggregation strategy that prioritizes updates from clients with reliable imputation. Experiments on federated chest X-ray classification using CheXpert, NIH Open-I, and PadChest demonstrate consistent improvements over deterministic baselines, with +5.36\% AUC gain in the most challenging configuration. Code implementation is available at \href{https://github.com/NafisFuadShahid/PFIN-UQAVG}{https://github.com/NafisFuadShahid/PFIN-UQAVG}
\end{abstract}

\begin{keywords}
Federated Learning, Multimodal Learning, Uncertainty Quantification, Feature Imputation, Medical Imaging
\end{keywords}



\section{Introduction}

In modern healthcare, relying on a single source of information to formulate a diagnosis is insufficient and potentially unsafe \citep{Teoh2024MultimodalFusion}. Instead, diverse data types, such as radiological images, clinical history, and textual reports are synthesized. Research consistently shows that multimodal models, which learn from both images and text, significantly outperform models that rely on images alone \citep{huang2020fusion, acosta2022multimodal}. For instance, in chest X-ray analysis, combining the visual scan with the radiologist's textual report enables the system to capture complex medical conditions with greater accuracy \citep{pmlr-v182-zhang22a, boecking2022making}.

However, the development of these collaborative AI models is constrained by strict privacy regulations. Laws such as HIPAA and GDPR prohibit the centralization of patient data across institutions \citep{Price2019Privacy}. To address this, Federated Learning (FL) was introduced \citep{mcmahan2017communication} to collaboratively train a shared model without transferring patient data off-site among hospitals. Data remains private locally, and only model updates are shared \citep{rieke2020future, sheller2020federated}.

While Federated Learning resolves privacy concerns, it encounters a practical challenge known as modality heterogeneity. In real-world deployments, resources vary significantly across institutions. Large academic medical centers often possess complete datasets comprising both X-rays and detailed reports. Conversely, smaller clinics or rural hospitals may only have access to X-ray images, lacking the infrastructure to provide structured text data \citep{rajpurkar2017chexnet, warnat2021swarm}. This disparity creates a network where some participants possess complete multimodal data, while others hold incomplete unimodal data.

To mitigate this missing data problem, early research proposed feature imputation, which trains the model to synthesize the missing modality. For example, if a clinic provides only an image, the model predicts the corresponding text report based on visual patterns \citep{ngiam2011multimodal}. Recent approaches, such as SMIL \citep{ma2021smil}, have refined this process for complex datasets. However, a critical flaw persists in these methods: they are deterministic. A deterministic model produces a single confident prediction even when guessing, potentially hallucinating a text description for an ambiguous image with high confidence. In safety-critical medical domains, such silent failures can lead to erroneous diagnoses \citep{jungo2019assessing, nair2020exploring}.

We argue that AI systems must be transparent about their limitations. Regulatory bodies, including the U.S. Food and Drug Administration (FDA), explicitly state the need to develop methods to quantify uncertainty and convey it in the device output to users \citep{fda2024uq}. This concept, known as Uncertainty Quantification (UQ), is essential for safety. Foundational work by \citet{kendall2017uncertainties} established methods to measure this uncertainty in deep learning. By integrating UQ, a model can report high uncertainty when input data is ambiguous, thereby warning clinicians not to trust the synthetic features.

In this paper, we propose the \textbf{Probabilistic Feature Imputation Network (P-FIN)}. Instead of deterministically estimating missing features, our model outputs a distribution parameterized by a mean value and a variance score. We train this network using a specialized loss function called $\beta$-NLL \citep{seitzer2022pitfalls}, which prevents the model from minimizing loss by simply predicting infinite uncertainty. We leverage this uncertainty in two distinct ways. Locally, the variance acts as a gate to suppress unreliable features before fusion. Globally, we introduce \textbf{Fed-UQ-Avg}, a novel aggregation method that prioritizes updates from hospitals with confident, high-quality data over those with high uncertainty. Our experiments on diverse chest X-ray datasets \citep{irvin2019chexpert, bustos2020padchest} demonstrate that this approach is significantly more robust than previous methods.



%==============================================================================
\section{Related Work}
\label{sec:related}
%==============================================================================

\subsection{Missing Modalities in Federated Learning}

Feature imputation networks have emerged as a solution to missing modality challenges by learning cross-modal mapping through various architectural approaches. Early work by Ngiam et al. \citep{ngiam2011multimodal} introduced deterministic autoencoders for synthesizing cross modality representations. Ma et al. \citep{ma2021smil} proposed SMIL, a Bayesian meta-learning approach for severely missing modalities while Kaissis et al. \citep{kaissis2020secure} and Warnat-Herresthal et al. \citep{warnat2021swarm} adapted deterministic feature imputation for federated medical imaging. However, existing methods produce point estimates without reliability measures, precluding uncertainty-aware downstream processing.

\subsection{Uncertainty Quantification in Medical Imaging}
Uncertainty estimation is critical for safety-critical medical applications, enabling systems to flag unreliable predictions for human review. Kendall and Gal \citep{kendall2017uncertainties} distinguished aleatoric uncertainty (inherent data noise) from epistemic uncertainty (model ignorance), with heteroscedastic aleatoric modeling being particularly relevant for input-dependent reliability assessment. Bayesian approaches including Monte Carlo Dropout \citep{gal2016dropout} and variational inference have been applied to medical image segmentation \citep{jungo2019assessing, nair2020exploring} and selective prediction \citep{laves2020wellcalibrated, geifman2019selectivepred}. However, while these works primarily focus on single-modal centralized settings, uncertainty quantification remains underexplored in multimodal federated settings, where both feature imputation reliability and client contribution quality must be assessed.We address this gap by introducing probabilistic feature imputation that explicitly models uncertainty during cross-modal synthesis, and leverage these uncertainty estimates both locally for feature gating and globally for uncertainty-aware federated aggregation.

\subsection{Calibration and Loss Functions}
Training neural networks to predict calibrated uncertainty is non-trivial. Guo et al. \citep{guo2017calibration} demonstrated that modern deep networks are often miscalibrated, producing overconfident predictions. Standard Gaussian negative log-likelihood (NLL) training can lead to ``variance explosion,'' where models predict infinite uncertainty to minimize loss without learning meaningful representations. Recent approaches address this through various regularization strategies: Lakshminarayanan et al. \citep{NIPS2017_9ef2ed4b} proposed deep ensembles, Laves et al. \citep{melba:2021:008:laves} introduced $\sigma$-scaling for recalibration, and Seitzer et al. \citep{seitzer2022pitfalls} developed $\beta$-NLL loss using stop-gradient operations. We adopt $\beta$-NLL for training our probabilistic imputation network due to its effectiveness in preventing variance collapse.

\subsection{Federated Aggregation Strategies}
FedAvg \citep{mcmahan2017communication} remains the dominant aggregation strategy, weighting client contributions proportionally to local dataset size. FedProx \citep{li2020fedprox} addresses statistical heterogeneity through proximal regularization, while other works have explored adaptive weighting based on gradient similarity or loss values \citep{sheller2020federated, dayan2021federated}. However, existing aggregation strategies do not account for feature reliability in missing modality scenarios. A client with many samples but poor imputation quality can degrade the global model through noisy gradient contributions. Our Fed-UQ-Avg explicitly incorporates imputation confidence, complementing data-based weighting with quality-aware adjustments.

%==============================================================================
\section{Methodology}
\label{sec:method}
%==============================================================================

We consider a federated learning setting with $K$ clients. A subset $\mathcal{C}_m$ (multimodal clients) possesses paired chest X-rays $x^I$ and radiology reports $x^T$, while the remaining clients $\mathcal{C}_u$ (unimodal clients) have only images. Our objective is to enable effective multimodal learning across all clients by providing reliable imputation for missing modalities.


% \begin{figure}[!ht]
%   \centering
%   \includegraphics[width=\linewidth]{images/training PFIN.png}
%   \caption{Overview of Stage 1: P-FIN Training. The architecture leverages a Transformer encoder to map image features to text embedding distributions, trained via $\beta$-NLL loss for calibrated uncertainty.}
%   \label{fig:pfin_training}
% \end{figure}

\subsection{Feature Encoders}
The image encoder $f_I$ employs a ResNet-50 backbone \citep{he2016deep} pretrained on ImageNet, with the final classification layer replaced by a linear projection:
\begin{equation}
    z^I = \frac{W_I \cdot \text{ResNet}(x^I)}{\|W_I \cdot \text{ResNet}(x^I)\|_2} \in \mathbb{R}^{256}
\end{equation}
where $W_I \in \mathbb{R}^{256 \times 2048}$ projects ResNet features to a 256-dimensional space, followed by L2 normalization. The text encoder $f_T$ uses BERT-base-uncased \citep{devlin2019bert}, extracting the [CLS] token representation with analogous projection:
\begin{equation}
    z^T = \frac{W_T \cdot \text{BERT}(x^T)_{\text{[CLS]}}}{\|W_T \cdot \text{BERT}(x^T)_{\text{[CLS]}}\|_2} \in \mathbb{R}^{256}
\end{equation}
where $W_T \in \mathbb{R}^{256 \times 768}$. Both encoders output L2-normalized features to ensure compatible representation spaces.

\begin{figure}[t]
  \centering
  \includegraphics[width=\linewidth]{images/training_PFIN.png}
  \caption{Overview of Stage 1: P-FIN Training. The architecture leverages a Transformer encoder to map image features to text embedding distributions, trained via $\beta$-NLL loss for calibrated uncertainty.}
  \label{fig:pfin_training}
\end{figure}

\subsection{Probabilistic Feature Imputation Network (P-FIN)}
Unlike deterministic approaches that output a fixed vector $\hat{z}^T$, P-FIN models the conditional distribution $p(z^T | z^I)$ as a heteroscedastic Gaussian. The architecture comprises:

\textbf{Input Projection.} Image features are projected and reshaped:
\begin{equation}
    h_0 = \text{GELU}(\text{LayerNorm}(\text{Linear}(z^I))) \in \mathbb{R}^{1 \times 256}
\end{equation}

\textbf{Learnable Query Token.} A learnable parameter $q \in \mathbb{R}^{1 \times 256}$ is combined with $h_0$, forming the input sequence $[q; h_0] \in \mathbb{R}^{2 \times 256}$.


\textbf{Transformer Encoder.} A 2-layer Transformer encoder with 4 attention heads processes the sequence:
\begin{equation}
    h_L = \text{TransformerEncoder}([q; h_0]) \in \mathbb{R}^{2 \times 256}
\end{equation}
The query output $h_L[0]$ contains the aggregated cross-modal information.

\textbf{Dual Output Heads.} Two separate MLPs predict the mean and variance:
\begin{align}
    \mu &= \text{MLP}_\mu(h_L[0]) \in \mathbb{R}^{256} \\
    \sigma^2 &= \text{MLP}_{\sigma}(h_L[0]) \in \mathbb{R}^{256}
\end{align}
The variance $\sigma^2$ represents the per-dimension \textbf{uncertainty}, which is directly output by the network and used for both gating and aggregation throughout this work.


\subsection{Calibrated Training via $\beta$-NLL}
Standard Gaussian NLL allows models to minimize loss by predicting large variances without learning meaningful features. We train P-FIN using the $\beta$-NLL loss \citep{seitzer2022pitfalls}, which applies a stop-gradient to prevent this shortcut:
\begin{equation}
    \mathcal{L}_{\beta\text{-NLL}} = \frac{1}{d} \sum_{j=1}^{d} \text{SG}(\sigma_j^{2\beta}) \left( \frac{1}{2} \log \sigma_j^2 + \frac{(z^T_j - \mu_j)^2}{2\sigma_j^{2}} \right)
\end{equation}
where $\text{SG}(\cdot)$ denotes the stop-gradient operator and $d = 256$. Setting $\beta = 0.5$ balances calibration with reconstruction quality, forcing the model to reduce prediction error rather than inflate uncertainty.

\subsection{Local Uncertainty-Aware Fusion}

On unimodal clients, directly using imputed features $\mu$ can propagate errors when imputation is unreliable. We introduce uncertainty-aware fusion that combines gating mechanisms \citep{Arevalo2017GatedMU} with cross-modal attention \citep{Lee_2018_ECCV}.

\textbf{Uncertainty Gating.} We compute a gate from the predicted variance:
\begin{equation}
    g = \text{sigmoid}(-\log \sigma^2) \in \mathbb{R}^{256}
\end{equation}
When uncertainty $\sigma^2$ is high, $\log \sigma^2$ becomes large, making $-\log \sigma^2$ strongly negative, so the gate approaches zero ($g \to 0$), suppressing unreliable features. Conversely, when uncertainty is low, the gate remains close to one.

\textbf{Cross-Modal Attention.} Image and gated text features attend to each other bidirectionally:
\begin{align}
    \hat{z}^I &= \text{LN}(z^I + \text{MHA}(Q{=}z^I, K{=}g \odot \mu, V{=}g \odot \mu)) \\
    \hat{z}^T &= \text{LN}(g \odot \mu + \text{MHA}(Q{=}g \odot \mu, K{=}z^I, V{=}z^I))
\end{align}
where LN denotes layer normalization and MHA is multi-head attention \citep{NIPS2017_3f5ee243}.


\textbf{Fusion.} The attended features are concatenated and projected:
\begin{equation}
    z_{\text{fused}} = \text{Linear}([\hat{z}^I ; \hat{z}^T]) \in \mathbb{R}^{256}
\end{equation}

This attention-guided fusion allows the model to dynamically weight contributions from each modality based on learned relevance, while the uncertainty gating ensures unreliable imputations are suppressed before fusion.

% \clearpage

\begin{figure}[t]
  \centering
  \includegraphics[width=\linewidth]{images/Inference_PFIN.png}
  \caption{Overview of Stage 2: P-FIN Inference. The gating mechanism $g$ attenuates unreliable dimensions of the imputed feature vector $\mu$ based on uncertainty $\sigma^2$ before attention-guided fusion and classification.}
  \label{fig:pfin_inference}
\end{figure}

\subsection{Global Aggregation: Fed-UQ-Avg}
Standard FedAvg weights client contributions by dataset size alone, creating vulnerability when unimodal clients produce poor imputations. Fed-UQ-Avg incorporates imputation quality through a two-component weighting scheme.

\textbf{Data Weight.} Standard size-proportional weight:
\begin{equation}
    w_{\text{data}}^{(k)} = \frac{n_k}{\sum_{j=1}^{K} n_j}
\end{equation}

\textbf{Confidence Weight.} We compute confidence using a temperature-scaled exponential of the negative mean uncertainty:
\begin{equation}
    \text{conf}_k = \exp\left(-\frac{\bar{\sigma}^2_k}{T}\right)
\end{equation}
where $\bar{\sigma}^2_k$ is client $k$'s mean imputation uncertainty and $T$ is a temperature parameter. The normalized confidence weight is:
\begin{equation}
    w_{\text{conf}}^{(k)} = \frac{\text{conf}_k}{\sum_{j=1}^{K} \text{conf}_j}
\end{equation}

\textbf{Combined Weight.} The final aggregation weight blends both components:
\begin{equation}
    W_k = (1 - \alpha) w_{\text{data}}^{(k)} + \alpha w_{\text{conf}}^{(k)}
\end{equation}
We set $\alpha = 0.6$ and $T = 0.2$, prioritizing imputation reliability while accounting for data size. Clients with low uncertainty receive higher weights, while those with high uncertainty contribute less to the global model.

\begin{algorithm2e}[H]
\small
\DontPrintSemicolon
\caption{Fed-UQ-Avg}
\label{alg:fed_uq_avg}
\KwIn{Global Model $\boldsymbol{\theta}^0$, Temperature $T$, Balance $\alpha$, Clients $\mathcal{K}$}
\KwOut{Final Model $\boldsymbol{\theta}^{R}$}

\For{$t \leftarrow 1$ \KwTo $R$}{
    Server broadcasts $\boldsymbol{\theta}^{t-1}$ to clients $\mathcal{K}$;
    
    % Fixed: Added braces around text for colorbox
    \colorbox{rowgray}{\textit{Parallel Client Training}}\\
    \For{$k \in \mathcal{K}$ \textbf{in parallel}}{
        % Fixed: Added closing brace } after P-FIN
        \lIf{Unimodal}{Impute missing modality with P-FIN, then train}
        \lElse{Train with multimodal data}
        Return $\boldsymbol{\theta}_k^{t}, \bar{\sigma}_k^2, n_k$\;
    }

    \colorbox{rowgray}{\textit{Compute Weights and Aggregate}}\\
    % Fixed: Changed index to j to avoid conflict with loop variable k
    % Fixed: Sum over S_t (sampled clients), not global k
    $N_{\text{tot}} \leftarrow \sum_{j \in \mathcal{K}} n_j$\;
    $Z_{\text{conf}} \leftarrow \sum_{j \in \mathcal{K}} \exp(-\bar{\sigma}_j^2 / T)$\;
    
    $\boldsymbol{\theta}^{t} \leftarrow \mathbf{0}$\;
    \For{$k \in \mathcal{K}$}{
        $w_{\text{data}} \leftarrow n_k / N_{\text{tot}}$\;
        $w_{\text{conf}} \leftarrow \exp(-\bar{\sigma}_k^2 / T) / Z_{\text{conf}}$\;
        $\lambda_k \leftarrow (1 - \alpha) w_{\text{data}} + \alpha w_{\text{conf}}$\;
        $\boldsymbol{\theta}^{t} \leftarrow \boldsymbol{\theta}^{t} + \lambda_k \boldsymbol{\theta}_k^{t}$\;
    }
}
\Return $\boldsymbol{\theta}^{R}$\;
\end{algorithm2e}



%==============================================================================
\section{Experiments}
\label{sec:experiments}
%==============================================================================

\subsection{Datasets}

We use three publicly available chest X-ray datasets for our experiments. CheXpert \citep{irvin2019chexpert} contains 224,316 radiographs from 65,240 patients with 14 pathology labels extracted from reports. NIH Open-I \citep{wang2017chestxray8} provides 7,470 images paired with 3,955 radiology reports, enabling multimodal learning with both visual and textual data. PadChest \citep{bustos2020padchest} comprises 160,868 images from 67,000 patients, used for external validation across different institutions and patient populations.

\subsection{Baselines}

We compare against two standard heuristic approaches that serve as performance lower bounds, and a state-of-the-art deterministic imputation method representing the current standard in federated learning


\textbf{Standard Heuristics:} We employ two naive heuristics: \textbf{Zero-filling}, which replaces missing features with null vectors ($\hat{z}^T = \mathbf{0}$), and \textbf{Uniform-filling}, which substitutes the global mean embedding, representing a static baseline that ignores sample-specific visual context.


\textbf{FIN + FedAvg:} The approach proposed by \citep{poudel2025multimodal} uses a deterministic Transformer decoder to reconstruct bottleneck features via MSE loss, aggregated with standard FedAvg. Unlike our method, it produces point estimates without uncertainty quantification.


\textbf{P-FIN + FedAvg:} An ablation of our method utilizing standard FedAvg. This isolates the specific contribution of our uncertainty-aware global aggregation mechanism.


\subsection{Setup and Implementation}
We simulate federated environments using $K=10$ clients distributed in three configurations of unimodal-to-multimodal ratios (8:2, 6:4, 4:6) to reflect varying data scarcity. Data were sourced from CheXpert (unimodal) and NIH Open-I (multimodal), distributed via a Dirichlet distribution ($\alpha_{\text{Dir}} = 0.5$) to ensure realistic non-IID label heterogeneity. PadChest was reserved for external validation.

P-FIN utilizes a ResNet-50 visual backbone and BERT-base textual backbone, both projected to 256 dimensions. The imputation module consists of a 2-layer Transformer with 4 attention heads and $d_{\text{model}}=256$, trained with $\beta$-NLL ($\beta=0.5$). For attention-guided fusion, we employ a single-head attention layer with 256-dimensional queries, keys, and values. Federated training spanned 20 communication rounds with 4 local epochs per round (batch size 32, Adam optimizer, learning rate $10^{-4}$). The Fed-UQ-Avg hyperparameters were set to $\alpha=0.6$ and $T=0.2$.



\subsection{Results and Analysis}
Table~\ref{tab:main_results} summarizes classification performance (mean AUC across 14 classes) on held-out test data. P-FIN integrated with Fed-UQ-Avg demonstrated consistent superiority over all deterministic baselines across all configurations.

\begin{table}[t]
\centering
\caption{Test AUC (\%) across federated configurations with varying unimodal (I) to multimodal (M) client ratios. Best results in \textbf{bold}.}
\label{tab:main_results}
\begin{tabular}{lccc}
\toprule
\textbf{Method} & \textbf{I:M = 8:2} & \textbf{I:M = 6:4} & \textbf{I:M = 4:6} \\
\midrule
Zero-filling & 72.56 {\scriptsize $\pm$ 0.31} & 75.61 {\scriptsize $\pm$ 0.33} & 77.86 {\scriptsize $\pm$ 0.29} \\
Uniform filling & 71.78 {\scriptsize $\pm$ 0.47} & 74.18 {\scriptsize $\pm$ 0.53} & 76.67 {\scriptsize $\pm$ 0.46} \\
FIN + FedAvg & 77.40 {\scriptsize $\pm$ 0.57} & 79.31 {\scriptsize $\pm$ 0.62} & 81.28 {\scriptsize $\pm$ 0.83} \\
P-FIN + FedAvg & 79.93 {\scriptsize $\pm$ 0.79} & 81.72 {\scriptsize $\pm$ 0.37} & 83.11 {\scriptsize $\pm$ 0.62} \\
\midrule
\textbf{Ours (P-FIN + Fed-UQ-Avg)} & \textbf{82.76} {\scriptsize $\pm$ 0.63} & \textbf{84.14} {\scriptsize $\pm$ 0.49} & \textbf{85.89} {\scriptsize $\pm$ 0.38} \\
\bottomrule
\end{tabular}
\end{table}

In the most challenging 8:2 scenario, where 80\% of clients lacked textual data, our approach yielded a 5.36\% absolute improvement over standard deterministic imputation (FIN + FedAvg). This gain emphasizes the criticality of uncertainty-aware weighting when high-quality ground truth is scarce. Convergence analysis further revealed that mean imputation uncertainty on unimodal clients decreased progressively over communication rounds, validating that P-FIN successfully learned to approximate the missing modality distributions from the multimodal minority.

\vspace{20pt}

\begin{figure}[h!]
  \centering
  \includegraphics[width=0.48\linewidth]{images/uncertainty_per_round.png}
  \hfill
  \setlength{\fboxrule}{1pt}
  \includegraphics[width=0.45\linewidth, height=5.1cm]{images/auc_result.png}
  \caption{\textbf{(Left)} Evolution of uncertainty estimates for all clients (0–9). Unimodal clients (0–7) are shown in blue, while multimodal clients (8–9) are in orange. \textbf{(Right)} AUC progression per round.}
  \label{fig:combined}
\end{figure}

The ablation study demonstrates the complementary value of both components: while P-FIN with standard FedAvg already improves over deterministic baselines by modeling imputation uncertainty, the addition of Fed-UQ-Avg yields further consistent gains of 2.83\%, 2.42\%, and 2.78\% across the three heterogeneity configurations. This confirms that uncertainty-aware aggregation is essential for fully leveraging calibrated estimates, particularly in scenarios with high modality imbalance where unreliable client updates could otherwise dominate the global model.


The attention-guided fusion mechanism proved particularly beneficial compared to simple fixed-weight baselines. By allowing the model to dynamically weight the contribution of observed versus imputed features, the attention mechanism naturally leverages the gating signal to suppress unreliable imputations during feature aggregation.

\subsection{Uncertainty Calibration Analysis}


A primary claim of P-FIN is that predicted uncertainties are calibrated and meaningful. We validate this through three analyses.

We measure calibration using Expected Calibration Error (ECE) \citep{guo2017calibration}. This quantifies the discrepancy between expected and observed coverage. Figure~\ref{fig:calibration}(a) shows the reliability diagram for P-FIN. The close alignment between the observed coverage curve and the perfect calibration diagonal indicates dependable calibration (ECE = 0.0422).



\begin{figure}[h]
\centering

% -------- First row: (a) and (b) --------
\setlength{\fboxrule}{1pt}
\begin{minipage}{0.96\linewidth}
    \centering
    \begin{minipage}{0.48\linewidth}
        \centering
        \includegraphics[width=\linewidth]{images/reliability_diagram.png}
        \vspace{2pt}
        (a) Reliability diagram
        \label{fig:reliability}
    \end{minipage}
    \hfill
    \begin{minipage}{0.48\linewidth}
        \centering
        \includegraphics[width=\linewidth]{images/2_binned_analysis.png}
        \vspace{2pt}
        (b) Error correlation
        \label{fig:error_correlation}
    \end{minipage}
\end{minipage}%


\caption{Uncertainty calibration analysis. (a) Reliability diagram showing dependable calibration with ECE = 0.0422. (b) Binned analysis demonstrating strong correlation between predicted uncertainty and reconstruction error.}

\label{fig:calibration}

\end{figure}

We also analyzed the relationship between predicted uncertainty and actual imputation error to verify that uncertainty meaningfully reflects imputation quality. Figure~\ref{fig:calibration}(b) presents a binned analysis where samples are grouped into deciles by predicted uncertainty. The monotonic trend with mean imputation error increasing across uncertainty deciles confirms that high uncertainty reliably indicates low-quality imputations.


\begin{figure}[t]
% -------- Second row: (c), full width --------
\setlength{\fboxrule}{1pt}
\begin{minipage}{0.96\linewidth}
    \centering
    \includegraphics[width=\linewidth]{images/qualitative_examples.png}
    \vspace{2pt}
\end{minipage}%


\caption{Qualitative Examples:
Top row shows low-uncertainty cases with clear imaging; bottom row shows high-uncertainty cases with complex or ambiguous presentations.}
\label{fig:calibration_2}
\end{figure}


%==============================================================================
\section{Discussion}
\label{sec:discussion}
%==============================================================================

The results substantiate the hypothesis that probabilistic modeling offers a robust defense against modality heterogeneity in federated learning. By explicitly quantifying imputation uncertainty, P-FIN mitigates the risks of error propagation inherent in deterministic approaches. The dual-mechanism strategy, comprising local gating and global Fed-UQ-Avg, creates a synergistic effect: local gating prevents individual classifiers from overfitting to hallucinations, while global aggregation prevents the shared model from being corrupted by clients with poor imputation capabilities.

From a clinical perspective, P-FIN aligns with regulatory priorities for uncertainty quantification in AI-enabled medical devices \citep{fda2024uq}. The ability to output a confidence metric alongside a prediction is a prerequisite for human-in-the-loop workflows, allowing radiologists to trust the model when uncertainty is low and scrutinize it when uncertainty is high. While our current implementation focuses on unidirectional image-to-text imputation, future work will explore bidirectional synthesis and the integration of epistemic uncertainty estimation to further enhance reliability in out-of-distribution scenarios.

%==============================================================================
\section{Conclusion}
\label{sec:conclusion}
%==============================================================================

We presented P-FIN, a probabilistic framework for handling missing modalities in federated medical imaging. By replacing point estimates with calibrated distributions, P-FIN enables uncertainty-aware local fusion and quality-weighted global aggregation. Our evaluation on chest X-ray classification confirms that this ``learning to distrust'' paradigm significantly outperforms deterministic alternatives, particularly in data-scarce environments. This work provides a foundation for more resilient and trustworthy multimodal federated learning systems in healthcare. Future work may explore more complex multimodal application scenarios as well as alternate aggregation mechanisms.

%==============================================================================
\midlacknowledgments{We thank the contributors of the CheXpert, NIH Open-I, and PadChest datasets for making their data publicly available for research. Additionally, we wish to acknowledge Syed Rifat Raiyan and Reaz Hassan Joader, Department of Computer Science
and Engineering, IUT, for their assistance in proofreading
and offering a preliminary review of this manuscript}

%==============================================================================
% \bibliography{midl-samplebibliography}
\bibliography{midl26_335}

\end{document}


