\setcounter{figure}{0}
\setcounter{table}{0}

% \renewcommand{\thepage}{S\arabic{page}}
\renewcommand{\thesection}{S\arabic{section}}
\renewcommand{\thetable}{S\arabic{table}}
\renewcommand{\thefigure}{S\arabic{figure}}

\clearpage
\appendix

\noindent
\textbf{\large Conditional Learned Reconstruction for Medical Imaging -- Supplementary Material}



\section{Background}
\label{app:appendix1}


\subsection{Accelerated MRI Reconstruction}
\label{app:appendix1-mri-subsampling}

\subsubsection{Subsampling Operator and Sampling Parameters}


Given an index set $\Theta \subset \Omega = \{1,\ldots,n\}$, the undersampling operator $\vec{U}_{\Theta}:\mathbb{C}^{n} \rightarrow \mathbb{C}^{n}$ retains entries indexed by $\Theta$ and sets all others to zero:
\begin{equation}
    (\vec{U}_{\Theta}(w))_i =
    \begin{cases}
        w_i, & i \in \Theta, \\
        0,   & i \notin \Theta ,
    \end{cases}
\qquad i=1,\ldots,n.
\end{equation}

The acceleration factor is defined as
\begin{equation}
    R = \frac{|\Omega|}{|\Theta|} = \frac{n}{|\Theta|},
\end{equation}
and is inversely proportional to the number of acquired $k$-space samples. Sensitivity maps are typically estimated from a fully-sampled  central region of $k$-space known as the autocalibration signal (ACS), denoted $\Theta_{\mathrm{acs}}\subset \Theta$, with ACS ratio

\begin{equation}
    r_{\mathrm{acs}} = \frac{|\Theta_{\mathrm{acs}}|}{n}.
\end{equation}
These parameters determine the sampling pattern and partial Fourier coverage used in the forward model.



\subsection{CBCT Acquisition Geometry and Projection Operator}\label{app:appendix1-cbct-geometry}

For monochromatic X-ray energy, the attenuation coefficient at spatial location $z \in X \subset \mathbb{R}^3$ is denoted by $x(z) \in \mathbb{R}_{\ge 0}$. The X-ray source follows a circular trajectory parameterized by $\gamma:[0,1] \to \mathbb{R}^3$, and the detector is described by a family of planes $Y(t)$, each identified with $\mathbb{R}^2$. For detector coordinate $u \in Y(t)$, let $l_{t,u}$ be the line segment from $\gamma(t)$ to $u$. The projection operator is
\begin{equation}
    \mathcal{P}(x)(t,u) = \int_{l_{t,u}} x(z)\, dz,
\end{equation}
mapping functions on $X$ to functions on $[0,1]\times\mathbb{R}^2$. The adjoint operator $\mathcal{P}^*$ is the backprojection operator.

\subsection{Fan-beam CT Geometry}
\label{app:appendix1-fanbeam-geometry}

Fan-beam CT is a two-dimensional analogue of CBCT with domain $X \subset \mathbb{R}^2$. The X-ray source trajectory is $\gamma:[0,1] \to \mathbb{R}^2$, and the detector is parameterized by lines $Y(t)$. For $u \in Y(t)$, $l_{t,u}$ denotes the line from $\gamma(t)$ to $u$. Using this notation, the projection operator is given by
\[
    \mathcal{P}(x)(t,u) = \int_{l_{t,u}} x(z)\, dz,
\]
mapping $X$ to $[0,1]\times \mathbb{R}$.



\section{Methods} \label{app:appendix2}

\subsection{Generalized Modulated Convolution}\label{app:appendix2-genmodconv}
This section defines the \textit{Generalized Modulated Convolution}, a broader formulation of the modulated convolution introduced in \eqref{eq:mod_conv} and discussed in Section~\ref{sec:sec3.1}. Using the same notation, the operation is written as
% 
\begin{equation}
    \vec{o}_{m} = \sum_{k=0}^{C_{\text{in}}-1} \left((\vec{W}_{\boldsymbol{\theta}})_{m,k} \otimes \vec{k}_{m,k}\right) \star \vec{i}_{k}   + (\vec{b}_{\boldsymbol{\psi}})_{m}, \quad m=1,\cdots, C_{\text{out}},
\end{equation}

\noindent
where $ \otimes$ denotes the tensor product, and 
$\vec{W}_{\boldsymbol{\theta}}$ is the generalized modulation weights, produced by
% 
\begin{equation}
    \vec{W}_{\boldsymbol{\theta}} = f_{\boldsymbol{\theta}}(\vec{z}) \in \mathbb{R}^{M},
    \label{eq:gen_mod_conv}
\end{equation}
% 
\noindent
responsible for fully conditioning the convolution weight on $\vec{z}$. Different choices of $M$ allow the method to express a range of modulation strategies:

\begin{itemize}
    % \item $M = {k_1 \times k_2}$ for kernel-only modulation,
    % \item $M = {C_{\text{out}}\times C_{\text{in}}}$ for features (channe-wise) modulation (as used in \Sec{sec3.1}),
    % \item $M = {k_1 \times k_2 \times C_{\text{in}}}$ or $M = {k_1 \times k_2 \times C_{\text{out}}}$ for partial feature modulation over input or output channels.
    \item $M = k_1 \times k_2$ (2D) or $M = k_1 \times k_2 \times k_3$ (3D) for kernel-only modulation.
    
    \item $M = C_{\text{out}} \times C_{\text{in}}$ for feature (channel-wise) modulation, as used in \Sec{sec3.1}.
    
    \item $M = (k_1 \times k_2 \times C_{\text{in}})$ or $(k_1 \times k_2 \times C_{\text{out}})$ in 2D,  
    and $M = (k_1 \times k_2 \times k_3 \times C_{\text{in}})$ or $(k_1 \times k_2 \times k_3 \times C_{\text{out}})$ in 3D,  
    for partial feature modulation over input or output channels.
\end{itemize}





\subsection{Deep Learning Architectures}
\subsubsection{Iterative ADMM DL-based Accelerated MRI Reconstruction}\label{app:appendix2-vsharp}
~\\

A wide range of deep learning approaches have been proposed for accelerated MRI, with many relying on unrolled iterative schemes that embed the acquisition physics within a learned optimization procedure. Examples include gradient-descent unrolling in either image or frequency domains \cite{Hammernik2017,Lnning2019,Sriram2020,Yiasemis2022b} and first-order methods based on proximal gradient \cite{Luo2023}, conjugate gradient \cite{Kim2022}, or ADMM \cite{10.1007/978-3-031-52448-6_45}.

For our experiments in Accelerated MRI Reconstruction we employ a DL-based algorithm that exploits variable half-quadratic splitting followed by ADMM unrolled optimization spanning $J$ iterations, namely vSHARP (variable Splitting Half-quadratic ADMM algorithm for Reconstruction of inverse-Problems). Given undersampled $k$-space measurements $\tilde{\vec{y}}$, and sensitivity maps $\mat{S}$,    each unrolled iteration comprises the following steps:
% 
\begin{subequations}
\begin{gather}
    \vec{x}^{(j)} = \argmin_{\vec{x}\in\mathbb{C}^{n}} \frac{1}{2} \sum_{k=1}^{n_c}\left|\left| \mathcal{A}_{\Theta, \mat{S}^{k}}^{k}(\vec{x}) - \Tilde{\vec{y}}^{k}\right|\right|_2^2  + 
    \frac{\eta_j}{2} \big | \big | \vec{x} - \vec{w}^{(j-1)} + \frac{\vec{u}^{(j-1)}}{\eta_j} \big | \big |_2^2, 
\label{eq:admm_x} \\
    \vec{w}^{(j)} =  \mathcal{D}_{\boldsymbol{\phi}_j} (\vec{x}^{(j)}, \vec{w}^{(j-1)}, \frac{\vec{u}^{(j-1)}}{\eta_j})
\label{eq:admm_w}\\
    \vec{u}^{(j)} = \vec{u}^{(j)} + \eta_j (\vec{x}^{(j)} - \vec{w}^{(j)}), \quad j=1,\cdots, J.
\label{eq:admm_u}
\end{gather}
\label{eq:admm_vsharp}
\end{subequations}
\noindent
vSHARP solves \eqref{eq:admm_x} via an iterative differentiable gradient scheme, while \eqref{eq:admm_w} is learned using trainable convolutional-based denoising modules $ \mathcal{D}_{\boldsymbol{\phi}_j}$. Initial estimations of each variable is obtained as follows:
% 
\begin{equation}
    \vec{x}^{(0)} = \vec{z}^{(0)} := \sum_{k=1}^{n_c} \vec{S}_{k}^{*}\mathcal{F}^{-1} (\tilde{\vec{y}}) , \quad \vec{u}^{(0)} = \mathcal{U}_{\boldsymbol{\phi}_u} (\vec{x}^{(0)}),
\label{eq:admm_inits}
\end{equation}
% 
where $\mathcal{U}_{\boldsymbol{\phi}_u}$ represents a DL-based initializer comprising alternating sequences of dilated convolutions and replication padding responsible for predicting suitable initial value for the Lagrange Multiplier step in \eqref{eq:admm_u}. For further details refer to the original work \cite{yiasemis2023vsharp}.

For the prediction of the sensitivity maps $\vec{S}$, vSHARP also employs a separate DL convolutional-based model, denoted as $\mathcal{S}_{\boldsymbol{\phi}_S}$, which takes as input estimated sensitivities $\tilde{\vec{S}}$ using ACS-sampled $k$-space data (see \cite{Yiasemis2022b} for more details on initial estimation) and refines them during training:
% 
\begin{equation}
    \mat{S}_k = \mathcal{S}_{\boldsymbol{\phi}_S} (\tilde{\mat{S}}_k), \quad k=1,\cdots, n_c.
\end{equation}
% 
Concerning the architecture of the denoising models $\{\mathcal{D}_{\boldsymbol{\phi}_j}\}_{j=1}^{J}$ and sensitivity module $\mathcal{S}_{\boldsymbol{\phi}_S}$, we opted for the 2D U-Net architecture \cite{ronneberger2015u}, which combines an encoder (2D convolutions and 2D max pooling), and a decoder (2D transpose convolutions) with skip connections.


\subsubsection{CBCT}
\label{app:appendix2-cbct}

For CBCT we use $\partial$U-net, a multi-scale learned iterative scheme that operates across four spatial resolutions ($1$, $1/2$, $1/4$, $1/8$). The lowest resolution reconstruction is progressively refined through successive convolutional blocks, each consisting of three convolutional layers with ReLU activations and normalization layers. The final image is produced by a high-resolution 3D U-net that integrates the intermediate multi-scale estimates.

As initialization, we use a filtered backprojection (FDK) reconstruction with a ramp filter and a $95\%$ frequency cut-off.  
All convolutional and transposed-convolutional layers in the backbone are replaced by modulated versions.

\subsubsection{Fan-beam}
\label{app:appendix2-fanbeam}

For Fan-beam CT we adopt the Learned Primal--Dual (LPD) algorithm, which unrolls the Primal--Dual Hybrid Gradient (PDHG) method. Each iteration consists of:

\begin{itemize}
    \item a \emph{primal update} in image space, implemented by a small CNN with three convolutional layers (PReLU activations and batch normalization),
    \item a \emph{dual update} in projection space, parameterized by an analogous CNN,
\end{itemize}
% 
\noindent
with differentiable projection and backprojection operators linking the two domains.  
Because LPD is memory-intensive, it is applied only to the 2D fan-beam geometry.

All convolutional layers in both primal and dual modules are replaced by modulated convolutions.



\section{Experimental Setup}
\label{app:appendix3}

\subsection{Quantitative Evaluation Metrics}
\label{app:appendix3-metrics}

Let $x \in \mathbb{R}^{n_1 \times n_2}$ denote the reference image and $\hat{x} \in \mathbb{R}^{n_1 \times n_2}$ the reconstructed image.  
For convenience, let $N = n_1  \times n_2$ denote the total number of pixels.

\paragraph{Structural Similarity Index Measure (SSIM)}
\begin{equation}
\mathrm{SSIM}(x,\hat{x}) 
= \frac{(2\mu_x \mu_{\hat{x}} + C_1)(2\sigma_{x\hat{x}} + C_2)}
{(\mu_x^2 + \mu_{\hat{x}}^2 + C_1)(\sigma_x^2 + \sigma_{\hat{x}}^2 + C_2)},
\end{equation}
where $\mu_x, \mu_{\hat{x}}$ are local means; $\sigma_x^2, \sigma_{\hat{x}}^2$ local variances; 
$\sigma_{x\hat{x}}$ the local covariance; and $C_1, C_2$ stability constants.

\paragraph{Peak Signal-to-Noise Ratio (pSNR)}
\begin{equation}
\mathrm{MSE}(x,\hat{x}) 
= \frac{1}{N} \sum_{i=1}^{N} (x_i - \hat{x}_i)^2 ,
\end{equation}
\begin{equation}
\mathrm{pSNR}(x,\hat{x})
= 10 \log_{10}
\left(
\frac{(\max(x))^2}{\mathrm{MSE}(x,\hat{x})}
\right).
\end{equation}

\paragraph{Normalized Mean Squared Error (NMSE)}
\begin{equation}
\mathrm{NMSE}(x,\hat{x})
= \frac{\| x - \hat{x} \|_2^2}{\| x \|_2^2}.
\end{equation}

\paragraph{Mean Absolute Error (MAE / L1)}
\begin{equation}
\mathrm{MAE}(x,\hat{x})
= \frac{1}{N} \sum_{i=1}^{N} |x_i - \hat{x}_i|.
\end{equation}

\paragraph{}
Higher SSIM and pSNR values, alongside lower NMSE and MAE, indicate superior reconstruction fidelity.



\subsection{Triangular Distribution}
\label{app:appendix3-triang-dist}
During training the acceleration factor $R$  for accelerated MRI reconstruction and photon count $I_0$ for Computed Tomography were selected using a (right-angle) triangular distribution within an interval $[a, b]$ with peak at $b$. The idea of using this distribution is that, for instance in accelerated MRI Reconstruction, higher acceleration factors are generated more often motivated by the fact the reconstruction model sees less data for higher accelerations. The same idea can be transfered to Computed Tomography for $I_0$.

Below, we outline the definition of such distribution for arbitrary choices of $a$ and $b$. The triangular distribution in the range \([a, b]\) can be characterized by a Probability Density Function (PDF) that linearly increases from \(a\) to \(b\) as follows:

\begin{equation}
    p(x) = \frac{2(x - a)}{(b - a)(b - a)} = \frac{2x}{b^2 - a^2}, \quad a \leq x \leq b.
\end{equation}
\noindent
The cumulative distribution function (CDF) and inverse CDF of $p$ are given by:

\begin{equation}
    F(x) = \frac{x^2 - a^2}{b^2 - a^2}, \quad a \leq x \leq b,
\end{equation}

\begin{equation}
    \text{and } F^{-1}(u) = \sqrt{u \cdot (b^2 - a^2) + a^2}, \quad 0 \leq u \leq 1.
\end{equation}

\noindent
The inverse cdf $F^{-1}$  method can be applied to sample from $p$ as follows:
\begin{enumerate}
    \item Generate a uniform random number $u' \sim U[0,1]$.
    \item Return $F^{-1}(u')$.
\end{enumerate}

\subsection{Accelerated MRI Reconstruction}
\label{app:appendix3-mri}

\subsubsection{Training and Optimization Details}
\label{app:training_details}

\paragraph{Model Optimization}
All models were developed in PyTorch \cite{paszke2017automatic} and optimized using Adam \cite{kingma2017adam} with parameters $(\beta_1, \beta_2) = (0.9, 0.999)$ and $\epsilon = 1\mathrm{e}{-8}$. Experiments were carried out on NVIDIA A100 or H100 GPUs. A batch size of 2 was used for static reconstruction and 1 for dynamic reconstruction. Static models were trained for 150{,}000 iterations, while dynamic models were trained for 80{,}000 iterations. The learning rate schedule began with an initial rate of $6.7\mathrm{e}{-4}$, increased linearly to $2\mathrm{e}{-3}$ over the first 1{,}000 iterations, and subsequently decayed by $20\%$ every 30{,}000 iterations.

\paragraph{Random Augmentations}
During training across all setups, random augmentations were applied to improve model robustness and learning efficacy. These included random cropping ($320 \times 320$ regions for static reconstruction and $(n_t, n_x/3, n_y/2)$ random crops for dynamic reconstruction, where $n_t$, $n_x$, and $n_y$ denote the temporal and spatial dimensions), random horizontal or vertical flipping, and random rotation.

\paragraph{Reconstruction Model Hyperparameters and Loss Function}
The vSHARP models incorporated U-Nets with four scales for both denoising and sensitivity estimation. For denoising, 2D U-Nets with 32, 64, 128, and 256 filters were used for static reconstruction, while 3D U-Nets with 16, 32, 64, and 128 filters were used for dynamic reconstruction. Sensitivity estimation employed U-Nets with 16, 32, 64, and 128 channels. For static reconstruction, 12 denoising steps and 10 data consistency steps were used, while dynamic reconstruction employed 8 denoising steps and 6 data consistency steps. The Lagrange Multiplier module was in line with the original vSHARP framework \cite{yiasemis2023vsharp}.  All remaining architectural and training choices were kept consistent with the experiments presented in \cite{yiasemis2023vsharp}. A dual-domain loss combining image-domain and $k$-space-domain loss components was employed, following the original work.


\subsubsection{Comparison and Ablation Studies}

\paragraph{Additional Results}
We show the SSIM and pSNR results of our experiments for high acceleration factors in \ref{tab:mri_results_ssim_psnr_10_16}. Moreover, in \Tab{mri_results_nmse} are provided the quantitative results for the NMSE metric for our experiments in Accelerated MRI Reconstruction in \Sec{sec4.3}, corresponding to Tab. \ref{tab:mri_results_ssim_psnr}.

\input{tabs/mri_results_ssim_psnr_10_16x}
\input{tabs/mri_results_nmse}

\paragraph{Additional Experiments}

To further assess the flexibility of the generalized modulated convolution, we conducted additional comparisons beyond the configurations evaluated in  \Sec{sec4.3}. Among the feature-modulation variants (MOD S, MOD M, MOD L), the best-performing configuration-referred to here as Feat (Best)-serves as our reference model. This variant corresponds to the strongest-performing feature-modulation setup in \Tab{mri_results_ssim_psnr} and \Tab{mri_results_nmse}, where modulation predicts a $C_\text{in} \times C_\text{out}$ feature-wise scaling matrix as defined in Eq. \eqref{eq:mod_conv}.
 
% To evaluate the generalized modulated convolution we consider the following variants to our experiments as presented in \Sec{sec4.3}, comparing to the best performing features modulation - Feat (Best)- as presented in \Tab{mri_results_ssim_psnr} and \Tab{mri_results_nmse}:


\begin{enumerate}
    \item \textit{Partial-in} modulation using one hidden layer with 32 input and 32 output features (Part-In L).
    \item \textit{Full} modulation using one hidden layer with 32 input and 8 output features (Full S).
\end{enumerate}

\input{tabs/mri_results_additional}

The results in \Tab{mri_additional} indicate that both generalized variants perform competitively with Feat (Best) across all acceleration factors for both prostate and knee datasets. While Feat (Best) remains the strongest overall, the performance differences are small. This suggests that the primary benefit arises from including acquisition-aware conditioning itself, whereas the exact parametrization of the modulator has a secondary effect.

\input{tabs/mri_inf_times}

\newpage
\subsection{Computed Tomography}
\label{app:appendix3-ct}
\paragraph{Additional Results} In \Tab{ct-table} are provided the quantitative results for the Fan-beam CT experiments. In Fig. \ref{fig:axial2-large} and Fig. \ref{fig:axial3-large} samples from variable projection count CBCT are provided.


\input{tabs/ct_results}
\input{tabs/cbct_inf_times}
\input{figs/cbct2_samples}

\section{Discussion}
\label{app:appendix4}

The experimental results across accelerated MRI, Cone-beam CT, and Fan-beam CT collectively underline the central observation of this work: conditioning learned iterative reconstruction schemes on acquisition parameters offers gains in reconstruction quality. The improvements emerge consistently across modalities, architectures, and parameter ranges, supporting the broader claim that variability in acquisition settings-when left implicit-limits the representational efficiency of non-conditional networks.


A recurring finding is that the magnitude of improvement depends strongly on the interaction between the physical forward model, the available measurement information, and the architecture used to perform the iterative updates. In accelerated MRI, where signal-to-noise characteristics and aliasing structure vary substantially with acceleration factor and ACS ratio, the modulated variants consistently outperform the unmodulated baselines across nearly all tested conditions. This is visible in \Tab{mri_results_ssim_psnr} and \Tab{mri_results_nmse}, where modulated models demonstrate higher SSIM and pSNR, and lower NMSE, particularly at higher acceleration factors-precisely the regime where the underdetermined nature of the inverse problem becomes most severe. For the prostate data, the MOD L configuration provides the most stable gains, unlike the knee data, were MOD M was the best performer overall.

Furthermore, in cases of varying field strength in MRI, conditioning becomes particularly relevant, as field strength directly affects signal-to-noise characteristics and measurement statistics.
In our dynamic experiments, using the Cardiac MRI Reconstruction Challenge 2025 dataset with 1.5T and 3T acquisitions, the modulated model consistently outperform the unmodulated baseline (\Tab{mri_ssim_psnr_results_cardiac}), with larger gains observed when field strength is explicitly provided as conditioning information.



In Cone-beam CT, where the conditioning variable is the photon count $I_0$, improvements follow a similar trend. \Tab{cbct-table} demonstrates that lower-dose settings ($I_0$ = 10k) benefit most from modulation, with the modulated $\partial$U-net producing sharper soft-tissue detail (\Fig{axial-large}) and lower mean absolute errors. In Cone-beam CT with variable projection count where the projection count is the conditioning variable, improvements from conditioning are more pronounced as shown in \Tab{cbct-table2} and reach 0.8 dB in PSNR. This is accomplished at negligible inferece time costs, as seen from Tab. \ref{tab:3d_cbct_params_time}.


Fan-beam CT results, however, show that the gains are smaller and sometimes marginal (\Tab{ct-table}). One plausible explanation, supported directly by the manuscript, is that LPD's dual blocks operate directly on the projection data. This allows the unmodulated network to infer noise characteristics implicitly. Consequently, external conditioning provides less additional value. Although the modulated variants still tend to outperform the baseline on average, the improvements are modest compared to MRI or CBCT.


A broader limitation of the present study is that only a subset of potentially relevant acquisition parameters was explored. For MRI, the auxiliary variable was restricted to acceleration factor, ACS ratio and field strength, but trajectory type, sequence type, or number of coils, are also meaningful candidates. Similarly, in CT, conditioning was performed on photon count and projection count in case of CBCT only. Exploring variables such as tube voltage may further clarify when and how conditioning is most impactful.

Another consideration is the trade-off between introducing modulation versus simply increasing model capacity. A wider U-Net or deeper CNN might recover part of the same performance gap by learning a richer set of shared filters. While exhaustively comparing these alternatives is beyond the scope of this work, the MRI results suggest that the gains from conditioning persist even when modulation capacity is relatively small (e.g., MOD S), indicating that explicit acquisition-aware adaptation cannot be trivially replaced by larger generic models.

Importantly, this distinction is further supported by the input-only modulation experiments. Modulating only the input convolution layers yields consistent improvements over the non-modulated baseline while introducing only a negligible increase in parameter count. This suggests that early, acquisition-aware adaptation of feature extraction already captures a substantial fraction of the benefit of conditioning, without requiring full modulation throughout the network. In this sense, input-level modulation acts as a lightweight yet effective mechanism for aligning the reconstruction process with acquisition-specific statistics, reinforcing that the observed gains stem from conditioning rather than from increased representational capacity alone.

It is also worth noting that our experiments rely on three specific learned iterative schemes, each selected as a representative architecture for its respective inverse problem: an ADMM-based unrolled model for MRI, $\partial$U-Net for CBCT, and Learned Primal--Dual for fan-beam CT. While many other reconstruction backbones exist, we did not attempt to evaluate modulation across the full architectural spectrum. Nevertheless, these choices comprise representative designs within their modalities, and therefore provide a reasonable basis for assessing how acquisition-aware conditioning behaves in practice.

In addition, the generalized formulation in \Appendix{appendix2-genmodconv} shows that several modulation parameterizations are possible. Under the configurations evaluated here for MRI (see \Tab{mri_additional}), our original feature-wise (channel-wise) formulation performs best overall and does so with the lowest computational overhead. It is also worth noting that the comparisons are not entirely fair: the Partial-In variant was tested with a larger MLP, whereas the Full variant used a smaller one, which makes their relative performance harder to interpret directly.

The triangular sampling strategy used during training as described in \Appendix{appendix3-triang-dist} emphasizes challenging regimes (high $R$ in MRI, low $I_0$ in CT). This likely contributes to the larger gains observed in these regions. Whether a uniform or different sampling schedule would yield different patterns-particularly at lower accelerations-remains an open question.

Lastly, beyond the specific feature-wise modulation strategy evaluated in this work, several alternative conditioning mechanisms exist. A widely used family of approaches relies on feature-wise affine transformations, such as FiLM layers \cite{perez2018film, dumoulin2018feature}, which scale and shift intermediate activations based on auxiliary variables. A different line of methods introduces conditional sparsity through learnable gating or $L_0$-regularization \cite{louizos2017learning, he2017channel}, effectively modulating the set of active channels or filters in response to the input. These strategies demonstrate that modulation can be implemented in multiple architectural forms, ranging from explicit feature-wise transformations to implicit capacity control. Comparing such alternatives with the proposed design is a natural extension for future work.


Related work has also introduced adaptive mechanisms within MRI-specific reconstruction frameworks. Examples include approaches that adapt the reconstruction model to different acquisition settings \cite{pramanik2023adapting} and hypernetwork-based methods that generate parts of the reconstruction network from coil- or scanner-specific embeddings \cite{ramanarayanan2023hypercoil}, as well as recent adaptive convolution approaches for QSM dipole inversion, where convolution kernels are generated as functions of acquisition geometry within a feed-forward U-Net architecture \cite{graf2024incorporating}.  


These methods demonstrate that acquisition-aware adaptiveness is feasible in MRI reconstruction, but differ in scope and mechanism from the present work, which introduces lightweight modulation of convolutional operators within learned iterative reconstruction schemes.



Overall, the results indicate that conditional learned iterative schemes offer a simple, architecture-agnostic mechanism for adapting deep reconstruction models to heterogeneous acquisition settings without training separate networks for each configuration. The improvements are consistent, and their strength depends on both the modality and the reconstruction backbone. Expanding the conditioning variables, exploring interactions with model capacity, and evaluating additional architectures represent natural next steps for future work.
