\section{Experiments}
\label{sec:ap4}




\subsection{Datasets Information}
\label{sec:ap4.1}
In our experiments we utilize the fastMRI Knee \cite{zbontar2019fastmri}, fastMRI Brain \cite{zbontar2019fastmri}, fastMRI Prostate \cite{cmrxrecon,cmrxrecondataset},  and CMRxRecon Cine \cite{tibrewala2023fastmri} datasets.
The characteristics and data splits are shown below in \Table{S1}.
\input{tab1}

In the comparative experiments outlined in \Section{subsec3.4},
% and the alternative configuration experiments (2) and (3) in Appendix \ref{sec:ap5.2}
we addressed the imbalance between proxy datasets (brain and knee in experiment set \textbf{A}; brain, knee, and prostate in experiment set \textbf{B}) and target datasets (prostate in experiment set \textbf{A}; cardiac cine in experiment set \textbf{B}) by oversampling the proxy data. Unless specified otherwise, this was achieved by duplicating each proxy dataset sample to ensure consistency across experiments.



\subsection{SSL Subsampling Partitioning}
\label{sec:ap4.2}
Let $\mat{M}_{i}$ denote the sampling set. Here we describe $\mat{M}_{i}$ as a sampling mask in the form of a squared array of size $n = n_x \times n_y$ such that:
\begin{equation*}
    (\mat{M}_{i})_{kj} =  \Bigg\{\begin{array}{ll}
        1, & \text{if } (k, j) \text{ is sampled}\\
        0, & \text{if } (k, j) \text{ is not sampled}.\\
        \end{array}
\end{equation*}

The set $\mat{\Theta}_{i}$ is obtained by selecting elements from $\mat{M}_{i}$ using a variable density 2D Gaussian scheme with a standard deviation of $\sigma$ pixels and mean vector as the center of the sampling set $\vec{M}_{i}$, up to the number of elements determined by a ratio $q_i$, determined such that $q_i = \frac{|\mat{\Theta}_{i}|}{|\mat{M}_{i}|}$, where $| \cdot |$ here denotes the cardinality. Mathematically, the selection process for $\mat{\Theta}_{i}$ from $\mat{M}_{i}$ can be described  by the following algorithm:

\begin{algorithm}[H]
    \SetAlgoLined
    \KwData{Square array $\mat{M}_{i}$ of size $n_x \times n_y$, ratio $0<q_i<1$, standard deviation $\sigma$}
    \KwResult{Set $\mat{\Theta}_{i}$}
    Initialize $\mat{\Theta}_{i}$ as an array of zeros of the same size as $\mat{M}_{i}$\;
    \While{$\frac{|\mat{\Theta}_{i}|}{|\mat{M}_{i}|} < q_i$}{
        Generate $(k, j)$ from $\mathcal{N}\left([\frac{n_x}{2}, \frac{n_y}{2}], \, \sigma^2\mat{I}_2\right)$\;
        \If{$(\mat{\Theta}_{i})_{kj} == 0$}{
            $(\mat{\Theta}_{i})_{kj} \leftarrow 1$\;
        }
    }
    \caption{Generation of $\mat{\Theta}_{i}$ using Gaussian Sampling}
\end{algorithm}

Subsequently, to partition $\mat{M}_{i}$, we set ${\mat{\Lambda}_{i}} = {\mat{M}_{i} \smallsetminus \mat{\Theta}_{i}}$. Note that by selecting $q_i=0$ then $\mat{\Theta}_{i} = \emptyset$, and for $q_i=1$ then $\mat{\Theta}_{i} = \mat{M}_i$.

For our comparison study in \Section{subsec3.6} of the main paper for SSL and JSSL experiments we randomly selected the ratio $q_i$ between 0.3, 0.4, 0.5, 0.6, 0.7 and 0.8. For our alternative configurations study in Appendix \ref{sec:ap5.1}, we employed an identical partitioning ratio selection except for the case of a fixed ratio of $q_i = 0.5$. In all our JSSL and SSL experiments we used $\sigma = 3.5$.

\input{fig1}

% \subsection{Reconstruction Network - vSHARP}
% \label{sec:ap4.3}
% In our main experiments, we employed the variable Splitting Half-quadratic ADMM algorithm for Reconstruction of inverse-Problems (vSHARP) as our reconstruction network, which is an unrolled physics-guided DL-based method \cite{yiasemis2023vsharp} that has previously been applied in accelerated brain, prostate and dynamic cardiac MRI reconstruction  \cite{yiasemis2023vsharp, yiasemis2023deep}. The vSHARP algorithm incorporates the half-quadratic variable splitting method to the optimization problem presented in \eqref{eq:var_problem}, introducing an auxiliary variable $\vec{z}$:
% % 
% \begin{equation}
%     \min_{\vec{x}^{'}, \vec{z}}\frac{1}{2}\left|\left| \mathcal{A}_{\vec{M}, \mat{S}}(\vec{x}^{'}) - \Tilde{\vec{y}}_{\mat{M}}\right|\right|_2^2 +  \mathcal{G}(\vec{z}) \quad \text{s.t. } \vec{x}^{'} = \vec{z}.
%     \label{eq:vsharp_var}
% \end{equation}

% Subsequently, \eqref{eq:vsharp_var} is iteratively unrolled over $T$ iterations using the Alternating Direction Method of Multipliers (ADMM). The ADMM formulation consists of three key steps: (a) a denoising step to refine the auxiliary variable $\vec{z}$, (b) data consistency for the target image $\vec{x}$, and (c) an update for the Lagrange Multipliers $\vec{u}$ introduced by ADMM:
% % 
% \begin{subequations}
%     \begin{gather}
%         \vec{z}_{t+1} =  \mathcal{H}_{\boldsymbol{\psi}_{t+1}}\left( \vec{z}_{t}, \vec{x}_{t}, \vec{u}_{t} /{\mu}_{t+1}\right),\label{eq:z_step}\\
%         \vec{x}_{t+1}  =  \arg\min_{\vec{x}^{'}} \Big|\Big| \mathcal{A}_{\vec{M}, \mat{S}}  (\vec{x}^{'})  \, -  \,  \Tilde{\vec{y}}_{\mat{M}}\Big|\Big|_2^2 +  \mu\left|\left|\vec{x}^{'} - \vec{z}_{t+1} + \vec{u}_{t}/{\mu}_{t+1} \right|\right|_2^2,
%         \label{eq:x_step}\\
%         \vec{u}_{t+1} = \vec{u}_{t} + \mu_{t+1} \left(\vec{x}_{t+1} - \vec{z}_{t+1}\right).
%         \label{eq:u_step}
%     \end{gather}
% \end{subequations}


% In \eqref{eq:z_step}, $\mathcal{H}_{\boldsymbol{\psi}_{t+1}}$ denotes a convolutional based DL image denoiser with trainable parameters $\boldsymbol{\psi}_{t+1}$, and $\eta_{t+1}$ a trainable learning rate. At each iteration, $\mathcal{H}_{\boldsymbol{\psi}_{t+1}}$ takes as input the previous predictions of the three variables and outputs an estimation of the auxiliary variable $\vec{z}$. Equation \ref{eq:x_step} is solved numerically by unrolling further a gradient descent scheme over $T_{\vec{x}}$ iterations. The last step in \eqref{eq:u_step}, involves a straightforward computation.  The initial approximations for $\vec{x}$ and $\vec{z}$ are taken as: 	 $\vec{x}_{0},\, \vec{z}_0 = \mathcal{R}_{\mat{S}} \circ \mathcal{F}^{-1} \left(\Tilde{\vec{y}}_{\mat{M}}\right).$  Moreover, for $\vec{u}_{0}$, vSHARP employs a trainable replication-padding and dilated convolutional-based network represented by $\mathcal{U}_{\boldsymbol{\phi}}$: $\vec{u}_{0} = \mathcal{U}_{\boldsymbol{\phi}}(\vec{x}_0).$

%  For all experiments, we adopted vSHARP with $T=12$ optimization steps, utilizing two-dimensional U-Nets \cite{Ronneberger2015} composed of 4 scales and 32 filters (in the first scale) for $\left\{\mathcal{H}_{\boldsymbol{\theta}_{t}}\right\}_{t=0}^{T-1}$. For the data consistency step, we set $T_{\vec{x}}=10$.
\subsection{Choice of Loss Functions}
\label{sec:ap4.4}
% In all our experiments, loss was computed as detailed in \Section{subsec2.4.4} employing the following combinations in the image and frequency domains, respectively:
% % 
% \begin{equation*}
% \begin{gathered}
%     {\mathcal{L}_{\text{I}}}^{\text{SL}},\, {\mathcal{L}_{\text{I}}}^{\text{SSL}} := 2 \left(1 - \text{SSIM}  + \mathcal{L}_{1}\right) + \text{HFEN}_1 + \text{HFEN}_2,\\
%     {\mathcal{L}_{K}}^{\text{SL}}, \, {\mathcal{L}_{K}}^{\text{SSL}} := 2 \left( \text{NMSE} + \text{NMAE} \right).
% \end{gathered}
% \end{equation*}

Following, we provide the mathematical definitions of each component of the loss function described in \Section{subsec3.3}:

\begin{itemize}
  \item Image Domain Loss Functions
  \begin{itemize}
    \item Structural Similarity Index Measure (SSIM) Loss
    
        \begin{equation}
                \mathcal{L}_\text{SSIM} := 1 - \text{SSIM}, \quad  \text{SSIM}(\vec{a},\,\vec{b}) =
            \frac{1}{N}\sum_{i=1}^{N} \frac{(2\mu_{\vec{a}_i}\mu_{\vec{b}_i} + \gamma_1)(2\sigma_{\vec{a}_i\vec{b}_i} + \gamma_2)}{({\mu^2_{\vec{a}_i}} +{\mu^2_{\vec{b}_i}} + \gamma_1)({\sigma^2_{\vec{a}_i}} + {\sigma^2_{\vec{b}_i}} + \gamma_2)},
            \label{eq:ssim_metric} 
        \end{equation}
    
        where $\vec{a}_i, \vec{b}_i, i=1,...,N$ represent $7\times 7$ square windows of  $\vec{a}, \vec{b}$, respectively, and  $\gamma_1 = 0.01$, $\gamma_1 = 0.03$. Additionally, $\mu_{\vec{a}_i}$, $\mu_{\vec{b}_i}$ denote the means of each window, $\sigma_{\vec{a}_i}$ and $\sigma_{\vec{b}_i}$ represent the corresponding standard deviations. Lastly, $\sigma_{\vec{a}_i\vec{b}_i}$ represents the covariance between $\vec{a}_i$ and $\vec{b}_i$.
    
    \item High Frequency Error Norm (HFEN)

    \begin{equation}
        \mathcal{L}_{\text{HFEN}_k} := {\text{HFEN}_k}, \quad  {\text{HFEN}_k}(\vec{a},\,\vec{b})  = \, \frac{|| \mathcal{G}(\vec{a}) - \mathcal{G}(\vec{b}) ||_k}{||\mathcal{G}(\vec{b})||_k},
    \label{eq:hfen}
\end{equation}

 where $\mathcal{G}$ is a Laplacian-of-Gaussian filter  \cite{5617283} with kernel of size $15\times 15$ and with a standard deviation of 2.5, and $k\,=\, 1 \text{ or } 2$.

    \item Mean Average Error (MAE / $L_1$) Loss
    \begin{equation}
        \mathcal{L}_1(\vec{a},\,\vec{b}) = || \vec{a} - \vec{b} ||_1 = \sum_{i=1}^n |a_{i} - b_{i}|
    \end{equation}
    \end{itemize}
    
    \item $k$-space Domain Loss Functions

    \begin{itemize}
        \item Normalized Mean Squared Error (NMSE)
        \begin{equation}
            \mathcal{L}_{\text{NMSE}} := \text{NMSE}, \quad  \text{NMSE}(\vec{a},\, \vec{b})\,= \, \frac{||\vec{a}\,-\,\vec{b}||_2^2}{||\vec{a}||_2^2}\,= \, \frac{\sum_{i=1}^n(a_{i} - b_{i})^2}{\sum_{i=1}^n a_{i}^{2}}.
            \label{eq:nmse}
        \end{equation}
        \item Normalized Mean Average Error (NMAE)
        \begin{equation}
            \mathcal{L}_{\text{NMAE}} := \text{NMAE}, \quad  \text{NMAE}(\vec{a},\, \vec{b})\,= \, \frac{||\vec{a}\,-\,\vec{b}||_1}{||\vec{a}||_1}\,= \, \frac{\sum_{i=1}^n |a_{i} - {b}_{i}|}{\sum_{i=1}^n |a_{i}|}.
            \label{eq:nmae}
        \end{equation}
    \end{itemize}

\end{itemize}

The rationale for the loss function components is also drawn from the literature \cite{yiasemis2023vsharp}. In the frequency domain, $ \mathcal{L}_{\text{NMSE}}$ and $ \mathcal{L}_{\text{NMAE}}$ are used to evaluate global similarity to the fully sampled $k$-space, with the former addressing larger deviations and the latter focusing on finer discrepancies. In the image domain, $\mathcal{L}_1$ and $\mathcal{L}_\text{SSIM}$ are commonly combined to optimize pixel-level accuracy and perceptual quality, while $ \mathcal{L}_{\text{HFEN}_k}$ emphasizes the preservation of edges and fine details.


% \subsection{Parameter Optimization}
% \label{sec:ap4.5}
% We optimized the model parameters using the Adam optimizer \cite{kingma2017adam}, with parameters $\epsilon=10^{-8}$, $(\beta_1, \beta_2) = (0.99, 0.999)$ and initial learning rate (lr) set to 0.003. We also employed a lr scheduler which decayed the lr by a factor of 0.8 every 150,000 training iterations. Experiments were carried out on 2 A6000 RTX GPUs, with a batch size of 2 slices of multi-coil $k$-space data assigned to each GPU employing the DIRECT toolkit \cite{DIRECTTOOLKIT}. All models were trained to convergence.

% \subsection{Statistical Testing}
% \label{sec:ap4.6}
% To determine whether the top-performing method in each category (SL methods, SSL-based methods including JSSL, SSL-based methods with different configurations) significantly outperformed the others, we conducted statistical tests. Initially, we calculated the differences in performance between the best method and the other methods within each category. The Shapiro-Wilk test \cite{SHAPIRO1965} was used to assess the normality of these differences. If the differences were normally distributed ($p > \alpha $), a paired t-test was performed, alternatively the Wilcoxon signed-rank test \cite{conover1999practical} was used. In our reported results we denote with an asterisk instances which the average best method was not found to be statistically significantly better ($p > \alpha$). Note that we set $\alpha = 0.05$ as the significance level.
