\section{JSSL Theoretical Motivation}
\label{sec:ap3}
\noindent
% In this appendix we provide theoretical motivations along with proofs for the JSSL method  presented in \Section{subsec3.8} of the main paper.
The core concept behind JSSL is to leverage both supervised and self-supervised learning to enhance MRI reconstruction of a target dataset, even when the parameters optimized on supervised proxy tasks may not be the most optimal. We hypothesize that introducing a supervised proxy task serves as a form of regularization, reducing the variance of our estimators due to the proxy supervised training on a `less noisy' task. We illustrate this intuition with two simplified examples in  Proposition \ref{prop:prop1} (estimating means of distributions) and Proposition \ref{prop:prop2} (linear regression), where we assume two distributions - one that we wish to estimate, but we cannot obtain sufficient samples from, and a proxy distribution that is directly accessible. We demonstrate that drawing samples from both distributions (or using only the proxy distribution) can reduce our estimator's variance and risk.


\begin{prop}
    Consider two distributions $p_i, \, i=1,2$ with means and variances ${\mu}_i, {\sigma}_i, \, i=1,2$, with unknown ${\mu}_1$, and ${\mu}_1 \neq {\mu}_2$. Then if $(\mu_1 - \mu_2)^2 < c \frac{{\sigma}_1^2} N$ for some $c \in (0, 1)$ and $N \in \mathbb{Z}^{+}$, then $\tilde{{x}} = \frac{1}{N+K} \sum_{i=1}^{N+K} {x}_i$ is a lower-variance estimator of $\mu_1$ compared to $\overline{{x}} = \frac{1}{N} \sum_{i=1}^N {x}_i$, where $\left\{{x}^{(i)} \sim p_1\right\}_{i=1}^{N}$ and $\left\{{x}^{(N+i)} \sim p_2 \right\}_{i=1}^{K}$ for a choice of a large $K \in \mathbb{Z}^{+} $.
\label{prop:prop1}
\end{prop}

\begin{proof}
    Assume a mixture distribution:
\begin{equation*}
    p_{\pi}(x) = \pi \mathcal{N}(x | {\mu}_1, {\sigma}_1^2) + (1-\pi) \mathcal{N}(x | {\mu}_2, {\sigma}_2^2).
\end{equation*}
\noindent
It is then straightforward to compute:
\begin{equation*}
    \begin{gathered}
           \mathbb E\left[p_{\pi}\right] = \pi {\mu}_1 + (1- \pi) {\mu}_2
    \end{gathered}
\end{equation*}
and,
\begin{equation*}
    \mathbb{V} \left[p_{\pi}\right] = \pi {\sigma}_1^2 + (1-\pi) {\sigma}_2^2 + \pi (1-\pi) ({\mu}_2 - {\mu}_1)^2.
\end{equation*}

\noindent
Drawing $\displaystyle \left\{{x}^{(i)} \sim p_1\right\}_{i=1}^{N}$ and $\displaystyle 
\left\{{x}^{(N+i)} \sim p_2 \right\}_{i=1}^{K}$, is approximately equivalent to drawing $N+K$ samples from the mixture $p_{\pi}$ with $\pi = \frac{N}{N+K}$. Using bias-variance decomposition, we can compute the expected mean squared errors for the two estimators:

\begin{equation*}
   \mathbb E\left[(\overline{{x}} - {\mu}_1)^2 \right] = \frac{{\sigma}_1^2}{N},
\label{eq.mse_est2_2}
\end{equation*}
and,
\begin{equation*}
    \mathbb{E}\left[(\tilde{{x}} - {\mu}_1)^2 \right] = 
   (1-\pi)^2 (\mu_1 - \mu_2)^2 + \frac{\pi \sigma_1^2 + (1-\pi) \sigma_2^2 + \pi (1-\pi) (\mu_1 - \mu_2)^2}{N+K}.
\label{eq.mse_est2}
\end{equation*}


If $(\mu_1 - \mu_2)^2 < c \frac{{\sigma}_1^2} N$ for some $c \in (0, 1)$, then  taking the limit $K \to \infty$ and thus $\pi \to 0$, we observe that \begin{equation*}
\mathbb{E}\left[(\tilde{{x}} - {\mu}_1)^2  \right] \to (\mu_1 - \mu_2)^2 < c \frac{{\sigma}_1^2}{N} < \frac{{\sigma}_1^2}{N} = \mathbb E\left[(\overline{{x}} - {\mu}_1)^2 \right].
\end{equation*}

\end{proof}

\break
\begin{prop}
\label{prop:prop2}
Let $\bm x \sim \mathcal N(\bm 0, \sigma^2 \bm I_{p})$ be $\mathbb R^p$-valued isotropic Gaussian random vector and $y, \tilde y$ be random variables with $p(y|\bm x) = \mathcal N(y| \bm w^T \bm x, \varepsilon^2)$ and $p(\tilde y|\bm x) = \mathcal N(\tilde y| \bm{\tilde w}^T \bm x, \tilde \varepsilon^2)$ for some $\bm w, \bm{\tilde w} \in \mathbb R^p$. Let $\mathcal T = \{(\bm x_1, \tilde y_1),\dots,  (\bm x_K, \tilde y_K)\}$ be a training data set with $K > p$ and consider a maximum likelihood estimator $\widehat y(\bm x; \mathcal T)$ for $y$ given $\bm x$, computed using $\mathcal T$. Then the following holds:
\begin{enumerate}
\item $\mathrm{Bias}_{\mathcal T}[\widehat y(\bm x; \mathcal T)] = (\bm{\tilde w}^T - \bm w^T) \bm x.$
\item $\mathrm{Var}_{\mathcal T}[\widehat y(\bm x; \mathcal T)] =  \frac{\tilde \varepsilon^2}{\sigma^2 K} \| \bm x \|_2^2$.
\item $\mathbb E_{(\bm x, y)} [\widehat y(\bm x; \mathcal T) - y]^2 \leq  p \sigma^2 \| \bm {\tilde w} - \bm w \|_2^2  + \frac{p \tilde \varepsilon^2}{K}  + \varepsilon^2$
\end{enumerate}
\end{prop}
\begin{proof}
Let $\bm{\tilde w}_{\mathrm{MLE}} = (\bm X^T \bm X)^{-1} \bm X^T \bm{\tilde y}$ be the MLE estimator for $\bm {\tilde w}$, where the $K$ rows of $\bm X \in \mathbb R^{K \times p}$ are given by $\bm x_1^T, \dots, \bm x_K^T$ and the vector $\bm{\tilde y}$ is defined as $\bm{\tilde y} := (\tilde y_1, \dots, \tilde y_K) \in \mathbb R^K$. Since $K > p$, matrix $\bm X$ has full column rank almost surely and thus $\bm X^T \bm X$ is almost surely invertible. Observe that 
\begin{equation*}
\mathbb E_{\mathcal T} [\bm{\tilde w}_{\mathrm{MLE}}^T] = \mathbb E_{\mathcal T} [(\bm{\tilde \varepsilon}^T + \bm{\tilde w}^T \bm X^T) \bm X (\bm X^T \bm X)^{-1}] =\bm{\tilde w}^T,
\end{equation*}
since $\bm{\tilde \varepsilon} := \bm{\tilde y} - \bm X \bm{\tilde w}$ has zero mean, is independent from $\bm x_i$'s and the expectation $\mathbb E_{\mathcal T}[\cdot]$ can be rewritten as $\mathbb E_{\bm x_1, \dots, \bm x_K} [\mathbb E_{\bm{\tilde \varepsilon}}[\cdot]]$.
By definition of estimator bias, 
\begin{equation*}
\mathrm{Bias}_{\mathcal T}[\widehat y(\bm x; \mathcal T)] = \mathbb E_{\mathcal T}[\widehat y(\bm x; \mathcal T)] - \mathbb E_{y | \bm x} y = \mathbb E_{\mathcal T} [\bm{\tilde w}_{\mathrm{MLE}}^T] \bm x - \bm w^T \bm x =(\bm{\tilde w}^T - \bm w^T) \bm x.
\end{equation*}
Next,
\begin{align*}
&\mathrm{Var}_{\mathcal T}[\widehat y(\bm x; \mathcal T)] = \mathbb E_{\mathcal T} [\mathbb E_{\mathcal T}[\widehat y(\bm x; \mathcal T)] - \widehat y(\bm x; \mathcal T)]^2 =\\
&= \mathbb E_{\mathcal T} [ \bm{\tilde w}^T \bm x - (\bm{\tilde \varepsilon}^T + \bm{\tilde w}^T \bm X^T) \bm X (\bm X^T \bm X)^{-1} \bm x]^2 = \mathbb E_{\mathcal T} [ \bm{\tilde \varepsilon}^T \bm X (\bm X^T \bm X)^{-1} \bm x]^2.
\end{align*}
The scalar $(\bm{\tilde \varepsilon}^T \bm X (\bm X^T \bm X)^{-1} \bm x)^2$ can be equivalently written as 
\begin{equation*}
(\bm{\tilde \varepsilon}^T \bm X (\bm X^T \bm X)^{-1} \bm x)^T (\bm{\tilde \varepsilon}^T \bm X (\bm X^T \bm X)^{-1} \bm x) =
\bm x^T (\bm X^T \bm X)^{-1} \bm X^T \bm{\tilde \varepsilon} \bm{\tilde \varepsilon}^T \bm X (\bm X^T \bm X)^{-1} \bm x.    
\end{equation*}
Using that $\mathbb E_{\mathcal T}[\cdot] = \mathbb E_{\bm x_1, \dots, \bm x_k} [\mathbb E_{\bm{\tilde \varepsilon}}[\cdot]]$, we deduce that
\begin{align*}
&\mathbb E_{\mathcal T} [ \bm{\tilde \varepsilon}^T \bm X (\bm X^T \bm X)^{-1} \bm x]^2 
= \mathbb E_{\bm x_1, \dots, \bm x_K} [ \bm x^T (\bm X^T \bm X)^{-1} \bm X^T \mathbb E_{\bm{\tilde \varepsilon}}[\bm{\tilde \varepsilon} \bm{\tilde \varepsilon}^T] \bm X (\bm X^T \bm X)^{-1} \bm x ]=\\
&=\tilde \varepsilon^2 \mathbb E_{\bm x_1, \dots, \bm x_K} [ \bm x^T (\bm X^T \bm X)^{-1} \bm x ] = \tilde \varepsilon^2 \mathbb E_{\bm x_1, \dots, \bm x_K} [\mathrm{tr}(\bm x^T (\bm X^T \bm X)^{-1} \bm x) ] =\\
&= \tilde \varepsilon^2 \mathbb E_{\bm x_1, \dots, \bm x_K} [\mathrm{tr}(\bm x \bm x^T (\bm X^T \bm X)^{-1})] = \tilde \varepsilon^2  \mathrm{tr}(\bm x \bm x^T \mathbb E_{\bm x_1, \dots, \bm x_K} [(\bm X^T \bm X)^{-1}]),
\end{align*}
where we use cyclic property of the trace and the fact that $z = \mathrm{tr}(z)$ for a scalar $z$. To compute $\mathbb E_{\bm x_1, \dots, \bm x_K} [(\bm X^T \bm X)^{-1}]$, we note that, by definition, $\bm X^T \bm X$ follows Wishart distribution $\mathcal W_p(\sigma^2 \bm I_p, K)$ with $K$ degrees of freedom and thus $(\bm X^T \bm X)^{-1}$ follows inverse Wishart distribution $\mathcal W^{-1}_p(\sigma^{-2} \bm I_p, K + p + 1)$, whose mean equals $\frac{\bm I_p}{\sigma^2K}$. Combining this with the previous results, we conclude
\begin{align*}
&\mathrm{Var}_{\mathcal T}[\widehat y(\bm x; \mathcal T)] = \frac{\tilde \varepsilon^2}{\sigma^2 K}  \mathrm{tr}(\bm x \bm x^T) = \frac{\tilde \varepsilon^2}{\sigma^2 K} \| \bm x \|_2^2.
\end{align*}
The final estimate follows from the first two identities and the bias-variance decomposition.
\end{proof}

Propositions \ref{prop:prop1} and \ref{prop:prop2} imply that leveraging a large number of samples from the proxy distribution ($K \to \infty$) can lead to a significant reduction in the variance of estimators trained under both supervised and self-supervised learning paradigms. Moreover, it highlights how the introduction of bias through supervised learning can be a strategic trade-off to lower variance. Additionally, Proposition \ref{prop:prop2} sheds light on how the risk associated with our estimator can be influenced by the degree of similarity between the target and proxy distributions.