\title{Common Event Tethering to Improve Prediction of Rare Clinical Events\\(Supplementary Material)}
\maketitle
\appendix

\section{Proofs for Theorems in Section~\ref{sec: theory}}\label{apdx:proofs} 

\twoasym*
\begin{proof}
We use an approach and notation similar to that of \cite{cessie1992ridge}. In particular, we use the Newton-Raphson maximization procedure to arrive at these asymptotic properties. Let $\mathcal{L}^{(s)}(\bm{\theta}_1) = \mathcal{L}^{(s)}(\bm{\theta}_1 | \mathcal{D}_n, \hat{\bm{\theta}}_2)$ for simplicity. Before proceeding, we add the superscript $k$ to the ridge estimate of $\bm{\theta}_2$ with ridge penalty parameter $k$, making it $\hat{\bm{\theta}}^{(k)}_2$. This is done so we can allow $\hat{\bm{\theta}}_2$ to denote the unregularized estimate. We do a similar thing for the estimate of $\bm{\theta}_1$ from \sinabbr, letting $\hat{\bm{\theta}}^{(s)}_1$ be the regularized estimate and $\hat{\bm{\theta}}_1$ be the unregularized estimate. 

We take the first derivative of $\mathcal{L}^{(s)}(\bm{\theta}_1)$

\[\mathbf{U}^{(s)}(\bm{\theta}_1) = \mathbf{U}(\bm{\theta}_1) - s(\bm{\theta}_1 - \hat{\bm{\theta}}^{(k)}_2).\]

$\mathbf{U}(\bm{\theta}_1)$ is the first derivative of the unregularized $\mathcal{L}(\bm{\theta}_1)$.

We then compute the negative hessian matrix, 
\[\bm{\Omega}^{(s)}(\bm{\theta}_1) = \bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\].

We now derive large sample properties of our estimate, $\hat{\bm{\theta}}^{(s)}_1$, using Taylor series expansion about the true parameter, $\bm{\theta}_1$. We have

\begin{equation*}
    \begin{split}
        \mathbf{U}^{(s)}(\hat{\bm{\theta}}^{(s)}_1) = \mathbf{U}^{(s)}(\bm{\theta}_1) - (\hat{\bm{\theta}}^{(s)}_1 - \bm{\theta}_1) \bm{\Omega}^{(s)}(\bm{\theta}_1) + o\left(\|\hat{\bm{\theta}}^{(s)}_1 - \bm{\theta}_1 \|\right).
    \end{split}
\end{equation*}

We then arrive at a first-order approximation of $\hat{\bm{\theta}}^{(s)}_1$

\begin{equation*}
    \begin{gathered}
        \hat{\bm{\theta}}^{(s)}_1 = 
        \bm{\theta}_1 + \left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\left(\mathbf{U}(\bm{\theta}_1) - s(\bm{\theta}_1 - \hat{\bm{\theta}}^{(k)}_2)\right).
    \end{gathered}
\end{equation*}

Now, as stated in \cite{cessie1992ridge}, we have that the first order estimate $\bm{\theta}_1$ that maximizes the unregularized log-likelihood is $\hat{\bm{\theta}}_1 = \bm{\theta}_1 + \bm{\Omega}^{-1}(\bm{\theta}_1)\mathbf{U}(\bm{\theta}_1).$ And under certain regularity conditions, we have that $\hat{\bm{\theta}}_1$ is asymptotically unbiased with covariance matrix $\bm{\Omega}(\bm{\theta}_1)^{-1}$. \cite{cessie1992ridge} also show that the ridge LR estimate of the true $\bm{\theta}_2$ with ridge penalty parameter $k$ is

\[\hat{\bm{\theta}}^{(k)}_2 = \left(\bm{\Omega}(\bm{\theta}_2)+k\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_2)\hat{\bm{\theta}}_2\]

where $\hat{\bm{\theta}}_2$ is again the estimate from unregularized logistic regression. From here, we get that the asymptotic bias of $\hat{\bm{\theta}}^{(k)}_2$ is  
\[-k\left(\bm{\Omega}(\bm{\theta}_2)+k\mathbf{I}\right)^{-1}\bm{\theta}_2\]
and asymptotic variance is
\[\left(\bm{\Omega}(\bm{\theta}_2) + k\mathbf{I}\right)^{-1} \\
    \bm{\Omega}(\bm{\theta}_2)\left(\bm{\Omega}(\bm{\theta}_2) + k\mathbf{I}\right)^{-1}.\]
Using these properties, we perform the following calculations to arrive at our asymptotic bias.
\begin{equation*}
    \begin{split}
    \mathbb{E}[\hat{\bm{\theta}}^{(s)}_1 - \bm{\theta}_1] = &
    \mathbb{E}\left[\bm{\theta}_1 + \left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}
    \left(\mathbf{U}(\bm{\theta}_1) - s(\bm{\theta}_1 - \hat{\bm{\theta}}^{(k)}_2)\right) -  \bm{\theta}_1\right] \\ = &
    \mathbb{E}\left[\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}
    \left(\mathbf{U}(\bm{\theta}_1) - s(\bm{\theta}_1 - \hat{\bm{\theta}}^{(k)}_2)\right)\right] \\ = &
    -s\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}
    \left[\bm{\theta}_1 - \mathbb{E}[\hat{\bm{\theta}}^{(k)}_2]\right].
    \end{split}
\end{equation*}

To derive the variance, we first rewrite our estimate as

\begin{equation*}
    \begin{gathered}
        \hat{\bm{\theta}}^{(s)}_1 =
        \left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\left(\bm{\Omega}(\bm{\theta}_1)\hat{\bm{\theta}}_1 + s\hat{\bm{\theta}}^{(k)}_2\right).
    \end{gathered}
\end{equation*}

From here, we can derive the asymptotic variance as shown below.

\begin{equation*}
    \begin{split}
    \text{Var}(\hat{\bm{\theta}}^{(s)}_1) = &
    \text{Var}\left[\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}
    \left(\bm{\Omega}(\bm{\theta}_1)\hat{\bm{\theta}}_1 + s\hat{\bm{\theta}}^{(k)}_2\right)\right] \\ = &
    \text{Var}\left[\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_1)\hat{\bm{\theta}}_1 +
    s\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}
    \left(\bm{\Omega}(\bm{\theta}_2)+k\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_2)\hat{\bm{\theta}}_2\right]
    \end{split}
\end{equation*}
Now, let 
\[A = \left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_1)\hat{\bm{\theta}}_1\] 
and let 
\begin{equation*}
    \begin{gathered}
        B = s\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}
        \left(\bm{\Omega}(\bm{\theta}_2)+k\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_2)\hat{\bm{\theta}}_2.
    \end{gathered}
\end{equation*} 
Then, we can calculate $\text{Var}(A+B)$. Note that the unregularized estimate $\hat{\bm{\theta}}_1$ is independent of the unregularized estimate $\hat{\bm{\theta}}_2$. Therefore, $\text{Var}(A+B) = \left(\text{Var}(A) + \text{Var}(B)\right)$. We conclude

\begin{equation*}
    \begin{split}
    \text{Var}(\hat{\bm{\theta}}^{(s)}_1) = & 
    \left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_1)\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} + \\
    &
    s^2\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}  \left(\bm{\Omega}(\bm{\theta}_2) + k\mathbf{I}\right)^{-1}
    \bm{\Omega}(\bm{\theta}_2)\left(\bm{\Omega}(\bm{\theta}_2) + k\mathbf{I}\right)^{-1}\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} \\
    = & 
    \left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_1)\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} +
    s^2\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} \textrm{Var}[\hat{\bm{\theta}}^{(k)}_2] \left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} \\
    = &
    \left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\left(\bm{\Omega}(\bm{\theta}_1) + s^2\textrm{Var}[\hat{\bm{\theta}}^{(k)}_2]\right)\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}.
    \end{split}
\end{equation*}
\end{proof}


\twomse*
\begin{proof}
    Begin by noting that the $\text{MSE}(\hat{\bm{\theta}}_1) = \left[\text{Bias}(\hat{\bm{\theta}}_1, \bm{\theta}_1)^2 + \text{Var}(\hat{\bm{\theta}}_1)\right]$. 

As shown by \cite{phrueksawatnon2021determining}, the asymptotic variance and squared bias of the MLE of ridge LR with ridge penalty $s$ is

\begin{equation*}
\begin{split}
    \textrm{Var}(\tilde{\bm{\theta}}_1) 
    & =
    \textrm{tr}\left[\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_1)\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} \right] \\
    & = \sum_{j=1}^{p} \frac{A_{j,j}}{(A_{j,j} + s)^2}
\end{split}
\end{equation*}
and
\[
    \textrm{Bias}(\tilde{\bm{\theta}}_1, \bm{\theta}_1)^2 = s^2\sum_{j=1}^{p}\frac{a_{j}^2}{(A_{j,j} + s)^2}    
    \]

Note that we can always diagonalize $\bm{\Omega}(\bm{\theta}_1) = \mathbf{P}\mathbf{A}\mathbf{P}'$ as such because it is a real symmetric matrix.

Recall that the asymptotic variance of our the similar two-step logistic regression estimator is

\begin{equation*}
    \begin{gathered}
    \text{Var}(\hat{\bm{\theta}}^r_{(s)}) = 
    \left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\left(\bm{\Omega}(\bm{\theta}_1) + s^2\textrm{Var}[\hat{\bm{\theta}}_2]\right)\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} 
    \end{gathered}
\end{equation*}

Then,
\begin{equation*}
    \begin{gathered}
    \textrm{tr}\left[\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}
    \left(\bm{\Omega}(\bm{\theta}_1) + s^2\textrm{Var}[\hat{\bm{\theta}}_2]\right)
    \left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\right] 
    \\
    =
    \\
    \textrm{tr}\left[\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_1)\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\right] +
    s^2\textrm{tr}\left[\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}  \textrm{Var}[\hat{\bm{\theta}}_2]\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} \right]    
    \end{gathered}
\end{equation*}

Note the first term is equal to $\textrm{Var}(\tilde{\bm{\theta}}_1)$. Namely,

\begin{equation*}
    \begin{gathered}
        \textrm{tr}\left[\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_1)\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\right] = \textrm{Var}(\tilde{\bm{\theta}}_1).
    \end{gathered}
\end{equation*}

We then expand the second term, making
\begin{equation*}
    \begin{gathered}
        s^2\textrm{tr}\left[\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}  \textrm{Var}[\hat{\bm{\theta}}_2]\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} \right] 
        \\
        =
        \\
        s^2\textrm{tr}\left[\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}  \left(\bm{\Omega}(\bm{\theta}_2) +k\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_2)\left(\bm{\Omega}(\bm{\theta}_2) + k\mathbf{I}\right)^{-1}\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} \right]
    \end{gathered}
\end{equation*}

where $k$ is the ridge parameter used to estimate $\hat{\bm{\theta}}_2$. Now, we use the fact that $\bm{\Omega}(\bm{\theta}_1) = \mathbf{P}\mathbf{A}\mathbf{P}'$ and $\bm{\Omega}(\bm{\theta}_2) = \mathbf{P}\mathbf{B}\mathbf{P}'$ for diagonal matrices $\mathbf{A}$ and $\mathbf{B}$. In particular, we manipulate this second term similar to \cite{williams2018some} as shown below.

\begin{equation*}
    \begin{gathered}
    s^2\textrm{tr}\left[\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}  \left(\bm{\Omega}(\bm{\theta}_2) +k\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}_2)\left(\bm{\Omega}(\bm{\theta}_2) + k\mathbf{I}\right)^{-1}\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} \right] 
    \\
    =
    \\
    s^2\textrm{tr}\left[\mathbf{P}\mathbf{P}'\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} \mathbf{P}\mathbf{P}'\left(\bm{\Omega}(\bm{\theta}_2) +k\mathbf{I}\right)^{-1}\mathbf{P}\mathbf{P}'\bm{\Omega}(\bm{\theta}_2)\mathbf{P}\mathbf{P}'\left(\bm{\Omega}(\bm{\theta}_2) + k\mathbf{I}\right)^{-1}
    \mathbf{P}\mathbf{P}'\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} \right] 
    \\
    =
    \\
    s^2\textrm{tr}\left[\mathbf{P}'\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1} \mathbf{P}\mathbf{P}'\left(\bm{\Omega}(\bm{\theta}_2) +k\mathbf{I}\right)^{-1}\mathbf{P}\mathbf{P}'\bm{\Omega}(\bm{\theta}_2)\mathbf{P}\mathbf{P}'\left(\bm{\Omega}(\bm{\theta}_2) + k\mathbf{I}\right)^{-1}
    \mathbf{P}\mathbf{P}'\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\mathbf{P} \right] 
    \\
    =
    \\
    s^2\textrm{tr}\left[\left(\mathbf{P}'\bm{\Omega}(\bm{\theta}_1)\mathbf{P} + s\mathbf{I}\right)^{-1}
    \left(\mathbf{P}'\bm{\Omega}(\bm{\theta}_2)\mathbf{P} +k\mathbf{I}\right)^{-1}\mathbf{P}'\bm{\Omega}(\bm{\theta}_2)\mathbf{P}\left(\mathbf{P}'\bm{\Omega}(\bm{\theta}_2)\mathbf{P} + k\mathbf{I}\right)^{-1}
    \left(\mathbf{P}'\bm{\Omega}(\bm{\theta}_1)\mathbf{P} + s\mathbf{I}\right)^{-1} \right] 
    \\
    =
    \\
    s^2\textrm{tr}\left[\left(\mathbf{A} + s\mathbf{I}\right)^{-1}\left(\mathbf{B} +k\mathbf{I}\right)^{-1}\mathbf{B}\left(\mathbf{B} + k\mathbf{I}\right)^{-1}\left(\mathbf{A} + s\mathbf{I}\right)^{-1} \right] 
    \\
    =
    \\
    s^2\sum_{j=1}^{p} \frac{B_{j, j}}{(B_{j, j} + k)^2(A_{j, j} + s)^2}.
    \end{gathered}
\end{equation*}

Therefore, asymptotically,

\begin{equation*}
    \begin{gathered}
        \textrm{Var}(\hat{\bm{\theta}}_1) = 
        \textrm{Var}(\tilde{\bm{\theta}}_1) +
        s^2\sum_{j=1}^{p} \frac{B_{j, j}}{(B_{j, j} + k)^2(A_{j, j} + s)^2}.
    \end{gathered}
\end{equation*}


We can then rewrite,

\begin{equation*}
    \begin{split}
        \textrm{Bias}(\hat{\bm{\theta}}_1, \bm{\theta}_1)^2 = &
        \left[\textrm{Bias}(\hat{\bm{\theta}}_1, \bm{\theta}_1)\right]'\left[\textrm{Bias}(\hat{\bm{\theta}}_1, \bm{\theta}_1)\right] \\
        = & 
        \left[-s\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\left(\bm{\theta}_1 - \mathbb{E}\left[\hat{\bm{\theta}}_2\right]\right) \right]'
        \left[-s\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-1}\left(\bm{\theta}_1 - \mathbb{E}\left[\hat{\bm{\theta}}_2\right]\right) \right]
    \end{split}
\end{equation*}

% From here, note that 
% \[\lim\limits_{n\rightarrow\infty}\mathbb{E}\left[\hat{\bm{\theta}}^c_{\eta^c_{ridge}}\right] = \left(\bm{\Omega}(\bm{\theta}^c_0) + 2\eta^c_{ridge}\mathbf{I}\right)^{-1}\bm{\Omega}(\bm{\theta}^c_0)\bm{\theta}^c_0.
% \]
% For notational sake, let 
% \[\mathbf{D}^c_{\eta_{ridge}} = \lim\limits_{n\rightarrow\infty}\mathbb{E}\left[\hat{\bm{\theta}}^c_{\eta^c_{ridge}}\right].\] 
% We see that this term is the astymptotic expectation of the estimated $\bm{\theta}^c$ subject to the $\eta^c_{ridge}$ penalty term. As $\eta^c_{ridge} \rightarrow 0$, $\mathbf{D}^c_{\eta_{ridge}} \rightarrow \bm{\theta}^c$ and as $\eta^c_{ridge} \rightarrow \infty$, $\mathbf{D}^c_{\eta_{ridge}} \rightarrow \bm{0}\in\mathbbm{R}^{(p)}$.

Then

\begin{equation*}
    \begin{split}
        \textrm{Bias}(\hat{\bm{\theta}}_1, \bm{\theta}_1)^2 = & 
        s^2
        \left(\bm{\theta}_1 - \mathbb{E}\left[\hat{\bm{\theta}}_2\right]\right)'\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-2}
        \left(\bm{\theta}_1 - \mathbb{E}\left[\hat{\bm{\theta}}_2\right]\right) \\
        = &
        s^2\bm{\theta}_1'\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-2}\bm{\theta}_1 -
        2s^2\bm{\theta}_1'\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-2}\mathbb{E}\left[\hat{\bm{\theta}}_2\right] +
        s^2\mathbb{E}\left[\hat{\bm{\theta}}_2\right]'\left(\bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}\right)^{-2}\mathbb{E}\left[\hat{\bm{\theta}}_2\right]
    \end{split}
\end{equation*}


Now, similar to \cite{phrueksawatnon2021determining}, we use the diagonalization of $\bm{\Omega}(\bm{\theta}_1)$ to see that 
\begin{equation*}
    \begin{gathered}
        \textrm{Bias}(\hat{\bm{\theta}}_1, \bm{\theta}_1)^2
        = s^2\sum_{j=1}^{p}\frac{a_j^2}{(A_{j,j} + s)^2} -
        2s^2\sum_{j=1}^{p}\frac{a_jb_j}{(A_{j,j} + s)^2} +
    s^2\sum_{j=1}^{p}\frac{b_j^2}{(A_{j,j} + s)^2}
    \end{gathered}
\end{equation*}   
where again $\mathbf{a} = \mathbf{P}\bm{\theta}_1$ and $\mathbf{b} = \mathbf{P}\mathbb{E}\left[\hat{\bm{\theta}}_2\right]$.

Now, note again that the first term above is equal to $\textrm{Bias}(\tilde{\bm{\theta}}_1, \bm{\theta}_1)^2$. Therefore,

\begin{equation*}
    \begin{gathered}
        \textrm{Bias}(\hat{\bm{\theta}}_1, \bm{\theta}_1)^2
        = \textrm{Bias}(\tilde{\bm{\theta}}_1, \bm{\theta}_1)^2 -
        s^2\sum_{j=1}^{p}\frac{1}{(A_{j,j} + s)^2}\left[2a_jb_j - b_j^2 \right]
    \end{gathered}
\end{equation*}   

Putting this all together, this makes
\begin{equation*}
    \begin{gathered} 
    \textrm{MSE}(\hat{\bm{\theta}}_1) =
    \textrm{MSE}(\tilde{\bm{\theta}}_1) +
    s^2\sum_{j=1}^{p} \frac{1}{(A_{j, j} + s)^2}\frac{B_{j, j}}{(B_{j, j} + k)^2} -
    s^2\sum_{j=1}^{p}\frac{1}{(A_{j,j} + s)^2}\left[2a_jb_j - b_j^2 \right]
    \end{gathered}
\end{equation*}

Therefore,
\[\textrm{MSE}\left(\hat{\bm{\theta}}_1\right) < \textrm{MSE}\left(\tilde{\bm{\theta}}_1\right)\]

when

\begin{equation*}
    \begin{gathered}
        s^2\sum_{j=1}^{p}\frac{1}{(A_{j,j} + s)^2}\left[2a_jb_j - b_j^2 \right]
        >
        s^2\sum_{j=1}^{p} \frac{1}{(A_{j, j} + s)^2}\frac{B_{j, j}}{(B_{j, j} + k)^2}.
    \end{gathered}
\end{equation*}

Which of course holds under the weaker condition that
\begin{equation*}
    \begin{gathered}
        b_{j}\left(2a_{j} - b_{j}\right) > \frac{B_{j, j}}{(B_{j, j} + k)^2}
    \end{gathered}
\end{equation*}
for all $j\in\{1, p\}$.

\end{proof}

\paragraph{Note on implications of Theorem~\ref{thm:mse}} Because $\forall j$, $\beta_j > 0$, it is necessary for $|a_j| > |a_j - b_j|$ in order for the inequality in Equation~\ref{eq:mse-diff} to hold. If we further observe that $\textrm{Var}(\hat{\bm{\theta}}_2) = \sum_{j=1}^p \beta_j$, Theorem~\ref{thm:mse} shows that the degree to which $\mathbf{a}$ is closer to $\mathbf{b}$ than $\bm{0}$ must be enough to account for the added variance of estimating $\bm{\theta}_2$. And noting that $\mathbf{a} = \mathbf{P}\bm{\theta}_1$ and $\mathbf{b} = \mathbf{P}\mathbbm{E}[\hat{\bm{\theta}}_2]$, the most intuitive way for $\mathbf{a}$ to be close to $\mathbf{b}$ is for $\bm{\theta}_1$ to be close to $\mathbbm{E}[\hat{\bm{\theta}}_2]$.


\subsection{Asymptotic Properties of \sinabbr}\label{apdx:asym-props-single}
\begin{theorem}\label{thm:asym-props-single}
    Let $\mathcal{L}^{(s,k)}(\bm{\theta} | \mathcal{D}_n) = \mathcal{L}^{(s)}(\bm{\theta} | \mathcal{D}_n) - \frac{1}{2}k\|\bm{\theta}\|_2^2$ be the log-likelihood of \sinabbr with an added $L_2$ regularization penalty on the magnitude of $\bm{\theta}$. Then, the MLE estimate of $\mathcal{L}^{(s,k)}(\bm{\theta} | \mathcal{D}_n)$, denoted $\hat{\bm{\theta}}^{(s,k)}$, has asymptotic bias
    \begin{equation}
        \begin{gathered}
            \mathbb{E}[\hat{\bm{\theta}}^{(s,k)} - \bm{\theta}] =
            -\left\{\bm{\Omega}^{(s,k)}(\bm{\theta})\right\}^{-1}
            \left(k\bm{\theta} + s\begin{bmatrix}
                    \bm{\theta}_1 - \bm{\theta}_2 \\
                    \bm{\theta}_2 - \bm{\theta}_1
                \end{bmatrix}\right)
        \end{gathered}
    \end{equation}
    and asymptotic variance
    \begin{equation}
        \begin{gathered}
            \text{Var}(\hat{\bm{\theta}}^{(s,k)}) =
            \left\{\bm{\Omega}^{(s,k)}(\bm{\theta})\right\}^{-1}\bm{\Omega}(\bm{\theta})\left\{\bm{\Omega}^{(s,k)}(\bm{\theta})\right\}^{-1}.
        \end{gathered}
    \end{equation}
    Above, we recall that
    \[\bm{\theta} = \begin{bmatrix}
        \bm{\theta}_1 \\
        \bm{\theta}_2
    \end{bmatrix}\] 
    and use $\bm{\Omega}(\bm{\theta})$ and $\bm{\Omega}^{(s,k)}(\bm{\theta})$ to denote the negative of the hessian matrix from the unregularized log-likelihood and $\mathcal{L}^{(s,k)}(\bm{\theta} | \mathcal{D}_n)$ respectively.
    
    
\end{theorem}
\begin{proof}
We proceed similarly to the proof for Theorem~\ref{thm:asymptotic}. Thus,
    \begin{equation*}
        \begin{gathered}
            \mathbf{U}^{(s,k)}(\bm{\theta}) =
            \mathbf{U}(\bm{\theta}) - k\bm{\theta} - s\begin{bmatrix}
                \bm{\theta}_1 - \bm{\theta}_2 \\
                \bm{\theta}_2 - \bm{\theta}_1
            \end{bmatrix}
        \end{gathered}
    \end{equation*}
    and
    \begin{equation*}
        \begin{gathered}
            \bm{\Omega}^{(s,k)}(\bm{\theta}) =
            \bm{\Omega}(\bm{\theta}) + k\mathbf{I}^* + s\left(\mathbf{I}^* + \begin{bmatrix}
            \mathbf{0} & -\mathbf{I} \\
            -\mathbf{I} & \mathbf{0}
            \end{bmatrix}\right).
        \end{gathered}
    \end{equation*}    
    $\mathbf{I}^*$ is a $2p\times 2p$ identity matrix and
    \[
    \begin{bmatrix}
            \mathbf{0} & -\mathbf{I} \\
            -\mathbf{I} & \mathbf{0}
    \end{bmatrix}
    \]
    is a $2p\times 2p$ matrix where $\mathbf{I}$ is a $p\times p$ identity matrix and $\mathbf{0}$ is a $p\times p$ matrix of all zeros.
    
    Then, we derive the large sample properties of our estimate, $\hat{\bm{\theta}}^{(s,k)}$, with the Taylor series expansion about the true parameter $\bm{\theta}$. This gives
    \begin{equation*}
        \begin{gathered}
            \mathbf{U}^{(s,k)}(\hat{\bm{\theta}}^{(s,k)}) =
            \mathbf{U}^{(s,k)}(\bm{\theta}) - \left(\hat{\bm{\theta}}^{(s,k)} - \bm{\theta} \right)\bm{\Omega}^{(s,k)}(\bm{\theta}) +
            o\left(\|\hat{\bm{\theta}}^{(s,k)} - \bm{\theta} \|\right)
        \end{gathered}
    \end{equation*}
and the first-order approximation is
\begin{equation*}
    \begin{gathered}
        \hat{\bm{\theta}}^{(s,k)} =
        \bm{\theta} + \left\{\bm{\Omega}^{(s,k)}(\bm{\theta})\right\}^{-1}\left(\mathbf{U}(\bm{\theta}) - k\bm{\theta} - s\begin{bmatrix}
                \bm{\theta}_1 - \bm{\theta}_2 \\
                \bm{\theta}_2 - \bm{\theta}_1
            \end{bmatrix} \right).
    \end{gathered}
\end{equation*}
From here, we arrive at an asymptotic bias
\begin{equation*}
    \begin{gathered}
        \mathbb{E}[\hat{\bm{\theta}}^{(s,k)} - \bm{\theta}] =
        -\left\{\bm{\Omega}^{(s,k)}(\bm{\theta})\right\}^{-1}
        \left(k\bm{\theta} + s\begin{bmatrix}
                \bm{\theta}_1 - \bm{\theta}_2 \\
                \bm{\theta}_2 - \bm{\theta}_1
            \end{bmatrix}\right).
    \end{gathered}
\end{equation*}
Now, noting as in Theorem~\ref{thm:asymptotic} that the unregularized MLE estimate $\hat{\bm{\theta}} = \bm{\theta} + \bm{\bm{\Omega}}^{-1}(\bm{\theta})\mathbf{U}(\bm{\theta})$, we rewrite
\begin{equation*}
    \begin{split}
        \hat{\bm{\theta}}^{(s,k)} = &
        \left\{\bm{\Omega}^{(s,k)}(\bm{\theta})\right\}^{-1}\left(\mathbf{U}(\bm{\theta}) -  \bm{\Omega}(\bm{\theta})\bm{\theta}\right) \\
        = &
        \left\{\bm{\Omega}^{(s,k)}(\bm{\theta})\right\}^{-1}\left(\mathbf{U}(\bm{\theta}) -  \bm{\Omega}(\bm{\theta})\left(\hat{\bm{\theta}} - \ \bm{\Omega}^{-1}(\bm{\theta})\mathbf{U}(\bm{\theta})\right)\right) \\
        = &
        \left\{\bm{\Omega}^{(s,k)}(\bm{\theta})\right\}^{-1}\bm{\Omega}(\bm{\theta})\hat{\bm{\theta}}
    \end{split}
\end{equation*}.

And since the asymptotic variance of $\hat{\bm{\theta}} = \bm{\bm{\Omega}}^{-1}(\bm{\theta})$, we have the asymptotic variance
\begin{equation*}
    \begin{gathered}
        \text{Var}(\hat{\bm{\theta}}^{(s,k)}) =
        \left\{\bm{\Omega}^{(s,k)}(\bm{\theta})\right\}^{-1}\bm{\Omega}(\bm{\theta})\left\{\bm{\Omega}^{(s,k)}(\bm{\theta})\right\}^{-1}.
    \end{gathered}
\end{equation*}
\end{proof}



We establish Lemma~\ref{lem:block-eig-vals} for use in our proof to Theorem~\ref{thm:single-step-mse}.

\begin{lemma}\label{lem:block-eig-vals}
Let $\mathbf{A}$ be an $n\times n$ diagonalizable matrix such that $\mathbf{A} = \mathbf{Q}\bm{\Lambda}\mathbf{Q}^{-1}$ where $\{\lambda_i\}_{i=1}^n$ are the $n$ diagonal entries of $\bm{\Lambda}$ and eigenvalues of $\mathbf{A}$. Then for an $n\times n$ identity matrix $\mathbf{I}$ and any real constant $c$ we have that the eigenvalues of the $2n\times 2n$ matrix
\[
    \mathbf{B} = \begin{bmatrix}
            \mathbf{A} + c\mathbf{I} & -c\mathbf{I} \\
            -c\mathbf{I} & \mathbf{A} + c\mathbf{I}
    \end{bmatrix}
\]
are $\{\lambda_i\}_{i=1}^n \cup \{\lambda_i + 2c\}_{i=1}^n$.

\begin{proof}
    We find the eigenvalues by solving the equation
    \[\textrm{det}(\mathbf{B} - \lambda\mathbf{I}^*) = 0\]
    where $\mathbf{I}^*$ is a $2n\times 2n$ matrix.
    Note that
    \[
    \mathbf{B} - \lambda\mathbf{I}^* = \begin{bmatrix}
            \mathbf{A} + (c - \lambda)\mathbf{I} & -c\mathbf{I} \\
            -c\mathbf{I} & \mathbf{A} + (c - \lambda)\mathbf{I}
    \end{bmatrix}
\]
    Since $\mathbf{A} + (c - \lambda)\mathbf{I}$ and $-c\mathbf{I}$ commute with each other, by \cite{silvester2000determinants} we have that
    \begin{align}
        \begin{split}
            \textrm{det}(\mathbf{B} - \lambda\mathbf{I}^*) = & \textrm{det}\left((\mathbf{A} + (c - \lambda)\mathbf{I})^2 - (-c\mathbf{I})^2 \right) \\
             = & \textrm{det}\left(\mathbf{A} + (c - \lambda)\mathbf{I} - c\mathbf{I} \right)\\
             & \textrm{det}\left(\mathbf{A} + (c - \lambda)\mathbf{I} + c\mathbf{I} \right) \\
             = & \textrm{det}\left(\mathbf{A} - \lambda\mathbf{I} \right)\textrm{det}\left(\mathbf{A} + (2c - \lambda)\mathbf{I} \right)
        \end{split}
    \end{align}
    Therefore, setting $\textrm{det}(\mathbf{B} - \lambda\mathbf{I}^*) = 0$, and noting that the eigenvalues of $\mathbf{A}$ are $\{\lambda_i\}_{i=1}^n$, we conclude that the eigenvalues of $\mathbf{B}$ are $\{\lambda_i\}_{i=1}^n \cup \{\lambda_i + 2c\}_{i=1}^n$.
\end{proof}
\end{lemma}


\sinmse*
\begin{proof}
Start by noting that this log-likelihood is of the same form as $\mathcal{L}^{(s,k)}$ from Theorem~\ref{thm:asym-props-single} with $k=0$. Therefore, we use those same results, setting $k=0$ and dropping $k$ from the superscript. Then, with $\bm{\theta}_1 = \bm{\theta}_2$, Theorem~\ref{thm:asym-props-single} shows that $\mathcal{L}^{(s)}$ is asymptotically unbiased for any value of $s$.

Therefore, the asymptotic MSE is just the asymptotic variance of the estimate, $\hat{\bm{\theta}}^{(s)}$.
\begin{equation*}
    \begin{gathered}
        \text{Var}(\hat{\bm{\theta}}^{(s)}) =
        \textrm{tr}\left[\left\{\bm{\Omega}^{(s)}(\bm{\theta})\right\}^{-1}\bm{\Omega}(\bm{\theta})\left\{\bm{\Omega}^{(s)}(\bm{\theta})\right\}^{-1} \right]
    \end{gathered}
\end{equation*}
We observe that $\bm{\Omega}^{(s)}(\bm{\theta})$ has the following structure
\[
    \begin{bmatrix}
            \bm{\Omega}(\bm{\theta}_1) + s\mathbf{I} & -s\mathbf{I} \\
            -s\mathbf{I} & \bm{\Omega}(\bm{\theta}_1) + s\mathbf{I}
    \end{bmatrix}
\]
From here, using Lemma~\ref{lem:block-eig-vals} we have that
\begin{equation*}
    \begin{gathered}
        \text{MSE}(\hat{\bm{\theta}}^{(s)}) = \text{Var}(\hat{\bm{\theta}}^{(s)}) =
        \sum_{j=1}^{p} \frac{1}{\lambda_j} + \frac{\lambda_j}{(\lambda_j + s)^2}
    \end{gathered}
\end{equation*}
where $\lambda_j$ is the $j$-th eigenvalue of $\bm{\Omega}(\bm{\theta}_1)$.

Therefore, for any $s' > s \geq 0$, we have that
\[\text{MSE}(\hat{\bm{\theta}}^{(s')}) < \text{MSE}(\hat{\bm{\theta}}^{(s)}).\]

And we note that as $s\rightarrow \infty$, $\text{MSE}(\hat{\bm{\theta}}^{(s)}) \rightarrow \sum_{j=1}^{p} \frac{1}{\lambda_j}$ which is the MSE of the MLE estimator of the unregularized log-likelihood of $\bm{\theta}_1$.


\end{proof}