\section{Proofs of Useful Lemmas}
\begin{lemma}[Gaussian mechanism]
For any deterministic real-valued function $f:\calD \to \bbR^m$ with sensitivity $S_f$, we can define a randomized function by adding Gaussian noise to $f$:
$$
f^{dp}(D):=f(D) + \calN\left(\mathbf{0}, S_f^2 \sigma^2\cdot I\right),
$$
where $\calN\left(\mathbf{0}, S_f^2 \sigma^2\cdot I\right)$ is a multivariate normal distribution with mean $\mathbf{0}$ and co-variance matrix $S_f^2\sigma^2$ multiplying a $m\times m$ identity matrix $I$. When $\sigma \geq \frac{\sqrt{2\log\left(1 / (1.25\delta)\right)} }{\varepsilon}$, $f^{dp}$ is $(\varepsilon,\delta)$-differentially private.
\end{lemma}

\begin{lemma}[JL Lemma for inner-product preserving (Bernoulli)]
Suppose $S$ be an arbitrary set of $l$ points in $\bbR^d$ and suppose $s$ is an upper bound for the maximum L2-norm for vectors in $S$. Let $B$ be a $k\times d$ random matrix, where $B_{ij}$ are independent random variables, which take value $1$ and value $-1$ with probability $1/2$.
	With the probability at least $ 1 - (l+1)^2\exp\left(-k\left(\frac{\beta^2}{4} - \frac{\beta^3}{6}\right)\right)$,
	$$
	\frac{\mathbf{u}^\top\mathbf{v}}{s^2} - 4\beta \leq \frac{\left( B\mathbf{u}/\sqrt{k}\right)^\top\left( B\mathbf{v} / \sqrt{k}\right)}{s^2} \leq \frac{\mathbf{u}^\top\mathbf{v}}{s^2} + 4\beta.
	$$
\end{lemma}

\begin{lemma}
\label{lem:mixed_lower_bound}
	\begin{enumerate}
		\item $\forall x\in[0, 1]$, $ - \log\left(1 - x\right) - x \geq  \frac{x^2}{2}$.
		\item $\forall x\in[0, 1]$, $x - \log\left(1 + x\right) \geq \frac{x^2}{4}$.
		\item $\forall x > 1, x - \log\left(1 + x\right) \geq \frac{x}{2}$.
	\end{enumerate}
\end{lemma}
\begin{proof}
Define $f_1(x) := - \log\left(1 - x\right) - x - \frac{x^2}{2}$. $f_1'(x) = \frac{x^2}{1-x} \geq 0.$ Thus $f_1(x)$ increases on $[0, 1]$ and $f_1(x)\geq f_1(0) = 0$.

Define $f_2(x) := x - \log\left(1 + x\right) - \frac{x^2}{4}$. $f_2'(x) = \frac{x(1-x)}{2(1+x)}$. $f_2(x)$ increases on $[0, 1]$ and $f_2(x)\geq f_2(0)=0$.

Define $f_3(x) := x - \log\left(1 + x\right) - \frac{x}{4}$. $f_3'(x) = \frac{3x-1}{4(1+x)} > 0$. $f_3(x)$ increases on $[0, 1]$ and $f_3(x)\geq f(1) >0$.
\end{proof}

\begin{lemma}
\label{lem:reduction}
Denote $\hat{H}_n=\frac{1}{n}X^\top X$, $\hat{C}_n = \frac{1}{n}X^\top Y$, $H = \bbE_{(\bx, y)\sim\calP}\left[\bx\bx^\top\right]$ and $C=\bbE_{(\bx, y)\sim\calP}\left[\bx\cdot y\right]$. Assume $\lVert \hat{H}_n^{\sf pub} - \hat{H}_n \rVert\leq\beta, \lVert \hat{C}_n^{\sf pub} - \hat{C}_n \rVert\leq\beta$ with prob $1 - f(\beta)$. We have that when $\beta \leq \frac{2\lVert  C \rVert \lVert  H^{-1} \rVert  + 5}{8}$,
$$
\mathbb{P}_{X, \by\sim \calD, R_1, R_2}\left[\lVert\hat{\bw}_n^{\sf pub} - \bw^*\rVert \leq \beta\right] \geq 1 - h(\beta),
$$
where $\hat{\bw}_n^{\sf pub} = \left(\hat{H}_n^{\sf pub}\right)^{-1}\hat{C}_n^{\sf pub}$, $c:=\lVert  C \rVert \lVert  H^{-1} \rVert^2  + 2 \lVert  H^{-1} \rVert$ and  $h(\beta)= f(\beta / 2c) + d^2 \exp\left( - \frac{n\beta^2}{8c^2d^2} \right) + d \exp\left( - \frac{n\beta^2}{8c^2d} \right)$.
\end{lemma}
\begin{proof}
	Hoeffding inequality and union bound together imply that  with prob. $1 - d^2 \exp\left( - \frac{n\beta^2}{2d^2} \right) - d \exp\left( - \frac{n\beta^2}{2d} \right)$,
$$
\lVert \hat{H}_n - H\rVert \leq \beta, \lVert \hat{C}_n - C\rVert \leq \beta.
$$
Thus with prob $1 - g(\delta)$, $\lVert \hat{H}_n^{\sf pub} - H\rVert \leq \beta, \lVert \hat{C}_n^{\sf pub} - C\rVert \leq \beta$, where $g(\beta) = f(\beta / 2) + d^2 \exp\left( - \frac{n\beta^2}{8d^2} \right) + d \exp\left( - \frac{n\beta^2}{8d} \right)$

We further have
\begin{itemize}
	\item 	$\lVert  \hat{C}_n^{\sf pub} \rVert \leq \lVert  C - \hat{C}_n \rVert + \lVert  C \rVert \leq \lVert  C \rVert + \beta$
	\item $\lVert \left(\hat{H}_n^{\sf pub}\right)^{-1} - H^{-1} \rVert \leq \lVert \left(\hat{H}_n^{\sf pub}\right)^{-1}\rVert \lVert H^{-1} \rVert \cdot\lVert \hat{H}_n^{\sf pub} - H\rVert \leq \left(\lVert H^{-1} \rVert + \lVert \left(\hat{H}_n^{\sf pub}\right)^{-1} - H^{-1} \rVert\right)\cdot \lVert H^{-1} \rVert \cdot \beta$, which implies that when $\beta \leq \frac{1}{2\lVert H^{-1} \rVert}$, $\lVert \left(\hat{H}_n^{\sf pub}\right)^{-1} - H^{-1} \rVert \leq \frac{\lVert H^{-1} \rVert^2 \cdot \beta}{1 - \lVert H^{-1} \rVert \cdot \beta} \leq \frac{\lVert H^{-1} \rVert^2 \cdot \beta}{2}$.
	\item When $\beta \leq \frac{1}{2\lVert H^{-1} \rVert}$, 
		\begin{align*}
		\lVert \left(\hat{H}_n^{\sf pub}\right)^{-1}\hat{C}_n^{\sf pub} - H^{-1}C\rVert &\leq \lVert \hat{C}_n^{\sf pub} \rVert \cdot \lVert \left(\hat{H}_n^{\sf pub}\right)^{-1} - H^{-1} \rVert + \lVert H^{-1} \rVert \cdot \lVert \hat{C}_n^{\sf pub} - C\rVert \nonumber \\
		&\leq \left( \lVert  C \rVert +  \beta \right)\cdot  \frac{\lVert H^{-1} \rVert^2 \cdot \beta}{2} + \lVert H^{-1} \rVert \cdot \beta\\
		& \leq \frac{\left( 2\lVert  C \rVert \lVert  H^{-1} \rVert^2  + 5 \lVert  H^{-1} \rVert  \right)}{4}\beta.
		\end{align*}
\end{itemize}
Let $b:= \frac{\left( 2\lVert  C \rVert \lVert  H^{-1} \rVert^2  + 5 \lVert  H^{-1} \rVert  \right)}{4} $ and replace $\beta$ by $b^{-1}\beta$, we have that when $\beta \leq \frac{2\lVert  C \rVert \lVert  H^{-1} \rVert  + 5}{8}$
$$
\mathbb{P}_{X, \by\sim \calD, R_1, R_2}\left[\lVert\hat{\bw}_n - \bw^*\rVert \leq \beta\right] \geq 1 - h(\beta),
$$
where $h(\beta)=g(\beta / b)= f(\beta/2b) + d^2 \exp\left( - \frac{n\beta^2}{8b^2d^2} \right) + d \exp\left( - \frac{n\beta^2}{8b^2d} \right)$.
\end{proof}

\begin{lemma}
\label{lem:norm_tail}
    If $r$ is a random variable sampled from standard normal distribution, we have following concentration bound:
    $$
    \bbP\left[ |r| < \beta \right] \geq 1 - \frac{2}{\sqrt{2\pi}\beta}\exp\left(-\frac{\beta^2}{2}\right)
    $$
\end{lemma}
\begin{proof}
    It's shown in page 2 in \cite{pollard15}.
\end{proof}

\begin{lemma}
\label{lem:prod_two_norm}
    If $r_1, r_2$ are two independent random variables sampled from standard normal distribution, $r_1r_2$ can be written as $\frac{c_1 - c_2}{2}$, where $c_1, c_2$ are independent two random variables sampled from chi-squared with degree $1$. Moreover, $\sum_{i=1}^nr_{1, n}r_{2, n}$ can be written as $\frac{c_{1, 1:n} - c_{2, 1:n}}{2}$, where $c_{1, 1:n}, c_{2, 1:n}$ are independent two random variables sampled from chi-squared with degree $n$.
\end{lemma}
\begin{proof}
$r_1r_2 = \frac{ \left(\frac{r_1+ r_2}{\sqrt{2}}\right)^2 - \left(\frac{r_1- r_2}{\sqrt{2}}\right)^2}{2}$. Because $r_1, r_2$ are two independent standard normal random variables,  $\frac{r_1+ r_2}{\sqrt{2}}, \frac{r_1 - r_2}{\sqrt{2}}$ are two independent standard normal random variables as well. $c_1:=\frac{r_1+ r_2}{\sqrt{2}}$ and $c_2:=\frac{r_1- r_2}{\sqrt{2}}$ complete the proof for the first part.

$\sum_{i=1}^nr_{1, n}r_{2, n} = \frac{1}{2}\sum_{i=1}^n(c_{1, i} - c_{2, i}) =  \frac{1}{2}(\sum_{i=1}^nc_{1, i} - \sum_{i=1}^nc_{2, i})$. $c_{1, 1:n}:=\sum_{i=1}^nc_{1, i}$ and $c_{2, 1:n}:=\sum_{i=1}^nc_{2, i}$ finish the proof.
\end{proof}


\section{Proofs in Section 4}
We restate the assumptions and theorems for the completeness.
\begin{assumption}
$D_i$, $i=1,\cdots, n$, are i.i.d sampled from an underlying distribution $\calP$ over $\bbR^{d+1}$.
\end{assumption}

\begin{assumption}
	The absolute values of all attributes are bounded by $1$.
\end{assumption}

\begin{assumption}
	$\bbE_{(\bx, y)\sim \calP}\left[ \bx\bx^\top \right]$ is positive definite.
\end{assumption}

\begin{theorem}
	When $\beta\leq c$ for some variable $c$ that depends on $\sigma_{\varepsilon, \delta}$, $d$ and $\calP$, but independent of $n$,
	$$
	\bbP\left[ \lVert \hat{\bw}^{\sf dgm}_n - \bw^* \rVert > \beta \right] < 1 - \exp\left(-O\left( \beta^2 \frac{n}{\sigma_{\varepsilon, \delta}^4d^4} \right) + \tilde{O}(1)\right).
	$$
\end{theorem}
\begin{proof}[Proof of \autoref{thm:dgm_utility}]
Denote $\left(\max_{j\in[m]}d_j\right)$ by $d_{\rm max}$. Denote $R\in \mathbb{R}^{n\times d}$ is a random matrix s.t. $R_{i, j}\sim \calN\left( 0, 4d_{\rm max} \sigma_{\varepsilon, \delta}^2 \right)$. We split $R$ into $R_X$ and $R_Y$ representing the addictive noise to $X$ and $Y$.
$$\hat{\bw}_n^{\sf dgm} = \left( \frac{1}{n}(X + R_X)^\top (X+R_X)+ (\lambda -4d_{\rm max} \sigma_{\varepsilon, \delta}^2) I\right)^{-1} \frac{(X+R_X)^\top (Y + R_Y)}{n}.$$
\begin{enumerate}
\item For any $i\in[d]$, $\frac{1}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2}\left[R_X^\top R_X\right]_{i, i}$ is sampled from chi-square distribution with degree n. From the cdf of chi-square distribution, we have following concentration: 
\begin{align*}
\bbP\left[\left\lvert\left[\frac{1}{n}R_X^\top R_X\right]_{i, i} - 4d_{\rm max} \sigma_{\varepsilon, \delta}^2\right\rvert <\beta \right] &\geq 1-\exp\left(-n\cdot\left(\frac{\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2} - \log\left(1 + \frac{\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2}\right)\right)\right) \\
&- \exp\left(-n\cdot\left(-\log\left(1 - \frac{\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2}\right) - \frac{\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2}\right)\right).	
\end{align*}

Moreover, for $i\neq j$, \autoref{lem:prod_two_norm} implies that $\frac{1}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2}\left[R_X^\top R_X\right]_{i, j}$ can be written as $\frac{c_{1, 1:n}- c_{2, 1:n}}{2}$, where $c_{1, 1:n}, c_{2, 1:n}$ are independent two random variables sampled from chi-squared with degree $n$. Thus 
\begin{align*}
    \bbP\left[\left\lvert\left[\frac{1}{n}R_X^\top R_X\right]_{i, j}\right\rvert <\beta \right] & = \bbP\left[\left\lvert\frac{c_{1, 1:n}- c_{2, 1:n}}{2n}\right\rvert <\frac{\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2} \right]\\
    &\geq \bbP\left[\left\lvert c_{1, 1:n} - n\right\rvert < \frac{n\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2}, \left\lvert c_{1, 1:n} - n\right\rvert < \frac{n\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2}\right] \\
    &\geq 1 - 2\bbP\left[\left\lvert c_{1, 1:n} - n\right\rvert \geq \frac{n\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2} \right] \\
    & \geq 1 - 2\exp\left(-n\cdot\left(\frac{\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2} - \log\left(1 + \frac{\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2}\right)\right)\right) \\
    & - 2\exp\left(-n\cdot\left(-\log\left(1 - \frac{\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2}\right) - \frac{\beta}{4d_{\rm max} \sigma_{\varepsilon, \delta}^2}\right)\right)
\end{align*}

Union bound implies that
\begin{align*}
\bbP\left[ \left\lVert\frac{1}{n}R_X^\top R_X - 4d_{\rm max} \sigma_{\varepsilon, \delta}^2 \cdot I\right\rVert \leq \beta_1 \right] &\geq 1 - d^2\cdot \exp\left(-n\cdot\left(\frac{\beta_1}{4dd_{\rm max} \sigma_{\varepsilon, \delta}^2} - \log\left(1 + \frac{\beta_1}{4dd_{\rm max} \sigma_{\varepsilon, \delta}^2}\right)\right)\right)\\
&  - d^2\cdot\exp\left(-n\cdot\left(-\log\left(1 - \frac{\beta_1}{4dd_{\rm max} \sigma_{\varepsilon, \delta}^2}\right) - \frac{\beta_1}{4dd_{\rm max} \sigma_{\varepsilon, \delta}^2}\right)\right)
\end{align*}
\item $\bbP\left[ \left\lVert\frac{X^\top R_X}{n} \right\rVert \leq \beta_2 \right] \geq 1 - \frac{4\sigma_{\varepsilon, \delta} d^{3}d_{\rm max}^{1/2}}{\sqrt{2\pi n}\beta_2}\exp\left(-\frac{n\beta_2^2}{8d^2d_{\rm max} \sigma_{\varepsilon, \delta}^2} \right)$, implied by \autoref{lem:norm_tail}.

\item $\bbP\left[ \left\lVert\frac{X^\top R_Y}{n} \right\rVert \leq \beta_3 \right] \geq 1 - \frac{4\sigma_{\varepsilon, \delta} d^{3/2}d_{\rm max}^{1/2}}{\sqrt{2\pi n}\beta_3}\exp\left(-\frac{n\beta_3^2}{8dd_{\rm max} \sigma_{\varepsilon, \delta}^2} \right)$, implied by \autoref{lem:norm_tail}.

\item $\bbP\left[ \left\lVert\frac{R_X^\top Y}{n} \right\rVert \leq \beta_4 \right] \geq 1 - \frac{4\sigma_{\varepsilon, \delta} d^{3/2}d_{\rm max}^{1/2}}{\sqrt{2\pi n}\beta_4}\exp\left(-\frac{n\beta_4^2}{8dd_{\rm max} \sigma_{\varepsilon, \delta}^2} \right)$, implied by  \autoref{lem:norm_tail}.

\item Similar to 1,
\begin{align*}
	\bbP\left[ \left\lVert\frac{R_X^\top R_Y}{n} \right\rVert \leq \beta_5 \right] &\geq 1 - 2d\exp\left(-n\cdot\left(\frac{\beta_5}{4d^{1/2}d_{\rm max} \sigma_{\varepsilon, \delta}^2} - \log\left(1 + \frac{\beta_5}{4d^{1/2}d_{\rm max} \sigma_{\varepsilon, \delta}^2}\right)\right)\right) \\
	&- 2d\exp\left(-n\cdot\left(-\log\left(1 - \frac{\beta_5}{4d^{1/2}d_{\rm max} \sigma_{\varepsilon, \delta}^2}\right) - \frac{\beta_5}{4d^{1/2}d_{\rm max} \sigma_{\varepsilon, \delta}^2}\right)\right)	
\end{align*}

\end{enumerate}

One can simplify $ - \log\left(1 - x\right) - x$ and $x - \log\left(1 + x\right)$ by \autoref{lem:mixed_lower_bound}.
Set $\beta_1=\frac{1}{2}\beta,~\beta_2 = \frac{1}{4}\beta, ~\beta_3=\beta_4 = \frac{1}{4}\beta, ~\beta_5=\frac{1}{2}\beta$.
The above concentrations together imply that when  $\beta < 8dd_{\rm max} \sigma_{\varepsilon, \delta}^2$, $\lVert \hat{H}_n^{\sf pub} - \hat{H}_n \rVert\leq\beta, \lVert \hat{C}_n^{\sf pub} - \hat{C}_n \rVert\leq\beta$ with prob at least $1 - f(\beta)$, where $f(\beta) = \exp\left(-\min\left\{ O\left(n\cdot \frac{\beta^2}{{d^2d_{\rm max}^2 \sigma_{\varepsilon, \delta}^4}}\right) + \tilde{O}(1)\right\}\right)$.


With the application of \autoref{lem:reduction}: when $\beta \leq \frac{2\lVert  C \rVert \lVert  H^{-1} \rVert  + 5}{8}$
$$
\mathbb{P}_{X, \by\sim \calD, R_1, R_2}\left[\lVert\hat{\bw}_n - \bw^*\rVert \leq \beta \right] \geq 1 - h(\beta),
$$
where $h(\beta)$ is:
\begin{enumerate}
    \item when $\beta < 16bdd_{\rm max} \sigma_{\varepsilon, \delta}^2$, $h(\beta) = \exp\left(- O\left(n\cdot \frac{\beta^2}{{d^2d_{\rm max}^2 \sigma_{\varepsilon, \delta}^4}}\right) + \tilde{O}(1)\right)$;
    \item when $\beta \geq 16bdd_{\rm max} \sigma_{\varepsilon, \delta}^2$, $h(\beta) = \exp\left(-O\left(n\cdot \frac{\beta^2}{d^2d_{\rm max} \sigma_{\varepsilon, \delta}^2}\right) + \tilde{O}(1)\right);$
\end{enumerate}
where $b=\frac{\left( 2\lVert  C \rVert \lVert  H^{-1} \rVert^2  + 5 \lVert  H^{-1} \rVert  \right)}{4}$ is a distribution dependent constant.
In the other word, when $\beta\leq \min\left\{ 16bdd_{\rm max} \sigma_{\varepsilon, \delta}^2, \frac{2\lVert  C \rVert \lVert  H^{-1} \rVert  + 5}{8} \right\}$, 
$$
\mathbb{P}_{X, \by\sim \calD, R_1, R_2}\left[\lVert\hat{\bw}_n - \bw^*\rVert \leq \beta \right] \geq 1 - \exp\left(- O\left(n\cdot \frac{\beta^2}{{d^2d_{\rm max}^2 \sigma_{\varepsilon, \delta}^4}}\right) + \tilde{O}(1)\right),
$$

\end{proof}

\begin{theorem}
	When $\beta\leq c$ for some variable $c$ that depends on $d$ and $\calP$, but independent of $n$ and $\sigma_{\varepsilon, \delta}$, $\bbP\left[ \lVert \bw^{\methodtwoupper{}}_n - \bw^* \rVert >\beta \right] < 
	\exp\left(- O\left(\min\left\{ \frac{k\beta^2}{d^2}, \frac{n\beta}{kd^2\sigma_{\varepsilon, \delta}^2}, \frac{n^{1/2}\beta}{d^{3/2} \sigma_{\varepsilon, \delta}}\right\}\right) + \tilde{O}(1) \right)$.
	If we take $k=O\left(\frac{(nd)^{1/2}}{d_{\rm max}^{1/2}\sigma_{\varepsilon, \delta}}\right)$,
    $
	\bbP\left[ \lVert \hat{\bw}^{\methodtwoupper{}}_n - \bw^* \rVert >\beta \right] < \exp\left(- \frac{n^{1/2}\beta}{d^{3/2}d_{\rm max}^{1/2}\sigma_{\varepsilon, \delta}} \cdot O\left(\min\left\{ 1, \beta \right\}\right) + \tilde{O}(1) \right).
	$
\end{theorem}
\begin{proof}[Proof of \autoref{thm:rmgm_utility}]
\begin{align*}
	\hat{\bw}_n^{\methodtwoupper{}} = \left(\frac{1}{n}\left(X^\top\frac{B^\top B}{k}X + X^\top \frac{B^\top}{\sqrt{k}} R_X + R_X^\top \frac{B}{\sqrt{k}}X + R_X^{\top}R_X\right)\right)^{-1}\left(\frac{1}{n}\left(X^{\top}\frac{B^\top B}{k}Y + R_X^{\top}\frac{B}{\sqrt{k}}Y + X^\top\frac{B^\top}{\sqrt{k}}R_Y + R_X^{\top}R_Y\right)\right)
\end{align*}

Define $M:=\frac{1}{\sqrt{k}}B$. Then we can make the analysis one by one.
\begin{enumerate}
	\item JL-lemma applied by Bernoulli random variables implies that with probability $1 - (d+2)^2\exp\left( -k\left(\frac{\beta_1^2}{64d} - \frac{\beta_1^3}{96d\sqrt{d}} \right)\right)$,
		$$
		\left\lVert\frac{1}{n}X^{\top}M^{\top}MY - \frac{1}{n}X^{\top}Y\right\rVert \leq \beta_1.
		$$
		\begin{proof}
			    \autoref{lem:jl} implies that with prob $1 - (d+2)^2\exp\left( k\left(\frac{\beta^2}{4} - \frac{\beta^3}{6} \right)\right)$, for any $u, v\in\{ X^{\top}_i|i\in [d]\} \cup \{Y\}$,
				$$
					\frac{(Mu)^{\top}Mv}{n} - 4\beta \leq \frac{u^{\top}v}{n} \leq \frac{(Mu)^{\top}Mv}{n}  + 4\beta. 
				$$
				This further implies that
				$$
				\left\lVert\frac{1}{n}X^{\top}M^{\top}MY - \frac{1}{n}X^{\top}Y\right\rVert \leq 4\sqrt{d}\beta.
				$$
				$\beta_1 = 4\sqrt{d}\beta$ helps finish the proof.
		\end{proof}
		
	\item JL-lemma applied by Bernoulli random variables implies that with probability $1 - (d+2)^2\exp\left( k\left(-\frac{\beta_2^2}{64d^2} - \frac{\beta_2^3}{96d^3} \right)\right)$,
		$$
		\left\lVert\frac{1}{n}X^{\top}M^{\top}M X - \frac{1}{n}X^{\top}X\right\rVert \leq \beta_2.
		$$
	
	\item With prob. $1-2kd\sqrt{\frac{2kdd_{\rm max} \sigma_{\varepsilon, \delta}^2}{\pi n\beta_3}}\exp\left( -n\frac{\beta_3}{8kdd_{\rm max}\sigma_{\varepsilon, \delta}^2}\right)$, $\left\lVert \frac{R_X^{\top}R_X}{n} \right\rVert\leq \beta_3$.
	\begin{proof}
		To simplify the proof, let's assume $R_X$ is a standard gaussian matrix. Because $\bbP \left(\left\lvert (R_X)_{ij}\rvert\right)\leq \beta \right) \geq 1 - \frac{2}{\sqrt{2\pi}\beta}\exp\left( -\beta^2/2\right)$ shown in \autoref{lem:norm_tail}, 
		$$
		\bbP \left( \lVert R_X^{\top}R_X \rVert \leq kd\beta  \right) \geq \bbP \left( \lVert R_X \rVert  \leq \sqrt{kd\beta} \right)  \geq 1 - \frac{2kd}{\sqrt{ 2\pi \beta}} \exp\left( -\beta/2\right).
		$$
		It's equivalent that
		$$
		\bbP \left( \left\lVert \frac{R_X^{\top}R_X}{n} \right\rVert_2\leq \beta_3 \right) \geq 1-2kd\sqrt{\frac{kd}{2\pi n\beta_3}} \exp\left( -\frac{n}{2kd}\beta_3\right).
		$$
		Plug-in the variance of $R_X$ leads to the targeted inequality.
	\end{proof}

	
		
	\item With prob. $1 - 2\left(\frac{1}{\sqrt{\pi \sigma_{\varepsilon, \delta}\beta_4 \sqrt{nd_{\rm max}}}} d^{5/4} + 2kd\right)\cdot\exp\left(-\sqrt{n}\cdot\frac{\beta_4}{4\sigma_{\varepsilon, \delta} \sqrt{dd_{\rm max}}}\right)$
		$$
		\left\lVert\frac{R_X^{\top}BY}{n\sqrt{k}}\right\rVert \leq \beta_4, \left\lVert\frac{X^{\top}B^\top R_Y}{n\sqrt{k}}\right\rVert \leq \beta_4
		$$
		\begin{proof}
		Denote $\mathbf{c}:=\frac{R_X^{\top}BY}{n\sqrt{k}}$ and further $\mathbf{c}_i := \frac{\left(\left(R_X\right)_i\right)^\top \mathbf{b}}{\sqrt{k}}$, where $\left(R_x\right)_i$ is the $i$th column for $R_X$ and $\mathbf{b} = \frac{BY}{n}$. 
		\begin{align*}
			\bbP\left[ |\bc_i|\leq \beta  \right] &= \int_{\mathbf{b}}\bbP\left[ |\bc_i|\leq \beta |\mathbf{b} \right] \bbP[\mathbf{b}] d\mathbf{b}\\
			& \geq \max_{\alpha > 0}\int_{|\mathbf{b}|\leq \alpha \cdot \mathbf{1}}\bbP\left[ |\bc_i|\leq \beta |\mathbf{b} \right] \bbP[\mathbf{b}] d\mathbf{b} \\
			& \geq \max_{\alpha > 0}\int_{|\mathbf{b}|\leq \alpha \cdot \mathbf{1}}\bbP\left[ |\bc_i|\leq \beta | |\mathbf{b}| = \alpha \cdot \mathbf{1} \right] \bbP[\mathbf{b}] d\mathbf{b} \\
			& \geq \max_{\alpha > 0}\bbP \left[ |\bc_i|\leq \beta |\mathbf{b}| = \alpha \cdot \mathbf{1} \right] \bbP[|\mathbf{b}|\leq \alpha \cdot \mathbf{1}] \\
			& \geq \max_{\alpha > 0}\left(1 - \frac{4\alpha\sigma_{\varepsilon, \delta}\sqrt{d_{\rm max}}}{\sqrt{2\pi}\beta}\exp\left(-\frac{\beta^2}{8\alpha^2\sigma_{\varepsilon, \delta}^2d_{\rm max}}\right) - 2k\exp\left( -n\cdot \frac{4\alpha^2\sigma_{\varepsilon, \delta}^2d_{\rm max}}{2} \right)\right) \\
			& \geq 1 - \left(\frac{1}{\sqrt{\pi \sigma_{\varepsilon, \delta}\beta \sqrt{d_{\rm max}n}}} + 2k\right)\cdot\exp\left(-\sqrt{n}\cdot\frac{\beta}{4\sqrt{d_{\rm max}}\sigma_{\varepsilon, \delta}}\right) \ \ \ \ //\alpha^2 = \frac{\beta}{2\sqrt{nd_{\rm max}}\sigma_{\varepsilon, \delta}}.
		\end{align*}
		Then
		$$
		\bbP\left[ \lVert\mathbf{c}\rVert \leq \beta  \right] \geq 1 - \sum_{i=1}^d\bbP\left[ \lVert\mathbf{c}_i\rVert > \frac{\beta}{\sqrt{d}}  \right] \geq 1 - \left(\frac{1}{\sqrt{\pi \sigma_{\varepsilon, \delta}\beta \sqrt{d_{\rm max}n}}} d^{5/4} + 2kd\right)\cdot\exp\left(-\sqrt{n}\cdot\frac{\beta}{4\sigma_{\varepsilon, \delta} \sqrt{dd_{\rm max}}}\right).
		$$
		Similarly, 
		$$
		\bbP\left[ \left\lVert\frac{X^{\top}B^\top R_Y}{n\sqrt{k}}\right\rVert \leq \beta  \right] \geq 1 - \left(\frac{1}{\sqrt{\pi \sigma_{\varepsilon, \delta}\beta \sqrt{nd_{\rm max}}}} d^{5/4} + 2kd\right)\cdot\exp\left(-\sqrt{n}\cdot\frac{\beta}{4\sigma_{\varepsilon, \delta} \sqrt{dd_{\rm max}}}\right).
		$$
		\end{proof}
		Union bound gives the conclusion.
		
	\item With prob. $1 - 2\left(\frac{1}{\sqrt{\pi \sigma_{\varepsilon, \delta}\beta_5 \sqrt{d_{\rm max}n}}}d^{5/2} + 2kd^2\right)\cdot\exp\left(-\sqrt{n}\cdot\frac{\beta_5}{4\sigma_{\varepsilon, \delta} d d_{\rm max}^{1/2}}\right)$
		$$ 
		\left\lVert\frac{R_X^{\top}BX}{n\sqrt{k}}\right\rVert \leq \beta_5,
		$$
		which is implied similar to 4.
		
	\item With prob. $1-2k(d+1)\sqrt{\frac{2kd^{1/2}d_{\rm max} \sigma_{\varepsilon, \delta}^2}{\pi n\beta_6}}\exp\left( -n\frac{\beta_6}{8kd^{1/2}d_{\rm max}\sigma_{\varepsilon, \delta}^2}\right)$, $\left\lVert \frac{R_X^{\top}R_Y}{n} \right\rVert\leq \beta_6$.
	\begin{proof}
		To simplify the proof, let's assume $R_X$ and $R_Y$ is a standard gaussian matrix first. Because $\bbP \left(\left\lvert (R_X)_{ij}\rvert\right)\leq \beta \right) \geq 1 - \frac{2}{\sqrt{2\pi}\beta}\exp\left( -\beta^2/2\right)$ shown in \autoref{lem:norm_tail}, 
		$$
		\bbP \left( \lVert R_X^{\top}R_Y \rVert \leq k\sqrt{d}\beta  \right) \geq \bbP \left( \lVert R_X \rVert  \leq \sqrt{kd\beta}, \lVert R_Y \rVert  \leq \sqrt{k\beta} \right)  \geq 1 - \frac{2k(d + 1)}{\sqrt{ 2\pi \beta}} \exp\left( -\beta/2\right).
		$$
		It's equivalent that
		$$
		\bbP \left( \left\lVert \frac{R_X^{\top}R_Y}{n} \right\rVert \leq \beta_6 \right) \geq 1-2k(d+1)\sqrt{\frac{k\sqrt{d}}{2\pi n\beta_6}} \exp\left( -\frac{n}{2k\sqrt{d}}\beta_6\right).
		$$
		Plug-in the variance of $R_X$ and $R_Y$ leads to the targeted inequality.
	\end{proof}

\end{enumerate}

Define $\hat{H}_n^{\methodtwoupper{}}:=\frac{1}{n}\left(X^\top\frac{A^\top A}{k}X + X^\top \frac{A^\top}{\sqrt{k}} R_X + R_X^\top \frac{A}{\sqrt{k}}X + \frac{1}{n}R_X^{\top}R_X\right)$, $\hat{H}_n:=\frac{X^{\top}X}{n}$, $\hat{C}_n^{\methodtwoupper{}} = \frac{1}{n}\left(X^{\top}\frac{A^\top A}{k}Y + R_X^{\top}\frac{A}{\sqrt{k}}Y + X\frac{A}{\sqrt{k}}R_Y + R_X^{\top}R_Y\right)$, $\hat{C}_n = \frac{1}{n}X^{\top}Y$.

The above analysis implies that, with prob. 
$$1  - (d+2)^2\exp\left( -k\left(\frac{\beta_1^2}{64d} - \frac{\beta_1^3}{96d\sqrt{d}} \right)\right)
- (d+2)^2\exp\left( -k\left(\frac{\beta_2^2}{64d^2} - \frac{\beta_2^3}{96d^3} \right)\right)
$$
$$
- 2kd\sqrt{\frac{2kdd_{\rm max} \sigma_{\varepsilon, \delta}^2}{\pi n\beta_3}}\exp\left( -n\frac{\beta_3}{8kdd_{\rm max}\sigma_{\varepsilon, \delta}^2}\right)
-  \left(\frac{1}{\sqrt{\pi \sigma_{\varepsilon, \delta}\beta_4 \sqrt{nd_{\rm max}}}} d^{5/4} + 2kd\right)\cdot\exp\left(-\sqrt{n}\cdot\frac{\beta_4}{4\sigma_{\varepsilon, \delta}\sqrt{dd_{\rm max}}}\right)
$$
$$
 - \left(\frac{1}{\sqrt{\pi \sigma_{\varepsilon, \delta}\beta_5 \sqrt{nd_{\rm max}}}}d^{5/2} + 2kd^2\right)\cdot\exp\left(-\sqrt{n}\cdot\frac{\beta_5}{4\sigma_{\varepsilon, \delta} dd_{\rm max}^{1/2}}\right) 
 - 2k(d+1)\sqrt{\frac{2kd^{1/2}d_{\rm max} \sigma_{\varepsilon, \delta}^2}{\pi n\beta_6}}\exp\left( -n\frac{\beta_6}{8kd^{1/2}d_{\rm max}\sigma_{\varepsilon, \delta}^2}\right)
$$
we have
$$
\lVert \hat{H}_n^{\methodtwoupper{}} - \hat{H}_n\rVert \leq \beta_2 + \beta_3 + 2\beta_5, \lVert \hat{B}_n^{\methodtwoupper{}} - \hat{B}_n\rVert \leq \beta_1 + 2\beta_4 + \beta_6.
$$
Let $\beta_2 = \frac{2\beta}{3}$, $\beta_3=\frac{1}{6}$,  $\beta_5 = \frac{\beta}{12}$ and $\beta_1=\beta_4 = \beta_6= \frac{\beta}{4}.$ We will have $\lVert \hat{H}_n^{\sf pub} - \hat{H}_n\rVert \leq \beta, \lVert \hat{C}_n^{\sf pub} - \hat{C}_n\rVert \leq \beta$, with prob. $1-f(\beta), ~\forall \beta \leq 4\sqrt{d}$ (implies $\beta_1 \leq \sqrt{d}$ and $\beta_2 \leq d$), where 
$$f(\beta) = (d + 2)^2 \exp\left(-\frac{k\beta^2}{768d}\right) 
+ (d + 2)^{2} \exp\left(-\frac{k\beta^2}{432d^2}\right) $$
$$
+ 2kd\sqrt{\frac{12kdd_{\rm max} \sigma_{\varepsilon, \delta}^2}{\pi n\beta}}\exp\left( -n\frac{\beta}{48kdd_{\rm max}\sigma_{\varepsilon, \delta}^2}\right)
+  \left(\frac{2}{\sqrt{\pi \sigma_{\varepsilon, \delta}\beta \sqrt{nd_{\rm max}}}} d^{5/4} + 2kd\right)\cdot\exp\left(-\sqrt{n}\cdot\frac{\beta}{16\sigma_{\varepsilon, \delta}\sqrt{dd_{\rm max}}}\right)
$$
$$
 + \left(\frac{2\sqrt{3}}{\sqrt{\pi \sigma_{\varepsilon, \delta}\beta \sqrt{nd_{\rm max}}}}d^{5/2} + 2kd^2\right)\cdot\exp\left(-\sqrt{n}\cdot\frac{\beta}{48\sigma_{\varepsilon, \delta} dd_{\rm max}^{1/2}}\right) 
 + 2k(d+1)\sqrt{\frac{8kd^{1/2}d_{\rm max} \sigma_{\varepsilon, \delta}^2}{\pi n\beta}}\exp\left( -n\frac{\beta}{48kd^{1/2}d_{\rm max}\sigma_{\varepsilon, \delta}^2}\right)
$$
\autoref{lem:reduction} implies that for $\beta\leq \min\{8b\sqrt{d}, \frac{2\lVert  C \rVert \lVert  H^{-1} \rVert  + 5}{8} \}$, we have 
$$
\mathbb{P}\left[\lVert\hat{\bw}_n - \bw^*\rVert \leq \beta\right] \geq 1 - h(\beta),
$$
where $h(\beta)$ is:
$$
h(\beta) = \exp\left(-\min\left\{ O\left(\frac{k\beta^2}{d^2}\right),  O\left(n\frac{\beta}{kdd_{\rm max}\sigma_{\varepsilon, \delta}^2}\right), O\left(n^{1/2}\frac{\beta}{dd_{\rm max}^{1/2} \sigma_{\varepsilon, \delta}}\right)\right\} + \tilde{O}(1) \right),
$$
where $\tilde{O}(1)$ includes $log$ terms of $n, d, d_{\rm max}, k, \beta$. If we take $k=O\left(\frac{(nd)^{1/2}}{d_{\rm max}^{1/2}\sigma_{\varepsilon, \delta}}\right)$,
	\begin{align*}
    h(\beta)=\exp\left(- \frac{n^{1/2}\beta}{d^{3/2}d_{\rm max}^{1/2}\sigma_{\varepsilon, \delta}} \cdot O\left(\min\left\{ 1, \beta \right\}\right) + \tilde{O}(1) \right).
	\end{align*}
\end{proof}

\begin{comment}
\section{Extra Results on Synthetic Datasets}
\autoref{fig:cvg_dis_app} shows when $\varepsilon\in\{0.1, 0.3\}$ how $\mathbb{P}\left[\lVert \bw_n - \bw^* \rVert > \beta\right]$ and $\mathbb{E}\lVert \bw_n - \bw^* \rVert$ of each algorithm changes when training set size $n$ increases. \methodtwoshort{} shows asymptotic tendencies $\plim_{n\to\infty}\mathbb{P}\left[\lVert \hat{\bw}_n^{\methodtwoupper{}} - \bw^* \rVert > \beta\right]=0$ for all settings except the most strict setting $(\varepsilon, \beta)=(0.1, 0.1)$, while \methodoneshort{} doesn't show such tendencies even when training set size $n$ is as large as $3\times 10^6$.
\begin{figure*}[t!]
\centering
\includegraphics[width=\linewidth]{figs/cdf_figs_appendix.png}
\caption{$\mathbb{P}\left[\lVert \hat{\bw}_n - \bw^* \rVert > \beta\right]$ and $\mathbb{E}\left[\lVert \hat{\bw}_n - \bw^* \rVert\right]$ as dataset size $n$ increases for different algorithms when $\varepsilon\in\{0.1, 0.3\}$.}
\label{fig:cvg_dis_app}
\end{figure*}
\end{comment}