\section{Notation and Problem Setup}
\label{sec:problem_setting}
\paragraph{Notations.} Denote $D^j$, $j=1, \cdots, m$, as data matrices for $m$ parties, where $D^j\in \mathbb{R}^{n\times d_j}$ and $m\geq 2$.
They are aligned by the same set of subjects but have different attributes and they have the same number of samples.
Define $D = \left[D^1,  \cdots, D^m\right]\in \mathbb{R}^{n\times (d + 1)}$ as the collection of all  datasets, where $d  =d_1 + \cdots + d_m-1$. We define $d$ by subtracting 1 from the total number of attributes because one column is label which we need to treat separately.
Define $d_{\rm max}=\max_{j\in[m]}d_j$, and $D_i$ as the $i$-th row of $D$, we make the following assumption on data distribution:
\begin{assumption}
\label{ass:data_dist}
%$D_i$ ($i=1,\cdots, n$) are i.i.d sampled from an underlying distribution $\calP: \bbR^{n\times (d+1)} \to \bbR$. 
$D_i$, $i=1,\cdots, n$, are i.i.d sampled from an underlying distribution $\calP$ over $\bbR^{d+1}$.
\end{assumption}

\paragraph{Dataset release algorithm.} A private multi-party data release algorithm needs to protect both inter-party and intra-party communications.
% cross parties and local computations inside the parties. 
% Our algorithms in the later section are under the below simple framework with three steps.
The general workflow of our proposed algorithms is designed as the following:
\begin{enumerate}[leftmargin=*,nosep]
    \item Pre-generate random variable $B$. The pre-generated one or more random variables will be shared among parties. 
    % \item Privatize the dataset locally with the algorithm $\calA^{\rm priv}$. Each party locally processes same privatizing algorithm $\calA^{\rm priv}$, which takes the local dataset $D^j\in\bbR^{n\times d_j}$ and the random matrix $B$ as input and output $k$ ``encrypted'' data  $\left(D^{\rm pub}\right)^j := \calA^{\rm priv}(D^j; B) \in \bbR^{k\times d_j}$. $k$ is predefined.
     \item Privatize the dataset locally with the algorithm $\calA^{\rm priv}$. Each party applies the same privatizing algorithm $\calA^{\rm priv}$ that takes the local dataset $D^j\in\bbR^{n\times d_j}$ and the random matrix $B$ as the inputs and then outputs $k$ (predefined) ``encrypted'' samples  $\left(D^{\rm pub}\right)^j := \calA^{\rm priv}(D^j; B) \in \bbR^{k\times d_j}$.
    \item Release the dataset. All parties jointly release $D^{\rm pub}=[\left(D^{\rm pub}\right)^1, \cdots, \left(D^{\rm pub}\right)^m]\in\bbR^{k\times (d+1)}$ to the public.
\end{enumerate}
% An algorithm under the above framework needs the specification to the random variables $B$ in the first step and the privatizing algorithm $\calA^{\rm priv}$ in the second step.
% Moreover, we would like to highlight the importance of the shared random variables $B$ for the above framework. They allow the existence of dependencies between the randomized output from different parties, which could be crucial to guarantee the further utility from the released dataset $D^{\rm pub}$.

Note that we need to specially design random variable $B$ and the privatizing algorithm $\calA^{\rm priv}$, which we will introduce in the next section. In addition, the random variable $B$ allows the dependencies between the randomized output from all parties, which can be utilized to guarantee the final utility. %\kevin{check.}

\paragraph{Privacy constraint.}
Since the public will observe the released dataset $D^{\rm pub}$, for each $j\in[m]$, $\left(D^{\rm pub}\right)^j$ should not leak the information of the private dataset $D^j$. Formally we require $\forall j\in[m]$, $\calA^{\rm priv}(D^j; B)$ is differentially private, where two neighbouring datasets $D^j$ and $\left(D^j\right)'$ differ at one row (sample).

However the multi-party setting requires more than the above guarantee because each party $j'\neq j$ not only observes $D^j$ but also the shared random variable $B$. Thus we need to further require that given $B$, each party $j$ cannot infer information about other private datasets $D^j$. In terms of differential privacy,  it is required that condition on $B$ for any possible sample value $I$, $\calA^{\rm priv}(D^j; B)$ is $(\varepsilon, \delta)$-differentially private, \emph{i.e.} for any two neighbouring datasets $D^j$ and $\left(D^j\right)'$ and $B$, we have
\resizebox{\linewidth}{!}{
 \begin{minipage}{\linewidth}
 \begin{align*}
\bbP(\calA^{\rm priv}(D^j; B)|B) \leq e^{\varepsilon}\cdot\bbP\left(\left(\calA^{\rm priv}\left(\left(D^j\right)'; B\right)\right|B\right) + \delta.
\end{align*}
 \end{minipage}
}

\paragraph{Utility target.}
We aim to guarantee the performance of \emph{arbitrary} linear regression task (arbitrarily selected label and features) on the joint released dataset $\left[D^1, \cdots, D^m\right]$.
Out of the notation simplicity, we assume the label in the linear regression task is the last attribute, and the features are the rest of the attributes.
Under this assumption, the joint private dataset $D$ can be written as $[X, Y]$, where $X\in\bbR^{n\times d}$ is the private feature matrix and $Y\in\bbR^n$ is the private label vector. 
Similarly the public dataset $D^{\rm pub}$ can be written as $[X^{\rm pub}, Y^{\rm pub}]$, where $X^{\rm pub}\in\bbR^{k\times d}$ and $Y^{\rm pub}\in\bbR^k$.

We define the loss function by the expected squared loss:
\begin{equation}
\label{eq:obj}
    L(\bw; \calP) = \bbE_{(\bx, y) \sim \calP}\left[ (\bw^\top\bx - y)^2\right],
\end{equation}
where the data point is sampled from the distribution $\calP$ in \autoref{ass:data_dist}.
% Following the literature of linear regression~\citep{farrar1967multicollinearity,chatterjee2006regression}, we make two standard assumptions on the distribution $\calP$: standard normalization assumption and \textit{no perfect multicollinearity} assumption.
We make two more assumptions for the distribution $\calP$: 
the standard normalization and the \textit{no perfect multicollinearity} assumption. The latter is common in the literature of linear regression~\citep{farrar1967multicollinearity,chatterjee2006regression}.
\begin{assumption}
\label{ass:bound}
	The absolute values of all attributes $|D_{ij}|$ are bounded by $1$.
\end{assumption}
\begin{assumption}
\label{ass:non-singular}
	$\bbE_{(\bx, y)\sim \calP}\left[ \bx\bx^\top \right]$ is positive definite.
\end{assumption}
Under \autoref{ass:non-singular}, derived by setting $\nabla_{\bw}L(\bw; \calP)=0$, the optimal solution $\bw^*$ to the loss in \autoref{eq:obj} has the following explicit form:
$$
\bw^*=\left(\bbE_{(\bx, y)\sim \calP}\left[\bx\bx^\top\right]\right)^{-1}\bbE_{(\bx, y)\sim \calP}\left[\bx\cdot y\right].
$$
% which could be derived by making $\nabla_{\bw}L(\bw; \calP)=0$.

The utility target (for the trained linear regression model) is determined by our release algorithm $(B, \calA^{\rm priv})$. For a given public dataset $D^{\rm pub}$ released by our algorithms, we define our utility target as the existence of a training algorithm $\calA^{\rm lr}$ that achieves the asymptotic property for the trained model weights $\hat{\bw}_n:=\calA^{\rm lr}\left(D^{\rm pub}\right)$ as the dataset size $n\to\infty$. 
The asymptotic property is commonly studied in differential privacy \citep{chaudhuri2011sample, bassily2014private, feldman2020private} and we restate it as follows: $\hat{\bw}_n$ converges to $\bw^*$ in probability as the size of dataset $n$ increases, \emph{i.e.} $\forall \beta>0, ~\lim_{n\to \infty}\bbP\left[ \lVert\hat{\bw}_n - \bw^*\rVert > \beta\right] = 0$. The randomness from the above property comes from data sampling $\calP$, dataset release algorithm $(B, \calA^{\rm priv})$, and the training algorithm $\calA^{\rm lr}$.

% is defined by the convergence of the learned weight $\hat{\bw}_n$ from any learning algorithm $\calA^{\rm lr}$ to the optimal weight $\bw^*$. 

% For the learning algorithm $\calA$, we expect the asymptotic property as the size of dataset $n$ increases, which is commonly studied by the differential privacy community \ruihan{cite}: \emph{$\hat{\bw}_n$ converges to $\bw^*$ in probability} as the size of dataset $n$ increases, \emph{i.e.} $\forall \beta>0, ~\lim_{n\to \infty}\bbP\left[ \lVert\hat{\bw}_n - \bw^*\rVert > \beta\right] = 0$. 



% We discuss the linear regression task that one of attributes is the label and all remaining attributes are features.
% Without the loss of generality, we assume the label is the last dimension.
% %Denote the pair of the feature matrix and the label vector as $[X, Y]=D$, where $X\in\mathbb{R}^{n\times (d)}$ and $Y\in \mathbb{R}^{n}$.
% Then the population loss for any linear model $\bw\in \mathbb{R}^{d}$ is defined by $L(\bw; \calP) = \bbE_{(\bx, y) \sim \calP}\left[ (\bw^\top\bx - y)^2\right]$.
% We define the minimizer for the population loss as $\bw^*$. 
% Denote $\calA_2$ and $\hat{\bw}_n$ as the learning algorithm from private data $D^{\sf pub}$ and the learned linear model.
% The learning algorithm $\calA_2$ possibly depends on $\calA_1$.
% We expect that \emph{$\hat{\bw}_n$ converges to $\bw^*$ in probability} as the size of dataset $n$ increases.
% Formally, it is defined as $\forall \beta>0, ~\lim_{n\to \infty}\bbP\left[ \lVert\hat{\bw}_n - \bw^*\rVert > \beta\right] = 0$. 

% % We make following two assumptions for our algorithms: %which are common in literature~\cite{eco1,eco2}:
% We further make two more assumptions:
% % Assumption~\ref{ass:bound} is a standard normalization assumption and
% The first assumption is the standard normalization.
% % that states attributes are bounded.
% \begin{assumption}
% \label{ass:bound}
% 	The absolute values of all attributes are bounded by $1$.
% \end{assumption}
% % We also assume \textit{no perfect multicollinearity} in the data, which is a common assumption in the literature of linear regression~\cite{farrar1967multicollinearity,chatterjee2006regression}.
% With Assumption \ref{ass:bound}, $\bbE_{(\bx, y)\sim \calP}\left[ \bx\bx^\top \right]$ exists. We further make the \textit{no perfect multicollinearity} assumption, which is common in the literature of linear regression~\cite{farrar1967multicollinearity,chatterjee2006regression}.
% \begin{assumption}
% \label{ass:non-singular}
% 	$\bbE_{(\bx, y)\sim \calP}\left[ \bx\bx^\top \right]$ is positive definite.
% \end{assumption}

% With the Assumption \ref{ass:non-singular}, $\bw^*$ has an explicit form: $\left(\bbE_{(\bx, y)\sim \calP}\left[\bx\bx^\top\right]\right)^{-1}\bbE_{(\bx, y)\sim \calP}\left[\bx\cdot y\right]$.

% % \kevin{Should we say something to justify assumption 2 and 3. e.g. for 2, we can see it's easily done be preproccessing. 3 we can cite some papers to say it's common?}

