
%Here is a list of highly related papers

%\begin{itemize}
%\item Motivation: \cite{agxr20} Federated Learning via Posterior Averaging: A New Perspective and Practical Algorithms \href{https://arxiv.org/pdf/2010.05273}{[link]}
%\item Related work: \cite{gghz20} Decentralized Stochastic Gradient Langevin Dynamics and Hamiltonian Monte Carlo (Mert) \href{https://arxiv.org/pdf/2007.00590.pdf}{[link]}
%\item \cite{rpu+20} FetchSGD: Communication-Efficient Federated Learning with Sketching \href{https://arxiv.org/pdf/2007.07682.pdf}{[link]}

%\item \cite{lhy+19} On The Convergence of Fedavg on non-iid data  \href{https://arxiv.org/pdf/1907.02189.pdf}{[link]}.
%\begin{itemize}
%    \item Since we are doing sampling, therefore, we probably don't need assumption 4 in \cite{lhy+19}
%\end{itemize}

%\item This paper \cite{dk19} can handle biased gradient.
%\href{https://arxiv.org/pdf/1710.00095.pdf}{[link]}
%\end{itemize}

\iffalse

\subsection{Definitions}


\begin{definition}[Smoothness]\label{def:smooth}
For any $p\in [0,1]$, we say $L$ is $\beta$-smooth if 
\begin{align*}
L(y)\leq L(x)+\langle \nabla L(x),y-x \rangle+\frac{\beta}{2}\| y-x \|^2_2
\end{align*}
\end{definition}

\Zhao{Our first step, probably will not use any semi definitions.}
\begin{definition}[semi-Smoothness]\label{def:semi-smooth}
For any $p\in [0,1]$, we say $L$ is $(b,a,p)$-semi-smooth if 
\begin{align*}
L(y)\leq L(x)+\langle \nabla L(x),y-x \rangle+ \beta_1\| y-x \|^2_2+ \beta_2 \| y-x \|^{2-2p}_2L(x)^p
\end{align*}
\end{definition}


\begin{definition}[Lipschitz]\label{def:lipschitz}
We say $L$ is $\beta$-Lipschitz if 
\begin{align*}
\| L(x)-L(y) \|^2_2\leq \beta^2\| x-y \|^2_2
\end{align*}
\end{definition}


\begin{definition}[semi-Lipschitz]\label{def:semi-lipschitz}
For any $p\in [0,1]$, we say $L$ is $(\alpha,\beta,p)$-Lipschitz if 
\begin{align*}
\| L(x)-L(y) \|^2_2\leq \beta^2\| x-y \|^2_2+ \alpha \| x-y \|^{2-2p}_2\cdot L(x)^{p}
\end{align*}
Specifically, we say function $L$ has $(\alpha,\beta,p)$-Lipschitz gradient or $L$ satisfies $(\alpha,\beta,p)$-Lipschitz gradient condition if 
\begin{align}
    \| \nabla L(x)- \nabla L(y) \|^2_2\leq \beta^2\| x-y \|^2_2+ \alpha \| x-y \|^{2-2p}_2\cdot L(x)^{p}
\end{align}

\end{definition}

\begin{definition}[Strongly convex]\label{def:strong-convex}
We say $L$ is $\alpha$-strongly convex if 
\begin{align*}
L(x)\geq L(y)+\langle \nabla L(y),x-y \rangle + \frac{\alpha}{2} \| y-x \|_2^2
\end{align*}
\end{definition}

\begin{definition}[semi-Strongly convex]\label{def:semistrong-convex}
We say $L$ is $(\alpha_1, \alpha_2, p)$-semi-strongly convex if 
\begin{align*}
L(x)\geq L(y)+\langle \nabla L(y),x-y \rangle + \alpha_1 \| y-x \|_2^2- \alpha_2 \| y-x \|_2^{2-2p}\cdot L(y)^{p}
\end{align*}
\end{definition}

We define non-critical point as follows:
\begin{definition}[Non-critical point]\label{def:no_critical_point}
We say $L$ is $(\theta_2,\theta_1)$-non-critical point if 
\begin{align*}
\theta_1^2 \cdot L(x) \leq \| \nabla L(x) \|^2 \leq \theta_2^2 \cdot L(x) .
\end{align*}
\end{definition}

The intuition for non-critical point property is that if $L(x)$ is large enough, then gradient descent can still make progress because $\|\nabla L(x)\|$ is lower bounded by $\theta_1^2 L(x)$.

\fi 

\iffalse
\subsection{An example}

We consider gradient update
\begin{align*}
    x_{t+1} = x_t - \eta \cdot \nabla L(x) |_{x = x_t}
\end{align*}

We can compute 
\begin{align*}
    & ~ L(x_{t+1}) - L(x^*) \\
    \leq & ~ L(x_t) - L(x^*) + \langle \nabla L(x_t), x_{t+1} - x_t \rangle + \beta_1 \| x_{t+1} - x_t \|^2 + \beta_2 \| x_{t+1} - x_t \|^{2-2p} \cdot L(x_t)^{p} \\
    = & ~ L(x_t) - L(x^*) - \eta \| \nabla L(x_t) \|^2 + \beta_1 \eta^2 \| \nabla L(x_t) \|^2 + \beta_2 \eta \| \nabla L(x_t) \|^{2-2p} \cdot L(x_t)^{p} \\
    \leq & ~  L(x_t) - L(x^*) - \eta \theta_1^2 L(x_t) + \beta_1 \eta^2 \theta_2^2 L(x_t) + \beta_2 \eta \theta_2^{2-2p}  L(x_t) \\
    = & ~ (1 - \eta \theta_1^2 + \beta_1 \eta^2 \theta_2^2 + \beta_2 \eta \theta_2^{2-2p}) \cdot L(x_t) - L(x^*) \\
    \leq  & ~ (1 - (\theta_1^2 - \beta_2 \theta_2^{2-2p})\eta/2 ) \cdot L(x_t) - L(x^*) \\
    = & ~ (1-\gamma) \cdot ( L(x_t) - L(x^*) )
\end{align*}
where the first step follows from $(\beta_1,\beta_2,p)$-semi-smooth, the third step follows from $(\theta_1,\theta_2)$-non-critical point, the fifth step follows from $\eta \le (\theta_1^2-\beta_2 \theta_2^{2-2p})/(2 \beta_1 \theta_2^2)$, the sixth step follows from $\gamma = \eta(\theta_1^2- \beta_2 \theta_2^{2-2p})/2$.



\subsection{Some Observations}
Not sure if this will be useful later, but here are some observations.

Assume that a function $L$ satisfies semi-smoothness (Def 2.2) and semi-strong convexity (def. 2.6).
Then from Def 2.6, we obtain
\begin{align*}
\lrw{\nabla L(y) - \nabla L(x), y - x} \geq 2 \alpha_1 \lrn{y-x}_2^2 - \alpha_2 \lrn{y-x}_2^{2-2p}\cdot \lrp{L(y)^{p}+L(x)^p}.
\end{align*}
At the same time, for any first order fixed point $x^*$, we obtain from Def. 2.2 that
\begin{align*}
L(y) \leq L(x^*) + \beta_1 \lrn{y-x^*}_2^2 + \beta_2\lrn{y-x^*}_2^{2-2p} \cdot L(x^*)^p.
\end{align*}
Since $p\in[0,1]$, we can use Jensen inequality to obtain
\begin{align*}
L(y)^{p}+L(x)^p &\leq 2 \cdot 3^{1-p} \cdot L(x^*)^p + 3^{1-p} \cdot \beta_1^p \lrp{ \lrn{y-x^*}_2^{2p} + \lrn{x-x^*}_2^{2p} } \\
&+ 3^{1-p} \cdot \beta_2^p \lrp{ \lrn{y-x^*}_2^{(2-2p)p} + \lrn{x-x^*}_2^{(2-2p)p} }\cdot L(x^*)^{p^2} .
\end{align*}
For simplicity assume that $L(x^*) = 0$. then
\begin{align*}
\lrw{\nabla L(y) - \nabla L(x), y - x} \geq 2 \alpha_1 \lrn{y-x}_2^2 - 3^{1-p} \cdot \alpha_2 \cdot \beta_1^p \lrn{y-x}_2^{2-2p} \lrp{ \lrn{y-x^*}_2^{2p} + \lrn{x-x^*}_2^{2p} }.
\end{align*}
When $x$ and $y$ are close, there's no contraction between them for $p<1$.
However, there is always contraction between $x$ (away from any first order fixed point) and $x^*$ as long as $\alpha_2$ or $\beta_1$ is small enough (in which case it seems that the fixed point is unique if $L(x^*)=0$? we can see that by taking $y^*$ and $x^*$ as fixed points and prove that the distance between them is 0).

That means in MCMC, one cannot use synchronous coupling to prove convergence in Wasserstein 2 metric.
Can consider reflection coupling.
Alternatively, can consider what's the log-Soblev constant for such class of objectives and prove convergence in KL divergence.

\fi