\documentclass[accepted]{uai2022} % after acceptance, for a revised version; also before submission to see how the non-anonymous paper would look like


%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}



%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{mathrsfs}
\usepackage{amsfonts}
\usepackage{booktabs}
\usepackage{enumitem}
\usepackage{xcolor}
\usepackage{tikz} % nice language for creating drawings and diagrams


\usepackage{algorithm}
\usepackage{algorithmic}

\usepackage{hyperref}
\hypersetup{nolinks=true}	 




\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{remark}{Remark}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}



\def\Var{{\rm {Var}}}
\def\E{{\mathbb{E}}}
\def\Cov{{\rm Cov}}
\def\Tr{{\rm Tr}}
\def\Det{{\rm Det}}
\def\Diag{{\rm diag}}
\def\Vec{{\rm Vec}}
\def\ST {{\rm s.t.}}
\def\Span{{\rm span}}
\def\sign{\rm sign}
\def\prox{\text{\rm prox}}
\def\inte {\rm int}
\def\rint {\rm rint}
\def\bd {\rm bd}
\def\rbd {\rm rbd}
\def\dom {\rm \mathbf{dom}}
\newcommand*\diff{\mathop{}\!\mathrm{d}}
\newcommand*\Diff[1]{\mathop{}\!\mathrm{d^#1}}
\def\dtb{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny def}}}{=}}} % define to be
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}


\DeclareGraphicsExtensions{.pdf,.png,.jpg}

% for special symbol
\usepackage{pifont}
\makeatletter
\newcommand*{\circnum}[1]{%
	\expandafter\@circnum\csname c@#1\endcsname
}
\newcommand*{\@circnum}[1]{%
	\ifnum#1<1 %
	\@ctrerr
	\else
	\ifnum#1>20 %
	\@ctrerr
	\else
	\ding{\the\numexpr 171+(#1)\relax}%
	\fi
	\fi
}
\makeatother

\usepackage{xspace}
\newcommand{\MATLAB}{\textsc{Matlab}\xspace}

\allowdisplaybreaks


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small, redefining \footnotesize, we provide the original \footnotesize using this macro.  (Use only sparingly, e.g., in drawings, as it is quite small.)



%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\title{Convergence Analysis of Linear Coupling with Inexact Proximal Operator}



\author{Qiang Zhou$^{1,2}$\thanks{Most of this work was performed while the first author worked as a postdoc at NTU, Singapore.} 
\quad and \quad Sinno Jialin Pan$^3$\\
	$^1${Southeast University, Nanjing 211189, China}\\
	$^2${Purple Mountain Laboratories, Nanjing 211111, China}\\
	$^3${Nanyang Technological University, Singapore}\\
	{\tt\small zhouqiang@u.nus.edu, sinnopan@ntu.edu.sg}
}



\begin{document}
\maketitle

\begin{abstract}
  Linear coupling is recently proposed to accelerate first-order algorithms by linking gradient descent and mirror descent together, which is able to achieve an accelerated convergence rate. This work focuses on the convergence analysis of linear coupling for convex composite minimization when a proximal operator cannot be exactly computed. It is of particular interest to study the convergence of linear coupling because it not only achieves the accelerated convergence rate for first-order algorithms but also works for generic norms. We present convergence analysis of linear coupling by allowing the proximal operator to be computed up to a certain precision. Our analysis illustrates that the accelerated convergence rate of linear coupling with an inexact proximal operator can be preserved if the error sequence of the inexact proximal operator decreases in a sufficiently fast rate. More importantly, our analysis leads to better bounds than existing works with inexact proximal operators. Experiment results on several real-world datasets verify our theoretical results.
\end{abstract}





\section{Introduction}\label{sec:introduction}


In this work, we consider convex composite minimization problems in the form of
\begin{equation}\label{eq:compositieproblem}
	\min_{\mathbf{x} \in \mathbb{R}^d} f(\mathbf{x}) \dtb g(\mathbf{x}) + h(\mathbf{x}),
\end{equation}
where $g$ is convex or $\mu$-strongly convex, and $L$-smooth, and $h$ is convex but possibly non-smooth \citep{nesterov2005smooth}. Various machine learning problems can be formulated in the form of \eqref{eq:compositieproblem}, where $g$ defines a convex loss function for training examples, and $h$ regularizes the model to promote a specified structure \citep{bach2011convex,sra2012optimization,jenatton2011structured}. For instance, it is well-known that $h(\mathbf{x}) = \|\mathbf{x}\|_1$ can be used to induce a sparse structure for $\mathbf{x}$ \citep{tibshirani1996regression}. In addition, constrained optimization problems can also be formulated in the form of \eqref{eq:compositieproblem} through reformulation. Specifically, for any convex optimization problem with constraint $\mathbf{x} \!\in\! \mathcal{C}$, it can be reformulated as \eqref{eq:compositieproblem} by defining $h(\mathbf{x}) \dtb I_{\mathcal{C}}(\mathbf{x})$ as the indicator function of the convex set $\mathcal{C}$, where $I_{\mathcal{C}}(\mathbf{x}) \!=\! 0$ if $\mathbf{x} \!\in\! \mathcal{C}$ or $I_{\mathcal{C}}(\mathbf{x}) \!=\! \infty$ otherwise. 
 


To solve \eqref{eq:compositieproblem}, first-order algorithms are the most popular choice due to their simplicity and generality \citep{sra2012optimization,nesterov2013introductory}.
First-order algorithms generally assume that the first-order gradient of a smooth function can be queried by a black-box in constant time. Therefore, the complexity for solving a smooth and convex function $f$ is measured by the number of times that first-order gradients are queried to produce a sequence $\{\mathbf{x}_k\}_{k=1}^T$ s.t. $f(\mathbf{x}_T) - f(\mathbf{x}^\star) \leq \epsilon$, where $\mathbf{x}^\star$ is the optimal solution. A gradient descent algorithm has $O(L/\epsilon)$ iteration complexity for $L$-smooth convex minimization problems, which can be improved to $O(L/\mu\log(1/\epsilon))$ when the function is $\mu$-strongly convex and $L$-smooth \citep{nesterov2013introductory}. 
However, these complexities are not optimal, which leaves big space for improvement \citep{nemirovsky1983problem}. In the seminal work \citep{nesterov1983method}, Nesterov laid the foundation of accelerated gradient methods (AGDs) for convex and $L$-smooth functions (i.e. $h(\mathbf{x}) = 0$ in \eqref{eq:compositieproblem}). Specifically, the method proposed in~\citep{nesterov1983method} successfully improves the complexities of gradient descent methods to $O(\sqrt{L/\epsilon})$ and $O(\sqrt{L/\mu}\log(1/\epsilon))$ for generally $\mu$-strongly convex and smooth functions, respectively, which are accelerated and optimal for first-order algorithms \citep{nemirovsky1983problem,nesterov2013introductory}. Since then, accelerated first-order algorithms with the optimal convergence rate have been further developed to solve convex composite minimization problems \citep{tseng2008onaccelerated,lin2017catalyst}.







However, the interpretation of the proof of acceleration and intuitions behind the convergence analysis in Nesterov's AGDs are not clear. Many efforts have been devoted to present a clear interpretation for Nesterov's AGDs, or alternatively to develop new accelerated and interpretable variants with the optimal convergence rate \citep{bubeck2015ageometric,su2014adifferential,krichene2015accelerated,diakonikolas2018accelerated}. Recently, \cite{allenzhu2017linear} proved that the optimal convergence rate can be achieved for smooth and convex optimization problems by {\em linearly coupling} two fundamental first-order algorithms (namely gradient descent and mirror descent). Later, \cite{rodomanov2016linear} extended linear coupling to generally convex composite minimization problems. 






As a variant of Nesterov's AGDs, linear coupling (LC) elegantly extends AGDs to non-Euclidean norms, which is important for many applications. In \cite[Appendix A.1]{allenzhu2017linear}, several concrete examples have been presented to illustrate the importance of allowing non-Euclidean norms in first-order algorithms. For example, one prefers to use the $\ell_1$ norm instead of the $\ell_2$ norm gradient descent for the saddle point problem $\min_{\mathbf{x} \in \Delta_n}\max_{\mathbf{y} \in \Delta_m} \mathbf{y}^\top\mathbf{A}\mathbf{x}$, where $\Delta_n$ and $\Delta_m$ denotes the unit simplex in $\mathbb{R}^n$ and $\mathbb{R}^m$, respectively, and $\mathbf{A} \in \mathbb{R}^{m \times n}$ with all the entries being in $[-1, 1]$. In this problem, there are two reasons for choosing the $\ell_1$ norm gradient descent as follows.  
\begin{itemize}[leftmargin=*,label={}]
	\item 1) In order to apply the $\ell_2$ norm gradient descent, one needs to further assume the square $\ell_2$ norm of each row of $\mathbf{A}$ is upper bounded by $1$. Obviously, it is a stronger condition and harder to satisfy than that of the $\ell_1$ norm.
	\item 2) More importantly, even the stronger condition is satisfied, the $\ell_2$ norm also leads to a larger value for $L$, thus the $\ell_1$ norm gradient descent has faster convergence \citep{nesterov2005smooth}. Another example is near-linear time maximum flow in which one need to apply $\ell_\infty$ gradient descent \citep{kelner2014an}.
\end{itemize} 





For convex composite optimization problems \eqref{eq:compositieproblem}, the proximal gradient descent \citep{parikh2014proximal} performs the following updates:
\begin{equation*}
	\mathbf{x}_{k+1} = \prox_{\eta_{k+1} h} \big(\mathbf{x}_k - \eta_{k+1} \nabla g(\mathbf{x}_k) \big), 
\end{equation*}
where $\prox_{\eta h}(\cdot)$ is the proximal operator \citep{combettes2011proximal} of $\eta h(\mathbf{x})$ defined for any scalar $\eta > 0$ as the unique solution of 
\begin{equation}\label{eq:proximaloperataor}
	\prox_{\eta h}(\mathbf{y}) = \argmin_{\mathbf{x} \in \mathbb{R}^d} \left\{\eta h(\mathbf{x}) + \frac{1}{2}\|\mathbf{x} - \mathbf{y}\|^2\right\}.
\end{equation}




If $h$ is considerably simple (e.g., $h(\mathbf{x}) = \|\mathbf{x}\|_1$), there is an analytical solution for $\mathbf{x}_{k+1}$ \citep{combettes2011proximal}. However, in more general cases, it is challenging to obtain the exact solution of the proximal operator possibly due to the following two reasons.
\begin{itemize}[leftmargin=*] 
	\item First, the proximal operator does not admit an analytical solution. For example, there is no closed form solution for the proximal operator if $h$ is the isotropic total variation regularization \citep{beck2009fast}. In this case, the proximal operator can only be solved by employing some optimization algorithm up to a certain precision. More details of this example can be found in \textit{Experiments}. 
	\item Second, it may be computationally expensive to obtain the exact solution. For instance, for optimization problems with an $\ell_1$ norm ball constraint (e.g., $h(\mathbf{x}) = I_{\|\mathbf{x}\|_1 \leq r}(\mathbf{x}), r \in \mathbb{R}_+$), the complexity of exactly performing the proximal operator is $O(d\log d)$~\citep{duchi2008efficient,liu2009efficient}. Therefore, it is highly demanding in computation for high-dimensional cases as $d$ is considerably large. We empirically compare the efficiency of exact and inexact proximal operator of the $\ell_1$ norm ball constraint. The result suggests that the inexact proximal operator outperforms the exact counterpart by carefully controlling the error sequences in this application. 
\end{itemize}







In \citep{allenzhu2017linear}, the objective is assumed to be convex and smooth, i.e., $h(\mathbf{x}) = 0$. Thus, the analysis of linear coupling does not involve the computation of proximal operator.
However, it is well-known that many machine learning problems can be formulated as a convex composite minimization problem due to the non-smooth regularization. Therefore, it is of particular interest to characterize the convergence behavior of linear coupling, when the proximal operator is not exactly solved. However, existing analyses \citep{schmidt2011convergence} on inexact proximal operators only cover the $\ell_2$ norm case, thus they are not applicable to linear coupling due to non-Euclidean norms. To this end, we present a complete study for the convergence rate of {\em linear coupling} with inexact proximal operators. 




Compared with existing works \citep{schmidt2011convergence,lin2017catalyst,kulunchakov2019}, our focus is the convergence analysis of linear coupling with inexact proximal operators, which presents new challenges due to the generic Bregman divergence (refer to Definition~\ref{de:Bregmandivergence}). In particular, the key step for analyzing inexact proximal operators is to bound the subgradient of inexact solution. It is an easy task in the case of squared Euclidean distance as the bounding problem has an analytical solution \citep{schmidt2011convergence}. In contrast, it does not admit an analytical form in our case due to the generic Bregman divergence, e.g., the Kullback-Leibler divergence. To address it, we present a relaxation method for the bounding problem so that the subgradient still can be bounded (see Lemma~\ref{th:xisubgradientofinexactsolution}). More importantly, our analysis leads to tighter bounds (refer to Remarks~\ref{re:comparisonconvergenceconstantconvex} and \ref{re:comparisonconvergenceconstantstronglyconvex} for details) than previous works \citep{schmidt2011convergence,lin2017catalyst,kulunchakov2019}. 






\section{Notation and Preliminaries}\label{sec:notationandpreliminary}
Throughout this paper, vectors and matrices are denoted by lower-case and upper-case boldface characters (e.g., $\mathbf{x}$ and $\mathbf{X}$), respectively.
Let $\mathbf{0}$ be a vector or matrix with all its entries equal to $0$. For both $\mathbf{x}, \mathbf{y} \!\in\! \mathbb{R}^d$, their inner product is
denoted by $\left\langle\mathbf{x}, \mathbf{y}\right\rangle \!=\! \sum_{i=1}^d \!x_i y_i$. Let $\|\cdot\|$ be a generic norm and its dual norm
is denoted by $\|\cdot\|_*$ that is defined as $\|\mathbf{y}\|_* = \sup_{\mathbf{x}} \big\{ \langle \mathbf{x}, \mathbf{y}\rangle ~|~ \|\mathbf{x}\| \leq 1 \}$.
Let $\|\cdot\|_1$ and $\|\cdot\|_2$ denote the $\ell_1$ and the $\ell_2$ norm, respectively.










\begin{definition}\label{de:Lsmooth}
	A function $f$ is $L$-smooth w.r.t. $\|\cdot\|$, if 
	\begin{equation*}
		f(\mathbf{y}) \leq f(\mathbf{x}) + \langle \nabla f(\mathbf{x}), \mathbf{y} - \mathbf{x} \rangle + \frac{L}{2}\|\mathbf{y} - \mathbf{x}\|^2, ~\forall \mathbf{x}, \mathbf{y}.
	\end{equation*}
\end{definition}



\begin{definition}\label{de:mustronglyconvex}
	A function $f$ is $\mu$-strongly convex w.r.t. $\|\cdot\|$, if  
	\begin{equation*}
		f(\mathbf{y}) \geq f(\mathbf{x}) + \langle \nabla f(\mathbf{x}), \mathbf{y} - \mathbf{x} \rangle + \frac{\mu}{2}\|\mathbf{y} - \mathbf{x}\|^2, ~\forall \mathbf{x}, \mathbf{y}.
	\end{equation*}
\end{definition}


\begin{definition}\label{de:Bregmandivergence}
	Let $\psi : \mathcal{Q} \rightarrow \mathbb{R}$ be a strictly convex and continuously differentiable function. 
	Then, the Bregman divergence is 
	\begin{equation*}
		V_{\psi}(\mathbf{y}, \mathbf{x}) \!\dtb\! \psi(\mathbf{y}) - \psi(\mathbf{x}) - \langle \nabla \psi(\mathbf{x}), \mathbf{y} - \mathbf{x} \rangle,
		\forall \mathbf{x}, \mathbf{y} \in \mathcal{Q}.
	\end{equation*}
\end{definition}
Definition~\ref{de:Bregmandivergence} implies $V_{\psi}(\mathbf{x}, \mathbf{x}) = 0$, and $V_{\psi}(\mathbf{y}, \mathbf{x}) \geq \tfrac{\rho}{2}\|\mathbf{x} - \mathbf{y}\|^2$ if $\psi$ is $\rho$-strongly convex w.r.t. $\|\cdot\|$. The Bregman divergence includes many well-known examples. 
\begin{itemize}
	\item 1) If $\psi(\mathbf{x}) \dtb \tfrac{1}{2}\|\mathbf{x}\|_2^2$, then $V_{\psi}(\mathbf{y}, \mathbf{x})$ is the squared Euclidean distance $V_{\psi}(\mathbf{y}, \mathbf{x}) = \tfrac{1}{2}\|\mathbf{x} - \mathbf{y}\|_2^2$.
	\item 2) If $\mathcal{Q} \dtb \big\{ \mathbf{x} \in \mathbb{R}_+^d | \sum_i x_i = 1 \big\}$ and $\psi(\mathbf{x}) \dtb \sum_i x_i \log x_i$, then $V_{\psi}(\mathbf{y}, \mathbf{x})$ becomes the Kullback-Leibler divergence $V_{\psi}(\mathbf{y}, \mathbf{x}) = \sum_i y_i \log(\tfrac{y_i}{x_i})$ between two probability distributions $\mathbf{x}$ and $\mathbf{y}$. In particular, $\psi(\mathbf{x})$ in this case is $1$-strongly convex w.r.t. $\|\cdot\|_1$ that leads to $V_{\psi} (\mathbf{y}, \mathbf{x}) \geq \|\mathbf{x} - \mathbf{y}\|_1^2$. In this case, one needs to employ optimization methods that are applicable for the $\ell_1$ norm.
	Therefore, this example illustrates the importance of convergence analysis for linear coupling with inexact proximal operators since it works for a generic norm.
\end{itemize} 





\begin{definition}\label{de:convexconjugate}
	For function $f(\mathbf{x})$, its convex conjugate is defined as 
	\begin{equation*}
		f^*(\mathbf{y}) \dtb \sup_{\mathbf{x}} \big\{ \langle \mathbf{x}, \mathbf{y} \rangle  - f(\mathbf{x}) \big\}.
	\end{equation*}
\end{definition}










\begin{algorithm}[t]
	\caption{LC with Inexact Proximal Operators}\label{alg:linearcoupling}
	\begin{algorithmic}[1]
		\STATE {\bfseries Input}: $\mathbf{x}_0, \alpha_0, \mu, L$
		\STATE {\bfseries Initialization}: $\mathbf{y}_0\leftarrow \mathbf{x}_0, \mathbf{z}_0 \leftarrow \mathbf{x}_0$
		\FOR{$k = 0$ {\bfseries to} $T\!-\!1$}
		\IF{$\mu = 0$}
		\STATE Set $\eta_{k+1} \leftarrow  \tfrac{k+2}{2L}$ and $\tau_k \leftarrow \tfrac{1}{L\eta_{k+1}}$ \\
		\STATE Set $\mathbf{w}_{k+1} \leftarrow  \mathbf{z}_k$
		\ELSE
		\STATE Set $\eta_{k+1} \!\leftarrow \! \tfrac{1}{L\alpha_{k+1}}$ and $\tau_k \!\leftarrow \! \tfrac{L\alpha_{k+1}  - \mu}{L - \mu}$ where $\alpha_{k+1}$ is obtained via $\alpha_{k+1}^2 \!=\! (1 - \alpha_{k+1}) \alpha_k^2 + \tfrac{\mu}{L}\alpha_{k+1}$
		\STATE Set $\mathbf{w}_{k+1} \leftarrow  \tfrac{\tau_k}{\alpha_{k+1}} \mathbf{z}_k + \big(1 - \tfrac{\tau_k}{\alpha_{k+1}}\big) \mathbf{y}_k$
		\ENDIF
		\STATE $\mathbf{x}_{k+1} \leftarrow \tau_k \mathbf{z}_k + (1 - \tau_k) \mathbf{y}_k$
		\STATE Find a $\xi_{k+1}$-suboptimal solution $\mathbf{y}_{k+1}$ for \eqref{eq:gradentdescentconvexcomposite}
		\setlength\abovedisplayskip{3pt}
		\STATE Find a $\xi_{k+1}$-suboptimal solution $\mathbf{z}_{k+1}$ for \eqref{eq:mirrordescentconvexcomposite}
		\ENDFOR
		\STATE {\bfseries Output}: $\mathbf{y}_T$
	\end{algorithmic}
\end{algorithm}









\section{Linear Coupling with Inexact Proximal Operator}\label{sec:inexactlinearcoupling}
For smooth and convex functions, the accelerated convergence rate can be obtained by two non-accelerated algorithms: linearly coupling gradient descent and mirror descent~\citep{allenzhu2017linear}. In fact, it can also be extended to solve convex composite minimization problem~\eqref{eq:compositieproblem} \citep{rodomanov2016linear}. We assume that $g$ is convex $(\mu = 0$) or $\mu$-strongly convex $(\mu > 0$), and $L$-smooth w.r.t. $\|\cdot\|$. Here, we present the extension of linear coupling (LC) for \eqref{eq:compositieproblem} when $g$ is either convex ($\mu = 0$) or strongly convex ($\mu > 0$) and summarize the high-level idea in Algorithm~\ref{alg:linearcoupling}. Missing proofs can be found in Appendix.



 
Let $\mathbf{y}_k$ and $\mathbf{z}_k$ be the outputs of gradient descent and mirror descent in the $(k\!-\!1)$-th iteration, respectively. The key idea of linear coupling is to combine $\mathbf{y}_k$ and $\mathbf{z}_k$ together by a linear coupling rate $\tau_k$ as the starting point for the next iteration such that the accelerated convergence rate can be achieved.  



Define $\mathbf{x}_{k+1} \dtb \tau_k \mathbf{z}_k + (1 - \tau_k) \mathbf{y}_k$.
In linear coupling, the gradient descent performs the following update:
\begin{equation}\label{eq:gradentdescentconvexcomposite} 
	\mathbf{y}_{k+1} = \argmin_{\mathbf{y}} \widetilde{Q}_{k+1}(\mathbf{y};\mathbf{x}_{k+1}), 
\end{equation}
where $\widetilde{Q}_{k+1}(\mathbf{y};\mathbf{x}_{k+1})$ is define as 
\begin{equation*}
	\widetilde{Q}_{k+1}(\mathbf{y};\mathbf{x}_{k+1}) = \langle \nabla g(\mathbf{x}_{k+1}), \mathbf{y}\rangle + h(\mathbf{y}) + \frac{L}{2} \|\mathbf{y} - \mathbf{x}_{k+1}\|^2.
\end{equation*}
Define $\eta_{k+1}, \tau_k$ and $\mathbf{w}_{k+1}$ as in Algorithm~\ref{alg:linearcoupling}, the mirror descent performs the following update:
\begin{equation}\label{eq:mirrordescentconvexcomposite}  
	\mathbf{z}_{k+1} = \argmin_{\mathbf{x}} \widehat{Q}_{k+1}(\mathbf{z};\mathbf{w}_{k+1}), 
\end{equation}
where   
\begin{equation*}
	\widehat{Q}_{k+1}(\mathbf{z};\mathbf{w}_{k+1}) \dtb \langle \nabla g(\mathbf{x}_{k+1}), \mathbf{z}\rangle + h(\mathbf{z}) + \frac{V_{\psi}(\mathbf{z},\mathbf{w}_{k+1})}{\eta_{k+1}}.
\end{equation*}

 


Unlike standard gradient descent and mirror descent, the linear coupling takes the gradient at $\mathbf{x}_{k+1}$ instead of $\mathbf{y}_k$ or $\mathbf{z}_k$ to obtain $\mathbf{y}_{k+1}$ and $\mathbf{z}_{k+1}$. In this way, the gradient descent and mirror descent are coupled together to solve \eqref{eq:compositieproblem}, which is able to achieve the optimal convergence rate \citep{allenzhu2017linear}.


\subsection{The $\xi_{k+1}$-suboptimal Solution}

The proximal operator is involved in both \eqref{eq:gradentdescentconvexcomposite} and \eqref{eq:mirrordescentconvexcomposite} due to the non-smooth function $h$. It is worth noting that the proximal distance in both \eqref{eq:gradentdescentconvexcomposite} and \eqref{eq:mirrordescentconvexcomposite} are defined based a generic norm $\|\cdot\|$ instead of $\|\cdot\|_2$. Specifically, they are  the squared norm $\|\cdot\|^2$ and the general Bregman divergence $V_{\psi}(\cdot,\cdot)$, respectively. In contrast, existing works mainly focus on the case of the squared Euclidean distance (the $\ell_2$ norm). Therefore, \eqref{eq:gradentdescentconvexcomposite} and \eqref{eq:mirrordescentconvexcomposite} are more challenging to solve than existing works. In other words, it is often the case that \eqref{eq:gradentdescentconvexcomposite} and \eqref{eq:mirrordescentconvexcomposite} can only be solved up to a certain precision. Therefore, it is critical to study the convergence rate of Algorithm~\ref{alg:linearcoupling} by allowing the proximal operator of the form \eqref{eq:gradentdescentconvexcomposite} and \eqref{eq:mirrordescentconvexcomposite} to be solved approximately. 

 




 

For the case of inexact proximal operators, we introduce $\xi_{k+1}$-suboptimal solution to analyze the convergence rate of Algorithm~\ref{alg:linearcoupling}. We assume that $h$ is equipped with an oracle such that the proximal operator can be computed up to a certain precision.
Specifically, given a non-negative $\xi_{k+1}$, the oracle is able to produce $\mathbf{y}_{k+1}$ and $\mathbf{z}_{k+1}$ such that
\begin{align}
	&~\widetilde{Q}_{k+1}(\mathbf{y}_{k+1};\mathbf{x}_{k+1}) - \min_{\mathbf{y}} \widetilde{Q}_{k+1}(\mathbf{y};\mathbf{x}_{k+1}) \leq  \xi_{k+1}, \label{eq:suboptimalsolutiongradientdescentcomposite} 
	\\
	&~ \widehat{Q}_{k+1}(\mathbf{z}_{k+1};\mathbf{w}_{k+1}) - \min_{\mathbf{z}} \widehat{Q}_{k+1}(\mathbf{z};\mathbf{w}_{k+1}) \leq \xi_{k+1}. \label{eq:suboptimalsolutionmirrordescentcomposite}
\end{align}
If $\xi_{k+1} \!= 0$, it implies that the proximal operator is exactly solved for both \eqref{eq:gradentdescentconvexcomposite} and \eqref{eq:mirrordescentconvexcomposite}. Otherwise, it means that the proximal operator is solved up to a certain precision controlled by $\xi_{k+1}$. Thus, $\mathbf{y}_{k+1}$ and $\mathbf{z}_{k+1}$ are referred to as $\xi_{k+1}$-suboptimal solutions to \eqref{eq:gradentdescentconvexcomposite} and \eqref{eq:mirrordescentconvexcomposite}, respectively.


Note that the inexactness criteria \eqref{eq:suboptimalsolutiongradientdescentcomposite} and \eqref{eq:suboptimalsolutionmirrordescentcomposite} are same as those used in \citep{schmidt2011convergence,lin2017catalyst,kulunchakov2019}. Whereas such a type of criteria has limitations, it remains the most standard one for convergence analysis with inexact proximal operator. 


Our analysis allows that the sub-problems \eqref{eq:gradentdescentconvexcomposite} and \eqref{eq:mirrordescentconvexcomposite} do not admit closed-form solutions Therefore one needs to compute an approximate solution up to a certain accuracy with some iterative algorithm $\mathcal{M}$. Note that both the sub-problems \eqref{eq:gradentdescentconvexcomposite} and \eqref{eq:mirrordescentconvexcomposite} are strongly convex even the objective $f(\mathbf{x})$ is not strongly convex. The strong convexity allows us to efficiently obtain a $\xi_{k+1}$-suboptimal solution via $\mathcal{M}$ with a linear convergence rate \citep{lin2017catalyst,kulunchakov2019}. 



 


\section{Convergence Analysis}\label{sec:convergenceanalysis}

In this section, we present the convergence analysis for Algorithm~\ref{alg:linearcoupling} when applying it to solve \eqref{eq:compositieproblem}. Specifically, we first present properties of a suboptimal solution in Section~\ref{sec:propertiesofxisuboptimalsolution}. Then, in Sections~\ref{sec:convergencerategenerallyconvex} and ~\ref{sec:convergenceratestronglyconvex}, we present specific convergence results of Algorithm~\ref{alg:linearcoupling} for $\mu = 0$ and $\mu > 0$, in Theorems~\ref{th:inexactLCconvergenceconvex} and ~\ref{th:inexactLCconvergencestronglyconvex}, respectively.
For convenience, we also assume that $\psi(\mathbf{x})$ is $1$-strongly convex and $\rho$-smooth w.r.t. $\|\cdot\|$. By introducing the Bregman divergence, our analysis can include more cases than existing works. In other words, it can recover many general cases. For example, the counterpart considers the squared $\ell_2$-norm that can be easily obtained by setting $\psi(\mathbf{x}) = \frac{1}{2}\|\mathbf{x}\|_2^2$ where $\psi(\mathbf{x})$ is $1$-strongly convex and $1$-smooth w.r.t. the $\ell_2$-norm.




 


\subsection{Properties of Suboptimal Solution}\label{sec:propertiesofxisuboptimalsolution}

For convergence analysis with inexact proximal operators, the key is to bound the solution of inexact proximal operator and the $\xi$-subgradient of the inexact solution. 
Thus, we first introduce the definition of $\xi$-subgradient, which is a generalization of subgradient. 
\begin{definition}\citep{bertsekas2003convex}\label{de:tausubdifferential}
	For convex function $f: \mathbb{R}^d \rightarrow \mathbb{R}$ and a non-negative scalar $\xi$, $\partial_{\xi} f(\mathbf{x})$ is the $\xi$-subgradient
	of $f$ at $\mathbf{x}$ if it holds that, $\forall \mathbf{v} \in \partial_{\xi} f(\mathbf{x})$,
	\begin{equation}\label{eq:tausubdifferential}
		f(\mathbf{y}) \geq f(\mathbf{x}) + \langle \mathbf{v}, \mathbf{y} - \mathbf{x} \rangle - \xi, \forall \mathbf{y} \in \mathbb{R}^d.
	\end{equation}
\end{definition}
Definition~\ref{de:tausubdifferential} implies that $\mathbf{0}$ is a $\xi$-subgradient of $f$ at $\mathbf{x}$ if $\mathbf{x}$ is a $\xi$-suboptimal solution of $f$. In the case of the $\ell_2$ norm, \citet{schmidt2011convergence} showed that $\mathbf{v}$ in Definition~\ref{de:tausubdifferential} can be easily bounded as the squared Euclidean distance in \eqref{eq:proximaloperataor} which leads to an analytical form for $\mathbf{v}$. In contrast, our case is more challenging as it involves $(\nabla \psi)^{-1}$ which generally does not has an analytical form for generic Bregman divergence. To address this problem, we present a relaxation method by exploiting the strongly convexity of $\psi$ so that it still admits an analytical form.
We define 
\begin{equation*}
	Q_{k+1}(\mathbf{z};\mathbf{w}_{k+1}) \dtb \langle \nabla g(\mathbf{x}_{k+1}), \mathbf{z} - \mathbf{w}_{k+1}\rangle + h(\mathbf{z}).
\end{equation*}
If $\mathbf{z}_{k+1}$ is $\xi_{k+1}$-suboptimal to \eqref{eq:mirrordescentconvexcomposite}, Lemma~\ref{th:xisubgradientofinexactsolution} provides a $\xi_{k+1}$-subgradient for $Q_{k+1}(\cdot;\mathbf{w}_{k+1})$ at $\mathbf{z}_{k+1}$.
\begin{lemma}\label{th:xisubgradientofinexactsolution}
	For $\forall k \geq 0$, if $\mathbf{z}_{k+1}$ is a $\xi_{k+1}$-suboptimal solution to \eqref{eq:mirrordescentconvexcomposite} in the sense of \eqref{eq:suboptimalsolutionmirrordescentcomposite}, then there exists $\boldsymbol\beta_{k+1}$ with
	$\!\|\boldsymbol\beta_{k+1}\|_*^2 \leq 2\rho\xi_{k+1}/\eta_{k+1}$ such that
	\begin{equation*}
		\frac{\nabla \psi(\mathbf{w}_{k+1}) - \nabla \psi(\mathbf{z}_{k+1})}{\eta_{k+1}} - \boldsymbol\beta_{k+1}
		\in \partial_{\xi_{k+1}} Q_{k+1}(\mathbf{z}_{k+1};\mathbf{w}_{k+1}).
	\end{equation*}
\end{lemma} 


The proof of Lemma~\ref{th:xisubgradientofinexactsolution} is given in Appendix~A.1. By using Lemma~\ref{th:xisubgradientofinexactsolution}, the following lemma enables us to bound the intermediate results of mirror descent with the inexact proximal operator. 
\begin{lemma}\label{th:Bregmandivergencegeneralizedinequalityinexact}
	Under the same setting as in Lemma~\ref{th:xisubgradientofinexactsolution}, then there exists $\boldsymbol\beta_{k+1}$ with $\|\boldsymbol\beta_{k+1}\|_*^2 \leq 2\rho\xi_{k+1}/\eta_{k+1}$ such that
	\begin{align*}
		&~ \widehat{Q}_{k+1}(\mathbf{z}_{k+1};\mathbf{w}_{k+1}) + V_{\psi}\big(\mathbf{u}, \mathbf{z}_{k+1}\big)/\eta_{k+1} - \xi_{k+1} \\
		&~ \leq \widehat{Q}_{k+1}(\mathbf{u};\mathbf{w}_{k+1}) + \big\langle \boldsymbol\beta_{k+1}, \mathbf{u} - \mathbf{z}_{k+1} \big\rangle, \forall \mathbf{u}, k \geq 0.
	\end{align*}
\end{lemma}
The proof of Lemma~\ref{th:Bregmandivergencegeneralizedinequalityinexact} is given in Appendix~A.2.



\subsection{Convergence Rates of Convex $g$}\label{sec:convergencerategenerallyconvex}

We first focus on the case when $g$ is convex. The next lemma derives a characteristic inequality for a specific Lyapunov function for inexact linear coupling by considering the inexactness of $\mathbf{y}_{k+1}$ and $\mathbf{z}_{k+1}$. 
\begin{lemma}\label{th:Lyapunovfunctionconvex}
	Under the same setting as in Lemma~\ref{th:xisubgradientofinexactsolution}, for any $k \geq 0$,
	if $g$ is convex and $\tau_k = 1/L\eta_{k+1}$,
	\begin{align}\label{eq:Lyapunovfunctionconvex}
			&~ \frac{1}{\tau_k^2} \left(f(\mathbf{y}_{k+1}) - f(\mathbf{x}^\star) \right) + L V_{\psi}(\mathbf{x}^\star, \mathbf{z}_{k+1}) \nonumber \\
			&~ \leq \frac{1 - \tau_k}{\tau_k^2} \left(f(\mathbf{y}_k) - f(\mathbf{x}^\star) \right) + L V_{\psi}(\mathbf{x}^\star, \mathbf{z}_k) \nonumber \\
			&~ + \sqrt{\frac{2\rho L \xi_{k+1}}{\tau_k}} \big\|\mathbf{x}^\star - \mathbf{z}_{k+1}\big\| + \frac{1 + \tau_k}{\tau_k^2}\xi_{k+1}.
		\end{align}
\end{lemma}
The proof of Lemma~\ref{th:Lyapunovfunctionconvex} is given in Appendix~B.1. From the Lyapunov function, we obtain a general convergence result for linear coupling with inexact proximal operator.
\begin{theorem}\label{th:inexactLCconvergenceconvex}
	Under the same setting as in Lemma~\ref{th:xisubgradientofinexactsolution}, if $g$ is convex, $\eta_{k+1} = (k+2)/2L$ and $\tau_k = 1/L\eta_{k+1}, \forall k \geq 0$, then $\forall T \geq 1$:
	\begin{equation}\label{eq:inexactLCconvergenceconvex}
		f(\mathbf{y}_T) - f(\mathbf{x}^\star) \leq \frac{6\left(L V_{\psi}(\mathbf{x}^\star, \mathbf{x}_0) + \widetilde{E}_T  + \widehat{E}_T\right)}{(T+1)^2}, 
	\end{equation}
	where $\widetilde{E}_T \!\dtb\! \sum_{k=1}^T(k+2)^2\xi_k$ and $\widehat{E}_T \!\dtb\! \left(\sum_{k=1}^T \!\sqrt{2\rho(k\!+\!1)\xi_k}\right)^2$.
\end{theorem}
The proof of Theorem~\ref{th:inexactLCconvergenceconvex} is given in Appendix~B.2.
\begin{remark}
	If the proximal operator is exact (i.e., $\xi_k = 0, \forall k \geq 1$), it leads to $\widetilde{E}_T = 0$ and $\widehat{E}_T = 0$. 
	Then, Theorem~\ref{th:inexactLCconvergenceconvex} recoveries the accelerated complexity $O(\sqrt{L/\epsilon})$ for convex objectives \citep{nesterov2013introductory}.
\end{remark}
\begin{corollary}\label{th:inexactLCconvergenceconstantconvex}
	Consider the same setting as Theorem~\ref{th:inexactLCconvergenceconvex}, if $\xi_k \leq \xi$ for all $k \geq 1$, then $\forall T \geq 1$:
	\begin{equation}\label{eq:inexactLCconvergenceconstantconvex}
		f(\mathbf{y}_T) - f(\mathbf{x}^\star) \leq \frac{6 L V_{\psi}(\mathbf{x}^\star, \mathbf{x}_0)}{(T+1)^2} + \big(\theta_1 T + \theta_2\big)\xi.
	\end{equation}
	where $\theta_1 = (6+16\rho)$ and $\theta_2 = 15 + 32\rho$.
\end{corollary}
\begin{remark}\label{re:comparisonconvergenceconstantconvex}
	Under the same setting as Corollary~\ref{th:inexactLCconvergenceconstantconvex}, based on \cite[Proposition~4]{schmidt2011convergence}, we have
	\begin{equation}\label{eq:IPMCconvergenceconstantconvex}
		\!\!\!	f(\mathbf{y}_T) - f(\mathbf{x}^\star) \leq\! \frac{6 L \|\mathbf{x}_0 - \mathbf{x}^\star\|^2}{(T+1)^2} + \left(12T^2\! + 4T + 2\right)\xi. \!\!\!\!\!
	\end{equation}
	Based on \cite[Theorem 3]{lin2017catalyst}, we have
	\begin{equation}\label{eq:CatalystCconvergenceconstantconvex}
		f(\mathbf{y}_T) - f(\mathbf{x}^\star) \leq \frac{4 L \|\mathbf{x}_0 - \mathbf{x}^\star\|^2}{(T+1)^2} + \frac{9}{2}(T+2)^2\xi.
	\end{equation}
	Based on \cite[Proposition~4]{kulunchakov2019}, we have
	\begin{equation}\label{eq:SCOCconvergenceconstantconvex}
		\!\!\!f(\mathbf{y}_T\!) - f(\mathbf{x}^\star) \!\leq\!\! \frac{2e^{1+\gamma}L\|\mathbf{x}_0-\mathbf{x}^\star\|^2}{(T+1)^2} + \frac{e^{1+\gamma}}{2\gamma}(T+2)^2\xi.\!\!\!
	\end{equation}
	Comparing \eqref{eq:inexactLCconvergenceconstantconvex} with \eqref{eq:IPMCconvergenceconstantconvex}, \eqref{eq:CatalystCconvergenceconstantconvex} and \eqref{eq:SCOCconvergenceconstantconvex}, our analysis achieves better bound than existing works \citep{schmidt2011convergence,lin2017catalyst,kulunchakov2019}.
	Specifically, our bound on the inexact proximal operator $\xi$ increases as $O(T\xi)$ while it is $O(T^2\xi)$ for existing works.
\end{remark}






Theorem~\ref{th:inexactLCconvergenceconvex} suggests that the inexact proximal operator leads to error accumulation in the convergence result. To preserve the accelerated rate $O(\tfrac{1}{T^2})$, Theorem~\ref{th:inexactLCconvergenceconvex} implies the error sequence should decrease to $0$ at a sufficiently fast rate. 
\begin{corollary}\label{th:inexactLCconvergencedecreasedconvex}
	Under the same setting as Theorem~\ref{th:inexactLCconvergenceconvex}, for any $\delta > 0$, if $\xi_k$ is chosen as
	$\xi_k \dtb \tfrac{f(\mathbf{x}_0) - f(\mathbf{x}^\star)}{(k+2)^{3+\delta}} $, then $\forall T \geq 1$, 
	\begin{equation}\label{eq:inexactLCconvergencedecreasedconvex}
		f(\mathbf{y}_T) - f(\mathbf{x}^\star) \leq \frac{L V_{\psi}(\mathbf{x}^\star,\mathbf{x}_0)}{(T+1)^2}\left(1 + \frac{2}{\delta} + \frac{8\rho}{\delta^2}\right).
	\end{equation}
\end{corollary}
\begin{remark}
	To preserve the accelerated rate $O(1/k^2)$ for convex objectives, existing works \citep{schmidt2011convergence,lin2017catalyst,kulunchakov2019} requires the error sequences decreases at the rate of $O(1/k^{4+\delta})$. In contrast, our analysis suggests $O(1/k^{3+\delta})$ is sufficient. This is consistent with our tighter bound shown in Corollary~\ref{th:inexactLCconvergenceconstantconvex}.
\end{remark}
As observed in \eqref{eq:inexactLCconvergencedecreasedconvex}, the objective value converges faster with more accurate proximal operator (i.e., a larger value for $\delta$). However, a larger $\delta$ also requires more computation time for each iteration. 


 

\noindent{\bf Comparison with Alternating Direction Method of Multipliers (ADMM)} Besides the accelerated proximal method, ADMM is another popular method for solving \eqref{eq:compositieproblem} due to its simplicity and applicability to broad applications \citep{boyd2011distributed}. It also allows inexact minimization of sub-problem to some extent. However, it is well-known that ADMM converges at the rate of $O(1/T)$ for convex objectives \citep{he2012onthe}. In contrast, the convergence rate of accelerated proximal method is $O(1/T^2)$, which is more desirable for large-scale machine learning problems.  

As discussed in \citep{boyd2011distributed}, ADMM will converge even the when the subproblems of each iteration are not solved exactly, as long as the approximate solutions satisfy certain suboptimality measures. In other words, ADMM also suffers from error accumulation if the sub-problems cannot be exactly solved. For example, the proximal operator does not admit a closed-form solution. We note our inexactness conditions \eqref{eq:suboptimalsolutiongradientdescentcomposite} and \eqref{eq:suboptimalsolutionmirrordescentcomposite} are absolute criteria which are same as the those used in \citep{schmidt2011convergence,lin2017catalyst,kulunchakov2019}. In contrast, existing works on convergence analysis of ADMM are mainly based on a relative error accuracy \citep{he2002anew,eckstein2018relative,alves2020relative} that is generally a stronger inexactness condition than ours. If ADMM takes the inexactness criterion as ours, under the same setting as Corollary~\ref{th:inexactLCconvergenceconstantconvex}, one can show that the convergence rate of inexact ADMM is
\begin{equation*}
	f(\mathbf{y}_T) - f(\mathbf{x}^\star) \leq \frac{\nu \|\mathbf{x} - \mathbf{x}^\star\|^2}{T} + \gamma T\xi,
\end{equation*}
where $\nu$ and $\gamma$ are some constants. Comparing it with \eqref{eq:inexactLCconvergenceconstantconvex}, we can observe that both ADMM and our result have $O(T\xi)$ error accumulation. However, our result achieves an $O(1/T^2)$ convergence rate while the rate is only $O(1/T)$ for ADMM.




\subsection{Convergence Rates of Strongly Convex $g$}\label{sec:convergenceratestronglyconvex}


In this section, we present the convergence result of linear coupling with inexact proximal operators for convex $g$. Next, we present the convergence result for strongly convex $g$. In this setting we assume $\|\cdot\| = \|\cdot\|_2$ that follows the customary \citep{nesterov2013introductory} of convergence analysis of optimization algorithms for strongly convex objectives. It is mainly used to simplify the proof of convergence analysis. Note that by far, whether the same properties obtained by the $\ell_2$-norm can be generalized to other norms is still an open question. However, even for the
$\ell_2$-norm, as we shall see in Remark~\ref{re-comparsionofstronglyconvex}, our convergence rate is still better than previously works.




We first introduce the analogue of Lemma~\ref{th:Lyapunovfunctionconvex} for strongly convex $g$. By considering the inexactness of $\mathbf{y}_{k+1}$ and $\mathbf{z}_{k+1}$, the next lemma derives a characteristic inequality for a specific Lyapunov function for strongly convex objectives.
\begin{lemma}\label{th:Lyapunovfunctionstronglyconvex}
	Under the same setting as Lemma~\ref{th:xisubgradientofinexactsolution}, 
	if $g$ is strongly convex, $\tau_k = \tfrac{L\alpha_{k+1} - \mu}{L - \mu}$ and $\eta_{k+1} \!=\! \tfrac{1}{L\alpha_{k+1}}$, then $\forall k \geq 0$,
	\begin{align}\label{eq:Lyapunovfunctionstronglyconvex}
		& f(\mathbf{y}_{k+1}) - f(\mathbf{x}^\star) + L\alpha_{k+1}^2 V_\psi(\mathbf{x}^\star, \mathbf{z}_{k+1}) \nonumber \\
	& \leq  \big(1 - \alpha_{k+1}\big)\left(f(\mathbf{y}_k) - f(\mathbf{x}^\star) + L\alpha_k^2V_\psi(\mathbf{x}^\star, \mathbf{z}_k)\right) \nonumber \\
		& + \sqrt{2\rho L\alpha_{k+1}^3\xi_{k+1}}\|\mathbf{x}^\star - \mathbf{z}_{k+1}\| +  \big(1 + \alpha_{k+1}\big) \xi_{k+1}.
	\end{align}
\end{lemma} 
The proof of Lemma~\ref{th:Lyapunovfunctionstronglyconvex} is given in Appendix~A.8. From the Lyapunov function, we obtain a general convergence result for linear coupling with inexact proximal operators.




We define $\Delta_k \dtb f(\mathbf{y}_k) - f(\mathbf{x}^\star) + L\alpha_k^2 V_{\psi}\left(\mathbf{x}^\star, \mathbf{z}_k\right), \forall k \geq 0$. Theorem~\ref{th:inexactLCconvergencestronglyconvex} presents the convergence result of linear coupling with inexact proximal operators for strongly convex $g$. 
\begin{theorem}\label{th:inexactLCconvergencestronglyconvex}
	%Let $\mathbf{y}_{k+1}$ and $\mathbf{z}_{k+1}$ be $\xi_{k+1}$-suboptimal solutions to \eqref{eq:gradentdescentconvexcomposite} and \eqref{eq:mirrordescentconvexcomposite}, respectively. 
	Under the same setting as Lemma~\ref{th:xisubgradientofinexactsolution}, 
	if $g$ is strongly convex, $\tau_k = \tfrac{L\alpha_{k+1} - \mu}{L - \mu}$ and $\eta_{k+1} \!=\! \tfrac{1}{L\alpha_{k+1}}$, then $\forall T \geq 1$:
	\begin{equation}\label{eq:inexactLCconvergencestronglyconvexgeneralized}
		\!\!\Delta_T \leq \Gamma_T\left(\!\!\Delta_0 + \sum_{k=1}^T\!\frac{\sqrt{2\rho L\alpha_k^3\xi_k}\|\mathbf{x}^\star - \mathbf{z}_k\| + 2\xi_k}{\Gamma_k} \!\right), \!\!\!
	\end{equation}
	where $\Gamma_k \dtb \prod_{i=1}^k (1 - \alpha_k)$. If $\alpha_0 = \sqrt{\tfrac{\mu}{L}}$, then $\forall T \geq 1$:
	\begin{equation}\label{eq:inexactLCconvergencestronglyconvex}
		\!\!\!f(\mathbf{y}_T) - f(\mathbf{x}^\star) \leq \left(\!1 - \sqrt{\frac{\mu}{L}} \right)^{T}\left(3\widetilde{\Delta}_0 + \widetilde{R}_T + \widehat{R}_T \right),\!\!
	\end{equation}
	where $\widetilde{\Delta}_0  \!\dtb\! f(\mathbf{x}_0\!) - \!f(\mathbf{x}^\star\!), \widetilde{R}_T \dtb 3\textstyle\sum_{k=1}^T \left(1 \!-\! \sqrt{\frac{\mu}{L}} \right)^{-k}\!\! \xi_k$
	\mbox{ and } $\widehat{R}_T \dtb  6\rho\sqrt{\tfrac{\mu}{L}}\left(\textstyle\sum_{k=1}^T \left(1 - \sqrt{\frac{\mu}{L}} \right)^{-k/2} \sqrt{\xi_k}\right)^2$.
\end{theorem}
The proof of Theorem~\ref{th:inexactLCconvergencestronglyconvex} is given in Appendix~A.9.
\begin{remark}\label{re-comparsionofstronglyconvex}
	If the proximal operator is exact (i.e., $\xi_k = 0, \forall k \geq 1$), it leads to $\widetilde{R}_T = 0$ and $\widehat{R}_T = 0$. Then, \eqref{eq:inexactLCconvergencestronglyconvex} recoveries the accelerated complexity $O(\sqrt{L/\mu}\log(1/\epsilon))$ for $\mu$-strongly convex objectives \citep{nesterov2013introductory}. 
\end{remark}
\begin{corollary}\label{th:inexactLCconvergenceconstantstronglyconvex}
	Consider the same setting as Theorem~\ref{th:inexactLCconvergencestronglyconvex}, if $\xi_k \leq \xi$ for all $k \geq 1$, then $\forall T \geq 1$:
	\begin{equation}\label{eq:inexactLCconvergenceconstantstronglyconvex}
		\!\!\! f(\mathbf{y}_T) - f(\mathbf{x}^\star) \leq 3\left(\!1 - \sqrt{\frac{\mu}{L}} \right)^T \!\!\widetilde{\Delta}_0 + (3+24\rho)\sqrt{\frac{L}{\mu}}\xi.\!\!
	\end{equation}
\end{corollary}
\begin{remark}\label{re:comparisonconvergenceconstantstronglyconvex}
	Under the same setting as Corollary~\ref{th:inexactLCconvergenceconstantstronglyconvex}, based on \cite[Proposition~4]{schmidt2011convergence}, we have
	\begin{equation}\label{eq:IPMCconvergenceconstantstronglyconvex}
		\!\!\!f(\mathbf{y}_T\!) - \!f(\mathbf{x}^\star\!) \leq 4\left(\!1 - \sqrt{\frac{\mu}{L}} \right)^T\!\!\! \widetilde{\Delta}_0 + \left(\frac{64L^2}{\mu^2} + 4\sqrt{\frac{\mu}{L}}\right)\xi. 
	\end{equation}
	Based on \cite[Theorem 3]{lin2017catalyst}, we have
	\begin{equation}\label{eq:CatalystCconvergenceconstantstronglyconvex} 
		f(\mathbf{y}_T) - f(\mathbf{x}^\star) \leq 4\left(1 - \sqrt{\frac{\mu}{L}} \right)^T \widetilde{\Delta}_0 + \frac{72L}{\mu}\xi.
	\end{equation}
	Based on \cite[Proposition~4]{kulunchakov2019}, we have
	\begin{equation}\label{eq:SCOCconvergenceconstantstronglyconvex}
		f(\mathbf{y}_T) - f(\mathbf{x}^\star) \leq 2\left(1 - \frac{1}{2}\sqrt{\frac{\mu}{L}} \right)^T \widetilde{\Delta}_0 + \frac{8L}{\mu}\xi.
	\end{equation}
	Comparing \eqref{eq:inexactLCconvergenceconstantstronglyconvex} with \eqref{eq:IPMCconvergenceconstantstronglyconvex}, \eqref{eq:CatalystCconvergenceconstantstronglyconvex} and \eqref{eq:SCOCconvergenceconstantstronglyconvex}, our analysis achieves better bound than existing works. Specifically, our error bound on $\xi$ is $O(\sqrt{\tfrac{L}{\mu}}\xi)$, while that are $O(\tfrac{L^2}{\mu^2}\xi)$ for \citep{schmidt2011convergence} and $O(\tfrac{L}{\mu}\xi)$ for \citep{lin2017catalyst,kulunchakov2019}, respectively.
\end{remark}




To preserve accelerated rate for Algorithm~\ref{alg:linearcoupling}, Theorem~\ref{th:inexactLCconvergencestronglyconvex} implies that the error sequence $\{\xi_k\}_{k\geq1}$ needs to decrease to $0$ at a linear rate.
\begin{corollary}\label{th:inexactLCconvergencedecreasedstronglyconvex}
	Under the same setting as in Theorem~\ref{th:inexactLCconvergencestronglyconvex}, for any $\vartheta \in (0, \sqrt{\mu/L})$, if $\xi_k$ is chosen as
	$\xi_k \leq \frac{1}{1+2\rho}\widetilde{\Delta}_0\left(1 - \vartheta\right)^k$, then $\forall T \geq 1$
	%then the output of Algorithm~\ref{alg:linearcoupling} satisfies
	\begin{equation}\label{eq:inexactLCconvergencedecreasedstronglyconvex}
		f(\mathbf{y}_T) - f(\mathbf{x}^\star) \leq (1 - \vartheta)^{T+1}\frac{12\widetilde{\Delta}_0}{(\sqrt{\mu/L} - \vartheta)^2}.
	\end{equation}
\end{corollary}
Similar to the conclusion for convex $g$, the objective with strongly convex $g$ has a faster convergence speed when the error of inexact proximal operators decreases at a faster rate (i.e., a larger value for $\rho$).





\begin{figure*}[t]
	\begin{center}
		\begin{tabular}{ccc}
			\hspace{-20pt}
			\includegraphics[height = 4.30 cm]{figures/a1a_doPoly.pdf}&
			\includegraphics[height = 4.30 cm]{figures/secom_doPoly.pdf}&
			\includegraphics[height = 4.30 cm]{figures/mushroom_doPoly.pdf}
		\end{tabular}
	\end{center}
	\caption{Results of linear coupling and ADMM with inexact proximal operator for CUR-like factorization. Objective function values v.s. number of iterations for different qualities of approximate solution of the proximal operator. From left to right: \texttt{a1a}, \texttt{secom} and \texttt{mushroom}. Better viewed on the screen with zooming-in as the difference of ADMM with $\xi_k \leq O(1/k^3)$ and $\xi_k \leq O(1/k^4)$ becomes insignificant compared to the results of linear coupling.}\label{fig:CURlikefactorization}
\end{figure*}

\section{Experiments}\label{sec:experiment}

In this section, we conduct two experiments to verify our theoretical results.




\subsection{CUR-like Factorization}\label{sec:cuklikefactorization}
We first apply Algorithm~\ref{alg:linearcoupling} to solve the CUR-like factorization optimization problem \citep{mairal2011convex}. 
For a given matrix $\mathbf{D} \in \mathbb{R}^{m \times n}$, the CUR-like factorization aims to approximate $\mathbf{D}$ by a matrix $\mathbf{X}$ with sparse rows and sparse columns.
\begin{equation*}
	\!\!\!\!\min_{\mathbf{X} \in \mathbb{R}^{m \times n}} \frac{1}{2}\left\|\mathbf{D}\mathbf{X}\mathbf{D} - \mathbf{D}\right\|_F^2 + \lambda_1\sum_{i=1}^m \left\|\mathbf{X}_{i,\cdot}\right\|_2 + \lambda_2\sum_{j=1}^n \left\|\mathbf{X}_{\cdot,j}\right\|_2,
\end{equation*}
where $\mathbf{X}_{i\cdot}$ and $\mathbf{X}_{\cdot,j}$ denote the $i$-th row and the $j$-th column of $\mathbf{X}$, respectively. The last two terms impose the $\ell_{2,1}$ norms for both row and columns of $\mathbf{X}$, that yields both sparse rows and columns. However, the proximal operator of this regularizer does not admit an analytical solution. There is even no iterative algorithm that can exactly compute the proximity operator.


Following \citep{schmidt2011convergence}, we approximately compute the proximal operator by a block coordinate descent (BCD) algorithm that is presented by \cite{jenatton2011structured} to efficiently obtain an approximate solution of the proximal operator. The BCD alternates between computing the proximal operator with respect to the rows and to the columns. At each iteration, we employ the BCD to solve \eqref{eq:gradentdescentconvexcomposite} and \eqref{eq:mirrordescentconvexcomposite} until \eqref{eq:suboptimalsolutiongradientdescentcomposite} and \eqref{eq:suboptimalsolutionmirrordescentcomposite} are satisfied that means both $\mathbf{y}_k$ and $\mathbf{z}_k$ are $\xi_k$-suboptimal solutions.

Suggested by Corollary~\ref{th:inexactLCconvergencedecreasedconvex}, we consider a decreased error sequences $\{\xi_k\}_{k \geq 1}$ where $\xi_k \leq 1/k^\alpha$ and the value of $\alpha$ are set to $\alpha = 1, 2, 3, 4$ in our experiments. As discussed before, the accelerated convergence rate of linear coupling can be preserved if $\alpha > 3$.



We perform experiments on four data sets\footnote{The datasets can be downloaded at \url{https://archive.ics.uci.edu/ml/datasets.php}.}: \texttt{mushroom}, \texttt{secom}, \texttt{a1a} and \texttt{musk}. We set $\lambda_1 = 0.01$ and $\lambda_2 = 0.01$ for all four datasets. Rather than assume the Lipschitz constant $L$ is known, we estimate it by line search \citep{nesterov2013introductory}. Specifically, we initialize the value of $L$ as $L = 0.5$ and double it if the following inequality is not satisfied
\begin{equation*}
	g(\mathbf{y}_k) \leq g(\mathbf{x}_k) + \langle\nabla g(\mathbf{x}_k), \mathbf{y}_k -  \mathbf{x}_k\rangle + \frac{L}{2}\|\mathbf{y}_k -  \mathbf{x}_k\|^2.
\end{equation*}
In our experiments, we observed that this strategy always performs better than a fixed but conservative value for $L$.



\begin{figure*}[t]
	\begin{center}
		\begin{tabular}{ccc}
			\hspace{-25pt}
			\includegraphics[height = 4.30 cm]{figures/lena_doPoly.pdf}&
			\includegraphics[height = 4.30 cm]{figures/boat_doPoly.pdf}&
			\includegraphics[height = 4.30 cm]{figures/football_doPoly.pdf}
		\end{tabular}
	\end{center}
	\caption{Results of linear coupling with inexact proximal operator for image deblurring with isotropic total variation. Objective function values v.s. number of iterations for different qualities of approximate solution of the proximal operator. From left to right: \texttt{Lena}, \texttt{Boat} and \texttt{Football}.}\label{fig:imagedeblurringITV}
\end{figure*}



To demonstrate the accelerated convergence rate of linear coupling, we compare it with ADMM. It is well-know that the performance of ADMM is highly dependent on the choice of penalty parameter $\varrho$.
To show the best performance of ADMM, we perform a grid search to find the best value of $\varrho$ and fixed it for all iterations. 

Fig.~\ref{fig:CURlikefactorization} shows the objective function values versus the number of iterations of inexact linear coupling on the three data sets: \texttt{a1a}, \texttt{secom} and \texttt{mushroom}. For linear coupling, the choice of $\xi_k \leq 1/k^4$ achieves the fastest convergence rate according to our analysis (refer to Corollary~\ref{th:inexactLCconvergencedecreasedconvex}), provides the best empirical performance across all three data sets. However, as the iteration goes, the performance gap between the choices of $\xi_k \leq 1/k^3$ and $\xi_k \leq 1/k^4$ becomes smaller. This is consistent with our theoretical results that the error sequences only need to decrease faster than $O(1/k^3)$ instead of $O(1/k^4)$
\citep{schmidt2011convergence,lin2017catalyst,kulunchakov2019}.


Furthermore, as observed from Fig.~\ref{fig:CURlikefactorization}, the linear coupling clearly outperforms the ADMM on this task. Here, we show the results of ADMM with $\xi_k \leq 1/k^3$ and $\xi_k \leq 1/k^4$. In fact, we observed the result of ADMM when $\alpha \geq 2$ are very close to each other, that is consistent with the theoretical result. Specifically, the error sequences should decrease faster than $O(1/k^2)$ if the convergence rate of the algorithm is $O(1/k)$, for example, non-accelerated proximal method.   




\subsection{Image Deblurring with Isotropic Total Variation}
For the second experiment, we consider image deblurring with isotropic total variation regularization \citep{chambolle2004analgorithm,beck2009fast}, which does not admit an analytical solution for the proximal operator \citep{chambolle2011afirstorder,beck2009fast}. For $\mathbf{X} \in \mathbb{R}^{m\times n}$, the discrete gradient operator is
\begin{equation*}
	\!\!(\nabla\mathbf{X})_{ij} = \left\{\!\!\!\!\begin{array}{ll}
		(X_{ij} \!-\! X_{i+1,j}, X_{ij} \!-\! X_{i,j+1})		& \mbox{if } i < m, j < n \\
		(0, X_{ij} \!-\! X_{i,j+1})								& \mbox{if } i = m, j < n \\
		(X_{ij} \!-\! X_{i+1,j}, 0)		& \mbox{if } i < m, j = n
	\end{array}\right.
\end{equation*}
Then, image deblurring with isotropic total variation regularization can be written as \citep{chambolle2016introduction,beck2009fast}
\begin{equation*} 
	\min_{\mathbf{X} \in \mathcal{C}} \frac{1}{2}\big\|\mathcal{A}(\mathbf{X}) - \mathbf{B}\|_\text{F}^2 + \lambda\sum_{i=1}^m\sum_{j=1}^n\|(\nabla\mathbf{X})_{ij}\|,
\end{equation*}
where $\mathcal{A}: \mathbb{R}^{m \times n} \rightarrow \mathbb{R}^{m \times n}$ is a linear operator representing some blurring processing, $\mathbf{B} \in \mathbb{R}^{m \times n}$ is the blurred
and noisy image, and $\lambda > 0$ denotes a regularization parameter. Unlike the $\ell_1$ total variation, the proximal operator of isotropic total variation regularization does not admit an analytical solution. Same as before, we employ the BCD algorithm to compute an approximate solution. 



We conducted this experiment on three images \texttt{Lena}, \texttt{Boat} and \texttt{Football} from \MATLAB image processing toolbox. We first resize each image to $128 \times 128$ pixels. To obtain $\mathbf{B}$, the clean image was first blurred by a $5 \times 5$ kernel matrix: $\mathbf{S} = \frac{1}{25}\mathbf{I}_{5 \times 5}$, where $\mathbf{I}_{5 \times 5} \in \mathbb{R}^{5 \times 5}$ is an identify matrix, followed by additive Gaussian noise with zero mean and standard deviation $10^{-1}$. The regularization parameter $\lambda$ was set to $0.1$ for all three images.


Fig.~\ref{fig:imagedeblurringITV} shows the objective function values versus the number of iterations of inexact linear coupling on \texttt{Lena}, \texttt{Boat} and \texttt{Football}.
Similar trends as the first experiments are observed for this experiment except the performance between $\xi_k \leq 1/k^3$ and $\xi_k \leq 1/k^4$ are very close to each other.
 
 
In this experiment, we did not perform the experiments of ADMM due to it solves the subproblem associated with $\frac{1}{2}\big\|\mathcal{A}(\mathbf{X}) - \mathbf{B}\|_F^2$ is computationally expensive.
Nevertheless, the experiment in Section~\ref{sec:cuklikefactorization} already clearly shows the advantages of linear coupling over ADMM.
 

 
 
 
 




\section{Conclusion}\label{sec:conclusion}
Non-smooth regularizations have played irreplaceable roles in machine learning. However, many of them do not admit an analytical solution for proximal operator. As a variant of Nesterov's AGDs, the linear coupling can efficiently solve convex composite minimization with accelerated convergence rates if the proximal operator is exactly computed. In this work, we present a complete convergence analysis for linear coupling with inexact proximal operators. Our analysis suggests that inexact linear coupling still achieves the accelerated convergence rate if the error sequence of inexact proximal operator decreases at a sufficiently fast rate. More importantly, our theoretical results are better than previous works. We empirically verify our theoretical analysis by employing linear coupling with inexact operators to solve CUR-like factorization and image deblurring with isotropic total variation on different datasets.



\begin{acknowledgements} % will be removed in pdf for initial submission,
	% so you can already fill it to test with the
	% ‘accepted’ class option
	Qiang Zhou is supported by National Science Foundation of China under Grant 62106045 and Southeast University Startup Fund (the Fundamental Research Funds for the Central Universities) under Grant 2242021R10096. Sinno J. Pan is supported by 2020 Microsoft Research Asia collaborative research grant.
\end{acknowledgements}


%\cleardoublepage
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Reference 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\bibliography{zhou_48}



\end{document}
