﻿/**
File:		MachineLearning/Optimization/Unconstrained/L-BFGS.h

Author:		
Email:		
Site:       

Copyright (c) 2019 . All rights reserved.
*/

#pragma once

#include <MachineLearning/BaseGradientOptimizationMethod.h>

namespace NeuralEngine
{
	namespace MachineLearning
	{
		////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>	Limited-memory BFGS (L-BFGS or LM-BFGS). </summary>
		///
		/// <remarks>	
		/// 	<para>
		///			Limited-memory BFGS (L-BFGS or LM-BFGS) is an optimization algorithm in the family 
		///			of quasi-Newton methods that approximates the Broyden–Fletcher–Goldfarb–Shanno (BFGS) 
		///			algorithm using a limited amount of computer memory. It is a popular algorithm for 
		///			parameter estimation in machine learning.[1][2] The algorithm's target problem is to 
		///			minimize $f ( x )$ $f(\mathbf {x} )$ over unconstrained values of the real-vector 
		///			$\mathbf{x}$ where $f$ is a differentiable scalar function.
		///		</para>
		/// 
		///		<para>
		///			Like the original BFGS, L-BFGS uses an estimation to the inverse Hessian matrix 
		///			to steer its search through variable space, but where BFGS stores a dense $n\times n$ 
		///			approximation to the inverse Hessian (n being the number of variables in the problem), 
		///			L-BFGS stores only a few vectors that represent the approximation implicitly. Due to 
		///			its resulting linear memory requirement, the L-BFGS method is particularly well suited 
		///			for optimization problems with a large number of variables. Instead of the inverse 
		///			Hessian $\mathbf{H}_k$, L-BFGS maintains a history of the past m updates of the position 
		///			$\mathbf{x}$ and gradient ∇f(x), where generally the history size $m$ can be small 
		///			(often $m<10$). These updates are used to implicitly do operations requiring the 
		///			$\mathbf{H}_k$-vector product.
		///		</para>
		/// 
		///		<para>
		/// 	  References:
		/// 	  <list type="bullet">
		///			<item>
		/// 	    	  <description><a href="http://users.iems.northwestern.edu/~nocedal/PDFfiles/limited.pdf">
		/// 				Byrd, R. and Lu, P. and Nocedal, J. and Zhu, C. (1995). "A Limited Memory
		/// 				Algorithm for Bound Constrained Optimization". SIAM Journal on Scientific Computing
		/// 				(CoNLL-2002). pp. 1190-1208. doi:10.1137/0916069</a>
		/// 	       </description>
		/// 	    </item>
		/// 	    <item>
		/// 	    	  <description><a href="https://dl.acm.org/citation.cfm?id=1118871">
		/// 				Malouf, Robert (2002). "A comparison of algorithms for maximum entropy parameter 
		/// 				estimation". Proceedings of the Sixth Conference on Natural Language Learning 
		/// 				(CoNLL-2002). pp. 49–55. doi:10.3115/1118853.1118871</a>
		/// 	       </description>
		/// 	    </item>
		/// 	    <item>
		/// 	    	  <description><a href="https://www.microsoft.com/en-us/research/publication/scalable-training-of-l1-regularized-log-linear-models/?from=http%3A%2F%2Fresearch.microsoft.com%2Fapps%2Fpubs%2Fdefault.aspx%3Fid%3D78900">
		/// 				Andrew, Galen; Gao, Jianfeng (2007). "Scalable training of L₁-regularized 
		/// 				log-linear models". Proceedings of the 24th International Conference on 
		/// 				Machine Learning. doi:10.1145/1273496.1273501. ISBN 9781595937933</a>
		/// 	       </description>
		/// 	    </item>
		/// 	   </list>
		/// 	</para>
		/// 	
		/// 	HmetalT, 02.05.2019. 
		/// </remarks>
		////////////////////////////////////////////////////////////////////////////////////////////////////
		template<typename Scalar, LineSearchType LSType = MoreThuente>
		class NE_IMPEXP LBFGSSolver : public BaseGradientOptimizationMethod<Scalar, LSType>
		{
		public:

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="numberOfVariables">
			/// 	The number of free parameters in the optimization problem.
			/// </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			LBFGSSolver(int numberOfVariables);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="numberOfVariables">
			/// 	The number of free parameters in the function to be optimized.
			/// </param>
			/// <param name="function">				[in,out] The function to be optimized. </param>
			/// <param name="gradient">				[in,out] The gradient of the function. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			LBFGSSolver(int numberOfVariables,
				std::function<Scalar(const af::array&, af::array&)> function);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="function">	The objective function and gradients whose optimum values should be found. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			LBFGSSolver(NonlinearObjectiveFunction<Scalar>* function);

			~LBFGSSolver();

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets number of corrections. </summary>
			///
			/// <remarks>	
			/// 	The number of corrections to approximate the inverse hessian matrix.
			///		The L-BFGS routine stores the computation results of previous \ref m
			///		iterations to approximate the inverse hessian matrix of the current
			///		iteration. This parameter controls the size of the limited memories
			///		(corrections). The default value is \c 6. Values less than \c 3 are
			///		not recommended. Large values will result in excessive computing time.
			/// 			
			/// 	Hmetal T, 04/06/2019. 
			/// </remarks>
			///
			/// <param name="corrections">	The corrections. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetNumCorrections(int corrections);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets Delta for convergence test. </summary>
			///
			/// <remarks>	
			/// 	The algorithm stops when the following condition is met,
			/// 	\f$(f_{k-d}(x)-f_k(x))/f_k(x)<\delta\f$, where \f$f_k(x)\f$ is		
			/// 	the current function value, \f$f_{k-d}(x)\f$ is the function value				
			/// 	\f$d\f$ iterations ago (specified by the \ref past parameter).						
			/// 	The default value is \c 0.								
			/// 													
			/// 	Hmetal T, 06/06/2019. 
			/// </remarks>
			///
			/// <param name="inDelta">	The in delta. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetDelta(Scalar inDelta);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets the maximum number of trials for the line search. </summary>
			///
			/// <remarks>	
			/// 	This parameter controls the number of function and gradients evaluations
			/// 	per iteration for the line search routine. The default value is \c 20.		
			/// 			
			/// 	Hmetal T, 06/06/2019. 
			/// </remarks>
			///
			/// <param name="maxIter">	The maximum iterator. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetMaxLinesearch(int maxIter);

		protected:

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>
			/// 	Implements the actual optimization algorithm. This method should try to minimize the
			/// 	objective function.
			/// </summary>
			///
			/// <remarks>	Hmetal T, 11.04.2017. </remarks>
			///
			/// <returns>	true if it succeeds, false if it fails. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			virtual bool Optimize(int* cycle = nullptr) override;

			//////////////////////////////////////////////////////////////////////////////////////////////////////
			///// <summary>	Line search by backtracking. </summary>
			/////
			///// <remarks>	Hmetal T, 04/06/2019. </remarks>
			/////
			///// <param name="fx">  	[in,out] The effects. </param>
			///// <param name="x">   	[in,out] The af::array to process. </param>
			///// <param name="grad">	[in,out] The graduated. </param>
			///// <param name="step">	[in,out] Amount to increment by. </param>
			///// <param name="drt"> 	The drt. </param>
			///// <param name="xp">  	The XP. </param>
			//////////////////////////////////////////////////////////////////////////////////////////////////////
			//void LineSearchBacktracking(Scalar& fx, af::array& x, af::array& grad, Scalar& step, const af::array& drt, const af::array& xp);

			//////////////////////////////////////////////////////////////////////////////////////////////////////
			///// <summary>	Line search bracketing. </summary>
			/////
			///// <remarks>	
			///// 	Similar to the backtracking line search
			/////		except that it actively maintains an upper and lower bound of the
			/////		current search range.
			///// 					
			///// 	Hmetal T, 06/06/2019. 
			///// </remarks>
			/////
			///// <param name="fx">  	[in,out] The effects. </param>
			///// <param name="x">   	[in,out] The af::array to process. </param>
			///// <param name="grad">	[in,out] The graduated. </param>
			///// <param name="step">	[in,out] Amount to increment by. </param>
			///// <param name="drt"> 	The drt. </param>
			///// <param name="xp">  	The XP. </param>
			//////////////////////////////////////////////////////////////////////////////////////////////////////
			//void LineSearchBracketing(Scalar& fx, af::array& x, af::array& grad, Scalar& step, const af::array& drt, const af::array& xp);

		private:
			int m;				// The number of corrections to approximate the inverse Hessian matrix.
			int past;			// Distance for delta-based convergence test.
			Scalar delta;		// Delta for convergence test.
			int max_linesearch;	// The maximum number of trials for the line search.
			Scalar min_step;	// The minimum step length allowed in the line search.
			Scalar max_step;	// The maximum step length allowed in the line search.
			Scalar ftol;		// A parameter to control the accuracy of the line search routine.
			Scalar wolfe;		// Wolfe condition
		};
	}
}
