﻿/**
File:		MachineLearning/Optimization/Unconstrained/FgLBFGS.h

Author:		
Email:		
Site:       

Copyright (c) 2019 . All rights reserved.
*/

#pragma once

#include <MachineLearning/BaseGradientOptimizationMethod.h>

namespace NeuralEngine
{
	namespace MachineLearning
	{
		////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>	Limited-memory BFGS (L-BFGS or LM-BFGS). </summary>
		///
		/// <remarks>	
		/// 	<para>
		///			Limited-memory BFGS (L-BFGS or LM-BFGS) is an optimization algorithm in the family 
		///			of quasi-Newton methods that approximates the Broyden–Fletcher–Goldfarb–Shanno (BFGS) 
		///			algorithm using a limited amount of computer memory. It is a popular algorithm for 
		///			parameter estimation in machine learning. The algorithm's target problem is to 
		///			minimize $f ( x )$ $f(\mathbf {x} )$ over unconstrained values of the real-vector 
		///			$\mathbf{x}$ where $f$ is a differentiable scalar function.
		///		</para>
		/// 
		///		<para>
		///			Like the original BFGS, L-BFGS uses an estimation to the inverse Hessian matrix 
		///			to steer its search through variable space, but where BFGS stores a dense $n\times n$ 
		///			approximation to the inverse Hessian (n being the number of variables in the problem), 
		///			L-BFGS stores only a few vectors that represent the approximation implicitly. Due to 
		///			its resulting linear memory requirement, the L-BFGS method is particularly well suited 
		///			for optimization problems with a large number of variables. Instead of the inverse 
		///			Hessian $\mathbf{H}_k$, L-BFGS maintains a history of the past m updates of the position 
		///			$\mathbf{x}$ and gradient ∇f(x), where generally the history size $m$ can be small 
		///			(often $m<10$). These updates are used to implicitly do operations requiring the 
		///			$\mathbf{H}_k$-vector product.
		///		</para>
		/// 
		///		<para>
		/// 	  References:
		/// 	  <list type="bullet">
		///			<item>
		/// 	    	  <description><a href="http://users.iems.northwestern.edu/~nocedal/PDFfiles/limited.pdf">
		/// 				Byrd, R. and Lu, P. and Nocedal, J. and Zhu, C. (1995). "A Limited Memory
		/// 				Algorithm for Bound Constrained Optimization". SIAM Journal on Scientific Computing
		/// 				(CoNLL-2002). pp. 1190-1208. doi:10.1137/0916069</a>
		/// 	       </description>
		/// 	    </item>
		/// 	    <item>
		/// 	    	  <description><a href="https://dl.acm.org/citation.cfm?id=1118871">
		/// 				Malouf, Robert (2002). "A comparison of algorithms for maximum entropy parameter
		/// 				estimation". Proceedings of the Sixth Conference on Natural Language Learning
		/// 				(CoNLL-2002). pp. 49–55. doi:10.3115/1118853.1118871</a>
		/// 	       </description>
		/// 	    </item>
		/// 	    <item>
		/// 	    	  <description><a href="https://www.microsoft.com/en-us/research/publication/scalable-training-of-l1-regularized-log-linear-models/?from=http%3A%2F%2Fresearch.microsoft.com%2Fapps%2Fpubs%2Fdefault.aspx%3Fid%3D78900">
		/// 				Andrew, Galen; Gao, Jianfeng (2007). "Scalable training of L₁-regularized
		/// 				log-linear models". Proceedings of the 24th International Conference on
		/// 				Machine Learning. doi:10.1145/1273496.1273501. ISBN 9781595937933</a>
		/// 	       </description>
		/// 	    </item>
		/// 	   </list>
		/// 	</para>
		/// 	
		/// 	HmetalT, 02.05.2019. 
		/// </remarks>
		////////////////////////////////////////////////////////////////////////////////////////////////////
		template<typename Scalar, LineSearchType LSType = MoreThuente>
		class NE_IMPEXP LBFGSBSolver : public BaseGradientOptimizationMethod<Scalar, LSType>
		{
		public:
			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="numberOfVariables">
			/// 	The number of free parameters in the optimization problem.
			/// </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			LBFGSBSolver(int numberOfVariables);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="numberOfVariables">
			/// 	The number of free parameters in the function to be optimized.
			/// </param>
			/// <param name="function">				[in,out] The function to be optimized. </param>
			/// <param name="gradient">				[in,out] The gradient of the function. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			LBFGSBSolver(int numberOfVariables,
				std::function<Scalar(const af::array&, af::array&)> function);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="function">	The objective function and gradients whose optimum values should be found. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			LBFGSBSolver(NonlinearObjectiveFunction<Scalar>* function);

			void SetHistorySize(const int hs);

			~LBFGSBSolver();

		protected:

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>
			/// 	Implements the actual optimization algorithm. This method should try to minimize the
			/// 	objective function.
			/// </summary>
			///
			/// <remarks>	Hmetal T, 11.04.2017. </remarks>
			///
			/// <returns>	true if it succeeds, false if it fails. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			virtual bool Optimize(int* cycle = nullptr) override;

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sorts pairs (k,v) according v ascending. </summary>
			///
			/// <remarks>	Hmetal T, 12/06/2019. </remarks>
			///
			/// <param name="v">	The std::vector&lt;std::pair&lt;int,Scalar&gt;&gt; to process. </param>
			///
			/// <returns>	The sorted indexes. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			std::vector<int> SortIndexes(const std::vector<std::pair<int, Scalar>>& v);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Computation of the generalized Cauchy point. </summary>
			///
			/// <remarks>	Hmetal T, 12/06/2019. </remarks>
			///
			/// <param name="problem"> 	The problem. </param>
			/// <param name="x">	   	The af::array to process. </param>
			/// <param name="g">	   	The af::array to process. </param>
			/// <param name="x_cauchy">	[in,out] The cauchy. </param>
			/// <param name="c">	   	[in,out] The af::array to process. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void GetGeneralizedCauchyPoint(const af::array& x, const af::array& g, af::array& x_cauchy, af::array& c);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Finds alpha* = max{a : a <= 1 and  l_i-xc_i <= a*d_i <= u_i-xc_i}. </summary>
			///
			/// <remarks>	Hmetal T, 12/06/2019. </remarks>
			///
			/// <param name="problem">			The problem. </param>
			/// <param name="x_cp">				[in,out] The cp. </param>
			/// <param name="du">				[in,out] The du. </param>
			/// <param name="FreeVariables">	[in,out] The free variables. </param>
			///
			/// <returns>	The found alpha. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar FindAlpha(af::array& x_cp, af::array& du, std::vector<int>& FreeVariables);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Solving unbounded probelm. </summary>
			///
			/// <remarks>	Hmetal T, 12/06/2019. </remarks>
			///
			/// <param name="problem">	  	The problem. </param>
			/// <param name="x_cauchy">   	[in,out] The cauchy. </param>
			/// <param name="x">		  	[in,out] The af::array to process. </param>
			/// <param name="c">		  	[in,out] The af::array to process. </param>
			/// <param name="g">		  	[in,out] The af::array to process. </param>
			/// <param name="SubspaceMin">	[in,out] The subspace minimum. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SubspaceMinimization(af::array& x_cauchy, af::array& x, af::array& c, af::array& g, af::array& SubspaceMin);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Gets an optimality. </summary>
			///
			/// <remarks>	Hmetal T, 12/06/2019. </remarks>
			///
			/// <param name="x">	The af::array to process. </param>
			/// <param name="g">	The af::array to process. </param>
			///
			/// <returns>	The optimality. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar GetOptimality(const af::array& x, const af::array& g);

		private:
			af::array W, M;
			Scalar theta;
			int m_historySize;
		};
	}
}
