/**
File:		MachineLearning/Optimization/Unconstrained/FgAdamSolver.h

Author:		
Email:		
Site:       

Copyright (c) 2019 . All rights reserved.
*/

#pragma once

#include <MachineLearning/BaseGradientOptimizationMethod.h>

namespace NeuralEngine
{
	namespace MachineLearning
	{
		////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>	Adam optimizer. </summary>
		///
		/// <remarks>	
		/// 	<para>
		///			Adam is an optimization algorithm that can used instead of the classical stochastic 
		///			gradient descent procedure to update network weights iterative based in training data.
		///			Adam is different to classical stochastic gradient descent. Stochastic gradient descent 
		///			maintains a single learning rate (termed alpha) for all weight updates and the learning 
		///			rate does not change during training. A learning rate is maintained for each network 
		///			weight(parameter) and separately adapted as learning unfolds.
		///		</para>
		/// 
		///		<para>
		///			The authors describe Adam as combining the advantages of two other extensions of 
		///			stochastic gradient descent. Specifically:
		///			<para>
		/// 			<list type="bullet">
		///					<item>
		///						Adaptive Gradient Algorithm (AdaGrad) that maintains a per - parameter 
		///						learning rate that improves performance on problems with sparse gradients
		///						(e.g.natural language and computer vision problems).
		///					</item>
		///					<item>
		///						Root Mean Square Propagation (RMSProp) that also maintains per - parameter 
		///						learning rates that are adapted based on the average of recent magnitudes 
		///						of the gradients for the weight(e.g.how quickly it is changing).This means 
		///						the algorithm does well on online and non - stationary problems(e.g.noisy).
		///					</item>
		///				</list>
		/// 		</para>
		///			Adam realizes the benefits of both AdaGrad and RMSProp. Instead of adapting the 
		///			parameter learning rates based on the average first moment(the mean) as in RMSProp, 
		///			Adam also makes use of the average of the second moments of the gradients (the uncentered 
		///			variance). Specifically, the algorithm calculates an exponential moving average of the 
		///			gradient and the squared gradient, and the parameters beta1 and beta2 control the 
		///			decay rates of these moving averages.
		///		</para>
		/// 
		///		<para>
		/// 	  References:
		/// 	  <list type="bullet">
		///			<item>
		/// 	    	  <description><a href="https://arxiv.org/pdf/1412.6980.pdf" target="_blank">
		/// 				Diederik P. Kingma and Jimmy Ba (2014). "Adam: A Method
		/// 				for Stochastic Optimization". CoRR.</a>
		/// 	       </description>
		/// 	    </item>
		/// 	   </list>
		/// 	</para>
		/// 	
		/// 	HmetalT, 02.05.2019. 
		/// </remarks>
		////////////////////////////////////////////////////////////////////////////////////////////////////
		template<typename Scalar, LineSearchType LSType = MoreThuente>
		class NE_IMPEXP AdamSolver : public BaseGradientOptimizationMethod<Scalar, LSType>
		{
		public:

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="numberOfVariables">
			/// 	The number of free parameters in the optimization problem.
			/// </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			AdamSolver(int numberOfVariables);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="numberOfVariables">
			/// 	The number of free parameters in the function to be optimized.
			/// </param>
			/// <param name="function">				[in,out] The function to be optimized. </param>
			/// <param name="gradient">				[in,out] The gradient of the function. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			AdamSolver(int numberOfVariables,
				std::function<Scalar(const af::array&, af::array&)> function);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="function">	The objective function and gradients whose optimum values should be found. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			AdamSolver(NonlinearObjectiveFunction<Scalar>* function);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Destructor. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			~AdamSolver();

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets decay rate for the first moment estimates. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <param name="beta1">	The first beta. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetBeta1(Scalar beta1);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets decay rate for the second-moment estimates. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <param name="beta2">	The second beta. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetBeta2(Scalar beta2);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets the learning rate. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <param name="alpha">	The alpha. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetAlpha(Scalar alpha);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets an epsilon to avoid division by zero. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <param name="epsilon">	The epsilon. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetEpsilon(Scalar epsilon);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets initial decay rate. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <param name="decay">	The decay. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetDecay(Scalar decay);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Gets decay rate for the first moment estimates. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <returns>	The beta 1. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar GetBeta1();

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Gets decay rate for the second-moment estimates. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <returns>	The beta 2. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar GetBeta2();

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Gets the learning rate. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <returns>	The alpha. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar GetAlpha();

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Gets the epsilon. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <returns>	The epsilon. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar GetEpsilon();

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Gets the initial decay. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <returns>	The decay. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar GetDecay();

		protected:

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>
			/// 	Implements the actual optimization algorithm. This method should try to minimize the
			/// 	objective function.
			/// </summary>
			///
			/// <remarks>	Hmetal T, 11.04.2017. </remarks>
			///
			/// <returns>	true if it succeeds, false if it fails. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			virtual bool Optimize(int* cycle = nullptr) override;

		private:
			Scalar min_step;	// The minimum step length allowed in the line search.
			Scalar max_step;	// The maximum step length allowed in the line search.

			Scalar sAlpha;		// learning rate
			Scalar sBeta1;		// exponential decay rate for the first moment estimates (e.g. 0.9)
			Scalar sBeta2;		// exponential decay rate for the second-moment estimates (e.g. 0.999).
			Scalar sEpsilon;	// small number to prevent any division by zero in the implementation
			Scalar sDecay;
			Scalar delta;
		};
	}
}