/**
File:		MachineLearning/Optimization/Unconstrained/FgAdaMaxSolver.h

Author:		
Email:		
Site:       

Copyright (c) 2019 . All rights reserved.
*/

#pragma once

#include <MachineLearning/BaseGradientOptimizationMethod.h>

namespace NeuralEngine
{
	namespace MachineLearning
	{
		////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>	AdaMax optimizer. </summary>
		///
		/// <remarks>	
		/// 	<para>
		///			AdaMax update rule. Adam is an an algorithm for first-order gradient-
		///			-based optimization of stochastic objective functions, based on adaptive
		///			estimates of lower - order moments.AdaMax is simply a variant of Adam based
		///			n the infinity norm. The idea with Adamax is to look at the value v as the 
		///			L2 norm of the individual current and past gradients. We can generalize it 
		///			to Lp update rule, but it gets pretty unstable for large values of p. But if 
		///			we use the special case of L-infinity norm, it results in a surprisingly 
		///			stable and well-performing algorithm. Heres how to implement Adamax with 
		///			python:
		///		</para>
		///		<code>
		///			for t in range(num_iterations):
		///				g = compute_gradient(x, y)
		///				m = beta_1 * m + (1 - beta_1) * g
		///				m_hat = m / (1 - np.power(beta_1, t))
		///				v = np.maximum(beta_2 * v, np.abs(g))
		///				w = w - step_size * m_hat / v
		///		</code>
		/// 
		///		<para>
		/// 	  References:
		/// 	  <list type="bullet">
		///			<item>
		/// 	    	  <description><a href="https://arxiv.org/pdf/1412.6980.pdf">
		/// 				Diederik P. Kingma and Jimmy Ba (2014). "Adam: A Method
		/// 				for Stochastic Optimization". CoRR.</a>
		/// 	       </description>
		/// 	    </item>
		/// 	   </list>
		/// 	</para>
		/// 	
		/// 	HmetalT, 02.05.2019. 
		/// </remarks>
		////////////////////////////////////////////////////////////////////////////////////////////////////
		template<typename Scalar, LineSearchType LSType = MoreThuente>
		class NE_IMPEXP AdaMaxSolver : public BaseGradientOptimizationMethod<Scalar, LSType>
		{
		public:

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="numberOfVariables">
			/// 	The number of free parameters in the optimization problem.
			/// </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			AdaMaxSolver(int numberOfVariables);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="numberOfVariables">
			/// 	The number of free parameters in the function to be optimized.
			/// </param>
			/// <param name="function">				[in,out] The function to be optimized. </param>
			/// <param name="gradient">				[in,out] The gradient of the function. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			AdaMaxSolver(int numberOfVariables,
				std::function<Scalar(const af::array&, af::array&)> function);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Creates a new instance of the L-BFGS optimization algorithm. </summary>
			///
			/// <remarks>	 Admin, 3/27/2017. </remarks>
			///
			/// <param name="function">	The objective function and gradients whose optimum values should be found. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			AdaMaxSolver(NonlinearObjectiveFunction<Scalar>* function);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Destructor. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			~AdaMaxSolver();

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets decay rate for the first moment estimates. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <param name="beta1">	The first beta. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetBeta1(Scalar beta1);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets decay rate for the second-moment estimates. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <param name="beta2">	The second beta. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetBeta2(Scalar beta2);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets the learning rate. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <param name="alpha">	The alpha. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetAlpha(Scalar alpha);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets an epsilon to avoid division by zero. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <param name="epsilon">	The epsilon. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetEpsilon(Scalar epsilon);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Sets initial decay rate. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <param name="decay">	The decay. </param>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			void SetDecay(Scalar decay);

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Gets decay rate for the first moment estimates. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <returns>	The beta 1. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar GetBeta1();

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Gets decay rate for the second-moment estimates. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <returns>	The beta 2. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar GetBeta2();

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Gets the learning rate. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <returns>	The alpha. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar GetAlpha();

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Gets the epsilon. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <returns>	The epsilon. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar GetEpsilon();

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>	Gets the initial decay. </summary>
			///
			/// <remarks>	, 15.08.2019. </remarks>
			///
			/// <returns>	The decay. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			Scalar GetDecay();

		protected:

			////////////////////////////////////////////////////////////////////////////////////////////////////
			/// <summary>
			/// 	Implements the actual optimization algorithm. This method should try to minimize the
			/// 	objective function.
			/// </summary>
			///
			/// <remarks>	Hmetal T, 11.04.2017. </remarks>
			///
			/// <returns>	true if it succeeds, false if it fails. </returns>
			////////////////////////////////////////////////////////////////////////////////////////////////////
			virtual bool Optimize(int* cycle = nullptr) override;

		private:
			Scalar min_step;	// The minimum step length allowed in the line search.
			Scalar max_step;	// The maximum step length allowed in the line search.

			Scalar sAlpha;		// learning rate
			Scalar sBeta1;		// exponential decay rate for the first moment estimates (e.g. 0.9)
			Scalar sBeta2;		// exponential decay rate for the second-moment estimates (e.g. 0.999).
			Scalar sEpsilon;	// small number to prevent any division by zero in the implementation
			Scalar sDecay;
			Scalar delta;
		};
	}
}
