/**
File:		MachineLearning/Optimization/Unconstrained/FgAdaMax.cpp

Author:		
Email:		
Site:       

Copyright (c) 2019 . All rights reserved.
*/

#include <NeMachineLearningPCH.h>
#include <MachineLearning/FgAdaMaxSolver.h>
#include <cmath>
#include <math.h>
#include <limits>
#include <iomanip>

namespace NeuralEngine::MachineLearning
{
	template class AdaMaxSolver<float, ArmijoBacktracking>;
	template class AdaMaxSolver<float, ArmijoBracketing>;
	template class AdaMaxSolver<float, MoreThuente>;
	template class AdaMaxSolver<float, StrongWolfeBacktracking>;
	template class AdaMaxSolver<float, StrongWolfeBracketing>;
	template class AdaMaxSolver<float, WolfeBacktracking>;
	template class AdaMaxSolver<float, WolfeBracketing>;

	template class AdaMaxSolver<double, ArmijoBacktracking>;
	template class AdaMaxSolver<double, ArmijoBracketing>;
	template class AdaMaxSolver<double, MoreThuente>;
	template class AdaMaxSolver<double, StrongWolfeBacktracking>;
	template class AdaMaxSolver<double, StrongWolfeBracketing>;
	template class AdaMaxSolver<double, WolfeBacktracking>;
	template class AdaMaxSolver<double, WolfeBracketing>;

	template<typename Scalar, LineSearchType LSType>
	AdaMaxSolver<Scalar, LSType>::AdaMaxSolver(int numberOfVariables)
		: BaseGradientOptimizationMethod<Scalar, LSType>(numberOfVariables),
		min_step(1e-20), max_step(1e+20), sBeta1(0.9), sBeta2(0.999), sAlpha(0.001), sEpsilon(1e-8), delta(1e-8), sDecay(0.0)
	{
	}

	template<typename Scalar, LineSearchType LSType>
	AdaMaxSolver<Scalar, LSType>::AdaMaxSolver(int numberOfVariables, std::function<Scalar(const af::array&, af::array&)> function)
		: BaseGradientOptimizationMethod<Scalar, LSType>(numberOfVariables, function),
		min_step(1e-20), max_step(1e+20), sBeta1(0.9), sBeta2(0.99), sAlpha(0.001), sEpsilon(1e-8), delta(1e-8), sDecay(0.0)
	{
	}

	template<typename Scalar, LineSearchType LSType>
	AdaMaxSolver<Scalar, LSType>::AdaMaxSolver(NonlinearObjectiveFunction<Scalar> * function)
		: BaseGradientOptimizationMethod<Scalar, LSType>(function),
		min_step(1e-20), max_step(1e+20), sBeta1(0.9), sBeta2(0.999), sAlpha(0.001), sEpsilon(1e-8), delta(1e-8), sDecay(0.0)
	{
	}

	template<typename Scalar, LineSearchType LSType>
	AdaMaxSolver<Scalar, LSType>::~AdaMaxSolver()
	{
	}

	template<typename Scalar, LineSearchType LSType>
	void AdaMaxSolver<Scalar, LSType>::SetBeta1(Scalar beta1)
	{
		sBeta1 = beta1;
	}

	template<typename Scalar, LineSearchType LSType>
	void AdaMaxSolver<Scalar, LSType>::SetBeta2(Scalar beta2)
	{
		sBeta2 = beta2;
	}

	template<typename Scalar, LineSearchType LSType>
	void AdaMaxSolver<Scalar, LSType>::SetAlpha(Scalar alpha)
	{
		sAlpha = alpha;
	}

	template<typename Scalar, LineSearchType LSType>
	void AdaMaxSolver<Scalar, LSType>::SetEpsilon(Scalar epsilon)
	{
		sEpsilon = epsilon;
	}

	template<typename Scalar, LineSearchType LSType>
	void AdaMaxSolver<Scalar, LSType>::SetDecay(Scalar decay)
	{
		sDecay = decay;
	}

	template<typename Scalar, LineSearchType LSType>
	Scalar AdaMaxSolver<Scalar, LSType>::GetBeta1()
	{
		return sBeta1;
	}

	template<typename Scalar, LineSearchType LSType>
	Scalar AdaMaxSolver<Scalar, LSType>::GetBeta2()
	{
		return sBeta2;
	}

	template<typename Scalar, LineSearchType LSType>
	Scalar AdaMaxSolver<Scalar, LSType>::GetAlpha()
	{
		return sAlpha;
	}

	template<typename Scalar, LineSearchType LSType>
	Scalar AdaMaxSolver<Scalar, LSType>::GetEpsilon()
	{
		return sEpsilon;
	}

	template<typename Scalar, LineSearchType LSType>
	Scalar AdaMaxSolver<Scalar, LSType>::GetDecay()
	{
		return sDecay;
	}

	template<typename Scalar, LineSearchType LSType>
	bool AdaMaxSolver<Scalar, LSType>::Optimize(int* cycle)
	{
		const int n = GetNumberOfVariables();
		af::array x = GetSolution();

		Scalar fx = _function->Value(x);
		Scalar fpast, step, t, step_t;

		af::array m_grad = _function->Gradient(x);
		af::array ms(m_dtype);
		af::array vs(m_dtype);

		ms = af::constant(0.0, n, m_dtype);
		vs = af::constant(0.0, n, m_dtype);

		Scalar xnorm = af::norm(x);
		Scalar gnorm = af::norm(m_grad);

		// print out some useful information, if specified
		if (_display)
		{
			std::cout << "Numerical Optimization via AdaMax\n=================================\n\n";
			std::cout << "Starting Value: " << fx << std::endl << std::endl;
		}

		int k = 0;
		if (cycle)
			*cycle = (k + 1);
		do
		{
			fpast = fx;

			step = sAlpha;
			if (sDecay > 0) step *= (1.0 / (1.0 + sDecay * k));
			t = k + 1;
			step_t = step * (sqrt(1.0 - pow(sBeta2, t)) / (1.0 - pow(sBeta1, t)));

			// main computation
			ms = (sBeta1 * ms) + (1.0 - sBeta1) * m_grad;				// updates the moving averages of the gradient
			vs = max(sBeta2 * vs, abs(m_grad));							// updates the moving averages of the squared gradient

			Scalar biasCorrection1 = 1.0 - std::pow(sBeta1, t);

			if (biasCorrection1 != 0)
				x -= (step_t / biasCorrection1 * ms / (vs + sEpsilon));	// updates the parameters
													
			fx = _function->Value(x);									// evaluate function and gradient
			m_grad = _function->Gradient(x);

			if (_display)
				std::cout << "Cycle: " << k + 1 << "\t\tf(x): " << fx << "\t\t\tStep Size: " << step_t << std::endl;

			if (cycle)
				*cycle = (k + 1);

			xnorm = af::norm(x);
			gnorm = af::norm(m_grad);
			k++;
		} while (gnorm > _tolerance * std::max<Scalar>(xnorm, 1.0) && k < maxIterations && (abs(fpast - fx) / fx) >= delta);

		SetSolution(x);
		return 1;
	}
}