function [x, cost, info, options] = rlbfgs(problem, x0, options)
% Riemannian limited memory BFGS solver for smooth objective functions.
% 
% function [x, cost, info, options] = rlbfgs(problem)
% function [x, cost, info, options] = rlbfgs(problem, x0)
% function [x, cost, info, options] = rlbfgs(problem, x0, options)
% function [x, cost, info, options] = rlbfgs(problem, [], options)
%
%
% This is a Riemannian limited memory BFGS solver (quasi-Newton method), 
% which aims to minimize the cost function in the given problem structure.
% It requires access to the gradient of the cost function.
%
% Parameter options.memory can be used to specify the number of iterations
% the algorithm remembers and uses to approximate the inverse Hessian of
% the cost. Default value is 30.
% For unlimited memory, set options.memory = Inf.
%
%
% For a description of the algorithm and theorems offering convergence
% guarantees, see the references below.
%
% The initial iterate is x0 if it is provided. Otherwise, a random point on
% the manifold is picked. To specify options whilst not specifying an
% initial iterate, give x0 as [] (the empty matrix).
%
% The two outputs 'x' and 'cost' are the last reached point on the manifold
% and its cost.
% 
% The output 'info' is a struct-array which contains information about the
% iterations:
%   iter (integer)
%       The iteration number. The initial guess is 0.
%   cost (double)
%       The corresponding cost value.
%   gradnorm (double)
%       The (Riemannian) norm of the gradient.
%   time (double)
%       The total elapsed time in seconds to reach the corresponding cost.
%   stepsize (double)
%       The size of the step from the previous to the new iterate.
%   accepted (Boolean)
%       true if step is accepted in the cautious update. 0 otherwise.
%   And possibly additional information logged by options.statsfun.
% For example, type [info.gradnorm] to obtain a vector of the successive
% gradient norms reached at each iteration.
%
% The options structure is used to overwrite the default values. All
% options have a default value and are hence optional. To force an option
% value, pass an options structure with a field options.optionname, where
% optionname is one of the following and the default value is indicated
% between parentheses:
%
%   tolgradnorm (1e-6)
%       The algorithm terminates if the norm of the gradient drops below
%       this. For well-scaled problems, a rule of thumb is that you can
%       expect to reduce the gradient norm by 8 orders of magnitude
%       (sqrt(eps)) compared to the gradient norm at a "typical" point (a
%       rough initial iterate for example). Further decrease is sometimes
%       possible, but inexact floating point arithmetic will eventually
%       limit the final accuracy. If tolgradnorm is set too low, the
%       algorithm may end up iterating forever (or at least until another
%       stopping criterion triggers).
%   maxiter (1000)
%       The algorithm terminates if maxiter iterations were executed.
%   maxtime (Inf)
%       The algorithm terminates if maxtime seconds elapsed.
%   minstepsize (1e-10)
%     The minimum norm of the tangent vector that points from the current
%     point to the next point. If the norm is less than minstepsize, the 
%     program will terminate.
%   memory (30)
%     The number of previous iterations the program remembers. This is used 
%     to approximate the inverse Hessian at the current point. Because of
%     difficulty of maintaining a representation of operators in terms of
%     coordinates, a recursive method is used. The number of steps in the
%     recursion is at most options.memory. This parameter can take any
%     integer value >= 0, or Inf, which is taken to be options.maxiter. If
%     options.maxiter has value Inf, then it will take value 10000 and a
%     warning will be displayed.
%   strict_inc_func (@(t) 1e-4*t)
%     The Cautious step needs a real function that has value 0 at t = 0,
%     and  is strictly increasing. See details in Wen Huang's paper
%     "A Riemannian BFGS Method without Differentiated Retraction for 
%     Nonconvex Optimization Problems"
%   statsfun (none)
%       Function handle to a function that will be called after each
%       iteration to provide the opportunity to log additional statistics.
%       They will be returned in the info struct. See the generic Manopt
%       documentation about solvers for further information. statsfun is
%       called with the point x that was reached last.
%   stopfun (none)
%       Function handle to a function that will be called at each iteration
%       to provide the opportunity to specify additional stopping criteria.
%       See the generic Manopt documentation about solvers for further
%       information.
%   verbosity (2)
%       Integer number used to tune the amount of output the algorithm
%       generates during execution (mostly as text in the command window).
%       The higher, the more output. 0 means silent. 3 and above includes a
%       display of the options structure at the beginning of the execution.
%   debug (false)
%       Set to true to allow the algorithm to perform additional
%       computations for debugging purposes. If a debugging test fails, you
%       will be informed of it, usually via the command window. Be aware
%       that these additional computations appear in the algorithm timings
%       too, and may interfere with operations such as counting the number
%       of cost evaluations, etc. (the debug calls get storedb too).
%   storedepth (2)
%       Maximum number of different points x of the manifold for which a
%       store structure will be kept in memory in the storedb for caching.
%       If memory usage is an issue, you may try to lower this number.
%       Profiling may then help to investigate if a performance hit was
%       incurred as a result.
%
%
% Please cite the Manopt paper as well as the research paper:
% @InBook{Huang2016,
%   title     = {A {R}iemannian {BFGS} Method for Nonconvex Optimization Problems},
%   author    = {Huang, W. and Absil, P.-A. and Gallivan, K.A.},
%   year      = {2016},
%   publisher = {Springer International Publishing},
%   editor    = {Karas{\"o}zen, B{\"u}lent and Manguo{\u{g}}lu, Murat and Tezer-Sezgin, M{\"u}nevver and G{\"o}ktepe, Serdar and U{\u{g}}ur, {\"O}m{\"u}r},
%   address   = {Cham},
%   booktitle = {Numerical Mathematics and Advanced Applications ENUMATH 2015},
%   pages     = {627--634},
%   doi       = {10.1007/978-3-319-39929-4_60}
% }
%
% We point out that, at the moment, this implementation of RLBFGS can be
% slower than the implementation in ROPTLIB by Wen Huang et al. referenced
% above. For the purpose of comparing to their work, please use their
% implementation.
%


% This file is part of Manopt: www.manopt.org.
% Original author: Changshuo Liu, July 19, 2017.
% Contributors: Nicolas Boumal
% Change log: 
%
%   Nov. 27, 2017 (Wen Huang):
%       Changed the default strict_inc_func to @(t) 1e-4*t from @(t) t.
%
%   Jan. 18, 2018 (NB):
%       Corrected a bug related to the way the line search hint was defined
%       by default.
%
%   Aug. 2, 2018 (NB):
%       Using the new storedb.remove features to keep storedb lean, and
%       reduced the default value of storedepth from 30 to 2 as a result.


    % Verify that the problem description is sufficient for the solver.
    if ~canGetCost(problem)
        warning('manopt:getCost', ...
                'No cost provided. The algorithm will likely abort.');
    end
    if ~canGetGradient(problem) && ~canGetApproxGradient(problem)
        % Note: we do not give a warning if an approximate gradient is
        % explicitly given in the problem description, as in that case the user
        % seems to be aware of the issue.
        warning('manopt:getGradient:approx', ...
           ['No gradient provided. Using an FD approximation instead (slow).\n' ...
            'It may be necessary to increase options.tolgradnorm.\n' ...
            'To disable this warning: warning(''off'', ''manopt:getGradient:approx'')']);
        problem.approxgrad = approxgradientFD(problem);
    end
    
    % This solver uses linesearch_hint as a line search algorithm. By
    % default, try a step size of 1, so that if the BFGS approximation of
    % the Hessian (or inverse Hessian) is good, then the iteration is close
    % to a Newton step.
    if ~canGetLinesearch(problem)
        problem.linesearch = @(x, d) 1;
    end
    
    % Local defaults for the program
    localdefaults.minstepsize = 1e-10;
    localdefaults.maxiter = 1000;
    localdefaults.tolgradnorm = 1e-6;
    localdefaults.memory = 30;
    localdefaults.strict_inc_func = @(t) 1e-4*t;
    localdefaults.ls_max_steps = 25;
    localdefaults.storedepth = 2;
    
    % Merge global and local defaults, then merge w/ user options, if any.
    localdefaults = mergeOptions(getGlobalDefaults(), localdefaults);
    if ~exist('options', 'var') || isempty(options)
        options = struct();
    end
    options = mergeOptions(localdefaults, options);
    
    % To make sure memory in range [0, Inf)
    options.memory = max(options.memory, 0);
    if options.memory == Inf
        if isinf(options.maxiter)
            options.memory = 10000;
            warning('rlbfgs:memory', ['options.memory and options.maxiter' ...
              ' are both Inf; options.memory has been changed to 10000.']);
        else
            options.memory = options.maxiter;
        end
    end
    
    M = problem.M;
    
    
    timetic = tic();
    
    
    % Create a random starting point if no starting point is provided.
    if ~exist('x0', 'var')|| isempty(x0)
        xCur = M.rand();
    else
        xCur = x0;
    end
    
    % Create a store database and get a key for the current x
    storedb = StoreDB(options.storedepth);
    key = storedb.getNewKey();
    
    % __________Initialization of variables______________
    % Number of iterations since the last restart
    k = 0;  
    % Total number of BFGS iterations
    iter = 0; 
    
    % This cell stores step vectors which point from x_{t} to x_{t+1} for t
    % indexing the last iterations, capped at options.memory.
    % That is, it stores up to options.memory of the most recent step
    % vectors. However, the implementation below does not need step vectors 
    % in their respective tangent spaces at x_{t}'s. Rather, it requires
    % them transported to the current point's tangent space by vector
    % transport. For details regarding the requirements on the the vector
    % transport, see the reference paper by Huang et al.
    % In this implementation, those step vectors are iteratively 
    % transported to the current point's tangent space after every
    % iteration. Thus, at every iteration, vectors in sHistory are in the
    % current point's tangent space.
    sHistory = cell(1, options.memory);
    
    % This cell stores the differences for latest t's of the gradient at
    % x_{t+1} and the gradient at x_{t}, transported to x_{t+1}'s tangent
    % space. The memory is also capped at options.memory.
    yHistory = cell(1, options.memory);
    
    % rhoHistory{t} stores the reciprocal of the inner product between
    % sHistory{t} and yHistory{t}.
    rhoHistory = cell(1, options.memory);
    
    % Scaling of direction given by getDirection for acceptable step
    alpha = 1; 
    
    % Scaling of initial matrix, Barzilai-Borwein.
    scaleFactor = 1;
    
    % Norm of the step
    stepsize = 1;
    
    % Stores whether the step is accepted by the cautious update check.
    accepted = true;
    
    % Query the cost function and its gradient
    [xCurCost, xCurGradient] = getCostGrad(problem, xCur, storedb, key);
    
    xCurGradNorm = M.norm(xCur, xCurGradient);
    
    % Line-search statistics for recording in info.
    lsstats = [];
    
    % Flag to control restarting scheme to avoid infinite loops (see below)
    ultimatum = false;
    
    % Save stats in a struct array info, and preallocate.
    stats = savestats();
    info(1) = stats;
    info(min(10000, options.maxiter+1)).iter = [];
    
    if options.verbosity >= 2
        fprintf(' iter                   cost val            grad. norm           alpha\n');
    end
    
    % Main iteration
    while true

        % Display iteration information
        if options.verbosity >= 2
        fprintf('%5d    %+.16e        %.8e      %.4e\n', ...
                iter, xCurCost, xCurGradNorm, alpha);
        end
        
        % Start timing this iteration
        timetic = tic();
        
        % Run standard stopping criterion checks
        [stop, reason] = stoppingcriterion(problem, xCur, options, ...
                                           info, iter+1);
        
        % If none triggered, run specific stopping criterion check
        if ~stop 
            if stats.stepsize < options.minstepsize
                % To avoid infinite loop and to push the search further
                % in case BFGS approximation of Hessian is off towards
                % the end, we erase the memory by setting k = 0;
                % In this way, it starts off like a steepest descent.
                % If even steepest descent does not work, then it is 
                % hopeless and we will terminate.
                if ~ultimatum
                    if options.verbosity >= 2
                        fprintf(['stepsize is too small, restarting ' ...
                            'the bfgs procedure at the current point.\n']);
                    end
                    k = 0;
                    ultimatum = true;
                else
                    stop = true;
                    reason = sprintf(['Last stepsize smaller than '  ...
                        'minimum allowed; options.minstepsize = %g.'], ...
                        options.minstepsize);
                end
            else
                % We are not in trouble: lift the ultimatum if it was on.
                ultimatum = false;
            end
        end  
        
        if stop
            if options.verbosity >= 1
                fprintf([reason '\n']);
            end
            break;
        end

        
        % Compute BFGS direction
        p = getDirection(M, xCur, xCurGradient, sHistory,...
                yHistory, rhoHistory, scaleFactor, min(k, options.memory));

        % Execute line-search
        [stepsize, xNext, newkey, lsstats] = ...
            linesearch_hint(problem, xCur, p, xCurCost, ...
                            M.inner(xCur, xCurGradient, p), ...
                            options, storedb, key);
        
        % Record the BFGS step-multiplier alpha which was effectively
        % selected. Toward convergence, we hope to see alpha = 1.
        alpha = stepsize/M.norm(xCur, p);
        step = M.lincomb(xCur, alpha, p);
        
        
        % Query cost and gradient at the candidate new point.
        [xNextCost, xNextGrad] = getCostGrad(problem, xNext, storedb, newkey);
        
        % Compute sk and yk
        sk = M.transp(xCur, xNext, step);
        yk = M.lincomb(xNext, 1, xNextGrad, ...
                             -1, M.transp(xCur, xNext, xCurGradient));

        % Computation of the BFGS step is invariant under scaling of sk and
        % yk by a common factor. For numerical reasons, we scale sk and yk
        % so that sk is a unit norm vector.
        norm_sk = M.norm(xNext, sk);
        sk = M.lincomb(xNext, 1/norm_sk, sk);
        yk = M.lincomb(xNext, 1/norm_sk, yk);
        
        inner_sk_yk = M.inner(xNext, sk, yk);
        inner_sk_sk = M.norm(xNext, sk)^2;    % ensures nonnegativity
        
        
        % If the cautious step is accepted (which is the intended
        % behavior), we record sk, yk and rhok and need to do some
        % housekeeping. If the cautious step is rejected, these are not
        % recorded. In all cases, xNext is the next iterate: the notion of
        % accept/reject here is limited to whether or not we keep track of
        % sk, yk, rhok to update the BFGS operator.
        cap = options.strict_inc_func(xCurGradNorm);
        if inner_sk_sk ~= 0 && (inner_sk_yk / inner_sk_sk) >= cap
            
            accepted = true;
            
            rhok = 1/inner_sk_yk;
            
            scaleFactor = inner_sk_yk / M.norm(xNext, yk)^2;
            
            % Time to store the vectors sk, yk and the scalar rhok.
            % Remember: we need to transport all vectors to the most
            % current tangent space.
            
            % If we are out of memory
            if k >= options.memory
                
                % sk and yk are saved from 1 to the end with the most 
                % current recorded to the rightmost hand side of the cells
                % that are occupied. When memory is full, do a shift so
                % that the rightmost is earliest and replace it with the
                % most recent sk, yk.
                for  i = 2 : options.memory
                    sHistory{i} = M.transp(xCur, xNext, sHistory{i});
                    yHistory{i} = M.transp(xCur, xNext, yHistory{i});
                end
                if options.memory > 1
                    sHistory = sHistory([2:end, 1]);
                    yHistory = yHistory([2:end, 1]);
                    rhoHistory = rhoHistory([2:end 1]);
                end
                if options.memory > 0
                    sHistory{options.memory} = sk;
                    yHistory{options.memory} = yk;
                    rhoHistory{options.memory} = rhok;
                end
                
            % If we are not out of memory
            else
                
                for i = 1:k
                    sHistory{i} = M.transp(xCur, xNext, sHistory{i});
                    yHistory{i} = M.transp(xCur, xNext, yHistory{i});
                end
                sHistory{k+1} = sk;
                yHistory{k+1} = yk;
                rhoHistory{k+1} = rhok;
                
            end
            
            k = k + 1;
            
        % The cautious step is rejected: we do not store sk, yk, rhok but
        % we still need to transport stored vectors to the new tangent
        % space.
        else
            
            accepted = false;
            
            for  i = 1 : min(k, options.memory)
                sHistory{i} = M.transp(xCur, xNext, sHistory{i});
                yHistory{i} = M.transp(xCur, xNext, yHistory{i});
            end
            
        end
        
        % Update variables to new iterate.
        storedb.removefirstifdifferent(key, newkey);
        xCur = xNext;
        key = newkey;
        xCurGradient = xNextGrad;
        xCurGradNorm = M.norm(xNext, xNextGrad);
        xCurCost = xNextCost;
        
        % iter is the number of iterations we have accomplished.
        iter = iter + 1;
        
        % Make sure we don't use too much memory for the store database
        % (this is independent from the BFGS memory.)
        storedb.purge();
        
        
        % Log statistics for freshly executed iteration
        stats = savestats();
        info(iter+1) = stats; 
        
    end

    
    % Housekeeping before we return
    info = info(1:iter+1);
    x = xCur;
    cost = xCurCost;

    if options.verbosity >= 1
        fprintf('Total time is %f [s] (excludes statsfun)\n', ...
                info(end).time);
    end

    
    % Routine in charge of collecting the current iteration stats
    function stats = savestats()
        stats.iter = iter;
        stats.cost = xCurCost;
        stats.gradnorm = xCurGradNorm;
        if iter == 0
            stats.stepsize = NaN;
            stats.time = toc(timetic);
            stats.accepted = NaN;
        else
            stats.stepsize = stepsize;
            stats.time = info(iter).time + toc(timetic);
            stats.accepted = accepted;
        end
        stats.linesearch = lsstats;
        stats = applyStatsfun(problem, xCur, storedb, key, options, stats);
    end

end




% BFGS step, see Wen's paper for details. This function takes in a tangent
% vector g, and applies an approximate inverse Hessian P to it to get Pg.
% Then, -Pg is returned.
%
% Theory requires the vector transport to be isometric and to satisfy the
% locking condition (see paper), but these properties do not seem to be
% crucial in practice. If your manifold provides M.isotransp, it may be
% good to do M.transp = M.isotransp; after loading M with a factory.
%
% This implementation operates in the tangent space of the most recent
% point since all vectors in sHistory and yHistory have been transported
% there.
function dir = getDirection(M, xCur, xCurGradient, sHistory, yHistory, ...
                            rhoHistory, scaleFactor, k)
    
    q = xCurGradient;
    
    inner_s_q = zeros(1, k);
    
    for i = k : -1 : 1
        inner_s_q(1, i) = rhoHistory{i} * M.inner(xCur, sHistory{i}, q);
        q = M.lincomb(xCur, 1, q, -inner_s_q(1, i), yHistory{i});
    end
    
    r = M.lincomb(xCur, scaleFactor, q);
    
    for i = 1 : k
         omega = rhoHistory{i} * M.inner(xCur, yHistory{i}, r);
         r = M.lincomb(xCur, 1, r, inner_s_q(1, i)-omega, sHistory{i});
    end
    
    dir = M.lincomb(xCur, -1, r);
    
end
