% For the step size we use gamma instead of 2*gamma as used in the
% manuscript.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% d = dimension, sig = noise variance, rho = norm of A_true, the system
% matrix to be estimated
d=5;
sig=1;
rho=0.98;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% initializes the system matrix  A_true to be estimated.
%A_normrnd=randn(d,d);
%A_normrnd=A_normrnd/norm(A_normrnd);
U=rand_ortho(d);
A_bimod=U'*diag([rho*ones(1,ceil(d/2)),(rho/3)*ones(1,d-ceil(d/2))])*U;
A_true = A_bimod;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% sets the required non linearity.
%fun=@(x)max(x,0);
%fun=@(x)tanh(x);
fun = @(x)leakyReLU(x);

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% B = buffer size,u = gap size,T = horizon,T_grad = number of iterations for GLM-tron,T_newton = number
% of interations for Quasi Newton Method.
B = 240;
u = 10;
T = 100000;
T_grad = 300;
T_newton = 40;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% generates data
X0=zeros(d,1);
X = data_gen(fun,X0,A_true,sig,d,T);

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% eta sets the max step size for stochastic gradient methods so that they
% don't become unstable.
eta = 5*(1-rho^2)/(d*(sig^2)*log(1+T));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% SGD with reverse experience replay

A0_rer = zeros(d,d);
cumul_rer = zeros(d,d);
loss_rer = zeros(T/(B+u),1);
eta_rer = min(eta,10*log(T)/T);
time_rer = zeros(T/(B+u),1);

for t = 1:T/(B+u)
    tic;
    for i = 1:B
        N = (B+u)*(t-1) + B+1;
        grad_rer = get_stochastic_gradient(fun,X(:,N-i),X(:,N-i+1),A0_rer);
        A0_rer = A0_rer - eta_rer*grad_rer;
    end
    cumul_rer = cumul_rer + A0_rer;
    if t == 1
       time_rer(t) = toc;
    else
        time_rer(t) = time_rer(t-1) + toc;
    end
    loss_rer(t) =  norm(cumul_rer/t-A_true,'fro');
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% SGD with experience replay
A0_er = zeros(d,d);
cumul_er = zeros(d,d);
loss_er = zeros(T/(B+u),1);
eta_er = min(eta,10*log(T)/T);
time_er = zeros(T/(B+u),1);

for t = 1:T/(B+u)
    tic;
    for i = 1:B
        N = (B+u)*(t-1);
        i_rand = randi([1,B]);
        grad_er = get_stochastic_gradient(fun,X(:,N+i_rand),X(:,N+i_rand+1),A0_er);
        A0_er = A0_er - eta_er*grad_er;
    end
    cumul_er = cumul_er + A0_er;
    if t == 1
       time_er(t) = toc;
    else
        time_er(t) = time_er(t-1) + toc;
    end
    loss_er(t) =  norm(cumul_er/t-A_true,'fro');
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% forward SGD
A0_fwd = zeros(d,d);
cumul_fwd = zeros(d,d);
loss_fwd = zeros(T-1,1);
eta_fwd = min(eta,10*log(T)/T);
time_fwd = zeros(T-1,1);

for t = 1:T-1
    tic;
    grad_fwd = get_stochastic_gradient(fun,X(:,t),X(:,t+1),A0_fwd);
    A0_fwd = A0_fwd - eta_fwd*grad_fwd;
    cumul_fwd = cumul_fwd + A0_fwd;
    if t == 1
        time_fwd(t) = toc;
    else
        time_fwd(t) = time_fwd(t-1) + toc;
    end
    loss_fwd(t) =  norm(cumul_fwd/(t+1)-A_true,'fro');
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%gradient descent/ GLM-tron
A0 = zeros(d,d);
eta = 50*(1-rho^2)/(d*(sig^2)*log(1+T));
loss_grad = zeros(T_grad,1);
time_grad = zeros(T_grad,1);
t = 1;
tic
grad = get_gradient(fun,X,A0,T,d);
A0 = A0 - eta*grad;
time_grad(t) = toc;
loss_grad(t) = norm(A0-A_true,'fro');

for t = 2:T_grad
    tic
    grad = get_gradient(fun,X,A0,T,d);
    A0 = A0 - eta*grad;
    time_grad(t) = time_grad(t-1) + toc;
    loss_grad(t) = norm(A0-A_true,'fro');
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% generalized newton step
G = zeros(d);
for t = 1:T
    G = G + X(:,t)*transpose(X(:,t));
end
G = G/T;
G_inv = inv(G);

A_newton = zeros(d,d);
eta_newton = 0.4;
loss_newton = zeros(T_newton,1);
time_newton = zeros(T_newton,1);
t = 1;
tic
grad = get_gradient(fun,X,A0,T,d);
A_newton = A_newton - eta*grad*G_inv;
time_newton(t) = toc;
loss_newton(t) = norm(A_newton-A_true,'fro');

for t = 2:T_newton
    tic
    grad = get_gradient(fun,X,A_newton,T,d);
    A_newton = A_newton - eta_newton*grad*G_inv;
    time_newton(t) = time_newton(t-1) + toc;
    loss_newton(t) = norm(A_newton-A_true,'fro');
end

set(groot, 'defaultTextInterpreter','latex');

loglog(time_fwd,loss_fwd,'cyan');
hold on
loglog(time_er,loss_er,'black')
hold on
loglog(time_rer,loss_rer,'green');
hold on
loglog(time_grad,loss_grad,'red');
hold on
loglog(time_newton,loss_newton,'blue');
title({['LeakyReLU AR, randBiMod $A^*$, $\rho=',num2str(rho),'$, dimension $d=',num2str(d),'$']});
xlabel('Compute Time');
ylabel('$||A_{T,avg}-A^*||_{F}$');
legend({'Forward SGD','SGD-ER','SGD-RER','GLMtron','Quasi Newton'},'Interpreter','latex');

figure()
loglog(1:T-1,loss_fwd,'cyan')
hold on
loglog([1:(B+u-1),(1:T/(B+u))*(B+u)],[ones(B+u-1,1)*norm(A_true,'fro') ;loss_er],'black')
hold on
loglog([1:(B+u-1),(1:T/(B+u))*(B+u)],[ones(B+u-1,1)*norm(A_true,'fro') ;loss_rer],'green')
hold on
title({['LeakyReLU AR, randBiMod $A^*$, $\rho=',num2str(rho),'$, dimension $d=',num2str(d),'$']});
xlabel('Number of SGD Steps');
ylabel('$||A_{T,avg}-A^*||_{F}$');
legend({'Forward SGD','SGD-ER','SGD-RER'},'Interpreter','latex');
function [X] = data_gen(fun, X0,A_true,sig,d,T) 
   X = zeros(d,T);
   X(:,1) = fun(A_true*X0) + sig*randn(d,1);
   for i = 2:T
      X(:,i) = fun(A_true*X(:,i-1)) + sig*randn(d,1);
   end
end

function [grad] = get_stochastic_gradient(fun,X,Y,A)
    grad = (fun(A*X)-Y)*transpose(X);
end

function [grad] = get_gradient(fun,X,A,T,d)
    grad = zeros(d,d);
    for i = 1:T-1
        grad = grad +  (fun(A*X(:,i)) - X(:,i+1))*transpose(X(:,i));
    end
    grad = grad/T;
end