rng(10)
n_samples = 2000;
p_features = 2000;
d = p_features;
C = rand(n_samples,p_features);
b = round(rand(n_samples,1))*2 - 1;
mu = 0;

obj = logistic(C, b, mu);
loss = @(w) obj.loss(w);
grad = @(w) obj.grad(w);
hess = @(w) obj.hessian(w);

w_0 = 1e4*rand(p_features, 1);
iterations = 5000;
format long;

%%%%%%%%%%%%%%%%%%%%%%%%%%% Quasi-Newton %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

x_qn = [0];
y_qn_I = [1];
y_qn_cI = [1];
y_gd = [1];
y_agd = [1];

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

w = w_0;
H = norm(inv(hess(w)))*eye(d);

for iter = 1:iterations
    if mod(iter, 100) == 0
        disp(iter);
    end
    gradient = grad(w);
    step_size = @(eta) loss(w - eta*H*gradient);
    options = optimset('MaxIter',100);
    lambda = fminsearch(step_size, 1, options);
    w_new = w - lambda*H*gradient;
    gradient_new = grad(w_new);
    s = w_new - w;
    y = gradient_new - gradient;
    t = 1.0/(s'*y);
    G = t*(H*y)*s';
    K = s*s';
    H = H - G' - G + (t^2*(y'*H*y) + t)*K;
    w = w_new;
    x_qn = [x_qn, iter];
end

w_opt = w;

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

w = w_0;
H = eye(d);
alpha = 0.1;
beta = 0.9;

for iter = 1:iterations
    if mod(iter, 100) == 0
        disp(iter);
    end
    gradient = grad(w);
    function_value = loss(w);
    direction = -H*gradient;
    lambda = 1;
    lambda_max = 1e10;
    lambda_min = 0;
    for i = 1:1e2
        if loss(w + lambda*direction) > function_value + alpha*lambda*direction
            lambda_max = lambda;
            lambda = (lambda_max + lambda_min)/2;
        elseif dot(grad(w + lambda*direction), direction) < beta*dot(gradient, direction)
            lambda_min = lambda;
            if lambda_max == 1e20
                lambda = 2*lambda;
            else
                lambda = (lambda_max + lambda_min)/2;
            end
        else
            break
        end
    end
    w_new = w - lambda*H*gradient;  
    gradient_new = grad(w_new);
    s = w_new - w;
    y = gradient_new - gradient;
    t = 1.0/(s'*y);
    G = t*(H*y)*s';
    K = s*s';
    H = H - G' - G + (t^2*(y'*H*y) + t)*K;
    w = w_new;
    y_qn_I = [y_qn_I, (loss(w) - loss(w_opt))/(loss(w_0) - loss(w_opt))];
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

w = w_0;
w_1 = rand(d, 1);
w_2 = rand(d, 1);
gradient_1 = grad(w_1);
gradient_2 = grad(w_2);
c = dot(gradient_2 - gradient_1, w_2 - w_1)./dot(w_2 - w_1, w_2 - w_1);
H = (1./c)*eye(d);

for iter = 1:iterations
    if mod(iter, 100) == 0
        disp(iter);
    end
    gradient = grad(w);
    function_value = loss(w);
    direction = -H*gradient;
    lambda = 1;
    lambda_max = 1e10;
    lambda_min = 0;
    for i = 1:1e2
        if loss(w + lambda*direction) > function_value + alpha*lambda*direction
            lambda_max = lambda;
            lambda = (lambda_max + lambda_min)/2;
        elseif dot(grad(w + lambda*direction), direction) < beta*dot(gradient, direction)
            lambda_min = lambda;
            if lambda_max == 1e20
                lambda = 2*lambda;
            else
                lambda = (lambda_max + lambda_min)/2;
            end
        else
            break
        end
    end
    w_new = w - lambda*H*gradient;
    gradient_new = grad(w_new);
    s = w_new - w;
    y = gradient_new - gradient;
    t = 1.0/(s'*y);
    G = t*(H*y)*s';
    K = s*s';
    H = H - G' - G + (t^2*(y'*H*y) + t)*K;
    w = w_new;
    y_qn_cI = [y_qn_cI, (loss(w) - loss(w_opt))/(loss(w_0) - loss(w_opt))];
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

w = w_0;

for iter = 1:iterations
    if mod(iter, 100) == 0
        disp(iter);
    end
    gradient = grad(w);
    step_size = @(eta) loss(w - eta*gradient);
    options = optimset('MaxIter',1000);
    lambda = fminsearch(step_size, 1, options);
    w_new = w - lambda*gradient;
    w = w_new;
    y_agd = [y_agd, (loss(w) - loss(w_opt))/(loss(w_0) - loss(w_opt))];
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

w_prev = w_0;
w_curr = w_0;
a = 0;
b = 0;

L = norm(hess(w));

for iter = 1:iterations
    if mod(iter, 100) == 0
        disp(iter);
    end
    z = w_curr + b*(w_curr - w_prev);
    gradient = grad(z);
    step_size = @(eta) loss(z - eta*gradient);
    options = optimset('MaxIter',1000);
    lambda = fminsearch(step_size, 1, options);
    w_new = z - lambda*gradient;
    w_prev = w_curr;
    w_curr = w_new;
    a_prev = a;
    a = (1 + sqrt(1 + 4*a*a))/2;
    b = a_prev/a;
    y_gd = [y_gd, min(y_gd(end), (loss(w_curr) - loss(w_opt))/(loss(w_0) - loss(w_opt)))];
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

figure;
semilogy(x_qn, y_qn_I, 'r-*', 'LineWidth', 3, 'MarkerIndices', 1:500:length(y_qn_I));
grid on
hold on
semilogy(x_qn, y_qn_cI, 'b-*', 'LineWidth', 3, 'MarkerIndices', 1:500:length(y_qn_I));
semilogy(x_qn, y_gd, 'g-*', 'LineWidth', 3, 'MarkerIndices', 1:500:length(y_qn_I));
semilogy(x_qn, y_agd, 'k-*', 'LineWidth', 3, 'MarkerIndices', 1:500:length(y_qn_I));
legend({'BFGS I', 'BFGS cI', 'GD', 'AGD'}, 'Interpreter','latex','fontsize',20,'Location','southeast')
xlim([0 5000])
ylim([1e-20 1e1])
xticks(0:500:5000)
ax = gca;
ax.FontSize = 15;
xlabel('number of iterations $k$','Interpreter','latex','fontsize',30)
ylabel('$\frac{f(x_k) - f(x_*)}{f(x_0) - f(x_*)}$','Interpreter','latex','fontsize',30)
set(gcf,'position',[0,0,600,400])
hold off