clc; clear all; 
close all;

%% Initializing variables: d = dimension of context, lambda = COAF parameter
rng(120,"twister")
%ratings =  movie ratings from MovieLens 1M Dataset
%ratings_r = recovered full movie ratings of with matrix completition
load('movielens1M.mat');
ratings_r = single(ratings_r);

ratings_r = ratings_r/25 - 1;

a = single(ratings_r);

num_movies = size(ratings_r,1);
d = 10; % or 16
lambda = 0.02;

%% Feature extraction with PCA
[coeff,~,latent] = pca(ratings_r);
movie_features = ratings_r * coeff(:,1:d);

%% Fitting for ground truth model: Y = mean rating
Y = mean(ratings_r,2);
mdf = fitlm(movie_features,Y,'linear');
theta_star = table2array(mdf.Coefficients(2:end,1));
w_movie = movie_features*theta_star;

%% %% Computing maximum mean reward per time step with Robbins–Monro algorithm
n_steps = 50000;
step_coef = 1.0;
step_power= 5.0;
%Gamma_tr: maximum mean reward per time step
Gamma_tr = robbins_monro(n_steps, step_coef, step_power,theta_star,movie_features);

%% Online movie recommendation experiment experiment
itr = 2000;
Gamma_table = zeros(itr,1);
T=10000;
Rewchart = zeros(itr,T);
beta = sqrt(lambda) + sqrt(2*log(T)+d*log(1 + d/T/lambda));

for i=1:itr
    t = 0;
    %Gamma = -1.0 + 2.0 * rand();
    Gamma = 0; %fixed Gamma initialization
    step_value = 0; %%step size parameter initialization
    Rewtable = zeros(T+20,1);
    iV = eye(d)/lambda;
    B = zeros(d,1);
    n = 0;
    while t <= T
        [Rwd, Na, Sa, Gamma_new, step_value_new, iV_new, B_new] = coaf_ts(Gamma, step_value, movie_features, theta_star, iV, B, ratings_r, beta);     
        Gamma = Gamma_new;
        step_value = step_value_new;
        iV = iV_new;
        B = B_new;
        t = t + Sa;
        n = n + Na;
        if Na >=1
            Rewtable(t + 1 : t + Na)=Rwd;
        end
        t = t + Na;
    end
    Gamma_table(i)= Gamma;
    Rewchart(i,:) = Rewtable(1:T);
end

Regchart = cumsum(Gamma_tr- Rewchart,2);

%% plot regret
time = round(linspace(1,T,200));
regret = Regchart(:,time);

risk1 = 5; % Plot quartiles (dark grey)
risk2 = 5;  % and upper 5 percents quantile (light grey)

COAFts_mov_fig = figure('position', [10 10 320 280]);
h = area(time, [prctile(regret, risk1); prctile(regret, 100-risk2)- ...
                 prctile(regret, risk1); prctile(regret, 100)- ...
                 prctile(regret, 100-risk2)]');
set(h(1),'FaceColor', 0.55*[1 1 1]);
set(h(2),'FaceColor', 0.78*[1 1 1]);
set(h(3),'FaceColor', 0.96*[1 1 1]);
hold on;  
h = plot(time, mean(regret), 'k','LineWidth',2);
% set(h, 'LineWidth', 1.5);
xlabel('time', 'FontSize', 10);
ylabel('regret', 'FontSize', 10);
% legend('lower 5%','middel 90%','upper 5%','mean');
% legend('Location','northwest')
ylim([0 220])
saveas(COAFts_mov_fig,'COAFts_mov.png')



%% Contextual bandit with MovieLens 1M dataset
function [Xa, Sa, La, Ida] = cbandit(movie_features)
    % Generate a random integer between 6 and 20
    La = randi([6, 20]);
      
    % Generate random arm request time between 5 and 10
    Sa = randi([5, 10]);
    
    % Generate a random reward vector from a beta distribution    
    Ida = randi([1, length(movie_features)],La,1);
    Xa = movie_features(Ida,:);    
end

%% COAF-TS
function [Rwd, Na, Sa, Gamma_new, step_value_new, iV_new, B_new] = coaf_ts(Gamma, step_value, movie_features, theta_star, iV, B, ratings, beta)
    [Xa, Sa, ~, Ida] = cbandit(movie_features);
    Ya = Xa*theta_star;
    step_value_new = step_value + Sa;

     % --- Thompson Sampling Core ---      
    theta_bar = iV*B; % posterior mean of theta
    % Sample theta_tilde from the posterior N(posterior_mean, posterior_covariance)
    noise_variance = 1;
    theta_tilde = mvnrnd(theta_bar, beta*noise_variance*iV);
    
    % Predict rewards for all candidate movies using the sampled theta_tilde
    TS_rwd = Xa * theta_tilde';
    Est_rwd = Xa * theta_bar;
    % --- End Thompson Sampling Core ---
     
    % Find elements in UCBa that are greater than or equal to Gamma
    s_id = find(TS_rwd >= Gamma);
    Rwd = Ya(s_id);
    % Rwd_sampled = TS_rwd (s_id);
    Rwd_est = Est_rwd(s_id); % The predicted rewards using mean theta
    S_id_mv = Ida(s_id);
    
    
    % Get the number of elements in Rew
    Na = numel(s_id);
    Rwd_nois = zeros(Na,1);
    for i = 1: length(S_id_mv)
        userid = randi([1, size(ratings,2)]);
        Rwd_nois(i) = ratings(S_id_mv(i), userid);
    end
    
    % y = (Sa + Na) * Gamma - sum(Rwd_sampled);
    y = (Sa + Na) * Gamma - sum(Rwd_est);
    %\xi selected to be 0.5
    Gamma_new= Gamma - y/2/(step_value_new);
    Gamma_new = max(-1.0, min(1.0, Gamma_new));
        
    % Update Statistics
    for i = 1:Na
        g = Xa(s_id(i),:)*iV*Xa(s_id(i),:)';
        temp = Xa(s_id(i),:)*iV;
        iV = iV - 1/(1+g)*(temp'*temp);
    end
    iV_new = iV;
    B_new = B + Xa(s_id,:)'*Rwd_nois;
end

%% robbins_monro algorithm
function Gamma_tr = robbins_monro(n_steps, step_coef, step_power,theta_star,movie_features)
   % Compute a sample path until `n_steps` steps by the Robbins-Monro algorithm.
    
    % Initialize x with a uniform random number between -1 and 1
    x = -1.0 + 2.0 * rand();

    for i = 1:n_steps
        a = step_coef / (i*step_power);
        [Xa, Sa, ~, ~] = cbandit(movie_features);
        Ya = Xa*theta_star;
        s_id = find(Ya >= x);
        Rwd = Ya(s_id);
        Na = numel(s_id);
        y = (Sa + Na) * x - sum(Rwd);    
        x = x - a * y;
        x = max(-1.0, min(1.0, x)); % Equivalent to np.clip(x, -1.0, 1.0)
    end
    Gamma_tr = x;
end