
% Generate an instance of the reward learning problem
%d = 3000;
%nvec = [64, 128, 256, 400, 512, 750, 1024];
%bp = 2; % decay for policy
%br = 1.70; % decay for reward
b = bp - br; % ratio of decay
%tau = 0.1; % noise variance
ratepred = -b/(b+2);

% Set up reward and policy spaces
Sr = zeros(d,1);
Sp = zeros(d,1);

for j=1:d
    Sr(j) = j^(br);
    Sp(j) = j^(bp);
end

Srinv = (1./Sr);
Spinv = (1./Sp);
%Sr = diag(Sr);
%Sp = diag(Sp);


% form the map M and the adjoint
%M = eye(d);
Mad = Spinv.*Sr;

rng(3)
% Optimal reward rstar
 z = randn(d,1);
 z = abs(z);
 z = sort(z, 'descend');
 z = z/norm(z); 
 rstar = sqrt(Srinv).*z;

% experiment id
%id = 4;
n = nvec(id);

% Select the query points
%al = 1/(b+2);
al = (1-(log(tau^2)/log(n)))/(b+2);
%al = 1;
nsamp = min(min(floor(const*n^(al)), n),d);
regp = n^(-(b+1)/(b+2))/regc;
qset = zeros(nsamp, 1);
for i = 1:nsamp
    qset(i) = 1/sqrt(Sp(i));
end

errvec = zeros(numruns, 1);
for runiter = 1:numruns
    % Obtain responses on query points
    nquerypoint = floor(n/nsamp);
    neff = nquerypoint*nsamp;
    y = zeros(neff,1);
    for i = 1:nsamp
        y((i-1)*nquerypoint +1: i*nquerypoint) = qset(i)*rstar(i)*Sr(i) + tau*randn(nquerypoint, 1);
    end
    
    % form the covariance matrix
    x = zeros(d,neff);
    for i = 1:nsamp
        xi = zeros(d,nquerypoint);
        xi(i, :) = ones(nquerypoint,1);
        x(:, (i-1)*nquerypoint +1: i*nquerypoint) = qset(i)*xi;
    end
    
    % obtain ridge regression estimate for reward
    cov = x*x'/neff;
    xy = x*y/neff;
    
    %rhat2 = inv(cov*diag(Sr) + regp*eye(d))*xy;
    rhat = (cov*diag(Sr) + regp*eye(d))\xy;
    
    
    polstar = optimize_reward(rstar, Sp, Mad);
    polhat = optimize_reward(rhat, Sp, Mad);
    
    err = rstar'*(Sr.*(polstar - polhat));
    %err = norm(sqrt(Sp)*Mad*(rstar - rhat))^2;
    errvec(runiter) = err;
end

function pol = optimize_reward(r, Sp, Mad)
polhat = Mad.*r;
normp = sqrt(polhat'*(Sp.*polhat));
pol = polhat/normp;
end


