%% implementation and simulation of Alg. 3
%
% Input: (At,Bt)        true dynamics
%        thetaUbound    upper bound on parameters (A,B)
%        thetaLbound    lower bound on parameters (A,B)
%        Q,R            defines stage cost as x' Q x + u' R u
%        Kt             feedback gain corresponding to dynamics (At,Bt)
%        T              time horizon / simulation time
%
% Output: cost          stage cost over horizon (length T)
%         cost_opt      stage cost of policy Kt over horizon (length T)
%                       with same process noise realization as cost
%         trajectories  struct with state trajectory, input trajectory,
%                       applied policies, state/input trajectory
%                       corresponding to Kt
%         estimates     struct with parameter estimate (as sampled by the
%                       algorithm), as well as mean and covariance of the
%                       underlying Gaussian distribution
%
%
function [cost,cost_opt,trajectories,estimates]=runExperiment(At,Bt,thetaUbound,thetaLbound,Q,R,Kt,T)

% extract problem size
n=size(Q,1);
nu=size(R,1);

% compute effective dimension (thetaUbound==thetaLbound) for parameters
% that are certain
p=n^2+n*nu-sum(thetaUbound==thetaLbound);

% eliminate known parameters
idx_known=thetaUbound==thetaLbound;
thetaUboundOrg=thetaUbound;
thetaLboundOrg=thetaLbound;

thetaLbound=thetaLboundOrg(~idx_known);
thetaUbound=thetaUboundOrg(~idx_known);

% compute transformation matrix such that 
% theta_org=theta_org_known + MOrg * theta.
%
% MOrg has dimension n^2+n*nu x p.
thetaOrgKnown=thetaUboundOrg;
thetaOrgKnown(~idx_known)=0;
MOrg=eye(n^2+n*nu);
MOrg=MOrg(:,~idx_known);

% random initialization
theta0=rand(p,1);
theta0=thetaLbound.*theta0+(1-theta0).*thetaUbound;

% initial matrices
[A0,B0]=getMatrices(thetaOrgKnown+MOrg*theta0,n,nu);

% initial condition
x1=zeros(n,1);

% initialize the vector of parameter estimates
theta=zeros(p,T);
theta(:,1)=theta0;

% initialize the vector of feedback policies
Ktraj=zeros(n*nu,T);
K1=-dlqr(A0,B0,Q,R);
Ktraj(:,1)=K1(:);

% mean/variance corresponding to parameter estimates
mutheta=zeros(p,T);
vartheta=zeros(p,T);

% initial variance arising from uniform distribution over parameter range
Pk=diag((thetaUbound-thetaLbound).^2/4);
vartheta(:,1)=diag(Pk);

% initialize parameters
eta=10;                 % learning rate
epsilon=sqrt((p)/T);    % packing width
sigma=1;                % process noise variance
M=5;                    % persistance of excitation parameter
sigma_u=sqrt(10/(M*eta*nu*epsilon)*(2+p)); % initial excitation
    
% initialize the vector of excitations
sigma_uk=zeros(1,T);
sigma_uk(1)=sigma_u;

% initialize state/input trajector vector
xtraj=zeros(n,T);
xtraj(:,1)=x1;
utraj=zeros(nu,T);

% initialize state trajectory vector belonging to policy Kt
xtraj_opt=zeros(n,T);
xtraj_opt(:,1)=x1;

% initialize cost vector
cost=zeros(1,T);
cost(1)=x1'*Q*x1+utraj(:,1)'*R*utraj(:,1);
cost_opt=zeros(1,T);
cost_opt(1)=x1'*Q*x1+x1'*Kt'*R*Kt*x1;

% initialize vector to store current parameter mean
mean_theta=zeros(p,1);

for k=2:T
    % generate process noise
    nk=randn(n,1);

    % simulate dynamics
    xtraj_opt(:,k)=(At+Bt*Kt)*xtraj_opt(:,k-1)+sigma*nk;
    xtraj(:,k)=At*xtraj(:,k-1)+Bt*utraj(:,k-1)+sigma*nk;    
    
    % recursive least squares update for parameters -----------------------
    %
    % 1. adjustment since subset of variables might be known
    Hls=kron([xtraj(:,k-1);utraj(:,k-1)]',eye(n));
    zls=xtraj(:,k)-Hls*thetaOrgKnown;
    Hls=Hls*MOrg;
    %
    % 2. compute correction matrix (recursive least squares)
    Kk=Pk*Hls'/(Hls*Pk*Hls'+eye(n));
    %
    % 3. update mean
    mean_theta=mean_theta+Kk*(zls-Hls*mean_theta);
    %
    % 4. update variance
    Pk=(eye(p)-Kk*Hls)*Pk*(eye(p)-Kk*Hls)'+Kk*Kk';
    %
    % 5. store parameters
    mutheta(:,k)=mean_theta;
    vartheta(:,k)=diag(Pk);
    % ---------------------------------------------------------------------

    % determine uk (input) by sampling from pk
    if mod(k-1,M)==0

        % sample from "posterior" over models -----------------------------
        %
        % -> truncated Gaussian with mean mean_theta and standard deviation 
        % CovRoot
        CovRoot=chol(1/eta*(Pk+Pk')+1e-4*eye(p))';
        theta(:,k)=mean_theta+CovRoot*randn(p,1);
        %
        % perform rejection sampling (for at most 10 steps)
        ncount=0;
        while (sum(theta(:,k)<thetaLbound)>0 || sum(theta(:,k)>thetaUbound)>0) && ncount<10
            theta(:,k)=mean_theta+CovRoot*randn(p,1);
            ncount=ncount+1;
        end
        %
        % truncate in case the above loop did not terminate
        idxviol=theta(:,k)<thetaLbound;
        theta(idxviol,k)=thetaLbound(idxviol);
        idxviol=theta(:,k)>thetaUbound;
        theta(idxviol,k)=thetaUbound(idxviol);
        % -----------------------------------------------------------------

        % extract system matrices
        [A0,B0]=getMatrices(thetaOrgKnown+MOrg*theta(:,k),n,nu);

        % compute feedback policy by solving LQR
        K1=-dlqr(A0,B0,Q,R);

        % store feedback policy
        Ktraj(:,k)=K1(:);

        % compute amount of excitation
        sigma_u=sqrt(10/(M*eta*nu*epsilon)*(2/ceil(k/M)+(p)/ceil(k/M)^2));
    else
        theta(:,k)=theta(:,k-1);
        Ktraj(:,k)=Ktraj(:,k-1);
    end
    
    % store amount of excitation
    sigma_uk(k)=sigma_u;

    % extract feedback policy
    K1=reshape(Ktraj(:,k),nu,n);

    % apply policy and add excitation
    utraj(:,k)=K1*xtraj(:,k)+randn*sigma_uk(k);
    
    % compute statge costs
    cost(:,k)=xtraj(:,k)'*Q*xtraj(:,k)+utraj(:,k)'*R*utraj(:,k);
    cost_opt(:,k)=xtraj_opt(:,k)'*Q*xtraj_opt(:,k)+xtraj_opt(:,k)'*Kt'*R*Kt*xtraj_opt(:,k);
end

% store the estimates and trajectories in corresponding structs
estimates=struct('theta',theta,'mutheta',mutheta,'vartheta',vartheta);
trajectories=struct('xtraj',xtraj,'utraj',utraj,'xtraj_opt',xtraj_opt,'sigma_uk',sigma_uk,'Ktraj',Ktraj);

end


%% the function to retrieves (A,B) system matrices from theta
%
% input:    theta   parameters
%           nx,nu   state and input dimension
% output:   A,B     system matrices
%
function [A,B]=getMatrices(theta,nx,nu)
    A=theta(1:nx^2);
    A=reshape(A,nx,nx);
    B=theta(nx^2+1:end);
    B=reshape(B,nx,nu);
end

