%Cascaded Tanks derivatives Random Forest five-fold verification Comparison

%Load the data sets, set up vectors of inputs
S = load('Tank1.mat');
S2 = load('Tank2.mat');
u1 = S.u';
y1 = S.y';
u2 = S2.u';
y2 = S2.y';

%Create vectors of time values corresponding to datapoints
t1=1:1:length(u1);
t1=t1*5;
t2=1:1:length(u2);
t2 = t2*4;

h11=y1(:,1);
h12=y1(:,2);
h21=y2(:,1);
h22=y2(:,2);

%Calculation of the derivatives:
derivh11 = center_difference(h11,t1);
derivh12 = center_difference(h12,t1);
derivh21 = center_difference(h21,t2);
derivh22 = center_difference(h22,t2);
%Smoothing of the calculated derivatives and forcing function input:
smderivh11 = smooth(derivh11,'lowess');
smderivh12 = smooth(derivh12,'lowess');
smderivh21 = smooth(derivh21,'lowess');
smderivh22 = smooth(derivh22,'lowess');
smu1 = smooth(u1);
smu2 = smooth(u2);

%Concatenation of the two datasets together (later be partitioned into test/train folds)
smderivh1 = [smderivh11;smderivh21];
smderivh2 = [smderivh12;smderivh22];
h1 = [h11;h21];
h2 = [h12;h22];
smu = [smu1;smu2];
u = [u1;u2];

%Inputs for the models are: h1, h2, u smoothed for dh1/dt, h1, h2, u for dh2/dt:
inputs_h1=[h1,h2,smu];
inputs_h2=[h1,h2,u];

%Set up Random Forest Parameters
numTrees = 100;
isCategorical = zeros(3,1);
leaf = 5;

%Initialize measurement values
deriv_h1_fit = cell(5,1);
deriv_h2_fit = cell(5,1);
err_h1= zeros(5,1);
err_h2 = zeros(5,1);

T_Train = zeros(5,1);
T_Predict = zeros(5,1);

%Partition of data for ordered 5-fold verification (have five groups of 2000 datapoints):
for i=1:5
    test_inputs_h1 = inputs_h1(1+2000*(i-1):2000*i,:);
    test_inputs_h2 = inputs_h2(1+2000*(i-1):2000*i,:);
    test_derivh1 = smderivh1(1+2000*(i-1):2000*i,:);
    test_derivh2 = smderivh2(1+2000*(i-1):2000*i,:);
    if i == 1
        train_inputs_h1 = inputs_h1(2001:end,:);
        train_inputs_h2 = inputs_h2(2001:end,:);
        train_derivh1 = smderivh1(2001:end,:);
        train_derivh2 = smderivh2(2001:end,:);
    else
        if i == 5
            train_inputs_h1 = inputs_h1(1:8000,:);
            train_inputs_h2 = inputs_h2(1:8000,:);
            train_derivh1 = smderivh1(1:8000,:);
            train_derivh2 = smderivh2(1:8000,:);
        else
            train_inputs_h1 = [inputs_h1(1:2000*(i-1),:);inputs_h1(1+2000*i:end,:)];
            train_inputs_h2 = [inputs_h2(1:2000*(i-1),:);inputs_h2(1+2000*i:end,:)];
            train_derivh1 = [smderivh1(1:2000*(i-1),:);smderivh1(1+2000*i:end,:)];
            train_derivh2 = [smderivh2(1:2000*(i-1),:);smderivh2(1+2000*i:end,:)];
        end
    end

    %Model building using training data (and timing)
    T_TrainS = tic;
    bh1 = TreeBagger(numTrees,train_inputs_h1,train_derivh1,'Method','regression', ...
        'OOBPredictorImportance','On', ...
        'CategoricalPredictors',find(isCategorical == 1), ...
        'MinLeafSize',leaf);
    bh2 = TreeBagger(numTrees,train_inputs_h2,train_derivh2,'Method','regression', ...
        'OOBPredictorImportance','On', ...
        'CategoricalPredictors',find(isCategorical == 1), ...
        'MinLeafSize',leaf);
    T_Train(i) = toc(T_TrainS);


    %Predictions of test set (and timing)
    T_PredictS = tic;
    deriv_h1_fit{i} = predict(bh1,test_inputs_h1);
    deriv_h2_fit{i} = predict(bh2,test_inputs_h2);
    T_Predict(i) = toc(T_PredictS);


    %Calculation of the MAE for each fold:
    err_h1(i) = mean(abs(deriv_h1_fit{i}-test_derivh1));
    err_h2(i) = mean(abs(deriv_h2_fit{i}-test_derivh2));
end

%Average value of the MAEs over all five folds:
MAE_h1 = mean(err_h1)
MAE_h2 = mean(err_h2)

%Average times for model building and predictions for all five folds:
Mean_Train_Time = mean(T_Train)
Mean_Predict_Time = mean(T_Predict)

%Standard deviation of the MAE over the five folds:
MAE_h1_stdev = (sum((err_h1-MAE_h1).^2)/5)^0.5
MAE_h2_stdev = (sum((err_h2-MAE_h2).^2)/5)^0.5