%% main_har_10chunks.m
% Streaming PCA experiments for the HAR dataset using a 10-chunk sequential split.
% The training data (from train/X_train.txt) is divided into 9 contiguous chunks and the test data (from test/X_test.txt)
% is used as the 10th chunk. Ground truth is computed from the entire training data.
% In addition, for each chunk we randomly drop out 10% of the features per datapoint (simulate missing data)
% by setting those entries to 0.
%
% Requirements:
%   - "train/X_train.txt" and "test/X_test.txt" must be in their respective folders.
%   - All required function files (e.g., get_oja_vec.m, get_var_estimates_subsampling.m, get_learning_rate.m, etc.) are in the MATLAB path.

%% 1. Load and Preprocess the HAR Dataset
fprintf('Loading HAR dataset...\n');
trainData = load('UCI HAR Dataset/train/X_train.txt');  % rows are feature vectors
testData = load('UCI HAR Dataset/test/X_test.txt');

[n_train, d] = size(trainData);
[n_test, ~] = size(testData);
fprintf('Training data: %d samples, %d features; Test data: %d samples, %d features.\n', n_train, d, n_test, d);

% Preprocessing: Although features are normalized per the README, we center and scale.
trainData = trainData - mean(trainData);
trainData = trainData ./ std(trainData);
testData = testData - mean(testData);
testData = testData ./ std(testData);

%% 2. Compute Ground Truth from the Entire Training Data
Sigma_train = cov(trainData);
[U_train, S_train, ~] = svd(Sigma_train);
groundTruth = U_train(:,1);
eig_vals = diag(S_train);
if length(eig_vals) > 1
    eigengap = eig_vals(1) - eig_vals(2);
else
    eigengap = eig_vals(1);
end

data_params = struct();
data_params.n = n_train;
data_params.d = d;
data_params.Sigma_true = Sigma_train;
data_params.Sigma_true_sqrtm = sqrtm(Sigma_train);
data_params.trueV = groundTruth;  % Ground truth eigenvector computed on training data
data_params.eigengap = eigengap;
data_params.c = NaN;
data_params.b = NaN;

fprintf('Computed ground truth eigenvector from training data (eigengap = %.4f).\n', eigengap);

%% 3. Divide Training Data into 9 Sequential Chunks and Use Test Data as the 10th Chunk
num_chunks_train = 9;
chunk_boundaries = round(linspace(1, n_train+1, num_chunks_train+1));
chunks = cell(10,1);
for i = 1:num_chunks_train
    idx_start = chunk_boundaries(i);
    idx_end = chunk_boundaries(i+1) - 1;
    chunks{i} = trainData(idx_start:idx_end, :);
end
chunks{10} = testData;  % Use test data as the last chunk
num_chunks = 10;

%% 4. For Each Chunk, Induce Missing Data (10% dropout) and Estimate the Eigenvector
% Set the dropout rate (10%)
drop_rate = 0.10;

alpha = 5;      % Learning rate parameter for the streaming PCA algorithm
verbose = 1;
z_val = 1.645;  % z-value for a 90% confidence interval

% Preallocate arrays to store per-chunk estimates:
est_eigen_chunks = zeros(num_chunks, d);
est_variance_chunks = zeros(num_chunks, d);
coverage_chunks = zeros(num_chunks, d);  % This will store binary indicators

for j = 1:num_chunks
    chunk_data = chunks{j};
    n_chunk = size(chunk_data, 1);
    
    % Induce missing data: For each datapoint (row), randomly set 10% of the entries to 0.
    dropout_mask = rand(size(chunk_data)) < drop_rate;
    chunk_data_dropout = chunk_data;
    chunk_data_dropout(dropout_mask) = 0;
    
    % Set local subsampling estimator parameters
    m1_local = 3;
    m2_local = floor(max(log(n_chunk), log(d)));
    B_local = floor(n_chunk / (m1_local * m2_local));
    
    % Run the subsampling estimator on the chunk with dropout
    result = get_var_estimates_subsampling(chunk_data_dropout, n_chunk, d, alpha, data_params, m1_local, m2_local, B_local, verbose);
    
    % Retrieve and align the estimated eigenvector
    est_vec = result.oja_vec;
    est_vec = est_vec * sign(est_vec' * groundTruth);
    est_eigen_chunks(j, :) = est_vec;
    est_variance_chunks(j, :) = result.variance;
    
    % For each coordinate, compute a 90% confidence interval and check coverage
    for i = 1:d
        ci_lower = est_vec(i) - z_val * sqrt(result.variance(i));
        ci_upper = est_vec(i) + z_val * sqrt(result.variance(i));
        if groundTruth(i) >= ci_lower && groundTruth(i) <= ci_upper
            coverage_chunks(j, i) = 1;
        else
            coverage_chunks(j, i) = 0;
        end
    end
    fprintf('Processed chunk %d / %d\n', j, num_chunks);
end

%% 5. Print Coverage for Each Coordinate (Across the 10 Chunks)
coverage_per_coord = mean(coverage_chunks, 1);  % coverage for each coordinate
fprintf('\nCoverage for each coordinate (HAR, Subsampling Estimator with 10%% dropout):\n');
for i = 1:d
    fprintf('  Coordinate %d: %.2f%%\n', i, coverage_per_coord(i)*100);
end

%% 6. Plot the Average Estimated Eigenvector with Shaded Standard Deviation and Ground Truth Overlay
avg_est_eigen = mean(est_eigen_chunks, 1);
std_est_eigen = std(est_eigen_chunks, 0, 1);

figure;
hold on;
coords = 1:d;
x_patch = [coords, fliplr(coords)];
y_patch = [avg_est_eigen - z_val*std_est_eigen, fliplr(avg_est_eigen + z_val*std_est_eigen)];
fill(x_patch, y_patch, [0.8 0.8 1], 'EdgeColor', 'none', 'FaceAlpha', 0.5, 'DisplayName', 'Avg ± Std');
plot(coords, avg_est_eigen, 'b-', 'LineWidth', 2, 'DisplayName', 'Avg Estimated Eigenvector');
plot(coords, groundTruth, 'ro', 'MarkerSize', 6, 'LineWidth', 2, 'DisplayName', 'Ground Truth Eigenvector');
xlabel('Coordinate Index');
ylabel('Eigenvector Value');
title('HAR: Estimated Eigenvector Across 10 Chunks (90% CI) with 10% Dropout');
legend('Location', 'best');
grid on;
hold off;

fprintf('\nHAR streaming PCA experiments complete.\n');
