function error_iter = mean_valid_msda(data, K, T, values)
    M = size(data, 1) - 2; % number of machines of training set
    test_X = data{M+2, 1};  %last one : test dataset
    test_Y = data{M+2, 2};
    valid_X = data{M+1, 1};
    valid_Y = data{M+1, 2};
    p = size(test_X, 2);

    %compute total covariance matrix and mean
    mu_hat = zeros(p, K);
    X_t = data{1,1};
    Y_t = data{1,2};
    for m = 2:M
        X_t = [X_t; data{m,1}];
        Y_t = [Y_t; data{m,3}];
    end
    cov_class = zeros(p, p);
    for k = 1:K
        ind_k = find(Y_t == k);
        X_class = X_t(ind_k,:);
        mu_hat(:,k) = mean(X_class, 1)';
        pi_hat(k) = size(X_class,1);
        cov_class = cov_class + size(X_class,1) * cov(X_class);
    end
    Sigma = cov_class/size(X_t,1);
    pi_hat = pi_hat/sum(pi_hat);
    delta_hat = mu_hat - mu_hat(:,1);
    delta_hat(:,1) = [];

    %compute local covariance matrix
    X1 = data{1,1};
    Y1 = data{1,2};
    Sigma_1 = zeros(p, p);
    for k = 1:K
        ind_k = find(Y1 == k);
        X_class1 = X1(ind_k,:);
        Sigma_1 = Sigma_1 + size(X_class1,1) * cov(X_class1);
    end
    Sigma_1 = Sigma_1 / size(X1 ,1);

    % initial error
    % error_iter = zeros(T+1,1);
    values = linspace(0.5, 2, 10);
    [theta_initial_0, error_initial, lambda_initial] = valid_initial(data, K, values); % take different lambda values;
    error_iter(1) = error_initial;
    
    theta_initial = theta_initial_0;
    %%% Mean!!
    % compute Sigma_hat * theta_hat
    lambda_mean = zeros(T,1);
    for t = 1:10
        % update theta
        delta = (Sigma_1 - Sigma) * theta_initial + delta_hat;

        for i = 1:length(values)
            lambda = values(i);
            % blockwise coordinate descent for updating theta
            theta_update = ISTA(Sigma_1, delta, lambda, 0.01);
            % compute error of t_th step
            theta_CV{i} = theta_update;
            theta_choose = [zeros(p, 1) theta_update];
            tpred_value = valid_X * theta_choose + log(pi_hat);
            [max_a, index] = max(tpred_value');
            index = index';
            error_cv(i) = 1 - mean(index == valid_Y);
        end
        lambda_min = values(find(error_cv == min(error_cv)));
        lambda_mean(t) = lambda_min(1);
        theta_initial = theta_CV{find(error_cv == min(error_cv))};
        % compute error of t_th step
        theta_choose = [zeros(p, 1) theta_initial];
        tpred_value = test_X * theta_choose + log(pi_hat);
        [max_a, index] = max(tpred_value');
        index = index';
        error_iter(t + 1) = 1 - mean(index == test_Y);
    end
end