function [error_iter, ell_2, F_score] = mean_valid_msda(data, K, T, values, theta_true)
    M = size(data, 1) - 2; % number of machines of training set
    test_X = data{M+2, 1};  %last one : test dataset
    test_Y = data{M+2, 2};
    valid_X = data{M+1, 1};
    valid_Y = data{M+1, 2};
    p = size(test_X, 2);

    % evaluation index
    error_iter = zeros(T+1,1);
    ell_2 = zeros(T+1,1);
    F_score = zeros(T+1,1);

    % initial error
    % error_iter = zeros(T+1,1);
    [theta_initial, error_initial, lambda_initial] = valid_initial(data, K, values); % take different lambda values;
    error_iter(1) = error_initial;
    ell_2(1) = avg_ell2(theta_initial, theta_true);
    F_score(1) = F1_score(2*K, theta_initial);
    theta_initial_0 = theta_initial;

    %compute total covariance matrix and mean
    mu_hat = zeros(p, K);
    X_t = data{1,1};
    Y_t = data{1,2};
    for m = 2:M
        X_t = [X_t; data{m,1}];
        Y_t = [Y_t; data{m,2}];
    end
    cov_class = zeros(p, p);
    for k = 1:K
        ind_k = find(Y_t == k);
        X_class = X_t(ind_k,:);
        mu_hat(:,k) = mean(X_class, 1)';
        pi_hat(k) = size(X_class,1);
        cov_class = cov_class + size(X_class,1) * cov(X_class);
    end
    Sigma = cov_class/size(X_t,1);
    pi_hat = pi_hat/sum(pi_hat);
    delta_hat = mu_hat - mu_hat(:,1);
    delta_hat(:,1) = [];

    %compute local covariance matrix
    X1 = data{1,1};
    Y1 = data{1,2};
    Sigma_1 = zeros(p, p);
    for k = 1:K
        ind_k = find(Y1 == k);
        X_class1 = X1(ind_k,:);
        Sigma_1 = Sigma_1 + size(X_class1,1) * cov(X_class1);
    end
    Sigma_1 = Sigma_1 / size(X1 ,1);

    %%% Mean!!
    % compute Sigma_hat * theta_hat
    lambda_mean = zeros(T,1);
    for t = 1:T
        % update theta
        delta = (Sigma_1 - Sigma) * theta_initial + delta_hat;

        for i = 1:length(values)
            lambda = values(i);
            % blockwise coordinate descent for updating theta
            % iter = 0;
            % theta_update = zeros(p, K-1);
            % theta_bar = zeros(p, K-1);
            % while iter < 100
            %         dif = 0;
            %         for j = 1 : p
            %             theta_bar = delta(j,:)/Sigma_1(j,j) - Sigma_1(j,:) * theta_update + Sigma_1(j,j) .* theta_update(j,:);
            %             theta_tmp = theta_update(j,:);
            %             bar_norm = norm(theta_bar, 2);
            %             v = bar_norm - lambda/Sigma_1(j,j);
            %             if v>0
            %                 theta_update(j,:) = theta_bar .* (v/bar_norm);
            %             else
            %                 theta_update(j,:) = zeros(1,K-1);
            %             end
            %             d = theta_update(j,:) - theta_tmp;
            %             dif = max(dif, max(abs(d)));
            %         end
            %         iter = iter + 1;
            %         if dif < 1e-5
            %             break
            %         end
            % end
            theta_update =  lasso_ISTA(Sigma_1, delta, lambda, eta);

            % compute error of t_th step
            theta_CV{i} = theta_update;
            theta_choose = [zeros(p, 1) theta_update];
            tpred_value = bayes_value(valid_X, theta_choose, mu_hat, pi_hat, K);
            [max_a, index] = max(tpred_value');
            index = index';
            error_cv(i) = 1 - mean(index == valid_Y);
        end
        lambda_min = values(find(error_cv == min(error_cv)));
        lambda_mean(t) = lambda_min(1);
        theta_initial = theta_CV{find(error_cv == min(error_cv))};
        % compute error of t_th step
        theta_choose = [zeros(p, 1) theta_initial];
        tpred_value = bayes_value(test_X, theta_choose, mu_hat, pi_hat, K);
        [max_a, index] = max(tpred_value');
        index = index';
        error_iter(t + 1) = 1 - mean(index == test_Y);
        ell_2(t + 1) = avg_ell2(theta_initial, theta_true);
        F_score(t + 1) = F1_score(11, theta_initial); 
    end
end