function [theta_initial, error_initial,lambda_initial] = valid_initial(data, K, values)

    X = data{1,1};  %machine 1
    Y = data{1,3};
    M = size(data, 1) - 2; % number of machines of training set
    test_X = data{M+1, 1};  %last one : test dataset
    test_Y = data{M+1, 2};
    valid_X = data{M+2, 1};
    valid_Y = data{M+2, 2};
    n = size(X, 1);
    p = size(X, 2);
    K = K;

    %compute local covariance matrix
    mu_hat_1 = zeros(p, K);
    Sigma = zeros(p, p);
    for k = 1:K
        ind_k = find(Y == k);
        X_class = X(ind_k,:);
        mu_hat_1(:,k) = mean(X_class, 1)';
        pi_hat_1(k) = size(X_class, 1);
        Sigma = Sigma + size(X_class,1) * cov(X_class);
    end
    Sigma = Sigma/size(X,1);
    pi_hat_1 = pi_hat_1/sum(pi_hat_1);
    delta_hat_1 = mu_hat_1 - mu_hat_1(:,1);
    delta_hat_1(:,1) = [];

    for i = 1:length(values)
        lambda = values(i);
        % blockwise coordinate descent for updating theta
        theta_update = zeros(p, K-1);
        theta_bar = zeros(p, K-1);
        iter = 0;
        while iter < 100
            dif = 0;
            for j = 1 : p
                theta_bar = delta_hat_1(j,:)/Sigma(j,j) - Sigma(j,:) * theta_update + Sigma(j,j) .* theta_update(j,:);
                theta_tmp = theta_update(j,:);
                bar_norm = norm(theta_bar, 2);
                v = bar_norm - lambda/Sigma(j,j);
                if v>0
                    theta_update(j,:) = theta_bar .* (v/bar_norm);
                else
                    theta_update(j,:) = zeros(1,K-1);
                end
                d = theta_update(j,:) - theta_tmp;
                dif = max(dif, max(abs(d)));
            end
            iter = iter + 1;
            if dif < 1e-5
                break
            end
        end
        % compute error of t_th step
        theta_CV{i} = theta_update;
        theta_choose = [zeros(p, 1) theta_update];
        tpred_value = valid_X * theta_choose + log(pi_hat_1);
        [max_a, index] = max(tpred_value');
        index = index';
        error_cv(i) = 1 - mean(index == valid_Y);
    end
    lambda_initial = values(find(error_cv == min(error_cv)));
    theta_initial = theta_CV{find(error_cv == min(error_cv))};
    % compute error of t_th step
    theta_choose = [zeros(p, 1) theta_initial];
    tpred_value = test_X * theta_choose + log(pi_hat_1);
    [max_a, index] = max(tpred_value');
    index = index';
    error_initial = 1 - mean(index == test_Y);
end
            
           
    
    