function [theta_initial, error_iter] = initial(X, Y, K, test_X, test_Y, values)
    %initial value
    n = size(X, 1);
    p = size(X, 2);
    % 5-fold CV
    for i = 1:length(values)
        lambda = values(i);
        cv_accuracy = zeros(5, 1);
        for j = 1:5
            % valid set and train set
            rowrank = randperm(n, round(n/5));
            valid_X = X(rowrank, :);
            valid_Y = Y(rowrank);
            train_X1 = X(setdiff(1:n, rowrank), :);
            train_Y1 = Y(setdiff(1:n, rowrank));
            % compute delta and sigma
            mu_train_m1 = zeros(K,p);
            cov_class = zeros(p,p);
            for k = 1:K
                ind_k = find(train_Y1 == k);
                X_class = train_X1(ind_k,:);
                mu_train_m1(k,:) = mean(X_class,1);
                cov_class = cov_class + size(X_class,1) * cov(X_class);
                pi_k(k) = size(X_class,1)/size(train_X1,1);
            end
            Sigma_1 = cov_class/size(train_X1,1);
            delta_train = (mu_train_m1 - mu_train_m1(1,:))';
            delta_train(:,1)=[];

            % obtain theta
            x = ISTA(Sigma_1, delta_train, lambda, 0.05);

            theta = [zeros(p, 1) x];
            pred_value = valid_X * theta + log(pi_k);
            [max_a, index] = max(pred_value');
            index = index';
            cv_accuracy(j) = mean(index == valid_Y);
        end
        lambda_accuracy(i) = mean(cv_accuracy);
    end
    lambda_choose = values(find(lambda_accuracy == max(lambda_accuracy)));
    lambda_choose = lambda_choose(1);
    % compute delta and sigma
    mu_train = zeros(K,p);
    cov_class = zeros(p,p);
    for k = 1:K
        ind_k = find(Y == k);
        X_class = X(ind_k,:);
        mu_train(k,:) = mean(X_class,1);
        cov_class = cov_class + size(X_class,1) * cov(X_class);
        pi_k(k) = size(X_class,1)/size(X,1);
    end
    Sigma_1 = cov_class/size(train_X1,1);
    delta_train = (mu_train - mu_train(1,:))';
    delta_train(:,1)=[];
    
    x = ISTA(Sigma_1, delta_train, lambda_choose, 0.01);
    theta_choose = [zeros(p, 1) x];
    tpred_value = test_X * theta_choose + log(pi_k);
    [max_a, index] = max(tpred_value');
    index = index';
    error_iter = 1 - mean(index == test_Y);
    theta_initial = theta_choose;
    theta_initial(:,1) = [];
end