function [error_iter, error_iter_1] = MNIST_run(data, T)
    M = size(data, 1) - 1;
    test_X = data{M+1, 1};
    test_Y = data{M+1, 2};
    K = 10; p = size(test_X, 2);

    %initial value
    [theta_initial, error_initial] = initial(data{1, 1}, data{1, 3}, test_X, test_Y);
    error_iter = zeros(T+1,1);
    error_iter_1 = zeros(T+1,1);
    error_iter(1) = error_initial;
    error_iter_1(1) = error_initial;
    theta_initial_0 = theta_initial;

    % compute Sigam and delta_hat
    mu_hat = zeros(p, K);
    X_t = [];
    Y_t = [];
    for m = 1:M
        X_t = [X_t; data{m,1}];
        Y_t = [Y_t; data{m,3}];
    end
    cov_class = zeros(p, p);
    for k = 1:K
        ind_k = find(Y_t == k);
        X_class = X_t(ind_k,:);
        mu_hat(:,k) = mean(X_class, 1)';
        pi_hat(k) = size(X_class,1);
        cov_class = cov_class + size(X_class,1) * cov(X_class);
    end
    Sigma = cov_class/size(X_t,1);
    pi_hat = pi_hat/sum(pi_hat);
    delta_hat = mu_hat - mu_hat(:,1);
    delta_hat(:,1) = [];

    % compute sigma_1
    train_X1 = data{1,1};
    train_Y1 = data{1,2};
    cov_class = zeros(p,p);
    X_class = cell(K,1);
    for k = 1:K
        ind_k = find(train_Y1 == k);
        X_class{k,1} = train_X1(ind_k,:);
        cov_class = cov_class + size(X_class{k,1},1) * cov(X_class{k,1});
    end
    Sigma_1 = cov_class/size(train_X1,1);

    % mean iteration
    for t = 1:T
        delta = (Sigma_1 - Sigma) * theta_initial + delta_hat;
        lambda = mean_iter_CV(data, theta_initial);
        cvx_begin 
            variable x(p)
            minimize(0.5 .* x.' * Sigma_1 * x - delta' * x + lambda * norm(x,1));
        cvx_end
        theta_choose = [zeros(p, 1) x];
        tpred_value = test_X * theta_choose + log(pi_hat);
        [max_a, index] = max(tpred_value');
        index = index';
        error_iter(t+1) = 1 - mean(index == test_Y);
        theta_initial = theta_choose;
        theta_initial(:,1) = [];
    end

    %% median of means
    %compute total mean and pi_k
    mu_hat = zeros(p, K);
    mu_store = cell(M,1);
    pi_store = zeros(M, K);
    for m = 1:M
        X = data{m,1};
        Y = data{m,3};
        for k = 1:K
            ind_k = find(Y == k);
            X_class = X(ind_k,:);
            mu_local(:,k) = mean(X_class,1)';
            pi_store(m,k) = size(X_class, 1);
        end
        mu_store{m} = mu_local;
        pi_store(m,:) = pi_store(m,:)/sum(pi_store(m,:));
    end
    mu_tilde = zeros(p,K);
    for k = 1:K
        mu_k = zeros(p, M);
        for m = 1:M
            mu_store_m = mu_store{m};
            mu_k(:,m) = mu_store_m(:,k);
        end
        mu_tilde(:,k) = median(mu_k, 2);
    end
    pi_tilde = median(pi_store, 1);
    delta_tilde = mu_tilde - mu_tilde(:,1);
    delta_tilde(:,1) = [];

    theta_initial_1 = theta_initial_0;
    % iteration
    for t = 1:T
        grad_store = zeros(p, M);
        for m = 1:M
            X = data{m,1};
            Y = data{m,3};
            cov_class = zeros(p, p);
            for k = 1:K
                ind_k = find(Y == k);
                X_class = X(ind_k,:);
                cov_class = cov_class + size(X_class,1) * cov(X_class);
            end
            Sigma_local = cov_class/size(X,1);
            grad_store(:, m) = Sigma_local * theta_initial_1;
        end
        grad_tilde = median(grad_store, 2);
        delta = Sigma_1 * theta_initial_1 - grad_tilde + delta_tilde;
        lambda_1 = median_iter_CV(data, theta_initial_1);
        cvx_begin 
            variable x(p)
            minimize(0.5 .* x.' * Sigma_1 * x - delta' * x + lambda_1 * norm(x,1));
        cvx_end
        theta_choose = [zeros(p, 1) x];
        tpred_value = test_X * theta_choose + log(pi_tilde);
        [max_a, index] = max(tpred_value');
        index = index';
        error_iter_1(t+1) = 1 - mean(index == test_Y);
        theta_initial_1 = theta_choose;
        theta_initial_1(:,1) = [];
    end
end