import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_blobs
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tqdm import tqdm
from sklearn.feature_selection import mutual_info_classif, f_classif
from skfeature.function.similarity_based import fisher_score, lap_score, reliefF, SPEC
from skfeature.function.statistical_based import gini_index, t_score
from skfeature.utility.construct_W import construct_W
import os
from joblib import Parallel, delayed

# generate Hypercube dataset
n_relevant_features = 10
X, y = make_classification(
    n_samples=2000,
    n_features=200,
    n_informative=10,
    n_redundant=0,
    n_clusters_per_class=2,
    flip_y=0.01,     # control label noise ratio (default 0.01)
    class_sep=1,  # increase class separation (default 1)
    scale=1.0,    # control the variance of the data (default 1)
    shuffle=False,  # do not disrupt the order of features, and ensure that the first 10 are relevant features
    random_state=42
)



# ================== experimental parameters ==================
k_neighbors_grid = [1, 3, 5, 10, 15, 20, 30, 50, 100]
svm_param_grid = {
    'C': [2**i for i in [-5, -2, 1, 4, 7, 10, 13]],
    'gamma': [2**i for i in [-15, -12, -9, -6, -3, 0, 3]]
}
n_outer_iter = 50
n_inner_folds = 10
true_features = set(range(n_relevant_features))

os.makedirs('/Hypercube/baseline', exist_ok=True)



# ================== Parallel task encapsulation ==================
def process_iteration(outer_iter):
    np.random.seed(outer_iter)  
    # outer division: retain 500 samples as the final test set
    X_temp, X_test_final, y_temp, y_test_final = train_test_split(
        X, y, test_size=500, random_state=outer_iter, stratify=y
    )

    # shuffle 50 training samples for feature selection
    sample_idx = np.random.choice(X_temp.shape[0], 50, replace=False)
    X_train_sub = X_temp[sample_idx]
    y_train_sub = y_temp[sample_idx]


    # feature selection: ANOVA
    scores, _ = f_classif(X_train_sub, y_train_sub)   

    # feature selection: Pearson
    # df_X = pd.DataFrame(X_train_sub)   
    # df_y = pd.Series(y_train_sub)
    # scores = df_X.apply(lambda x: x.corr(df_y), axis=0)   
    # scores = scores.abs().to_numpy()

    # feature selection: Gini-index
    # scores = gini_index.gini_index(X_train_sub, y_train_sub)   
    # scores = -scores

    # feature selection: t-test
    # scores = t_score.t_score(X_train_sub, y_train_sub)  

    # feature selection: Fisher
    # scores = fisher_score.fisher_score(X_train_sub, y_train_sub)  

    # feature selection: Laplacian
    # W = construct_W(X_train_sub, y=y_train_sub, metric='euclidean', weight_mode='heat_kernel', t=3)   
    # scores = lap_score.lap_score(X_train_sub, W=W)  
    # scores = -scores




    selected_features = np.argsort(scores)[-10:]

    # SVM parameter tuning
    grid_search = GridSearchCV(
        SVC(kernel='rbf'),
        param_grid=svm_param_grid,
        cv=StratifiedKFold(n_splits=n_inner_folds, shuffle=True, random_state=42),   
        scoring='accuracy',
        n_jobs=8
    )
    grid_search.fit(X_temp[:, selected_features], y_temp)

    test_acc = grid_search.best_estimator_.score(X_test_final[:, selected_features], y_test_final)
    correct = len(set(selected_features) & true_features)
    print(f'iter_{outer_iter}: test_acc={test_acc}, correct={correct}')

    return {
        'iteration': outer_iter + 1,
        'correct_features': correct,
        'test_accuracy': test_acc,
        'selected_features': selected_features,
        'best_C': grid_search.best_params_['C'],
        'best_gamma': grid_search.best_params_['gamma'],
        'test_data': (X_test_final[:, selected_features], y_test_final)
    }




# ================== Parallel Execution ==================
results = Parallel(n_jobs=8, verbose=10)(
    delayed(process_iteration)(i) for i in range(n_outer_iter)
)

# ================== Post-processing ==================
df = pd.DataFrame(results)
df.to_csv('/Hypercube/baseline/final_results_baseline.csv', index=False)  
avg_test_accuracy = df['test_accuracy'].mean()
avg_correct = df['correct_features'].mean()
print(f'avg_test_accuracy: {avg_test_accuracy}')
print(f'avg_correct: {avg_correct}')



