import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_blobs
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tqdm import tqdm
from sklearn.feature_selection import mutual_info_classif, f_classif
from skfeature.function.similarity_based import fisher_score, lap_score, reliefF, SPEC
from skfeature.function.statistical_based import gini_index, t_score
from skfeature.utility.construct_W import construct_W
import os
from joblib import Parallel, delayed

# generate Hypercube dataset
n_relevant_features = 10
X, y = make_classification(
    n_samples=2000,
    n_features=200,
    n_informative=10,
    n_redundant=0,
    n_clusters_per_class=2,
    flip_y=0.01,     # control label noise ratio (default 0.01)
    class_sep=1,  # increase class separation (default 1)
    scale=1.0,    # control the variance of the data (default 1)
    shuffle=False,  # do not disrupt the order of features, and ensure that the first 10 are relevant features
    random_state=42
)



# ================== experimental parameters ==================
k_neighbors_grid = [1, 3, 5, 10, 15, 20, 30, 50, 100]
svm_param_grid = {
    'C': [2**i for i in [-5, -2, 1, 4, 7, 10, 13]],
    'gamma': [2**i for i in [-15, -12, -9, -6, -3, 0, 3]]
}
n_outer_iter = 50
n_inner_folds = 10
true_features = set(range(n_relevant_features))

os.makedirs('/Hypercube/baseline', exist_ok=True)



# ================== Parallel task encapsulation ==================
def process_iteration(outer_iter):
    np.random.seed(outer_iter)  
    # outer division: retain 500 samples as the final test set
    X_temp, X_test_final, y_temp, y_test_final = train_test_split(
        X, y, test_size=500, random_state=outer_iter, stratify=y
    )

    # shuffle 50 training samples for feature selection
    sample_idx = np.random.choice(X_temp.shape[0], 50, replace=False)
    X_train_sub = X_temp[sample_idx]
    y_train_sub = y_temp[sample_idx]

    # ===== feature selection parameter tuning =====
    best_k = None
    best_fs_score = -np.inf

    # traversing parameters
    for k in k_neighbors_grid:
        # feature selection: ReliefF
        # scores = reliefF.reliefF(X_train_sub, y_train_sub, k=k)  

        # feature selection: IG
        scores = mutual_info_classif(X_train_sub, y_train_sub, n_neighbors=k)  


        selected = np.argsort(scores)[-10:]

        # inner cross validation
        inner_scores = []
        inner_kf = StratifiedKFold(n_splits=n_inner_folds, shuffle=True, random_state=42)

        for inner_train_idx, inner_val_idx in inner_kf.split(X_temp, y_temp):
            X_inner_train, X_inner_val = X_temp[inner_train_idx], X_temp[inner_val_idx]
            y_inner_train, y_inner_val = y_temp[inner_train_idx], y_temp[inner_val_idx]

            svm = SVC(kernel='rbf')
            svm.fit(X_inner_train[:, selected], y_inner_train)
            score = svm.score(X_inner_val[:, selected], y_inner_val)
            inner_scores.append(score)

        mean_score = np.mean(inner_scores)
        print(f'iter_{outer_iter}, k={k}: mean_score={mean_score}')
        if mean_score > best_fs_score:
            best_fs_score = mean_score
            best_k = k

    # ===== SVM tuning after fixing FS parameters =====
    scores = reliefF.reliefF(X_train_sub, y_train_sub, k=best_k)  
    selected_features = np.argsort(scores)[-10:]

    # SVM parameter tuning
    grid_search = GridSearchCV(
        SVC(kernel='rbf'),
        param_grid=svm_param_grid,
        cv=StratifiedKFold(n_splits=n_inner_folds, shuffle=True, random_state=42),   
        scoring='accuracy',
        n_jobs=8
    )
    grid_search.fit(X_temp[:, selected_features], y_temp)

    test_acc = grid_search.best_estimator_.score(X_test_final[:, selected_features], y_test_final)
    correct = len(set(selected_features) & true_features)
    print(f'iter_{outer_iter}: test_acc={test_acc}, correct={correct}')

    return {
        'iteration': outer_iter + 1,
        'correct_features': correct,
        'test_accuracy': test_acc,
        'selected_features': selected_features,
        'best_k': best_k,
        'best_C': grid_search.best_params_['C'],
        'best_gamma': grid_search.best_params_['gamma'],
        'test_data': (X_test_final[:, selected_features], y_test_final)
    }


# ================== Parallel execution ==================
results = Parallel(n_jobs=8, verbose=10)(
    delayed(process_iteration)(i) for i in range(n_outer_iter)
)

# ================== Post-processing ==================
df = pd.DataFrame(results)
df.to_csv('/Hypercube/baseline/final_results_baseline.csv', index=False)  
avg_test_accuracy = df['test_accuracy'].mean()
avg_correct = df['correct_features'].mean()
print(f'avg_test_accuracy: {avg_test_accuracy}')
print(f'avg_correct: {avg_correct}')

