import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_blobs
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, ParameterGrid
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tqdm import tqdm
from sklearn.feature_selection import mutual_info_classif, f_classif
from skfeature.function.similarity_based import fisher_score, lap_score, reliefF, SPEC
from skfeature.function.statistical_based import gini_index, t_score
from skfeature.utility.construct_W import construct_W
import os
from joblib import Parallel, delayed
from XOR.ManiFeSt import ManiFeSt
import time

# generate Hypercube dataset
n_relevant_features = 10
X, y = make_classification(
    n_samples=2000,
    n_features=200,
    n_informative=10,
    n_redundant=0,
    n_clusters_per_class=2,
    flip_y=0.01,     # control label noise ratio (default 0.01)
    class_sep=1,  # increase class separation (default 1)
    scale=1.0,    # control the variance of the data (default 1)
    shuffle=False,  # do not disrupt the order of features, and ensure that the first 10 are relevant features
    random_state=42
)



# ================== experimental parameters ==================
percentile_grid = [5, 10, 30, 50, 70, 90, 95]
svm_param_grid = {
    'C': [2**i for i in [-5, -2, 1, 4, 7, 10, 13]],
    'gamma': [2**i for i in [-15, -12, -9, -6, -3, 0, 3]]
}
n_outer_iter = 50
n_inner_folds = 10
true_features = set(range(n_relevant_features))

seed_grid = {
    'seed1': [2333],    
    'seed2': [56],    
    'seed3': [42]
}

os.makedirs('/Hypercube/baseline', exist_ok=True)



# ================== Parallel task encapsulation ==================
def process_iteration(seed_param={'seed1':42, 'seed2':42, 'seed3':42}, outer_iter=None):
    np.random.seed(seed_param['seed1'])    
    # outer division: retain 500 samples as the final test set
    X_temp, X_test_final, y_temp, y_test_final = train_test_split(
        X, y, test_size=500, random_state=seed_param['seed2']+outer_iter, stratify=y
    )

    # shuffle 50 training samples for feature selection
    sample_idx = np.random.choice(X_temp.shape[0], 50, replace=False)
    X_train_sub = X_temp[sample_idx]
    y_train_sub = y_temp[sample_idx]

    # ===== feature selection parameter tuning =====
    best_percentile = None
    best_fs_score = -np.inf

    # traversing parameters
    for percentile in percentile_grid:
        # feature selection: ManiFeSt
        scores, idx, _ = ManiFeSt(X_train_sub, y_train_sub, kernel_scale_factor=1, use_spsd=True, percentile=percentile)
        selected = np.argsort(scores)[-10:]

        # inner cross validation
        inner_scores = []
        inner_kf = StratifiedKFold(n_splits=n_inner_folds, shuffle=True, random_state=seed_param['seed3'])    

        for inner_train_idx, inner_val_idx in inner_kf.split(X_temp, y_temp):
            X_inner_train, X_inner_val = X_temp[inner_train_idx], X_temp[inner_val_idx]
            y_inner_train, y_inner_val = y_temp[inner_train_idx], y_temp[inner_val_idx]

            svm = SVC(kernel='rbf')
            svm.fit(X_inner_train[:, selected], y_inner_train)
            score = svm.score(X_inner_val[:, selected], y_inner_val)
            inner_scores.append(score)

        mean_score = np.mean(inner_scores)
        print(f'iter_{outer_iter}, percentile={percentile}: mean_score={mean_score}')
        if mean_score > best_fs_score:
            best_fs_score = mean_score
            best_percentile = percentile

    # ===== SVM tuning after fixing FS parameters =====
    scores, idx, _ = ManiFeSt(X_train_sub, y_train_sub, kernel_scale_factor=1, use_spsd=True, percentile=best_percentile)
    selected_features = np.argsort(scores)[-10:]

    # SVM parameter tuning
    grid_search = GridSearchCV(
        SVC(kernel='rbf'),
        param_grid=svm_param_grid,
        cv=StratifiedKFold(n_splits=n_inner_folds, shuffle=True, random_state=42),   
        scoring='accuracy',
        n_jobs=8
    )
    grid_search.fit(X_temp[:, selected_features], y_temp)

    test_acc = grid_search.best_estimator_.score(X_test_final[:, selected_features], y_test_final)
    correct = len(set(selected_features) & true_features)
    print(f'iter_{outer_iter}: test_acc={test_acc}, correct={correct}')

    return {
        'iteration': outer_iter + 1,
        'correct_features': correct,
        'test_accuracy': test_acc,
        'selected_features': selected_features,
        'best_percentile': best_percentile,
        'best_C': grid_search.best_params_['C'],
        'best_gamma': grid_search.best_params_['gamma'],
        'test_data': (X_test_final[:, selected_features], y_test_final)
    }


# ================== Parallel execution ==================
start_time = time.time()  
seed_param = {'seed1': 2333, 'seed2': 56, 'seed3': 42}
results = Parallel(n_jobs=8, verbose=10)(
    delayed(process_iteration)(seed_param, i) for i in range(n_outer_iter)
)
print(f"Total time taken: {time.time() - start_time:.2f} seconds")  

# ================== Post-processing==================
df = pd.DataFrame(results)
df.to_csv('/Hypercube/baseline/final_results_baseline.csv', index=False)  
avg_test_accuracy = df['test_accuracy'].mean()
avg_correct = df['correct_features'].mean()
print(f'avg_test_accuracy: {avg_test_accuracy}')
print(f'avg_correct: {avg_correct}')

