# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from itertools import combinations
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split  # to split the dataset for training and testing
# Import StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler
# Import RandomForestClassifier for trying a different algorithm
from sklearn.ensemble import RandomForestClassifier

# Load data from an Excel file
data_DC = pd.read_excel(r'./data/DC_CN_SMC_MCI_all_for_Python_Classifier.xlsx')
data_DC = pd.DataFrame(data_DC)

# Separate the data into different groups based on labels
data_DC_CN = data_DC[data_DC.iloc[:, (data_DC.shape[1] - 1)] == 1]
data_DC_SMC = data_DC[data_DC.iloc[:, (data_DC.shape[1] - 1)] == 2]
data_DC_MCI = data_DC[data_DC.iloc[:, (data_DC.shape[1] - 1)] == 3]

# Create a new DataFrame with the loaded data
data_DC_new = pd.DataFrame(data_DC)

# Split the data into training and testing sets
x = data_DC_new.iloc[:, :-1]
y = data_DC_new.iloc[:, -1]
X_train_DC, X_test_DC, y_train_DC, y_test_DC = train_test_split(x, y, test_size=0.2)

# Feature scaling
scaler = StandardScaler()
X_train_DC_scaled = scaler.fit_transform(X_train_DC)
X_test_DC_scaled = scaler.transform(X_test_DC)

# Instantiate a different classifier (Random Forest)
rf = RandomForestClassifier(n_estimators=100, random_state=1)

# Define a class for Sequential Forward Selection (SFS)
class SequentialForwardSelection():

    # Initialize the class with an estimator and the desired number of features
    def __init__(self, estimator, k_features):
        self.estimator = clone(estimator)
        self.k_features = k_features

    # Fit the SFS algorithm to the training and test data
    def fit(self, X_train, X_test, y_train, y_test):
        max_indices = tuple(range(X_train.shape[1]))
        total_features_count = len(max_indices)
        self.subsets_ = []
        self.scores_ = []
        self.indices_ = []

        # Iterate through the feature space to find the first feature
        # which gives the maximum model performance
        scores = []
        subsets = []
        for p in combinations(max_indices, r=1):
            score = self._calc_score(X_train.values, X_test.values, y_train.values, y_test.values, p)
            scores.append(score)
            subsets.append(p)

        # Find the single feature having the best score
        best_score_index = np.argmax(scores)
        self.scores_.append(scores[best_score_index])
        self.indices_ = list(subsets[best_score_index])
        self.subsets_.append(self.indices_)

        # Add features one by one until k_features is reached
        dim = 1
        while dim < self.k_features:
            scores = []
            subsets = []
            current_feature = dim

            # Add the remaining features one-by-one from the remaining feature set
            # Calculate the score for every feature combination
            idx = 0
            while idx < total_features_count:
                if idx not in self.indices_:
                    indices = list(self.indices_)
                    indices.append(idx)
                    score = self._calc_score(X_train.values, X_test.values, y_train.values, y_test.values, indices)
                    scores.append(score)
                    subsets.append(indices)
                idx += 1

            # Get the index of the best score
            best_score_index = np.argmax(scores)
            # Record the best score
            self.scores_.append(scores[best_score_index])
            # Get the indices of features that gave the best score
            self.indices_ = list(subsets[best_score_index])
            # Record the indices of features for the best score
            self.subsets_.append(self.indices_)

            dim += 1

        self.k_score_ = self.scores_[-1]

    # Transform the data to the dataset with the selected features
    def transform(self, X):
        return X.values[:, self.indices_]

    # Train models with a specific set of features (indices)
    def _calc_score(self, X_train, X_test, y_train, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train.ravel())
        y_pred = self.estimator.predict(X_test[:, indices])
        score = accuracy_score(y_test, y_pred)
        return score

# Main execution block
if __name__ == "__main__":
    # Define the range of features to consider (up to the first 10 features)
    data_features = [x for x in range(data_DC.shape[1])]
    accuracy_result_DC = []

    # Loop over the specified range of features
    for k in data_features[0:10]:
        # Instantiate a Logistic Regression classifier
        lr = LogisticRegression(C=1.0, random_state=1, solver='lbfgs', max_iter=1000)

        # Instantiate SequentialForwardSelection for the current k value
        sfs_DC = SequentialForwardSelection(rf, k)

        # Fit the data to determine the k_features which give the most optimal model performance
        sfs_DC.fit(X_train_DC, X_test_DC, y_train_DC, y_test_DC)

        # Transform the training data set to a dataset having k_features
        # that give the most optimal model performance
        sfs_DC.transform(X_train_DC)

        # Transform the test data set to a dataset having k_features
        sfs_DC.transform(X_test_DC)

        # Accuracy calculation for DC
        sfs_DC.estimator.fit(X_train_DC.iloc[:, sfs_DC.indices_], y_train_DC.ravel())
        y_pred_DC = sfs_DC.estimator.predict(X_test_DC.iloc[:, sfs_DC.indices_])
        score = accuracy_score(y_test_DC, y_pred_DC)

        # Print the accuracy for the current k value
        print("accuracy_DC: ", '%.4f' % score)
        accuracy_dict = {k: '%.4f' % score}
        accuracy_result_DC.append(float(score))

    # Print the accuracy results for all k values
    print("accuracy_DC: ", accuracy_result_DC)

    # Define the range of features to consider (up to the first 10 features)
    data_features = [x for x in range(data_DC.shape[1])]
    plt.figure(figsize=(10, 6))
    # Plot accuracy for DC
    plt.plot(data_features[0:10], accuracy_result_DC, marker='o', label='SMC', color='blue')

    # Add labels, title, and legend
    plt.xlabel('Number of Features Selected')
    plt.ylabel('Accuracy')
    plt.title('Classification Accuracy')
    #plt.legend()

    # Show the plot
    plt.grid(True)
    plt.show()