import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import shap
import math
import itertools
import sys

from method import sobol_total_indices, calculate_sobol_total_indices_gpt, shap_with_model, sobol_total_with_model

from evaluation import classifier_evaluation, regressor_evaluation

# Increase recursion limit
sys.setrecursionlimit(10**6)


def sobol_first_order_effect(X, Y, subset_features):
    """
    Computes the Sobol' first-order effect for a subset of features.
    
    Args:
        X (pandas.DataFrame): The input data containing the features.
        Y (pandas.Series): The target variable.
        subset_features (list): The subset of feature names for which to calculate the Sobol' first-order effect.
    
    Returns:
        float: The Sobol' first-order effect for the subset of features.
    """
    n_samples = X.shape[0]
    
    # Reset the index of X and Y to avoid integer-based indexing
    X = X.reset_index(drop=True)
    Y = Y.reset_index(drop=True)
    
    # Calculate the variance of the output Y
    var_Y = Y.var(ddof=0)
    
    # Calculate the variance of the conditional expectation E(Y|X_subset)
    X_subset = X[subset_features]
    sorted_data = pd.concat([X_subset, Y], axis=1).sort_values(by=subset_features)
    sorted_X_subset = sorted_data[subset_features]
    sorted_Y = sorted_data[Y.name]
    
    cond_exp = pd.Series(index=sorted_data.index, data=0.0)
    for i in range(n_samples):
        left_indices = sorted_X_subset.iloc[:i].eq(sorted_X_subset.iloc[i], axis=1).all(axis=1)
        right_indices = sorted_X_subset.iloc[i+1:].eq(sorted_X_subset.iloc[i], axis=1).all(axis=1)
        left_indices = left_indices.reindex(sorted_Y.index, fill_value=False)
        right_indices = right_indices.reindex(sorted_Y.index, fill_value=False)
        left_values = sorted_Y.loc[left_indices]
        right_values = sorted_Y.loc[right_indices]
        cond_exp.iloc[i] = pd.concat([left_values, right_values]).mean()

        if left_values.empty and right_values.empty:
            cond_exp.iloc[i] = Y.mean()
        else:
            cond_exp.iloc[i] = pd.concat([left_values, right_values]).mean()
    
    cond_exp_variance = cond_exp.var(ddof=0)
    
    # Calculate the Sobol' first-order effect for the subset
    first_order_effect = cond_exp_variance / var_Y
    
    return first_order_effect


def calculate_sobol_first_order_effect_subset(X, Y, subset_indices):
    """
    Calculate the Sobol first-order effect of a subset of features as a whole.

    Parameters:
        X (DataFrame): Input features.
        Y (DataFrame): Output.
        subset_indices (list): Indices of the features in the subset.

    Returns:
        sobol_first_order_effect_subset (float): Sobol first-order effect for the subset of features as a whole.
    """
    num_samples = X.shape[0]

    # Select the subset of features from the input features
    X_subset = X.iloc[:, subset_indices].values

    # Calculate the variance of the output when the subset of features is varied
    var_Y_subset = np.var(Y.values @ X_subset)
    print(var_Y_subset)

    # Calculate the total output variance
    total_var_Y = np.var(Y)

    # Calculate the Sobol first-order effect for the subset of features as a whole
    sobol_first_order_effect_subset = var_Y_subset / total_var_Y

    return sobol_first_order_effect_subset

def factorial(n):
    """Calculate the factorial of a number."""
    result = 1
    for i in range(1, n + 1):
        result *= i
    return result

def calculate_shapley_values_variance_decomposition(X, Y):
    """
    Calculate Shapley values for variance decomposition.

    Parameters:
        X (DataFrame): Input features.
        Y (DataFrame): Output.

    Returns:
        shapley_values (Series): Shapley values for each feature.
    """
    num_samples, num_features = X.shape[0], X.shape[1]
    shapley_values = pd.Series(index=X.columns, dtype=float)

    total_var_Y = np.var(Y)

    for feature in X.columns:
        # Remove the current feature from X
        X_without_feature = X.drop(columns=[feature])

        shapley_value = 0.0

        for subset_size in range(1, num_features + 1):
            for subset in itertools.combinations(X_without_feature.columns, subset_size):
                coalition = list(subset)
                coalition.append(feature)

                X_coalition = X[coalition].values

                # Calculate the output variance for the current coalition
                var_Y_coalition = np.var(Y @ X_coalition)

                # Calculate the marginal contribution of the current coalition
                marginal_contribution = var_Y_coalition / factorial(len(coalition)) / factorial(num_features - len(coalition) - 1)

                shapley_value += marginal_contribution

        # Normalize the Shapley value by dividing by the number of samples
        shapley_value *= factorial(num_features) / num_samples

        # Store the Shapley value for the current feature
        shapley_values[feature] = shapley_value

    return shapley_values

def classifier_evaluation(classifier, X, Y):

    # baseline evaluation
    model = classifier
    model.fit(X, Y)
    scores = cross_val_score(model, X, Y, cv=10)
    print(f"Baseline score: {scores.mean()}")
    print("-"*20)


    # sobol total evaluation
    X_dropped_lowest = X.copy()
    for iter in range(len(X.columns)-1):
        total_indices = sobol_total_with_model(X_dropped_lowest, Y, model)
        for feature, index in zip(X_dropped_lowest.columns, total_indices):
            print(f"Feature: {feature}, Sobol' Total Index: {index:.4f}")
        min_index = np.argmin(total_indices)
        min_feature = X_dropped_lowest.columns[min_index]
        X_dropped_lowest = X_dropped_lowest.drop(min_feature, axis=1)
        model = classifier
        model.fit(X_dropped_lowest, Y)
        scores = cross_val_score(model, X_dropped_lowest, Y, cv=10)
        print(f"Feature {min_feature} dropped, new score: {scores.mean()}")
        print("-"*10)
    
    print("Sobol total evaluation done")
    print("-"*20)

    # shapley evaluation
    X_dropped_lowest_shap = X.copy()  
    for iter in range(len(X.columns)-1):
        shap_values = shap_with_model(X_dropped_lowest_shap, Y, model)
        # shap_values = calculate_shapley_values(X_dropped_lowest_shap, Y)
        for feature, value in zip(X_dropped_lowest_shap.columns, shap_values):
            print(f"Feature: {feature}, SHAP Value: {value}")
        min_shap_value = shap_values.min()
        min_shap_index = np.argmin(shap_values)
        min_shap_feature = X_dropped_lowest_shap.columns[min_shap_index]
        X_dropped_lowest_shap = X_dropped_lowest_shap.drop(min_shap_feature, axis=1)
        model = classifier
        model.fit(X_dropped_lowest_shap, Y)
        scores = cross_val_score(model, X_dropped_lowest_shap, Y, cv=10)
        print(f"Feature {min_shap_feature} dropped, new score: {scores.mean()}")
        print("-"*10)

    print("Shapley evaluation done")

# Example usage

# Load your data
data = pd.read_csv('diabetes.csv')
X = data.drop('Outcome', axis=1)  # Input features
Y = data['Outcome']  # Target variable

model = RandomForestClassifier()

# Calculate Sobol' total indices
sobol_total_indices = sobol_total_with_model(X, Y, model)
print(sobol_total_indices)

# Calculate Shapley values
shap_values = shap_with_model(X, Y,model)
print(shap_values)

# Evaluate the classifier
classifier_evaluation(model, X, Y)

