import re
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import shutil
import pandas as pd

destination_folder = '../data_real/generated_temp/'
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

final_folder = '../data_real/generated/'
if not os.path.exists(final_folder):
    os.makedirs(final_folder)
    

def get_files(folder_path):
    file_list = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):  # Replace '.txt' with the extension of the files you want to list
            file_path = os.path.join(folder_path, file_name)
            file_list.append(file_path)

    print(file_list)
    return file_list

def generate_dataset(df, cols, file_name, id_f):
    res = df.iloc[:, cols[0]]
    for i in range(1, len(cols)):
        df1 = df.iloc[:, cols[i]]
        res = pd.concat([res, df1], axis=1)
    df1 = df.iloc[:, -1]
    res = pd.concat([res, df1], axis=1)
    dataset_name = file_name.split('.')[-2].split("/")[-1]
    res.to_csv(destination_folder +"{}_4_{}_{}.csv".format(dataset_name, len(cols), id_f), index=False)
        
def generate_datasets(data, file_name, max_num):
    print("generating {}".format(max_num))
    for i in range(1, max_num):
        bin = "{0:b}".format(i)
        #print(bin)
        cols = []
        for j in range(len(bin)):
            if bin[len(bin)-j-1] == '1':
                cols.append(j)
        #print(cols)
        generate_dataset(data, cols, file_name, i)
        


# Source and destination folder paths
#source_folder = './'

for filename in get_files('../data_real/datasets_real_normalized/'):
    data = pd.read_csv(filename)
    generate_datasets(data, filename, 2**(data.shape[1]-1))
    

#identify files to run
# if ERM of data of the same complexity is not in the Rashomon set, there is no reason to run this dataset

files = get_files(destination_folder)
theta = 0.03

scores = []
code_name = []
number_of_features = []

for filename in files:
    data = pd.read_csv(filename)
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values
    model = LogisticRegression(penalty='none')
    model.fit(X, y)
    
    #y_pred = model.predict(X)
    
    # Evaluate the model using classification metrics
    print(filename, model.score(X,y))#, accuracy_score(y_pred, y)
    scores += [model.score(X,y)]
    code_name += [filename.split('.')[-2].split('/')[-1].split('_')[0] + '_'+filename.split('.')[-2].split('/')[-1].split('_')[2]]
    number_of_features += [int(filename.split('.')[-2].split('/')[-1].split('_')[2])]


# Create a DataFrame
df = pd.DataFrame({'dataset_f': files
                   ,'accuracy': scores
                   , 'code_name': code_name
                   , 'num_features': number_of_features})


#df.to_csv("acc_datasets.csv")

# Function to filter rows based on the specified accuracy difference
def filter_accuracy(group):
    max_accuracy = group['accuracy'].max()
    group['keep'] = (max_accuracy - group['accuracy']) <= theta
    return group

# Filter the DataFrame
filtered_df = df.groupby('code_name').apply(filter_accuracy).reset_index(drop=True)
filtered_df = filtered_df[filtered_df['keep']]

filtered_df.drop(columns='keep')

# Filter the DataFrame
filtered_df.to_csv("datasets_ro_run.csv")


# Iterate through each file in the DataFrame and copy it to the destination folder
for index, row in filtered_df.iterrows():
    file_name = row['dataset_f']
    #source_path = os.path.join(source_folder, file_name)
    file = file_name.split("/")[-1]
    destination_path = os.path.join(final_folder, file)

    # Copy the file to the destination folder
    shutil.copy2(file_name, destination_path)
    
    
import numpy as np

def read_dataset(folder):
    X = np.load(folder + "/X_data.npy")
    Y0 = np.load(folder + "/Y_data0.npy")
    Y1 = np.load(folder + "/Y_data1.npy")
    return X, Y0, Y1

def save_dataset(folder, X, Y0, Y1):
    np.save(folder + "/X_data", X)
    np.save(folder + "/Y_data0", Y0)
    np.save(folder + "/Y_data1", Y1)
    

for file in get_files(final_folder):
    df = pd.read_csv(file)
    X = df.iloc[:, :-1].values
    Y0 = df.iloc[:, -1].values
    Y1 = Y0.copy()
    Y1[Y1==0]=-1
    dataset_name = re.split('\.|/', file)[-2]
    dir_name = final_folder + "{}".format(dataset_name)
    os.mkdir(dir_name)
    save_dataset(dir_name, X, Y0, Y1)