import numpy as np
import arff as ARFF
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
import os
import re
from scipy.io import arff
from datetime import datetime

def make_arff(X, y, name, path):
    col_names = []
    col_types = []
    for i in range(X.shape[1]):
        col_names.append('attr{}'.format(i+1))
        col_types.append('NUMERIC')
    df = pd.DataFrame(X, columns=col_names)
    df['class'] = y.astype(int).astype(str)
    col_names.append('class')
    col_types.append(list(np.unique(y).astype(int).astype(str)))

    arff_data = {
        'description': name,
        'relation': name[:-5],
        'attributes': [(col_names[i], col_types[i]) for i in range(len(col_names))],
        'data': df.values.tolist()
    }
    with open(os.path.join(path, name), 'w') as f:
        ARFF.dump(arff_data, f)

def preprocess(df):
    X_cols = []
    for c in df.columns:
        if c != 'class':
            X_cols.append(c)
    X = df[X_cols]
    y = df['class']

    df = X
    df = df.dropna(axis=1, how='all')
    # try:
    #     imputer = SimpleImputer(strategy='most_frequent')
    #     df[df.select_dtypes(include=['object']).columns] = imputer.fit_transform(
    #         df[df.select_dtypes(include=['object']).columns])
    # except:
    #     a=1
    # try:
    #     imputer = SimpleImputer(strategy='mean')
    #     df[df.select_dtypes(include=['float']).columns] = imputer.fit_transform(
    #         df[df.select_dtypes(include=['float']).columns])
    # except:
    #     a=1
    for column in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
    scaler = MinMaxScaler()
    df = scaler.fit_transform(df)

    le = LabelEncoder()
    y = le.fit_transform(y)
    X = np.array(df)
    y = np.array(y)
    return X, y

def preprocess_arff(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    processed_lines = []
    for line in lines:
        if line.strip().lower().startswith('@attribute'):
            line=re.sub(r'\{([^}]*)\}', lambda m: '{' + m.group(1).replace(' ', '') + '}', line)
        processed_lines.append(line)

    temp_file_path = 'temp_preprocessed.arff'
    with open(temp_file_path, 'w') as file:
        file.writelines(processed_lines)

    processed_lines = []
    data_section = False
    for line in lines:
        if line.strip().lower().startswith('@attribute'):
            line = re.sub(r'\{([^}]*)\}', lambda m: '{' + m.group(1).replace(' ', '') + '}', line)
            processed_lines.append(line)
            continue
        if data_section:
            line = re.sub(r'\s*,\s*', ',', line.strip()+'\n')
        elif line.strip().lower() == '@data':
            data_section = True
        processed_lines.append(line)

    temp_file_path = 'temp_preprocessed.arff'
    with open(temp_file_path, 'w') as file:
        file.writelines(processed_lines)

    return temp_file_path

def read_arff(file_path,filename):
    data, meta = arff.loadarff(file_path)
    df = pd.DataFrame(data)
    df.rename(columns={df.columns[df.columns.get_loc(df.columns[-1])]: 'class'}, inplace=True)
    now = datetime.now()
    formatted_now = now.strftime("%Y-%m-%d %H:%M:%S")
    print(formatted_now+' '+file_path)
    X, y = preprocess(df)
    return X, y
