import numpy as np
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from utils import read_data, check_data, adjust_dataset_size, split_labels, ColInfo, contruct_col_info
import category_encoders


def std_based(col_name,df):
    mean = df[col_name].mean()
    std = df[col_name].std()
    cut_off = std * 3
    lower, upper = mean - cut_off, mean + cut_off
    new_df = df[(df[col_name] < upper) & (df[col_name] > lower)]
    return new_df


def data_preprocess_global(dst, selected_labels, y_name):
    dst.rename({'DiabetesPedigreeFunction': 'DPF'}, inplace=True, axis=1)
    dst.BloodPressure = dst.BloodPressure.replace(0, dst.BloodPressure.median())
    dst.SkinThickness = dst.SkinThickness.replace(0, dst.SkinThickness.mean())
    dst.Insulin = dst.Insulin.replace(0, dst.Insulin.median())
    dst.BMI = dst.BMI.replace(0, dst.BMI.mean())

    dst = std_based('Pregnancies', dst)
    dst = std_based('BloodPressure', dst)
    dst = std_based("SkinThickness", dst)
    dst = std_based('Insulin', dst)
    dst = std_based('DPF', dst)
    dst = std_based('Age', dst)

    return dst


def get_diabetes_data(file_path):
    target_col_name = 'Outcome'
    selected_labels = [0, 1]

    dst = read_data(file_path)

    dst = data_preprocess_global(dst, selected_labels, y_name=target_col_name)

    dst_x, dst_y = split_labels(dst, y_name=target_col_name)

    print(dst_y.value_counts())

    col_info = contruct_col_info([], ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
                     'DPF', 'Age'], target_col_name, dst_x)

    return dst_x, dst_y, col_info
