import numpy as np
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from utils import read_data, check_data, adjust_dataset_size, split_labels, ColInfo, contruct_col_info
import category_encoders


def data_preprocessing_local(data):
    # max_min_scaler = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
    z_score_scaler = lambda x: (x - np.mean(x)) / np.std(x)
    scaler = z_score_scaler
    data[['height']] = data[['height']].apply(scaler)
    data[['lenght']] = data[['lenght']].apply(scaler)
    data[['area']] = data[['area']].apply(scaler)
    data[['blackpix']] = data[['blackpix']].apply(scaler)
    data[['blackand']] = data[['blackand']].apply(scaler)
    data[['wb_trans']] = data[['wb_trans']].apply(scaler)
    return data


def get_pageblocks_data(file_path):
    target_col_name = 'class'

    selected_labels = [1, 2, 3, 4, 5]

    dst = read_data(file_path)

    # dst = dst[(dst['class'] == selected_labels[0]) | (dst['class'] == selected_labels[1])]

    dst_x, dst_y = split_labels(dst, y_name=target_col_name)

    print(dst_y.value_counts())

    print("total data num", dst_x.shape)

    dst_x = data_preprocessing_local(dst_x)

    target_enc = category_encoders.OrdinalEncoder(cols=[target_col_name],
                                                  mapping=[
                                                      {'col': 'class',
                                                       'mapping': {selected_labels[i]:i for i in range(len(selected_labels))}
                                                       }]
                                                  )
    dst_y = target_enc.fit_transform(dst_y)
    dst_y = dst_y[target_col_name]

    col_info = contruct_col_info([],
                                 ['height', 'lenght', 'area', 'eccen', 'p_black', 'p_and', 'mean_tr',
                                  'blackpix', 'blackand', 'wb_trans']
                                 , target_col_name, dst_x)

    return dst_x, dst_y, col_info