import pandas as pd
import numpy as np
from datetime import timedelta
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
import pickle
import json
import os

def read_berka(path):
    with open(path, 'r') as f:
        lines = f.readlines()

    header = lines[0].strip().replace('"', '').split(';')
    rows = []
    for line in lines[1:]:
        rows.append(line.strip().replace('"', '').split(';'))

    return pd.DataFrame(rows, columns=header)

def calculate_days_since_earliest_date(dates):
    date_objects = [datetime.strptime(date, '%y%m%d') for date in dates]
    earliest_date = min(date_objects)
    days_since = [(date - earliest_date).days for date in date_objects]
    return days_since, earliest_date.strftime('%y%m%d')

def reconstruct_dates(days_since, earliest_date_str):
    earliest_date = datetime.strptime(earliest_date_str, '%y%m%d')
    original_dates = [(earliest_date + timedelta(days=days)).strftime('%y%m%d') for days in days_since]
    return original_dates

def birth_number_split(birth_numbers):
    years = [int(bn[:2]) for bn in birth_numbers]
    months = [int(bn[2:4]) for bn in birth_numbers]
    days = [int(bn[4:6]) for bn in birth_numbers]
    genders = []
    for i in range(len(months)):
        if months[i] >= 50:
            months[i] -= 50
            genders.append(1)
        else:
            genders.append(0)
    return years, months, days, genders

def table_label_encode(df, discrete_cols):
    df = df.copy()
    label_encoders = {}
    for col in discrete_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders

def table_label_decode(df, label_encoders):
    df = df.copy()
    for col, le in label_encoders.items():
        df[col] = le.inverse_transform(df[col])
    return df

def get_domain(df, id_cols, discrete_cols):
    domain = {}
    for col in df.columns:
        if col in discrete_cols:
            domain[col] = {
                'size': len(df[col].unique()),
                'type': 'discrete'
            }
        elif col not in id_cols:
            domain[col] = {
                'size': len(df[col].unique()),
                'type': 'continuous'
            }
    return domain

def encode_and_save(df, discrete_cols, keys, save_dir, table_name):
    df_encoded, df_label_encoders = table_label_encode(df, discrete_cols)
    df_encoded = df_encoded.astype('str')
    df_encoded.to_csv(os.path.join(save_dir, f'{table_name}.csv'), index=False)
    with open(os.path.join(save_dir, f'{table_name}_label_encoders.pkl'), 'wb') as f:
        pickle.dump(df_label_encoders, f)
    df_domain = get_domain(df_encoded, keys, discrete_cols)
    with open(os.path.join(save_dir, f'{table_name}_domain.json'), 'w') as f:
        json.dump(df_domain, f)
