# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np

import sklearn.datasets as ds

from data.split import Splitter
from data.defaults import NUM_FOLDS, SPLIT_SEED, VAL_FRACTION

num_classes = 7

categoricals = {
    'Wilderness_Area': 4,
    'Soil_Type': 40, 
    }

discrete_numericals = [
    'Slope',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    ]

continuous_numericals = [
    'Elevation',
    'Aspect',
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Horizontal_Distance_To_Fire_Points',
    ]

def load():
    data = ds.fetch_covtype(as_frame=True)
    yX = data.data.copy()
    yX.insert(0, 'Target', (data.target - 1).astype('category'))
    
    for col, cardinality in categoricals.items():
        one_hot_cols = [f'{col}_{i}' for i in range(cardinality)]
        yX[col] = np.argmax(yX[one_hot_cols], -1)
        yX[col] = yX[col].astype('category')
        yX.drop(one_hot_cols, axis=1, inplace=True)
    
    return yX.astype({k: np.uint8 for k in discrete_numericals})


def load_fold(fold=0, missing_as_value=True, fixed_point=False, num_folds=NUM_FOLDS, val_fraction=VAL_FRACTION, seed=SPLIT_SEED):
    df = load()
    num_samples = len(df)

    s = Splitter.from_shuffle(num_samples, seed=seed)
    s, test = s.cv(num_folds, fold, return_splitter=True)
    train_idx, val_idx = s.split((1 - val_fraction, val_fraction))
    
    return df.iloc[train_idx], df.iloc[val_idx], df.iloc[test.index]
    
