import pandas as pd
import random
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold

CLASSIFICATION = 'classification'

def load_benchmark(filename, cv=5, test_fold=0, seed=42, problem=CLASSIFICATION, stratified=True):   
    
    random.seed(seed)
    np.random.seed(seed)
        
    grid_cls_tasks_df = pd.read_csv(filename, index_col=0)
    list_model_ids = grid_cls_tasks_df.model_id.unique().copy()
    random.shuffle(list_model_ids)
    dic_model_id = {list_model_ids[i]:i for i in range(0, len(list_model_ids))}
    grid_cls_tasks_df['model_id'] = grid_cls_tasks_df.model_id.replace(dic_model_id).astype(int)
    grid_cls_tasks_df['column_type_id'] = grid_cls_tasks_df.column_type_id.replace({
        'num':0,
        'cat':1,
        'cat_num':2
    })
         
    grid_cls_tasks_df = grid_cls_tasks_df[grid_cls_tasks_df['ranking'].notna()]
    
    if stratified:

        kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
        task_ids = grid_cls_tasks_df[['tid','column_type_id']].value_counts().index.get_level_values(0).values
        ctypes = grid_cls_tasks_df[['tid','column_type_id']].value_counts().index.get_level_values(1).values

        # Split task IDs into folds
        folds=[]  
        for train_index, test_index in kf.split(task_ids, ctypes):
            train_ids = [task_ids[i] for i in train_index]
            test_ids = [task_ids[i] for i in test_index]
            folds.append((train_ids, test_ids))

        train_tasks = folds[test_fold][0]
        test_tasks = folds[test_fold][1]

    else:
        
        def dividir_en_folds(lista, n_folds):
            fold_size = len(lista) // n_folds
            return [lista[i*fold_size : (i+1)*fold_size] for i in range(n_folds)]

        test_tasks = []
        train_tasks = []
        for ctype in grid_cls_tasks_df.column_type_id.unique():
            ctype_all_tasks = grid_cls_tasks_df[grid_cls_tasks_df.column_type_id==ctype].tid.unique()
            random.shuffle(ctype_all_tasks)
            ctype_folds = dividir_en_folds(ctype_all_tasks, cv)
            ctype_test_tasks = ctype_folds[test_fold]
            ctype_train_tasks = [t for t in ctype_all_tasks if t not in ctype_test_tasks]
            test_tasks.extend(ctype_test_tasks)
            train_tasks.extend(ctype_train_tasks)
            #[tid for fold_list in [ctype_folds_list[test_fold] for ctype_folds_list in ctype_lists.values()] for tid in fold_list]"""
    
    Xy = grid_cls_tasks_df[grid_cls_tasks_df.tid.isin(train_tasks)].copy()
    max_ranking_lenght = Xy['tid'].value_counts().max()
    
    Xy['rank_count'] = max_ranking_lenght - Xy['ranking']
    rank_scores = Xy.pivot(index='model_id', columns='tid', values='rank_count')
    rank_means = rank_scores.mean(axis=1)
    grid_cls_tasks_df['rank_mean'] = grid_cls_tasks_df.model_id.apply(lambda m: rank_means.loc[m] if (
        m in rank_means.index or problem==CLASSIFICATION) else 0.0)
    
    Xy = grid_cls_tasks_df[grid_cls_tasks_df.tid.isin(train_tasks)].copy()
    test_scores = Xy.pivot(index='model_id', columns='tid', values='test_scores')
    test_means = test_scores.mean(axis=1)
    grid_cls_tasks_df['test_score_mean'] = grid_cls_tasks_df.model_id.apply(lambda m: test_means.loc[m] if (
        m in test_means.index or problem==CLASSIFICATION) else test_means.min())

    X_train = grid_cls_tasks_df[grid_cls_tasks_df.tid.isin(train_tasks)].sample(frac=1)
    X_test = grid_cls_tasks_df[grid_cls_tasks_df.tid.isin(test_tasks)].sample(frac=1)
    
    X_train = X_train[X_train['ranking'].notna()]
    X_test = X_test[X_test['ranking'].notna()]

    target = ['ranking', 'test_scores', 'fit_times'] if 'fit_times' in X_train.columns else [
        'ranking', 'test_scores']
    
    y_train = X_train[target].copy()
    y_test = X_test[target].copy()
    
    X_train = X_train.drop(target, axis=1)
    X_test = X_test.drop(target, axis=1)
    
    return X_train, y_train, X_test, y_test


from metrics import calculate_ndcg, calculate_mrr, calculate_score, calculate_ttb, to_one_row, prom_dicts

from sklearn.model_selection import train_test_split

def execute_model(model, model_name, data_splits, cv, val_set=False,
                  fit_params={}, sequential=False, calc_ttb=False, ks=[1, 10, 100]):

    test_ndcgs = {}
    test_mrrs = {}
    test_scores = {}
    test_ttbs = {}

    
    for fold in range(0,cv):

        X_train, y_train, X_test, y_test = data_splits[fold][0],data_splits[fold][1],data_splits[fold][2],data_splits[fold][3]
        
        X_train_split, y_train_split, X_val_split, y_val_split = X_train, y_train, None, None
                
        if val_set:
            unique_tids = X_train_scaled['tid'].unique()
            train_tids, val_tids = train_test_split(unique_tids, test_size=0.1, random_state=42)

            X_train_split = X_train_scaled[X_train_scaled['tid'].isin(train_tids)]
            X_val_split = X_train_scaled[X_train_scaled['tid'].isin(val_tids)]

            y_train_split = y_train[X_train_scaled['tid'].isin(train_tids)]
            y_val_split = y_train[X_train_scaled['tid'].isin(val_tids)]
        
        model.fit(X_train_split, y_train_split, X_val_split, y_val_split, fit_params)
                     
        Xy_test = pd.concat([X_test,y_test], axis=1)
        Xy_test['ranking_pred'] = model.predict(X_test) if not sequential else model.predict(X_test, y_test)
        
        for tid in Xy_test['tid'].unique():
            
            if tid in test_ndcgs:
                raise ValueError('Task id repetead between folds.')
            
            test_ndcg = pd.DataFrame({'ndcg':calculate_ndcg(
                Xy_test,
                Xy_test['ranking'],
                Xy_test['ranking_pred'],
                tid,
                ks=ks
            )
            })
        
            test_ndcgs[tid] = test_ndcg

            test_mrr = pd.DataFrame({'mrr':calculate_mrr(
                Xy_test,
                Xy_test['ranking'],
                Xy_test['ranking_pred'],
                tid,
                ks=ks
            )
            })

            test_mrrs[tid] = test_mrr

            test_score = pd.DataFrame({'score':calculate_score(
                Xy_test,
                Xy_test['ranking'],
                Xy_test['ranking_pred'],
                Xy_test['test_scores'],
                tid,
                ks=ks
            )
            })

            test_scores[tid] = test_score
            
            if calc_ttb:
                test_ttb = pd.DataFrame({'ttb':calculate_ttb(
                    Xy_test,
                    Xy_test['ranking'],
                    Xy_test['ranking_pred'],
                    Xy_test['fit_times'],
                    tid,
                    ks=ks
                )
                })

                test_ttbs[tid] = test_ttb
    
    return test_ndcgs, test_mrrs, test_scores, test_ttbs


def seed_sequence(n):
    if n == 1:
        return 0
    else:
        return 42 * (2 ** (n - 2))