import pandas as pd
import numpy as np
import os

os.makedirs('reports', exist_ok=True)
save_path = 'reports/report.xlsx'

result_folder = 'results'
# result_folder = '../backup_results_mlp'

tats_results = {
    'Agriculture.csv': 0.109,
    'Climate.csv': 1.028,
    'Economy.csv': 0.008,
    'Energy.csv': 0.265,
    'Environment.csv': 0.267,
    'Health.csv': 1.315,
    'Security.csv': 112.054,
    'SocialGood.csv': 0.987,
    'Traffic.csv': 0.187
}

# tats_results = {
#     'Agriculture.csv': 0.1116763,
#     'Climate.csv': 0.966027498,
#     'Economy.csv': 0.008428393,
#     'Energy.csv': 0.265845831,
#     'Environment.csv': 0.266818998,
#     'Health.csv': 1.307822749,
#     'Security.csv': 109.0977516,
#     'SocialGood.csv': 0.917961374,
#     'Traffic.csv': 0.191867627
# }

def read_result():
    exp_folders = os.listdir(result_folder)
    exp_folders = [f for f in exp_folders if os.path.isdir(os.path.join(result_folder, f)) and not f.startswith('.')]

    model_name_list = []
    data_list = []
    seed_list = []
    seq_len_list = []
    pred_len_list = []
    d_ff_list = []
    d_model_list = []
    batch_size_list = []
    pipeline_list = []
    lr_list = []
    prompt_weight_list = []
    weighted_loss_list = []
    n_period_list = []
    channel_independence_list = []
    alpha_list = []
    window_size_list = []
    hop_size_list = []
    learning_rate_list = []

    mape_list = []
    mse_list = []
    mae_list = []

    average_by_predict_len_dict = {}
    for exp_folder in exp_folders:
        tokens = exp_folder.strip().split('_')[10:]
        print('tokens: ', tokens)
        # pipeline, model_name, data, seed, seq_len, pred_len, d_ff, d_model, batch_size, lr, prompt_weight, freq_rate = \
        # tokens[0], tokens[1], tokens[2], tokens[3], tokens[4], tokens[5], tokens[6], tokens[7], tokens[8], tokens[9], tokens[10], tokens[11]
        data, seq_len, pred_len, seed, pipeline, weighted_loss = tokens[0], tokens[1], tokens[2], tokens[3], tokens[4], tokens[5]
        tokens = exp_folder.strip().split('_')
        pipeline, n_period, alpha, window_size, hop_size, channel_independence, learning_rate = tokens[0], tokens[1], tokens[2], tokens[3], tokens[4], tokens[5], tokens[6]

        exp_folder_path = os.path.join(result_folder, exp_folder)
        metric_file = os.path.join(exp_folder_path, 'metrics.npy')
        try:
            metric = np.load(metric_file)
        except:
            continue
        mae, mse, mape = metric[0], metric[1], metric[4]

        key = (pipeline, data, seq_len, pred_len, weighted_loss, n_period, channel_independence)
        if key not in average_by_predict_len_dict:
            average_by_predict_len_dict[key] = []
        average_by_predict_len_dict[key].append((mae, mse, mape))

        pipeline_list.append(pipeline)
        # model_name_list.append(model_name)
        data_list.append(data)
        seed_list.append(seed)
        seq_len_list.append(seq_len)
        pred_len_list.append(pred_len)
        weighted_loss_list.append(weighted_loss)
        n_period_list.append(n_period)
        channel_independence_list.append(channel_independence)
        alpha_list.append(alpha)
        window_size_list.append(window_size)
        hop_size_list.append(hop_size)
        learning_rate_list.append(learning_rate)
        # freq_rate_list.append(freq_rate)
        # d_ff_list.append(d_ff)
        # d_model_list.append(d_model)
        # batch_size_list.append(batch_size)
        # lr_list.append(lr)
        # prompt_weight_list.append(prompt_weight)

        mape_list.append(mape)
        mse_list.append(mse)
        mae_list.append(mae)

    data = {
        'pipeline': pipeline_list,
        # 'model_name': model_name_list,
        'data': data_list,
        'seed': seed_list,
        'seq_len': seq_len_list,
        'pred_len': pred_len_list,
        'n_period': n_period_list,
        'channel_independence': channel_independence_list,
        'alpha': alpha_list,
        'window_size': window_size_list,
        'hop_size': hop_size_list,
        'learning_rate': learning_rate_list,
        'mape': mape_list,
        'mse': mse_list,
        'mae': mae_list,
    }
    df = pd.DataFrame.from_dict(data)
    return df

def do_average(df, op_cols, target_cols=['mape', 'mse', 'mae'], n=None):
    columns = list(df.columns)
    key_cols = columns.copy()
    for c in op_cols:
        key_cols.remove(c)
    for c in target_cols:
        key_cols.remove(c)

    res_dict = {}

    for index, row in df.iterrows():
        key = []
        for c in key_cols:
            key.append(row[c])
        key = tuple(key)
        # print(key)
        if key not in res_dict.keys():
            res_dict[key] = {}
            for c in target_cols:
                res_dict[key][c] = []
        
        for c in target_cols:
            res_dict[key][c].append(row[c])
    
    new_df = {}
    for col in columns:
        new_df[col] = []

    for key, value in res_dict.items():
        for i, c in enumerate(key_cols):
            new_df[c].append(key[i])
        
        for c in value.keys():
            if n is not None and len(value[c]) < n:
                mean = 999
                new_df[c].append(mean)
                continue
            v_array = np.array(value[c])
            mean = np.mean(v_array)
            new_df[c].append(mean)

    remove_keys = []
    for k in new_df:
        # print(f'{k}: ', len(new_df[k]))
        if len(new_df[k]) == 0:
            remove_keys.append(k)
    for k in remove_keys:
        new_df.pop(k)
    new_df = pd.DataFrame.from_dict(new_df)
    return new_df

def get_best_results_by_pipeline(df, target_cols=['mse', 'mae']):
    columns = list(df.columns)
    
    res_dict = {}

    datasets = set(df['data'].tolist())
    pipelines = set(df['pipeline'].tolist())
    rows = []
    for pipeline in pipelines:
        sub_df = df[df['pipeline']==pipeline]
        if len(sub_df) == 0:
            continue
        mean_mse = sub_df['mse'].mean()
        mean_mae = sub_df['mae'].mean()
        rows.append({
            'pipeline': pipeline,
            'model_name': sub_df['model_name'].tolist()[0],
            'mse': mean_mse,
            'mae': mean_mae,
        })
    df = pd.DataFrame.from_records(rows)
    return df

def get_improvement_by_pipeline(df, target_cols=['mse', 'mae']):
    columns = list(df.columns)
    
    res_dict = {}

    datasets = set(df['data'].tolist())
    pipelines = set(df['pipeline'].tolist())
    rows = []
    for pipeline in pipelines:
        sub_df = df[df['pipeline']==pipeline]
        if len(sub_df) == 0:
            continue
        mean_mse = sub_df['mse'].mean()
        mean_mae = sub_df['mae'].mean()
        mean_promotion = sub_df['promotion'].mean()
        rows.append({
            'pipeline': pipeline,
            'model_name': sub_df['model_name'].tolist()[0],
            'mse': mean_mse,
            'mae': mean_mae,
            'promotion': mean_promotion,
        })

    df = pd.DataFrame.from_records(rows)
    return df

def get_best_results_by_data(df, target_cols=['mse', 'mae']):
    columns = list(df.columns)
    
    res_dict = {}

    datasets = set(df['data'].tolist())
    pipelines = set(df['pipeline'].tolist())
    rows = []
    for pipeline in pipelines:
        pipeline_df = df[df['pipeline']==pipeline]
        for data in datasets:
            sub_df = pipeline_df[pipeline_df['data']==data]
            if len(sub_df) == 0:
                continue
            max_id = sub_df['mse'].argmin()
            row = sub_df.iloc[max_id].to_dict()

            if not data.endswith('.csv'):
                data = data + '.csv'
            # promotion = (tats_results[data] - row['mse']) / tats_results[data] * 100
            # row['promotion'] = promotion
            rows.append(row)
    df = pd.DataFrame.from_records(rows)
    return df


def get_best_results_by_pred_len(df, target_cols=['mse', 'mae']):
    columns = list(df.columns)
    
    res_dict = {}

    datasets = set(df['data'].tolist())
    pipelines = set(df['pipeline'].tolist())
    rows = []
    for pipeline in pipelines:
        pipeline_df = df[df['pipeline']==pipeline]
        for data in datasets:    
            subdata_df = pipeline_df[pipeline_df['data']==data]
            if len(subdata_df) == 0:
                continue
            for pred_len in [96 , 192, 336, 720]:
                sub_df = subdata_df[subdata_df['pred_len']==str(pred_len)]
                if len(sub_df) == 0:
                    continue
                max_id = sub_df['mse'].argmin()
                row = sub_df.iloc[max_id].to_dict()

                if not data.endswith('.csv'):
                    data = data + '.csv'
                # promotion = (tats_results[data] - row['mse']) / tats_results[data] * 100
                # row['promotion'] = promotion
                rows.append(row)
    df = pd.DataFrame.from_records(rows)
    return df

def do_average_by_data(df, target_cols=['mse', 'mae']):
    columns = list(df.columns)
    
    res_dict = {}

    datasets = set(df['data'].tolist())
    pipelines = set(df['pipeline'].tolist())
    rows = []
    for pipeline in pipelines:
        pipeline_df = df[df['pipeline']==pipeline]
        for data in datasets:
            sub_df = pipeline_df[pipeline_df['data']==data]
            if len(sub_df) == 0:
                continue
            mse = sub_df['mse'].mean()
            mae = sub_df['mae'].mean()
            row = {
                'pipeline': pipeline,
                'data': data,
                'mse': mse,
                'mae': mae,
            }

            if not data.endswith('.csv'):
                data = data + '.csv'
            # promotion = (tats_results[data] - row['mse']) / tats_results[data] * 100
            # row['promotion'] = promotion
            rows.append(row)
    df = pd.DataFrame.from_records(rows)
    return df

df1 = read_result()
# df2 = do_average(df1, op_cols=['seed'])
df3 = do_average(df1, op_cols=['pred_len'], n=4)
df4 = get_best_results_by_data(df3)
# df5 = get_improvement_by_pipeline(df4)

df5 = get_best_results_by_pred_len(df1)
df6 = do_average_by_data(df5)



with pd.ExcelWriter(save_path) as writer:  
    df1.to_excel(writer, index=False, sheet_name='Experiments')
    # df2.to_excel(writer, index=False, sheet_name='Average_by_seed')
    df3.to_excel(writer, index=False, sheet_name='Average')
    df4.to_excel(writer, index=False, sheet_name='Best_by_data')
    # df5.to_excel(writer, index=False, sheet_name='Best_by_pipeline')
    df5.to_excel(writer, index=False, sheet_name='Best_by_data_all')
    df6.to_excel(writer, index=False, sheet_name='Average_by_data_all')    
