import pandas as pd
import numpy as np
from scipy.stats import rankdata
import copy
import pandas as pd
from scipy.stats import wilcoxon
from sklearn.preprocessing import StandardScaler


def calculate_avg_rank_dict(score_dict, fold_tasks=None):
    avg_rank_dict = {}
    
    # Find unique names
    approaches = set(name.rsplit('_', 1)[0] for name in score_dict.keys() if '_' in name)
    
    for approach in approaches:
        rank_approach = f'{approach}_Rank'
        score_approach = f'{approach}_Score'

        if rank_approach in score_dict and score_approach in score_dict:
            avg_rank_dict[rank_approach] = {}
            avg_rank_dict[score_approach] = {}

            for task_num in score_dict[rank_approach]:
                if fold_tasks is None or task_num in fold_tasks:
                    rank_scores = score_dict[rank_approach][task_num].iloc[:, 0].values
                    score_scores = score_dict[score_approach][task_num].iloc[:, 0].values

                    # Group metrics and calculate rankings
                    metrics_1 = [(rank_scores[0], rank_approach), (score_scores[0], score_approach)]
                    metrics_10 = [(rank_scores[1], rank_approach), (score_scores[1], score_approach)]
                    metrics_100 = [(rank_scores[2], rank_approach), (score_scores[2], score_approach)]

                    # Sort the metrics
                    metrics_1.sort(reverse=True, key=lambda x: x[0])
                    metrics_10.sort(reverse=True, key=lambda x: x[0])
                    metrics_100.sort(reverse=True, key=lambda x: x[0])

                    # Calculate ranks taking into account ties
                    ranks_1 = rankdata([-m[0] for m in metrics_1], method='min')
                    ranks_10 = rankdata([-m[0] for m in metrics_10], method='min')
                    ranks_100 = rankdata([-m[0] for m in metrics_100], method='min')

                    rank_1_value = [ranks_1[i] for i in range(len(metrics_1)) if metrics_1[i][1] == rank_approach][0]
                    score_1_value = [ranks_1[i] for i in range(len(metrics_1)) if metrics_1[i][1] == score_approach][0]
                    rank_10_value = [ranks_10[i] for i in range(len(metrics_10)) if metrics_10[i][1] == rank_approach][0]
                    score_10_value = [ranks_10[i] for i in range(len(metrics_10)) if metrics_10[i][1] == score_approach][0]
                    rank_100_value = [ranks_100[i] for i in range(len(metrics_100)) if metrics_100[i][1] == rank_approach][0]
                    score_100_value = [ranks_100[i] for i in range(len(metrics_100)) if metrics_100[i][1] == score_approach][0]

                    if task_num not in avg_rank_dict[rank_approach]:
                        avg_rank_dict[rank_approach][task_num] = pd.DataFrame(index=[0, 1, 2])
                    if task_num not in avg_rank_dict[score_approach]:
                        avg_rank_dict[score_approach][task_num] = pd.DataFrame(index=[0, 1, 2])

                    avg_rank_dict[rank_approach][task_num].loc[0, 'rank'] = rank_1_value
                    avg_rank_dict[rank_approach][task_num].loc[1, 'rank'] = rank_10_value
                    avg_rank_dict[rank_approach][task_num].loc[2, 'rank'] = rank_100_value

                    avg_rank_dict[score_approach][task_num].loc[0, 'rank'] = score_1_value
                    avg_rank_dict[score_approach][task_num].loc[1, 'rank'] = score_10_value
                    avg_rank_dict[score_approach][task_num].loc[2, 'rank'] = score_100_value

    return avg_rank_dict


def scale_task_score_values(task_dict, task_test_scores, func, params={}):
    scaled_task_dict = {}
    for task_num, df in task_dict.items():
        if '_' in task_num:
            task_id = int(task_num.split('_')[0])
        scaler = func(**params).fit(task_test_scores[task_id].reshape(-1, 1))
        scaled_df = df.applymap(lambda x: scaler.transform([[x]])[0][0])
        scaled_task_dict[task_num] = scaled_df
    return scaled_task_dict

def scale_score_results(metric_dict, task_test_scores, func, params={}):
    copied = copy.deepcopy(metric_dict)
    return {approach: scale_task_score_values(tasks, task_test_scores, func, params) for approach, tasks in copied.items()}

def scale_reg_results(amlb_reg_scores_results, reg_file):
    
    reg_task_test_scores = {}
    reg_results = pd.read_csv(reg_file, index_col=0)
    reg_results['test_scores'] = reg_results.test_scores.apply(lambda x: -1 * (-x ** 2))
    dic_ranges = (reg_results.groupby(['tid'])['test_scores'].max() - reg_results.groupby(['tid'])['test_scores'].min()).to_dict() 
    reg_results['test_scores'] = reg_results.apply(lambda row: row.test_scores/dic_ranges[row.tid], axis=1)

    reg_task_test_scores.update(reg_results.groupby('tid')['test_scores'].apply(lambda x: np.array(x)).to_dict())
    return scale_score_results(amlb_reg_scores_results, reg_task_test_scores, StandardScaler)

def aggregate_metrics_to_dataframe(ndcg_dict, mrr_dict, score_dict, ttb_dict, avg_rank_dict, func, fold_tasks=None):
    # Initialize a dictionary to store the aggregated data
    aggregated_data = {
        'NDCG@1': {},
        'NDCG@10': {},
        'NDCG@100': {},
        'MRR@1': {},
        'MRR@10': {},
        'MRR@100': {},
        'SCORE@1': {},
        'SCORE@10': {},
        'SCORE@100': {},
        'TTB@1': {},
        'TTB@10': {},
        'TTB@100': {},
        'AVG_RANK@1': {},
        'AVG_RANK@10': {},
        'AVG_RANK@100': {}
    }
    
    # Function to calculate the average of a metric across all tasks
    def calculate_average_metric(metric_dict, metric_name):
        for approach, tasks in metric_dict.items():
            # Create a list to store the values ​​of all tasks
            position_1_values = []
            position_10_values = []
            position_100_values = []
            
            # Iterate over all tasks and extract the values
            for task_num, df in tasks.items():
                if fold_tasks is None or task_num in fold_tasks:
                    position_1_values.append(df.iloc[0, 0])
                    position_10_values.append(df.iloc[1, 0])
                    position_100_values.append(df.iloc[2, 0])
            
            # Calculate the average for each position
            aggregated_data[f'{metric_name}@1'][approach] = func(position_1_values) 
            aggregated_data[f'{metric_name}@10'][approach] = func(position_10_values)
            aggregated_data[f'{metric_name}@100'][approach] = func(position_100_values)

    # Calculate averages for NDCG, MRR, SCORE, and TTB
    calculate_average_metric(ndcg_dict, 'NDCG')
    calculate_average_metric(mrr_dict, 'MRR')
    calculate_average_metric(score_dict, 'SCORE')
    calculate_average_metric(ttb_dict, 'TTB')
    calculate_average_metric(avg_rank_dict, 'AVG_RANK')
    
    # Create the DataFrame from the aggregated dictionary
    df_aggregated = pd.DataFrame(aggregated_data)
    
    return df_aggregated


from scipy.stats import wilcoxon
import pandas as pd

def calculate_wilcoxon(ndcg_dict, mrr_dict, score_dict, ttb_dict, avg_rank_dict, score_less=False, ttb=True):    
    wilcoxon_results = {
        'NDCG@1': {},
        'NDCG@10': {},
        'NDCG@100': {},
        'MRR@1': {},
        'MRR@10': {},
        'MRR@100': {},
        'SCORE@1': {},
        'SCORE@10': {},
        'SCORE@100': {},
        'TTB@1': {},
        'TTB@10': {},
        'TTB@100': {},
        'AVG_RANK@1': {},
        'AVG_RANK@10': {},
        'AVG_RANK@100': {}
    }
    # Function to calculate Wilcoxon for a metric
    def calculate_wilcoxon_metric(metric_dict, metric_name, score_less=False):
        # Find unique names
        approaches = set(name.rsplit('_', 1)[0] for name in metric_dict.keys() if '_' in name)
        
        for approach in approaches:
            rank_values = []
            score_values = []
            
            # Extract the values for _Rank and _Score
            if f'{approach}_Rank' in metric_dict and f'{approach}_Score' in metric_dict:
                for task_num in metric_dict[f'{approach}_Rank']:
                    rank_values.append(metric_dict[f'{approach}_Rank'][task_num].iloc[:, 0].values)
                    score_values.append(metric_dict[f'{approach}_Score'][task_num].iloc[:, 0].values)
                
                # Convert lists to arrays for Wilcoxon calculation
                rank_values_1 = pd.concat([pd.Series(r[:1]) for r in rank_values])
                rank_values_10 = pd.concat([pd.Series(r[1:2]) for r in rank_values])
                rank_values_100 = pd.concat([pd.Series(r[2:3]) for r in rank_values])
                
                score_values_1 = pd.concat([pd.Series(r[:1]) for r in score_values])
                score_values_10 = pd.concat([pd.Series(r[1:2]) for r in score_values])
                score_values_100 = pd.concat([pd.Series(r[2:3]) for r in score_values])
                
                # Calculate Wilcoxon and store the p-value
                if score_less:
                    _, p_value_1 = wilcoxon(rank_values_1, score_values_1, alternative='less')
                    _, p_value_10 = wilcoxon(rank_values_10, score_values_10, alternative='less')
                    _, p_value_100 = wilcoxon(rank_values_100, score_values_100, alternative='less')               
                else:
                    _, p_value_1 = wilcoxon(rank_values_1, score_values_1, alternative='greater')
                    _, p_value_10 = wilcoxon(rank_values_10, score_values_10, alternative='greater')
                    _, p_value_100 = wilcoxon(rank_values_100, score_values_100, alternative='greater')     

                wilcoxon_results[f'{metric_name}@1'][approach] = p_value_1
                wilcoxon_results[f'{metric_name}@10'][approach] = p_value_10
                wilcoxon_results[f'{metric_name}@100'][approach] = p_value_100

    # Calculate Wilcoxon for NDCG, MRR, SCORE, TTB and AVG_RANK
    calculate_wilcoxon_metric(ndcg_dict, 'NDCG')
    calculate_wilcoxon_metric(mrr_dict, 'MRR')
    calculate_wilcoxon_metric(score_dict, 'SCORE', score_less=score_less)
    if ttb:
        calculate_wilcoxon_metric(ttb_dict, 'TTB', score_less=True)  # TTB: less is better
    calculate_wilcoxon_metric(avg_rank_dict, 'AVG_RANK', score_less=True)  # AVG_RANK: less is better
    
    # Create the DataFrame from the Wilcoxon results dictionary
    df_wilcoxon = pd.DataFrame(wilcoxon_results)
    
    return df_wilcoxon


def get_difference_data(data):
    # Create a new DataFrame for the differences
    difference_data = {}

    # Find unique names
    approaches = set(idx.rsplit('_', 1)[0] for idx in data.index if '_' in idx)

    # Calculate the differences
    for approach in approaches:
        rank_row = data.loc[f'{approach}_Rank']
        score_row = data.loc[f'{approach}_Score']
        difference_data[approach] = rank_row - score_row
        
    return pd.DataFrame(difference_data).T


def get_percentage_improvement_data(data):
    # Create a new DataFrame for the improvement percentage
    improvement_data = {}

    # Find unique names for approaches
    approaches = set(idx.rsplit('_', 1)[0] for idx in data.index if '_' in idx)

    # Calculate the percentage of improvement
    for approach in approaches:
        rank_row = data.loc[f'{approach}_Rank']
        score_row = data.loc[f'{approach}_Score']
        # Make sure we are not dividing by zero
        improvement_data[approach] = ((rank_row - score_row) / score_row)

    return pd.DataFrame(improvement_data).T


def get_formatted_results(df):
    
    # Separate into two levels: approach and score/rank
    approach = [i.rsplit('_', 1)[0] for i in df.index]
    metric = [i.rsplit('_', 1)[-1] if '_' in i else '' for i in df.index]

    # Create the MultiIndex
    multi_index = pd.MultiIndex.from_arrays([approach, metric], names=['approach', 'score_or_rank'])

    # Assign the MultiIndex to the DataFrame
    df_copy = df.copy()
    df_copy.index = multi_index
    return df_copy.sort_index(ascending=[True,False])


def transformar_notacion_cientifica(df):
    columnas_score = [col for col in df.columns if "SCORE" in col]
    df[columnas_score] = df[columnas_score].applymap(lambda x: f"{x:.2e}")
    return df


def calculate_wilcoxon_complete(ndcg_dict, mrr_dict, score_dict, ttb_dict, avg_rank_dict, score_less=False, ttb=True):    
    
    wilcoxon_results = {
        'NDCG@1': {},
        'NDCG@5': {},
        'NDCG@10': {},
        'NDCG@100': {},
        'MRR@1': {},
        'MRR@5': {},
        'MRR@10': {},
        'MRR@100': {},
        'SCORE@1': {},
        'SCORE@5': {},
        'SCORE@10': {},
        'SCORE@100': {},
        'TTB@1': {},
        'TTB@5': {},
        'TTB@10': {},
        'TTB@100': {},
        'AVG_RANK@1': {},
        'AVG_RANK@5': {},
        'AVG_RANK@10': {},
        'AVG_RANK@100': {}
    }
    
    # Function to calculate Wilcoxon for a metric
    def calculate_wilcoxon_metric(metric_dict, metric_name, score_less=False):
        
        # Find unique names
        approaches = set(name.rsplit('_', 1)[0] for name in metric_dict.keys() if '_' in name)
        
        rank_values = []
        score_values = []
        
        for approach in approaches:

                        
            # Extract the values for _Rank and _Score
            if f'{approach}_Rank' in metric_dict and f'{approach}_Score' in metric_dict:
                for task_num in metric_dict[f'{approach}_Rank']:
                    rank_values.append(metric_dict[f'{approach}_Rank'][task_num].iloc[:, 0].values)
                    score_values.append(metric_dict[f'{approach}_Score'][task_num].iloc[:, 0].values)
                
        # Convert lists to arrays for Wilcoxon calculation
        rank_values_1 = pd.concat([pd.Series([r[0]]) for r in rank_values if pd.notna(r[0])]) if any(pd.notna(r[0]) for r in rank_values) else pd.Series(dtype=float)
        rank_values_5 = pd.concat([pd.Series([r[1]]) for r in rank_values if pd.notna(r[1])]) if any(pd.notna(r[1]) for r in rank_values) else pd.Series(dtype=float)
        rank_values_10 = pd.concat([pd.Series([r[2]]) for r in rank_values if pd.notna(r[2])]) if any(pd.notna(r[2]) for r in rank_values) else pd.Series(dtype=float)
        rank_values_100 = pd.concat([pd.Series([r[3]]) for r in rank_values if pd.notna(r[3])]) if any(pd.notna(r[3]) for r in rank_values) else pd.Series(dtype=float)

        score_values_1 = pd.concat([pd.Series([r[0]]) for r in score_values if pd.notna(r[0])]) if any(pd.notna(r[0]) for r in score_values) else pd.Series(dtype=float)
        score_values_5 = pd.concat([pd.Series([r[1]]) for r in score_values if pd.notna(r[1])]) if any(pd.notna(r[1]) for r in score_values) else pd.Series(dtype=float)
        score_values_10 = pd.concat([pd.Series([r[2]]) for r in score_values if pd.notna(r[2])]) if any(pd.notna(r[2]) for r in score_values) else pd.Series(dtype=float)
        score_values_100 = pd.concat([pd.Series([r[3]]) for r in score_values if pd.notna(r[3])]) if any(pd.notna(r[3]) for r in score_values) else pd.Series(dtype=float)

        if score_less:
            _, p_value_1 = wilcoxon(rank_values_1, score_values_1, alternative='less') 
            _, p_value_5 = wilcoxon(rank_values_5, score_values_5, alternative='less') if rank_values_5.shape[0]>0 else (np.nan, np.nan)
            _, p_value_10 = wilcoxon(rank_values_10, score_values_10, alternative='less')
            _, p_value_100 = wilcoxon(rank_values_100, score_values_100, alternative='less')               
        else:
            _, p_value_1 = wilcoxon(rank_values_1, score_values_1, alternative='greater')
            _, p_value_5 = wilcoxon(rank_values_5, score_values_5, alternative='greater') if rank_values_5.shape[0]>0 else (np.nan, np.nan)
            _, p_value_10 = wilcoxon(rank_values_10, score_values_10, alternative='greater')
            _, p_value_100 = wilcoxon(rank_values_100, score_values_100, alternative='greater')     

        print(metric_name+'@1','\t', 'all test','\t', 'Bonferroni adjustment', len(rank_values_1))
        print(metric_name+'@5','\t', 'all test','\t', 'Bonferroni adjustment', len(rank_values_5))
        print(metric_name+'@10','\t', 'all test','\t', 'Bonferroni adjustment', len(rank_values_10))
        print(metric_name+'@100','\t', 'all test','\t', 'Bonferroni adjustment', len(rank_values_100))
        
        wilcoxon_results[f'{metric_name}@1']['Bonferroni'] = p_value_1
        wilcoxon_results[f'{metric_name}@5']['Bonferroni'] = p_value_5
        wilcoxon_results[f'{metric_name}@10']['Bonferroni'] = p_value_10
        wilcoxon_results[f'{metric_name}@100']['Bonferroni'] = p_value_100

    # Calculate Wilcoxon for NDCG, MRR, SCORE, TTB and AVG_RANK
    calculate_wilcoxon_metric(ndcg_dict, 'NDCG')
    calculate_wilcoxon_metric(mrr_dict, 'MRR')
    calculate_wilcoxon_metric(score_dict, 'SCORE', score_less=score_less)
    if ttb:
        calculate_wilcoxon_metric(ttb_dict, 'TTB', score_less=True)  # TTB: less is better
    calculate_wilcoxon_metric(avg_rank_dict, 'AVG_RANK', score_less=True)  # AVG_RANK: less is better
    
    # Create the DataFrame from the Wilcoxon results dictionary
    df_wilcoxon = pd.DataFrame(wilcoxon_results)
    
    return df_wilcoxon