


import pandas as pd
import re


def natsorted(seq):
    def _key(s):
        parts = re.split(r"(\d+)", str(s))
        return [int(p) if p.isdigit() else p.lower() for p in parts]
    return sorted(seq, key=_key)

from pathlib import Path
import os




BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
ROOT_DIR = BASE_DIR
while ROOT_DIR != ROOT_DIR.parent and not (ROOT_DIR / "process_tabular").is_dir():
    ROOT_DIR = ROOT_DIR.parent

RESULTS_DIR = ROOT_DIR / "final_results" / "results"
RANK_DIR = ROOT_DIR / "final_results" / "ranking" / "nlp_cv"
RANK_DIR.mkdir(parents=True, exist_ok=True)

print("📂 Base directory:", BASE_DIR)
print("📂 Results directory:", RESULTS_DIR)
print("📂 Ranking directory:", RANK_DIR)




BIG_DATASETS = [
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
]


def map_to_big(ds: str):
    s = str(ds)

    if s in BIG_DATASETS:
        return s

    if re.match(r"^CIFAR10_[0-9]+$", s):
        return "CIFAR10"

    if re.match(r"^FashionMNIST_[0-9]+$", s):
        return "FashionMNIST"

    if re.match(r"^SVHN_[0-9]+$", s):
        return "SVHN"

    if re.match(r"^20news_[0-9]+$", s):
        return "20news"

    if re.match(r"^agnews_[0-9]+$", s):
        return "agnews"

    if s.startswith("MNIST-C_"):
        return "MNIST-C"

    if s.startswith("MVTec-AD_"):
        return "MVTec-AD"

    if s in ("amazon", "imdb", "yelp"):
        return s

    return None


def aggregate_baseline(df: pd.DataFrame) -> pd.DataFrame:
    if "Dataset" not in df.columns or "Model" not in df.columns:
        return df

    df = df.copy()
    df["Dataset_Agg"] = df["Dataset"].map(map_to_big)
    df["Dataset_Agg"] = df["Dataset_Agg"].fillna(df["Dataset"])

    numeric_cols = df.select_dtypes(include="number").columns.tolist()
    agg = df.groupby(["Dataset_Agg", "Model"], as_index=False)[numeric_cols].mean()
    agg = agg.rename(columns={"Dataset_Agg": "Dataset"})
    return agg





def process_files(input_file_ours, input_file_baseline,
                  output_file_rank, output_file_avg_ranks):

    
    df_dvmad = pd.read_csv(input_file_ours, skiprows=1, header=None)
    df_baseline = pd.read_csv(input_file_baseline, skiprows=1, header=None)

    
    if df_dvmad.shape[1] > df_baseline.shape[1]:
        df_dvmad = df_dvmad.iloc[:, :-1]

    
    new_header = [
        , 'Model', 'AUCROC', 'AUCPR', 'Accuracy', 'MCC',
        , 'Precision', 'Recall', 'Time Train', 'Time Test'
    ]
    df_dvmad.columns = new_header[:df_dvmad.shape[1]]
    df_baseline.columns = new_header[:df_baseline.shape[1]]

    
    df_baseline = aggregate_baseline(df_baseline)

    
    all_cols = [c for c in new_header if c in df_dvmad.columns or c in df_baseline.columns]
    df_dvmad = df_dvmad.reindex(columns=all_cols)
    df_baseline = df_baseline.reindex(columns=all_cols)

    
    merged_df = pd.concat([df_dvmad, df_baseline], ignore_index=True)

    
    merged_df['AUCROC'] = merged_df['AUCROC'].fillna(0)
    merged_df['AUCPR'] = merged_df['AUCPR'].fillna(0)

    
    merged_df['Rank_AUCROC'] = merged_df.groupby('Dataset')['AUCROC']                                         .rank(ascending=False, method='dense')

    merged_df['Rank_AUCPR'] = merged_df.groupby('Dataset')['AUCPR']                                        .rank(ascending=False, method='dense')

    
    merged_df['Dataset'] = pd.Categorical(
        merged_df['Dataset'],
        categories=natsorted(merged_df['Dataset'].unique()),
        ordered=True
    )

    merged_df = merged_df.sort_values(
        ['Dataset', 'Rank_AUCROC']
    ).reset_index(drop=True)

    
    merged_df.to_csv(output_file_rank, index=False)

    
    avg_ranks = merged_df.groupby('Model').agg(
        Avg_Rank_AUCROC=('Rank_AUCROC', 'mean'),
        Avg_Rank_AUCPR=('Rank_AUCPR', 'mean')
    ).reset_index()

    
    avg_ranks_auc = avg_ranks.sort_values(
        
    ).reset_index(drop=True)

    avg_ranks_aupr = avg_ranks.sort_values(
        
    ).reset_index(drop=True)

    
    if 'DVM-AD' in avg_ranks['Model'].values:
        d = avg_ranks[avg_ranks['Model'] == 'DVM-AD']

        auc_rank = avg_ranks_auc[
            avg_ranks_auc['Model'] == 'DVM-AD'
        ].index[0] + 1

        aupr_rank = avg_ranks_aupr[
            avg_ranks_aupr['Model'] == 'DVM-AD'
        ].index[0] + 1

        print(f"📊 {input_file_ours.name}")
        print(f"   DVM-AD Avg_Rank_AUCROC: {d['Avg_Rank_AUCROC'].values[0]:.3f} "
              f"(Rank {auc_rank})")
        print(f"   DVM-AD Avg_Rank_AUCPR:  {d['Avg_Rank_AUCPR'].values[0]:.3f} "
              f"(Rank {aupr_rank})\n")

    
    avg_ranks.to_csv(output_file_avg_ranks, index=False)





file_pairs = [
    ('dvmad_result_nlp_cv_10_datasets.csv', 'Baseline_result_nlp_cv_10_datasets.csv'),
    ('dvmad_result_tabular_47_datasets.csv', 'Baseline_result_tabular_47_datasets.csv'),
    
    
    
]




for ours_file, baseline_file in file_pairs:
    ours_path = RESULTS_DIR / ours_file
    baseline_path = RESULTS_DIR / baseline_file

    if not ours_path.exists():
        print(f"❌ Missing file: {ours_path}")
        continue

    if not baseline_path.exists():
        print(f"❌ Missing file: {baseline_path}")
        continue

    rank_file = RANK_DIR / f"{ours_path.stem}_rank.csv"
    avg_rank_file = RANK_DIR / f"{ours_path.stem}_avg_ranks.csv"

    process_files(
        ours_path,
        baseline_path,
        rank_file,
        avg_rank_file
    )

print("🎉 DONE: All ranking files generated successfully.")
