


import re
import pandas as pd
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
ROOT_DIR = BASE_DIR
while ROOT_DIR != ROOT_DIR.parent and not (ROOT_DIR / "process_tabular").is_dir():
    ROOT_DIR = ROOT_DIR.parent


RAW_DIR = ROOT_DIR / "process_nlp_cv" / "raw_data"
PROC_DIR = ROOT_DIR / "process_nlp_cv" / "processed_results"
INPUT_CSV = RAW_DIR / "dvmad_result_realworld_data.csv"
OUT_BEST = PROC_DIR / "dvmad_result_nlp_cv_10_datasets.csv"
BEST_METRIC = "AUCROC"  


BIG_ORDER = [
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
    ,
]


def map_to_big(ds: str):
    s = str(ds)

    
    if re.match(r"^CIFAR10_[0-9]+$", s):
        return "CIFAR10"

    
    if re.match(r"^FashionMNIST_[0-9]+$", s):
        return "FashionMNIST"

    
    if re.match(r"^SVHN_[0-9]+$", s):
        return "SVHN"

    
    if re.match(r"^20news_[0-9]+$", s):
        return "20news"

    
    if re.match(r"^agnews_[0-9]+$", s):
        return "agnews"

    
    if s.startswith("MNIST-C_"):
        return "MNIST-C"

    
    if s.startswith("MVTec-AD_"):
        return "MVTec-AD"

    
    if s in ("amazon", "imdb", "yelp"):
        return s

    return None


def main():
    df = pd.read_csv(INPUT_CSV)

    
    dataset_col = "Dataset" if "Dataset" in df.columns else df.columns[0]

    
    scaler_col = "Scaler" if "Scaler" in df.columns else df.columns[1]

    
    df[scaler_col] = df[scaler_col].fillna("None")

    numeric_cols = df.select_dtypes(include="number").columns.tolist()
    if BEST_METRIC not in numeric_cols:
        raise ValueError(
            f"BEST_METRIC '{BEST_METRIC}' not in numeric columns: {numeric_cols}"
        )

    
    df["_BIG"] = df[dataset_col].map(map_to_big)
    df = df[df["_BIG"].notna()].copy()

    by_scaler = (
        df.groupby(["_BIG", scaler_col], dropna=False)[numeric_cols]
        .mean(numeric_only=True)
        .reset_index()
        .rename(columns={"_BIG": dataset_col})
    )

    by_scaler = by_scaler[[dataset_col, scaler_col, *numeric_cols]]
    by_scaler[dataset_col] = pd.Categorical(
        by_scaler[dataset_col], categories=BIG_ORDER, ordered=True
    )
    by_scaler = by_scaler.sort_values([dataset_col, scaler_col]).reset_index(drop=True)

    
    best = (
        by_scaler.sort_values([dataset_col, BEST_METRIC], ascending=[True, False])
        .groupby(dataset_col, sort=False)
        .head(1)
        .reset_index(drop=True)
    )

    
    best_out = best.drop(columns=[scaler_col])
    best_out.insert(1, "Model", "DVM-AD")
    best_out = best_out[[dataset_col, "Model", *numeric_cols]]

    best_out.to_csv(OUT_BEST, index=False)

    print(f"✅ Saved: {OUT_BEST} (rows: {len(best_out)})")
    print("Best metric:", BEST_METRIC)


if __name__ == "__main__":
    main()
