


import re
import os
import pandas as pd
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
ROOT_DIR = BASE_DIR
while ROOT_DIR != ROOT_DIR.parent and not (ROOT_DIR / "process_tabular").is_dir():
    ROOT_DIR = ROOT_DIR.parent


RAW_DIR = ROOT_DIR / "process_nlp_cv" / "raw_data"
NLP_PROC_DIR = ROOT_DIR / "process_nlp_cv" / "processed_results"
TAB_PROC_DIR = ROOT_DIR / "process_tabular" / "processed_results"
NLP_PROC_DIR.mkdir(parents=True, exist_ok=True)
TAB_PROC_DIR.mkdir(parents=True, exist_ok=True)

INPUT_CSV = RAW_DIR / "Baseline_result_combined_57_datasets.csv"
OUT_47    = TAB_PROC_DIR / "Baseline_result_tabular_47_datasets.csv"
OUT_REST  = NLP_PROC_DIR / "Baseline_result_nlp_cv_10_datasets.csv"


raw_47 = r"""
10_cover.npz   15_Hepatitis.npz    1_ALOI.npz           24_mnist.npz       29_Pima.npz        33_skin.npz      38_thyroid.npz    42_WBC.npz   47_yeast.npz            8_celeba.npz
11_donors.npz  16_http.npz         20_letter.npz        25_musk.npz        2_annthyroid.npz   34_smtp.npz      39_vertebral.npz  43_WDBC.npz  4_breastw.npz           9_census.npz
12_fault.npz   17_InternetAds.npz  21_Lymphography.npz  26_optdigits.npz   30_satellite.npz   35_SpamBase.npz  3_backdoor.npz    44_Wilt.npz  5_campaign.npz
13_fraud.npz   18_Ionosphere.npz   22_magic.gamma.npz   27_PageBlocks.npz  31_satimage-2.npz  36_speech.npz    40_vowels.npz     45_wine.npz  6_cardio.npz
14_glass.npz   19_landsat.npz      23_mammography.npz   28_pendigits.npz   32_shuttle.npz     37_Stamps.npz    41_Waveform.npz   46_WPBC.npz  7_Cardiotocography.npz
"""

def normalize_npz_name(token: str) -> str:
    token = token.strip()
    token = re.sub(r"\.npz$", "", token)  
    return token

def extract_47_list(raw: str):
    
    tokens = re.findall(r"[^\s]+\.npz", raw)
    
    names = [normalize_npz_name(t) for t in tokens]
    
    seen = set()
    out = []
    for n in names:
        if n not in seen:
            out.append(n)
            seen.add(n)
    return out

def main():
    if os.environ.get("ALLOW_57") != "1":
        print("⚠️ 57-dataset baseline split disabled. Set ALLOW_57=1 to enable.")
        return

    df = pd.read_csv(INPUT_CSV)

    dataset_col = "Dataset" if "Dataset" in df.columns else df.columns[0]

    list_47 = extract_47_list(raw_47)

    
    
    list_47_fixed = []
    for n in list_47:
        if n == "la10_cover":
            list_47_fixed.append("10_cover")
        else:
            list_47_fixed.append(n)

    set_47 = set(list_47_fixed)

    df_47 = df[df[dataset_col].isin(set_47)].copy()
    df_rest = df[~df[dataset_col].isin(set_47)].copy()

    df_47.to_csv(OUT_47, index=False)
    df_rest.to_csv(OUT_REST, index=False)

    print("✅ Done!")
    print(f"- 47 file:   {OUT_47}   | rows = {len(df_47)} | unique datasets = {df_47[dataset_col].nunique()}")
    print(f"- Rest file: {OUT_REST} | rows = {len(df_rest)} | unique datasets = {df_rest[dataset_col].nunique()}")

    
    missing = [n for n in list_47_fixed if n not in set(df[dataset_col].unique())]
    if missing:
        print("\n⚠️ Datasets in your 47-list NOT found in CSV Dataset column:")
        print(missing)

if __name__ == "__main__":
    main()
