


from pathlib import Path
from typing import Optional
import os
import pandas as pd

BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
ROOT_DIR = BASE_DIR
while ROOT_DIR != ROOT_DIR.parent and not (ROOT_DIR / "process_tabular").is_dir():
    ROOT_DIR = ROOT_DIR.parent
TAB_PROC_DIR = ROOT_DIR / "process_tabular" / "processed_results"
NLP_PROC_DIR = ROOT_DIR / "process_nlp_cv" / "processed_results"
OUT_DIR = ROOT_DIR / "final_results" / "results"
OUT_DIR.mkdir(parents=True, exist_ok=True)

DVM_47 = TAB_PROC_DIR / "dvmad_result_tabular_47_datasets.csv"
DVM_47_FALLBACK = TAB_PROC_DIR / "dvmad_result_realworld_data.csv"
DVM_10 = NLP_PROC_DIR / "dvmad_result_nlp_cv_10_datasets.csv"

BASELINE_47 = TAB_PROC_DIR / "Baseline_result_tabular_47_datasets.csv"
BASELINE_10 = NLP_PROC_DIR / "Baseline_result_nlp_cv_10_datasets.csv"

OUT_DVM_57 = OUT_DIR / "dvmad_result_combined_57_datasets.csv"
OUT_BASELINE_57 = OUT_DIR / "Baseline_result_combined_57_datasets.csv"


def ensure_model_col_second(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    cols = list(df.columns)

    if "Dataset" not in df.columns:
        raise ValueError("Không thấy cột 'Dataset' trong file.")

    if len(cols) >= 2 and cols[1].lower() == "scaler":
        df = df.rename(columns={cols[1]: "Model"})
        df["Model"] = model_name

    if "Model" not in df.columns:
        df.insert(1, "Model", model_name)
    else:
        df["Model"] = model_name
        cols = list(df.columns)
        if cols[1] != "Model":
            cols.remove("Model")
            cols.insert(1, "Model")
            df = df[cols]

    return df


def concat_files(file_47: Path, file_10: Path, output_file: Path, force_model: Optional[str] = None):
    if not file_47.exists():
        raise FileNotFoundError(f"Missing file: {file_47}")
    if not file_10.exists():
        raise FileNotFoundError(f"Missing file: {file_10}")

    df47 = pd.read_csv(file_47)
    df10 = pd.read_csv(file_10)

    if force_model is not None:
        df47 = ensure_model_col_second(df47, force_model)
        df10 = ensure_model_col_second(df10, force_model)

    all_cols = list(df47.columns)
    for col in df10.columns:
        if col not in all_cols:
            all_cols.append(col)

    df47 = df47.reindex(columns=all_cols)
    df10 = df10.reindex(columns=all_cols)

    out = pd.concat([df47, df10], ignore_index=True)

    if force_model is not None and "Dataset" in out.columns:
        
        out = out.drop_duplicates(subset=["Dataset"], keep="first").reset_index(drop=True)

    output_file.parent.mkdir(parents=True, exist_ok=True)
    out.to_csv(output_file, index=False)

    print(f"✅ Saved: {output_file}")
    print("Rows:", len(out))
    print("Unique Dataset:", out["Dataset"].nunique())


def main():
    if os.environ.get("ALLOW_57") != "1":
        print("⚠️ 57-dataset concat disabled. Set ALLOW_57=1 to enable.")
        return

    dvm_47 = DVM_47 if DVM_47.exists() else DVM_47_FALLBACK

    concat_files(dvm_47, DVM_10, OUT_DVM_57, force_model="DVM-AD")
    concat_files(BASELINE_47, BASELINE_10, OUT_BASELINE_57)


if __name__ == "__main__":
    main()
