from pathlib import Path
import json
import pandas as pd

CLF_PATH = Path(__file__).parents[2] / "data/tabular/processed/classification"
REG_PATH = Path(__file__).parents[2] / "data/tabular/processed/regression"
DATA_PATH = Path(__file__).parents[2] / "data/tabular/processed"

wdbs_clf = {
    "47746_SurnameDetails": "LanguageOfOrigin",
    "73376_HISTORICAL_FIGURES": "PROFESSION",
    "42562_Geographer_Profiles": "Languages",
    "30417_ArtworksCatalog": "ArtworkType",
    "36100_MagicNarrativeMotifs": "CulturalOrigin",
    "87283_ParishChurchDetails": "Country",
    "70942_NotableTreesInformation": "TreeSpecies",
    "29832_Rafael_Individuals": "Nationality",
    "02053_StriatumScientificArticles": "JournalName",
    "07136_researcher_profile": "affiliated_institution",
    "07310_DecommissionedTransportStations": "Country",
    "97297_MusicAlbumsPublishedInUs": "MusicGenre",
    "92415_island_details": "country_name",
    "67195_SUB_POST_OFFICE_DETAILS": "ADMINISTRATIVE_TERRITORY",
    "09510_CreativeCommonsAuthors": "Gender",
    "66643_KindergartenLocations": "Country",
    "64477_NobleIndividuals": "Role",
    "56474_Sculpture_Instances": "Material_Used",
    "90741_MUSEUM_DETAILS": "COUNTRY",
    "00473_HistoricBuildings": "CountryName",
    "97229_PhilosopherProfiles": "Languages",
    "07900_ArtistCopyrightRepresentation": "ArtistOccupation",
    "63797_SpringLocations": "CountryName",
    "65102_defender_profiles": "citizenship_country",
    "15542_FORWARD_PLAYERS": "SPORTS_TEAM",
    "70780_StateSchoolDetails": "Country",
}

wkdbs_reg = {
    "90930_RegisteredShips": "GrossTonnage",
    "19664_MunicipalDistrictCapitals": "PopulationCount",
    "66610_geopolitical_regions": "land_area",
    "89039_Business_Entity_Locations": "Population_Count",
    "53353_research_articles": "publication_date",
    "14012_ResearchArticleCitations": "PublicationDate",
    "14976_DrawingsCatalog": "ArtworkHeightCm",
    "88197_artworks_inventory": "artwork_width_cm",
    "03977_Eclipsing_Binary_Star_Instances": "Apparent_Magnitude",
    "62826_HISTORICAL_FIGURES": "BIRTH_DATE",
    "46159_DissolvedMunicipalityRecords": "DissolutionDate",
    "28324_ukrainian_village_instances": "elevation_meters",
    "94062_POET_PROFILES": "DEATH_DATE",
    "82939_Territorial_Entities": "Population_Count",
    "89439_WwiPersonnelProfiles": "BirthDate",
    "28146_Twinned_Cities": "Population",
}


def main():
    # Load TTB configurations from JSON files
    with open(Path(__file__).parent / "ttb_data_config.json", "r") as f:
        data_configs = json.load(f)

    with open(Path(__file__).parent / "ttb_extra_config.json", "r") as f:
        extra_configs = json.load(f)

    ttb_configs = {**data_configs, **extra_configs}

    # Load CARTE configurations from JSON file
    with open(Path(__file__).parent / "carte_config.json", "r") as f:
        carte_configs = json.load(f)

    ## Make configurations
    all_configs = {}

    # TextTabBench
    for dataset_name, config in ttb_configs.items():
        dataset_config = {}
        dataset_name = "ttb_" + dataset_name
        dataset_config["dataset_name"] = dataset_name
        dataset_config["file"] = f"{dataset_name}.parquet"
        dataset_config["source"] = "TTB"
        if config["task"] == "clf":
            # Check if the file exists
            if not (CLF_PATH / dataset_config["file"]).exists():
                print(f"Warning: {dataset_config['file']} does not exist. Skipping.")
                continue
            df = pd.read_parquet(CLF_PATH / dataset_config["file"])
            num_classes = len(df[config["target"]].unique())
            if num_classes == 2:
                dataset_config["task"] = "b-clf"
            elif num_classes > 2:
                dataset_config["task"] = "m-clf"
            else:
                print(f"Warning: {dataset_name} has less than 2 classes.")
                continue
        elif config["task"] == "reg":
            # Check if the file exists
            if not (REG_PATH / dataset_config["file"]).exists():
                print(f"Warning: {dataset_config['file']} does not exist. Skipping.")
                continue
            dataset_config["task"] = "reg"
        else:
            print(f"Warning: {dataset_name} has unknown task {config['task']}.")
            continue
        dataset_config["target"] = config["target"]
        all_configs[dataset_name] = dataset_config

    # CARTE
    for dataset_name, config in carte_configs.items():
        dataset_config = {}
        dataset_name = "carte_" + dataset_name
        dataset_config["dataset_name"] = dataset_name
        dataset_config["file"] = f"{dataset_name}.parquet"
        dataset_config["source"] = "CARTE"
        if config["task"] == "classification":
            # Check if the file exists
            if not (CLF_PATH / dataset_config["file"]).exists():
                print(f"Warning: {dataset_config['file']} does not exist. Skipping.")
                continue
            df = pd.read_parquet(CLF_PATH / dataset_config["file"])
            num_classes = len(df[config["target_name"]].unique())
            if num_classes == 2:
                dataset_config["task"] = "b-clf"
            elif num_classes > 2:
                dataset_config["task"] = "m-clf"
            else:
                print(f"Warning: {dataset_name} has less than 2 classes.")
                continue
        elif config["task"] == "regression":
            # Check if the file exists
            if not (REG_PATH / dataset_config["file"]).exists():
                print(f"Warning: {dataset_config['file']} does not exist. Skipping.")
                continue
            dataset_config["task"] = "reg"
        else:
            print(f"Warning: {dataset_name} has unknown task {config['task']}.")
            continue
        dataset_config["target"] = config["target_name"]
        all_configs[dataset_name] = dataset_config

    # WikiDBs classification
    for dataset_name, target in wdbs_clf.items():
        dataset_config = {}
        dataset_name = "wdbs_" + dataset_name
        dataset_config["dataset_name"] = dataset_name
        dataset_config["file"] = f"{dataset_name}.parquet"
        # Check if the file exists
        if not (CLF_PATH / dataset_config["file"]).exists():
            print(f"Warning: {dataset_config['file']} does not exist. Skipping.")
            continue
        dataset_config["source"] = "WikiDBs"
        df = pd.read_parquet(CLF_PATH / dataset_config["file"])
        num_classes = len(df[target].unique())
        if num_classes == 2:
            dataset_config["task"] = "b-clf"
        elif num_classes > 2:
            dataset_config["task"] = "m-clf"
        else:
            print(f"Warning: {dataset_name} has less than 2 classes.")
            continue
        dataset_config["target"] = target
        all_configs[dataset_name] = dataset_config

    # WikiDBs regression
    for dataset_name, target in wkdbs_reg.items():
        dataset_config = {}
        dataset_name = "wdbs_" + dataset_name
        dataset_config["dataset_name"] = dataset_name
        dataset_config["file"] = f"{dataset_name}.parquet"
        # Check if the file exists
        if not (REG_PATH / dataset_config["file"]).exists():
            print(f"Warning: {dataset_config['file']} does not exist. Skipping.")
            continue
        dataset_config["source"] = "WikiDBs"
        dataset_config["task"] = "reg"
        dataset_config["target"] = target
        all_configs[dataset_name] = dataset_config

    # Save the configurations to a JSON file
    with open(DATA_PATH / "data_configs.json", "w") as f:
        json.dump(all_configs, f, indent=4)
    return


if __name__ == "__main__":
    main()
