from pathlib import Path
from tqdm import tqdm
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from dateutil import parser

from preprocess_ttb import _drop_empty_columns, _drop_single_value_columns


WIKIDB_DIR = Path(__file__).parents[2] / "data/tabular/raw/wikidbs"
PROCESSED_DIR = Path(__file__).parents[2] / "data/tabular/processed"
LINKED_DIR = Path(__file__).parents[2] / "data/tabular/tables_with_linked_entities"


# Select relevant tasks
classification_targets = {
    "47746_SurnameDetails": "LanguageOfOrigin",
    "73376_HISTORICAL_FIGURES": "PROFESSION",
    "42562_Geographer_Profiles": "Languages",
    "30417_ArtworksCatalog": "ArtworkType",
    "36100_MagicNarrativeMotifs": "CulturalOrigin",
    "87283_ParishChurchDetails": "Country",
    "70942_NotableTreesInformation": "TreeSpecies",
    "29832_Rafael_Individuals": "Nationality",
    "02053_StriatumScientificArticles": "JournalName",
    "07136_researcher_profile": "affiliated_institution",
    "07310_DecommissionedTransportStations": "Country",
    "97297_MusicAlbumsPublishedInUs": "MusicGenre",
    "92415_island_details": "country_name",
    "67195_SUB_POST_OFFICE_DETAILS": "ADMINISTRATIVE_TERRITORY",
    "09510_CreativeCommonsAuthors": "Gender",
    "66643_KindergartenLocations": "Country",
    "64477_NobleIndividuals": "Role",
    "56474_Sculpture_Instances": "Material_Used",
    "90741_MUSEUM_DETAILS": "COUNTRY",
    "00473_HistoricBuildings": "CountryName",
    "97229_PhilosopherProfiles": "Languages",
    "07900_ArtistCopyrightRepresentation": "ArtistOccupation",
    "63797_SpringLocations": "CountryName",
    "65102_defender_profiles": "citizenship_country",
    "15542_FORWARD_PLAYERS": "SPORTS_TEAM",
    "70780_StateSchoolDetails": "Country",
}

regression_targets = {
    "90930_RegisteredShips": "GrossTonnage",
    "19664_MunicipalDistrictCapitals": "PopulationCount",
    "66610_geopolitical_regions": "land_area",
    "89039_Business_Entity_Locations": "Population_Count",
    "53353_research_articles": "publication_date",
    "14012_ResearchArticleCitations": "PublicationDate",
    "14976_DrawingsCatalog": "ArtworkHeightCm",
    "88197_artworks_inventory": "artwork_width_cm",
    "03977_Eclipsing_Binary_Star_Instances": "Apparent_Magnitude",
    "62826_HISTORICAL_FIGURES": "BIRTH_DATE",
    "46159_DissolvedMunicipalityRecords": "DissolutionDate",
    "28324_ukrainian_village_instances": "elevation_meters",
    "94062_POET_PROFILES": "DEATH_DATE",
    "82939_Territorial_Entities": "Population_Count",
    "89439_WwiPersonnelProfiles": "BirthDate",
    "28146_Twinned_Cities": "Population",
}

date_regressions = [
    "53353_research_articles",
    "14012_ResearchArticleCitations",
    "62826_HISTORICAL_FIGURES",
    "46159_DissolvedMunicipalityRecords",
    "94062_POET_PROFILES",
    "89439_WwiPersonnelProfiles",
]

unnecessary_columns = {
    "90930_RegisteredShips": ["ShipDescription", "ImoNumber", "RadioCallSign"],
    "19664_MunicipalDistrictCapitals": [
        "CityDescription",
        "CommonsCategory",
        "PostalCodeRange",
        "TimeZone",
        "DialingCode",
        "FreebaseId",
        "OktmoId",
        "GeoNamesId",
        "OkatoId",
        "WhosOnFirstId",
        "CityImage",
        "GnsUniqueFeatureId",
    ],
    "66610_geopolitical_regions": [
        "region_description",
        "region_map_image",
        "iso_code",
        "time_zone",
        "commons_category",
        "geo_names_id",
        "open_street_map_relation_id",
        "geographical_shape",
        "i_naturalist_place_id",
        "fips_code",
        "whos_on_first_id",
        "gns_unique_feature_id",
        "region_image",
        "main_category",
        "entity_type",
        "music_brainz_area_id",
        "freebase_id",
    ],
    "89039_Business_Entity_Locations": [
        "Entity_Description",
        "Open_Street_Map_Relation_Id",
        "Commons_Category",
        "Viaf_Id",
        "Geo_Names_Id",
        "Freebase_Id",
        "Main_Category",
        "Locator_Map_Image",
        "Entity_Type",
        "Time_Zone",
        "Whos_On_First_Id",
        "Fact_Grid_Item_Id",
        "Entity_Image",
        "National_Library_Of_Israel_J9u_Id",
        "Postal_Code",
    ],
    "53353_research_articles": [
        "article_description",
        "article_type",
        "issue_number",
        "page_range",
        "pub_med_id",
        "digital_object_identifier",
    ],
    "14012_ResearchArticleCitations": [
        "ArticleDescription",
        "Pmcid",
        "PubMedId",
        "ArticleType",
        "DigitalObjectIdentifier",
        "PageRange",
        "JournalIssue",
        "ResearchGatePublicationId",
    ],
    "14976_DrawingsCatalog": [
        "ArtworkDescription",
        "ArtworkImage",
        "ArtworkTitle",
        "InventoryId",
    ],
    "88197_artworks_inventory": [
        "artwork_description",
        "inventory_id",
        "artwork_title",
        "mccp_id",
    ],
    "3977_Eclipsing_Binary_Star_Instances": [
        "Star_Description",
        "Catalog_Identifier",
        "Simbad_Id",
        "Epoch_Reference",
    ],
    "62826_HISTORICAL_FIGURES": [
        "VIAF_ID",
        "GND_ID",
        "ENTITY_TYPE",
        # "FIRST_NAME",
        # "SURNAME",
        "DEUTSCHE_BIOGRAPHIE_GND_ID",
        "GOOGLE_KNOWLEDGE_GRAPH_ID",
    ],
    "46159_DissolvedMunicipalityRecords": [
        "MunicipalityDescription",
        "EntityType",
        "GeoLodId",
        "NameInKana",
        "GoogleKnowledgeGraphId",
    ],
    "28324_ukrainian_village_instances": [
        "village_description",
        "koatuu_id",
        "village_type",
        "membership",
        "postal_code",
        "time_zone",
        "wikimedia_project_focus",
        "fact_grid_item_id",
        "google_knowledge_graph_id",
    ],
    "94062_POET_PROFILES": [
        "VIAF_ID",
        "INTERNATIONAL_STANDARD_NAME_IDENTIFIER",
        "ENTITY_TYPE",
    ],
    "82939_Territorial_Entities": [
        "Settlement_Description",
        "Open_Street_Map_Relation_Id",
        "Commons_Category",
        "Viaf_Id",
        "Geo_Names_Id",
        "Freebase_Id",
        "Main_Category",
        "Locator_Map",
        "Time_Zone",
        "Whos_On_First_Id",
        "Image_Url",
    ],
    "46912_AsteroidDiscoveryRecords": [
        "AsteroidDescription",
        "MinorPlanetCenterBodyId",
        "JplSmallBodyDatabaseSpkId",
    ],
    "89439_WwiPersonnelProfiles": ["Biography", "EntityType", "FreebaseId"],
    "28146_Twinned_Cities": [
        "City_Description",
        "Flag_Image",
        "Main_Category",
        "Birth_Category",
        "Open_Street_Map_Relation_Id",
        "Commons_Gallery",
        "Commons_Category",
        "Page_Banner",
        "Bibliotheque_Nationale_De_France_Id",
        "Freebase_Id",
        "Gnd_Id",
        "Library_Of_Congress_Authority_Id",
        "Music_Brainz_Area_Id",
        "Viaf_Id",
        "Death_Category",
        "Geo_Names_Id",
        "Associated_People_Category",
        "Time_Zone",
        "Postal_Code",
        "Source_Description",
        "Dialing_Code",
        "Locator_Map_Image",
        "Native_Label",
        "Quora_Topic_Id",
        "Gran_Enciclopedia_Catalana_Id_Former_Scheme",
        "Arch_Inform_Location_Id",
        "Unlocode",
        "Coat_Of_Arms_Image",
        "Nl_Cr_Aut_Id",
        "Encyclopdia_Britannica_Online_Id",
        "Whos_On_First_Id",
        "City_Image",
        "Maps_Category",
        "Fact_Grid_Item_Id",
        "National_Library_Of_Israel_J9u_Id",
        "Store_Norske_Leksikon_Id",
        "Den_Store_Danske_Id",
        "Museum_Digital_Place_Id",
        "Online_Pwn_Encyclopedia_Id",
        "Gran_Enciclopedia_Catalana_Id",
        "Dewey_Decimal_Classification",
    ],
    "47746_SurnameDetails": [
        "Description",
        "TypeOfName",
        "DistinctFrom",
        "GeopatronymeId",
        "SoundexCode",
        "ColognePhoneticsCode",
        "CaverphoneCode",
        "CommonsCategory",
        "WolframEntityCode",
        "GeneanetFamilyNameId",
    ],
    "73376_HISTORICAL_FIGURES": [
        "ENTITY_TYPE",
        "THE_PEERAGE_PERSON_ID",
        "GENICOM_PROFILE_ID",
        "WIKI_TREE_PERSON_ID",
    ],
    "42562_Geographer_Profiles": [
        "Gnd_Id",
        "Viaf_Id",
        "Id_Ref_Id",
        "Library_Of_Congress_Authority_Id",
        "Isni_Identifier",
        "Entity_Type",
    ],
    "30417_ArtworksCatalog": ["IiifManifestUrl", "ArtworkTitle", "InventoryNumber"],
    "36100_MagicNarrativeMotifs": [
        "DepictionType",
        "CatalogueCode",
        "SourceDescription",
    ],
    "87283_ParishChurchDetails": [
        "ChurchDescription",
        "CommonsCategory",
        "ChurchImage",
    ],
    "70942_NotableTreesInformation": [
        "TreeDescription",
        "CommonsCategory",
        "TreeImage",
    ],
    "29832_Rafael_Individuals": ["Entity_Type", "Gender", "Biography"],
    "2053_StriatumScientificArticles": [
        "ArticleDescription",
        "PubMedId",
        "ArticleType",
        "ArticleTitle",
        "PageRange",
        "Doi",
        "JournalIssue",
    ],
    "7136_researcher_profile": ["entity_type", "orcid_i_d"],
    "7310_DecommissionedTransportStations": [
        "StationDescription",
        "StationImage",
        "CommonsCategory",
    ],
    "97297_MusicAlbumsPublishedInUs": [
        "AlbumDescription",
        "MusicBrainzReleaseGroupId",
        "AlbumSeries",
        "DiscogsMasterId",
    ],
    "92415_island_details": ["island_description", "geo_names_id"],
    "67195_SUB_POST_OFFICE_DETAILS": ["DESCRIPTION", "COUNTRY", "OWNER"],
    "9510_CreativeCommonsAuthors": ["EntityType", "OrcidID", "ScopusAuthorId"],
    "66643_KindergartenLocations": ["Description", "Type", "ItalianSchoolId"],
    "64477_NobleIndividuals": [
        "Biography",
        "EntityType",
        "GenealogicsorgPersonId",
        "WikiTreePersonId",
        "ThePeeragePersonId",
    ],
    "56474_Sculpture_Instances": ["Description", "Image_File"],
    "90741_MUSEUM_DETAILS": ["MUSEUM_DESCRIPTION", "MUSEUM_IMAGE", "COMMONS_CATEGORY"],
    "473_HistoricBuildings": ["BuildingDescription", "BritishListedBuildingsId"],
    "97229_PhilosopherProfiles": [
        "Biography",
        "EntityType",
        "ViafId",
        "InternationalStandardNameIdentifier",
        "GndId",
        "LibraryOfCongressAuthorityId",
        "IdRefId",
    ],
    "7900_ArtistCopyrightRepresentation": [
        "ArtistDescription",
        "EntityType",
        "DacsIdFormer",
        "CopyrightRepresentativeStatus",
        "CopyrightStatusAsCreator",
        "DacsId2022",
    ],
    "63797_SpringLocations": [
        "SpringDescription",
        "GeoNamesId",
        "GnisFeatureId",
        "SpringType",
    ],
    "65102_defender_profiles": [
        "player_description",
        "player_occupation",
        "entity_type",
        "transfermarkt_player_id",
        "football_databaseeu_person_id",
    ],
    "15542_FORWARD_PLAYERS": [
        "PLAYER_DESCRIPTION",
        "PLAYER_OCCUPATION",
        "ENTITY_TYPE",
        "TRANSFERMARKT_PLAYER_ID",
        "FOOTBALL_DATABASEEU_PERSON_ID",
    ],
    "70780_StateSchoolDetails": [
        "SchoolDescription",
        "NpsnIndonesianSchoolId",
        "SekolahKitaId",
    ],
}


def date_to_fractional_year(date_str):
    """Function to compute fractional year.
    We do not use pd.to_datetime because it does not support datetimes
    outside the range of Python's standard library.
    """
    try:
        # Parse the date manually
        parsed_date = parser.parse(date_str)
        year = parsed_date.year
        month = parsed_date.month
        day = parsed_date.day

        # Handle negative years
        if date_str.startswith("-"):
            year = -year

        # Days in each month (non-leap year)
        days_in_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

        # Check for leap year and adjust February's days
        if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0):
            days_in_month[1] = 29

        # Calculate day of the year
        day_of_year = sum(days_in_month[: month - 1]) + day

        # Calculate total days in the year
        total_days = sum(days_in_month)

        # Compute fractional year
        fractional_year = year + (day_of_year - 1) / total_days
        return fractional_year
    except Exception as e:
        # Handle invalid dates
        return None


def load_table(table_name):
    # Load the table
    prefix = table_name.split("_")[0]
    folder_list = os.listdir(WIKIDB_DIR)
    folder = [f for f in folder_list if f.startswith(prefix)][0]
    table = "_".join(table_name.split("_")[1:]) + ".csv"
    df = pd.read_csv(WIKIDB_DIR / folder / "tables" / table)
    df_ids = pd.read_csv(WIKIDB_DIR / folder / "tables_with_item_ids" / table)
    df["wikidata_id"] = df_ids.iloc[:, 0]
    return df


def ttb_preprocess(df, target):
    ## Run some basic data cleaning
    missing_ratio_threshold = 0.5

    df_size = df.shape
    # 1. Drop columns with more than 50% missing values
    df = _drop_empty_columns(df, threshold=missing_ratio_threshold)
    # 2. Drop columns with only one unique value
    df = _drop_single_value_columns(df)
    # 3. remove duplicates
    df = df.drop_duplicates()
    # 4. remove rows with missing target values
    df = df[df[target].notna()]
    # 5. drop unnamed columns
    df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
    print(f"Dataframe shape before/after basic cleaning: {df_size} / {df.shape}")
    return df


def main():
    ## Process classification tasks
    print("Processing classification tasks")
    for table_name, target in classification_targets.items():
        print(f"    {table_name}")

        # Load the table
        table = load_table(table_name)

        # Remove unnecessary columns
        if table_name in unnecessary_columns:
            table = table.drop(columns=unnecessary_columns[table_name])

        # Follow preprocessing steps from TTB
        table = ttb_preprocess(table, target)

        # Keep the classes that have more than 105 occurrences
        counts = table[target].value_counts()
        classes_to_remove = counts[counts < 105].index
        num_classes = len(counts.index.tolist())
        # Keep at most 10 classes
        if num_classes - len(classes_to_remove) > 10:
            classes_to_remove = counts.index[10:]
        table = table[~table[target].isin(classes_to_remove)].reset_index(drop=True)

        print(f"        Number of rows: {len(table)}")
        print(f"        Number of columns: {len(table.columns)-1}")
        print(f"        Number of classes: {len(table[target].unique())}")
        print(f"        Most populated class: {table[target].value_counts().max()}")
        print(f"        Least populated class: {table[target].value_counts().min()}")

        # Encode classification targets
        le = LabelEncoder()
        table[target] = le.fit_transform(table[target])

        # Save the table
        if len(table) > 1050:
            # Save linked table
            target_idx = table.columns.get_loc(target)
            linked_table = table.iloc[:, [0, target_idx, -1]]
            linked_table.to_parquet(
                LINKED_DIR / f"classification/wdbs_{table_name}.parquet", index=False
            )

            # Remove wikidata_id column
            table.drop(columns=["wikidata_id"], inplace=True)

            # Save table
            table.to_parquet(
                PROCESSED_DIR / f"classification/wdbs_{table_name}.parquet", index=False
            )
        else:
            print(f"Table {table_name} has insufficient rows for saving.")

    ## Process regression tasks
    print("Processing regression tasks")
    for table_name, target in regression_targets.items():
        print(f"    {table_name}")

        # Load the table
        table = load_table(table_name)

        # Remove unnecessary columns
        if table_name in unnecessary_columns:
            table = table.drop(columns=unnecessary_columns[table_name])

        # Follow preprocessing steps from TTB
        table = ttb_preprocess(table, target)

        # Process date targets
        if table_name in date_regressions:
            table[target] = table[target].str.removeprefix("+")
            table[target] = table[target].str.replace(
                r"(-?\d{1,4})-00-00", r"\1-01-01", regex=True
            )
            table[target] = table[target].apply(date_to_fractional_year)
            table[target] = table[target].apply(lambda x: 2025 - x)

        # Rescale targets
        table[target] = table[target].astype(float)
        table[target] = table[target].apply(np.log10)

        # Remove rows with non-finite targets
        table = table[np.isfinite(table[target])].reset_index(drop=True)

        print(f"        Number of rows: {len(table)}")
        print(f"        Number of columns: {len(table.columns)-1}")

        # Save the table
        if len(table) > 1050:
            # Save linked table
            target_idx = table.columns.get_loc(target)
            linked_table = table.iloc[:, [0, target_idx, -1]]
            linked_table.to_parquet(
                LINKED_DIR / f"regression/wdbs_{table_name}.parquet", index=False
            )

            # Remove wikidata_id column
            table.drop(columns=["wikidata_id"], inplace=True)

            # Save table
            table.to_parquet(
                PROCESSED_DIR / f"regression/wdbs_{table_name}.parquet", index=False
            )
        else:
            print(f"Table {table_name} has insufficient rows for saving.")
    return


if __name__ == "__main__":
    main()
