from pathlib import Path
import pandas as pd
import numpy as np

from SEPAL.dataloader import DataLoader


if __name__ == "__main__":
    ## Load original data
    dir_path = Path(__file__).parent
    use_cols = ["RegionName", "StateName", "CountyName", "2023-06-30"]
    df = pd.read_csv(dir_path / "City_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv", usecols=use_cols)
    df.columns = ["City", "Code", "County", "target"]


    ## Format city names to match Yago nomenclature
    state_codes = pd.read_csv(dir_path / "state_codes.csv")
    df = df.merge(state_codes, on="Code")
    df["col_to_embed"] = df["City"] + ",_" + df["State"]
    df.loc[:, "col_to_embed"] = df["col_to_embed"].str.replace(" ", "_")


    ## Deal with unmatched entities
    # Get Yago4 entities
    yago_dir = Path(__file__).absolute().parents[2] / "knowledge_graphs"
    yago4_dl = DataLoader(yago_dir / "yago4_with_full_ontology")
    entity_list = list(yago4_dl.entity_to_idx.keys())

    """
    Note:

    The directly matched entities have various types, contained in this list:
        ['City', 'Corporation', 'AdministrativeArea',
        'Census-designated_place', 'Borough_(Pennsylvania)',
        'New_England_town', 'County_seat', 'Village_(United_States)',
        'Human_settlement', 'Village', 'Town', 'Town_(New_Jersey)',
        'Independent_city_(United_States)', 'Township_(New_Jersey)',
        'Borough_(New_Jersey)', 'Place', 'Gay_village', 'Hamlet_(place)',
        'Consolidated_city-county', 'College_town', 'Border_town',
        'Municipality', 'Administrative_divisions_of_New_York_(state)',
        'Home_Rule_Municipality_(Pennsylvania)', 'Island', 'Ghost_town',
        'Suburb', 'City_(New_Jersey)', 'Thing', 'Unincorporated_area',
        'Resort_town', 'Planned_community', 'Neighbourhood',
        'Township_(Pennsylvania)', 'Charter_township',
        'Geographical_feature', 'Borough', 'Historic_district',
        'TouristAttraction', 'Concentration_camp', 'Micronation',
        'Unorganized_territory', 'Barrier_island', 'Civil_township', 'Bay',
        'Electoral_district', 'Township_(United_States)', 'SkiResort',
        'Show_cave']

    Some entities have several types, e.g.
        Suitland-Silver_Hill,_Maryland	Thing
        Suitland-Silver_Hill,_Maryland	Census-designated_place
    """
    # Deal with special cases
    mask = ~df["col_to_embed"].isin(entity_list)
    n0 = mask.sum()
    print("Initial number of unmatched entities: ", n0)

    exact_matches = { # 226 matches
        "New_York,_New_York": "New_York_City",
        "North_Bellmore,_New_York": "North_Bellmore_Q3825900",
        "Bellmore,_New_York": "Bellmore_Q17399831",
        "Waterloo,_New_York": "Waterloo,_New_York_(town)",
        "Lake_Mohegan,_New_York": "Mohegan_Lake,_New_York",
        "Bay_Wood,_New_York": "Baywood,_New_York",
        "Wheatly_Heights,_New_York": "Wheatley_Heights,_New_York",
        "Northville,_New_York": "Northville,_Fulton_County,_New_York",
        "Saint_Albans,_Vermont": "St._Albans_(city),_Vermont",
        "Slatyfork,_West_Virginia": "Slaty_Fork,_West_Virginia",
        "Alma,_West_Virginia": "Centerville,_Tyler_County,_West_Virginia",
        "Upper_Falls,_West_Virginia": "Tornado,_West_Virginia",
        "Renick,_West_Virginia": "Falling_Spring,_West_Virginia",
        "Westport,_Maine": "Westport_Island,_Maine",
        "Enosburg,_Vermont": "Enosburgh,_Vermont",
        "Mineral_Wells,_West_Virginia": "Mineralwells,_West_Virginia",
        "Mac_Arthur,_West_Virginia": "MacArthur,_West_Virginia",
        "Silver_Gate,_Montana": "Silver_Gate_Q9077387",
        "Minot_AFB,_North_Dakota": "Minot_Air_Force_Base",
        "Reiles_Acres,_North_Dakota": "Reile's_Acres,_North_Dakota",
        "Cuddebackville,_New_York": "Cuddebackville_Q71299603",
        "Canaedea,_New_York": "Caneadea,_New_York",
        "Preston_Hollow,_New_York": "Preston-Potter_Hollow,_New_York",
        "Dalton,_New_York": "Dalton_Q34654902",
        "Pottersville,_New_York": "Pottersville_Q49362128",
        "Diberville,_Mississippi": "D'Iberville,_Mississippi",
        "Robinsonville,_Mississippi": "Tunica_Resorts,_Mississippi",
        "Vernon_Rockville,_Connecticut": "Vernon,_Connecticut",
        "Bliss,_New_York": "Bliss_Q34644184",
        "Goldcreek,_Montana": "Gold_Creek_(Montana)",
        "Marriott-Slaterville_City,_Utah": "Marriott-Slaterville,_Utah",
        "Helena_-_West_Helena,_Arkansas": "Helena–West_Helena,_Arkansas",
        "Parkers-Iron_Springs,_Arkansas": "Landmark,_Arkansas",
        "Kingston,_Arkansas": "Kingston,_Madison_County,_Arkansas",
        "Jewell,_Iowa": "Jewell_Junction,_Iowa",
        "Groveoak,_Alabama": "Grove_Oak,_Alabama",
        "Sutton,_Alaska": "Sutton-Alpine,_Alaska",
        "Barrow,_Alaska": "Utqiagvik,_Alaska",
        "Kachemak_City,_Alaska": "Kachemak,_Alaska",
        "Katskill_Bay,_New_York": "Kattskill_Bay,_New_York",
        "Lake_Purdy,_Alabama": "Brook_Highland,_Alabama",
        "Laceys_Spring,_Alabama": "Lacey's_Spring,_Alabama",
        "Jacksons_Gap,_Alabama": "Jackson's_Gap,_Alabama",
        "La_Canada_Flintridge,_California": "La_Cañada_Flintridge,_California",
        "Highland_Lake,_New_Jersey": "Highland_Lakes,_New_Jersey",
        "View_Park-Windsor_Hills,_California": "View_Park–Windsor_Hills,_California",
        "San_Miguel,_California": "San_Miguel,_San_Luis_Obispo_County,_California",
        "Pinon_Hills,_California": "Piñon_Hills,_California",
        "White_Water,_California": "Whitewater,_California",
        "West_Compton,_California": "West_Rancho_Dominguez,_California",
        "Springfield,_New_Jersey": "Springfield_Township,_Union_County,_New_Jersey",
        "Mi_Wuk_Village,_California": "Mi-Wuk_Village,_California",
        "The_Sea_Ranch,_California": "Sea_Ranch,_California",
        "Jacumba,_California": "Jacumba_Hot_Springs,_California",
        "Desoto,_Texas": "DeSoto,_Texas",
        "Mt_Holly_Township,_New_Jersey": "Mount_Holly,_New_Jersey",
        "Lebanon_Borough,_New_Jersey": "Lebanon,_New_Jersey",
        "Township_of_Washington,_New_Jersey": "Washington_Township,_Bergen_County,_New_Jersey",
        "Fairfield,_New_Jersey": "Fairfield_Township,_Essex_County,_New_Jersey",
        "Penitas,_Texas": "Peñitas,_Texas",
        "Anderson_Mill,_Texas": "Anderson_Mill,_Austin,_Texas",
        "Washington,_Texas": "Washington-on-the-Brazos,_Texas",
        "Leesville,_South_Carolina": "Batesburg-Leesville,_South_Carolina",
        "Sullivans_Island,_South_Carolina": "Sullivan's_Island,_South_Carolina",
        "Longcreek,_South_Carolina": "Long_Creek,_South_Carolina",
        "Mt_Laurel_Township,_New_Jersey": "Mount_Laurel,_New_Jersey",
        "Saint_Paul,_Texas": "St._Paul,_Collin_County,_Texas",
        "Little_River_Academy,_Texas": "Little_River-Academy,_Texas",
        "Morgans_Point_Resort,_Texas": "Morgan's_Point_Resort,_Texas",
        "Riomedina,_Texas": "Rio_Medina,_Texas",
        "Deridder,_Louisiana": "DeRidder,_Louisiana",
        "Dequincy,_Louisiana": "DeQuincy,_Louisiana",
        "Lincoln,_Kansas": "Lincoln_Center,_Kansas",
        "D_Hanis,_Texas": "D'Hanis,_Texas",
        "Morgans_Point,_Texas": "Morgan's_Point,_Texas",
        "Dekalb,_Illinois": "DeKalb,_Illinois",
        "Hoolehua,_Hawaii": "Hoʻolehua,_Hawaii",
        "Papaaloa,_Hawaii": "Pāpaʻaloa,_Hawaii",
        "Kaaawa,_Hawaii": "Kaʻaʻawa,_Hawaii",
        "Ninole,_Hawaii": "Nīnole,_Hawaii",
        "Ookala,_Hawaii": "ʻŌʻōkala,_Hawaii",
        "Forestview,_Illinois": "Forest_View,_Illinois",
        "Cave_in_Rock,_Illinois": "Cave-In-Rock,_Illinois",
        "Newtown_Square,_Pennsylvania": "Newtown_Township,_Delaware_County,_Pennsylvania",
        "Ewa_Beach,_Hawaii": "ʻEwa_Beach,_Hawaii",
        "Waimanalo,_Hawaii": "Waimānalo,_Hawaii",
        "Hauula,_Hawaii": "Hauʻula,_Hawaii",
        "Ocean_View,_Hawaii": "Hawaiian_Ocean_View,_Hawaii",
        "Laupahoehoe,_Hawaii": "Laupāhoehoe,_Hawaii",
        "Abington,_Pennsylvania": "Abington_Township,_Montgomery_County,_Pennsylvania",
        "Etters,_Pennsylvania": "Goldsboro,_Pennsylvania",
        "Susquehanna,_Pennsylvania": "Susquehanna_Depot,_Pennsylvania",
        "Sugarloaf,_Pennsylvania": "Sugarloaf_Township,_Luzerne_County,_Pennsylvania",
        "Howardsville,_Virginia": "Howardsville,_Albemarle_County,_Virginia",
        "Swan_River,_Minnesota": "Swan_River,_Itasca_County,_Minnesota",
        "McKinley,_Minnesota": "McKinley,_St._Louis_County,_Minnesota",
        "Pulaski,_Pennsylvania": "Pulaski_Township,_Lawrence_County,_Pennsylvania",
        "Mapleton_Depot,_Pennsylvania": "Mapleton,_Pennsylvania",
        "Bradfordwoods,_Pennsylvania": "Bradford_Woods,_Pennsylvania",
        "Gladehill,_Virginia": "Glade_Hill,_Virginia",
        "Dewitt,_Virginia": "DeWitt,_Virginia",
        "Green_Bay,_Virginia": "Green_Bay,_Prince_Edward_County,_Virginia",
        "Gumspring,_Virginia": "Gum_Spring,_Virginia",
        "West_Finley,_Pennsylvania": "West_Finley_Township,_Washington_County,_Pennsylvania",
        "Zieglersville,_Pennsylvania": "Zieglerville,_Pennsylvania",
        "Portage_des_Sioux,_Missouri": "Portage_Des_Sioux,_Missouri",
        "Arlington,_Virginia": "Arlington_County,_Virginia",
        "Spotsylvania,_Virginia": "Spotsylvania_Courthouse,_Virginia",
        "Dewey-Humboldt,_Arizona": "Dewey–Humboldt,_Arizona",
        "Lake_Worth,_Florida": "Lake_Worth_Beach,_Florida",
        "Land_O_Lakes,_Florida": "Land_O'_Lakes,_Florida",
        "Bois_D_Arc,_Missouri": "Bois_D'Arc,_Missouri",
        "Velda_Village,_Missouri": "Velda_Village_Hills,_Missouri",
        "Merriam_Woods_Village,_Missouri": "Merriam_Woods,_Missouri",
        "South_West_City,_Missouri": "Southwest_City,_Missouri",
        "Cedarcreek,_Missouri": "Cedar_Creek,_Missouri",
        "Deland,_Florida": "DeLand,_Florida",
        "Fountainbleau,_Florida": "Fontainebleau,_Florida",
        "Hallandale,_Florida": "Hallandale_Beach,_Florida",
        "Greater_Carrollwood,_Florida": "Carrollwood_(CDP),_Florida",
        "Saint_Louis,_Missouri": "St._Louis",
        "Lees_Summit,_Missouri": "Lee's_Summit,_Missouri",
        "Sainte_Genevieve,_Missouri": "Ste._Genevieve,_Missouri",
        "Riverview,_Missouri": "Riverview,_St._Louis_County,_Missouri",
        "Defuniak_Springs,_Florida": "DeFuniak_Springs,_Florida",
        "Greater_Northdale,_Florida": "Northdale,_Florida",
        "Debary,_Florida": "DeBary,_Florida",
        "Ponte_Vedra,_Florida": "Ponte_Vedra_Beach,_Florida",
        "Doctor_Phillips,_Florida": "Dr._Phillips,_Florida",
        "Land_O_Lakes,_Wisconsin": "Land_O'_Lakes,_Wisconsin",
        "High_Bridge,_Wisconsin": "Highbridge,_Wisconsin",
        "Okauchee,_Wisconsin": "Okauchee_Lake,_Wisconsin",
        "Pryor,_Oklahoma": "Pryor_Creek,_Oklahoma",
        "Breckinridge,_Oklahoma": "Breckenridge,_Oklahoma",
        "Westwood_Lake,_Florida": "Westwood_Lakes,_Florida",
        "Westgate-Belvedere_Homes,_Florida": "Westgate,_Florida",
        "Asbury_Lake,_Florida": "Lake_Asbury,_Florida",
        "Fontana,_Wisconsin": "Fontana-on-Geneva_Lake,_Wisconsin",
        "Sheldon,_Wisconsin": "Sheldon,_Rusk_County,_Wisconsin",
        "Wilson,_Wisconsin": "Wilson,_St._Croix_County,_Wisconsin",
        "Sewalls_Point,_Florida": "Sewall's_Point,_Florida",
        "Manattee_Road,_Florida": "Manatee_Road,_Florida",
        "Saint_Lucie,_Florida": "St._Lucie_Village,_Florida",
        "Willow_Spring,_North_Carolina": "Willow_Springs,_North_Carolina",
        "Midland,_Georgia": "Midland,_Columbus,_Georgia",
        "Marble_Hill,_Georgia": "Marblehill,_Georgia",
        "Cherrylog,_Georgia": "Cherry_Log,_Georgia",
        "Salem,_Wisconsin": "Salem_(community),_Kenosha_County,_Wisconsin",
        "Longview,_North_Carolina": "Long_View,_North_Carolina",
        "Union_Grove,_North_Carolina": "Union_Grove_Township,_Iredell_County,_North_Carolina",
        "Cajahs_Mountain,_North_Carolina": "Cajah's_Mountain,_North_Carolina",
        "Saint_Simons_Island,_Georgia": "St._Simons,_Georgia",
        "Wilsons_Mills,_North_Carolina": "Wilson's_Mills,_North_Carolina",
        "Boomer,_North_Carolina": "Boomer_Township,_Wilkes_County,_North_Carolina",
        "Swanquarter,_North_Carolina": "Swan_Quarter,_North_Carolina",
        "Shiloh,_North_Carolina": "Shiloh_Township_Q6031630",
        "Tall_Timbers,_Maryland": "Tall_Timbers,_St._Mary's_County,_Maryland",
        "Tilghman,_Maryland": "Tilghman_Island,_Maryland",
        "O''_Neill,_Nebraska": "O'Neill,_Nebraska",
        "Stacy,_North_Carolina": "Stacy_Township_Q6031827",
        "Sealevel,_North_Carolina": "Sea_Level,_North_Carolina",
        "Jaars,_North_Carolina": "JAARS,_North_Carolina",
        "Belcamp,_Maryland": "Riverside,_Harford_County,_Maryland",
        "Cascade,_Maryland": "Highfield-Cascade,_Maryland",
        "Springville,_Indiana": "Springville,_Lawrence_County,_Indiana",
        "Laotto,_Indiana": "LaOtto,_Indiana",
        "Westpoint,_Indiana": "West_Point,_Indiana",
        "Pines,_Indiana": "Town_of_Pines,_Indiana",
        "The_Village_of_Indian_Hill,_Ohio": "Indian_Hill,_Ohio",
        "Pena_Blanca,_New_Mexico": "Peña_Blanca,_New_Mexico",
        "Linthicum_Heights,_Maryland": "Linthicum,_Maryland",
        "Riverdale,_Maryland": "Riverdale_Park,_Maryland",
        "Woodlawn,_Maryland": "Woodlawn,_Prince_George's_County,_Maryland",
        "Rootstown,_Ohio": "Rootstown_Township,_Portage_County,_Ohio",
        "Newbury,_Ohio": "Newbury_Center,_Ohio",
        "Pleasant_Run_Farm,_Ohio": "Pleasant_Run_Farms,_Ohio",
        "Deerfield,_Ohio": "Deerfield_Township,_Portage_County,_Ohio",
        "Wayne_Lakes_Park,_Ohio": "Wayne_Lakes,_Ohio",
        "Espanola,_New_Mexico": "Española,_New_Mexico",
        "Penasco,_New_Mexico": "Peñasco,_New_Mexico",
        "Abiquiu,_New_Mexico": "Abiquiú,_New_Mexico",
        "Cerrillos,_New_Mexico": "Los_Cerrillos,_New_Mexico",
        "Hagerhill,_Kentucky": "Hager_Hill,_Kentucky",
        "Flatgap,_Kentucky": "Flat_Gap,_Kentucky",
        "Demossville,_Kentucky": "DeMossville,_Kentucky",
        "Yeaddiss,_Kentucky": "Yeaddis,_Kentucky",
        "Mammoth_Cave,_Kentucky": "Mammoth_Cave_National_Park",
        "Regina,_Kentucky": "Marrowbone,_Pike_County,_Kentucky",
        "Shelby_Gap,_Kentucky": "Shelby_Gap_Railroad_Station,_Kentucky",
        "Canon_City,_Colorado": "Cañon_City,_Colorado",
        "Coal_Creek,_Colorado": "Coal_Creek,_Boulder_County,_Colorado",
        "Mt_Chase_Plantation,_Maine": "Mount_Chase,_Maine",
        "Saint_Marys,_Colorado": "St._Mary's,_Colorado",
        "Seatac,_Washington": "SeaTac,_Washington",
        "Dupont,_Washington": "DuPont,_Washington",
        "Garden_Home-Whitford,_Oregon": "Garden_Home–Whitford,_Oregon",
        "Mount_Angel,_Oregon": "Mt._Angel,_Oregon",
        "Lafollette,_Tennessee": "LaFollette,_Tennessee",
        "Thompsons_Station,_Tennessee": "Thompson's_Station,_Tennessee",
        "Lake_City,_Tennessee": "Rocky_Top,_Tennessee",
        "Grandview,_Tennessee": "Grandview,_Rhea_County,_Tennessee",
        "Washington,_District_of_Columbia": "Washington,_D.C.",
        "Dukedom,_Tennessee": "Dukedom,_Kentucky_and_Tennessee",
        "Beaver_Cove_Plantation,_Maine": "Beaver_Cove,_Maine",
        "Dewitt,_Michigan": "DeWitt,_Michigan",
        "Sault_Sainte_Marie,_Michigan": "Sault_Ste._Marie,_Michigan",
        "Gunplain,_Michigan": "Gun_Plain_Township,_Michigan",
        "Watertown,_Michigan": "Watertown_Charter_Township,_Clinton_County,_Michigan",
        "Plainfield,_Michigan": "Plainfield_Township,_Kent_County,_Michigan",
        "L''_Anse,_Michigan": "L'Anse,_Michigan",
        "Canyon_Dam,_California": "Canyondam,_California",
        "Lincoln_University,_Pennsylvania": "Lincoln_University,_Pennsylvania_(CDP)",
        "James_Creek,_Pennsylvania": "Marklesburg,_Pennsylvania",
        "Chatham,_New_Jersey": "Chatham_Borough,_New_Jersey",
        "Labelle,_Florida": "LaBelle,_Florida",
        "Silverlake,_Washington": "Silver_Lake,_Washington",
        "Duck_River,_Tennessee": "Shady_Grove,_Hickman_County,_Tennessee",
        "Clinton,_Michigan": "Clinton_Township,_Macomb_County,_Michigan",
        "Orchard_Lake,_Michigan": "Orchard_Lake_Village,_Michigan",
        "Whitehall,_Maryland": "White_Hall,_Baltimore_County,_Maryland",
        "Clinton,_Wisconsin": "Clinton_(town),_Rock_County,_Wisconsin",
        "Long_Lake,_Wisconsin": "Long_Lake,_Florence_County,_Wisconsin",
        "Cross_Lake,_Minnesota": "Crosslake,_Minnesota",
        "Iron,_Minnesota": "Iron_Junction,_Minnesota",
        "Bass_River,_New_Jersey": "Bass_River_Township,_New_Jersey",
        "Dixie,_West_Virginia": "Dixie,_Nicholas_County,_West_Virginia",
        "Peru,_Iowa": "Old_Peru,_Iowa",
    }

    df.loc[mask, 'col_to_embed'] = df.loc[mask, 'col_to_embed'].map(exact_matches).fillna(df.loc[mask, 'col_to_embed'])
    mask = ~df["col_to_embed"].isin(entity_list)
    n1 = mask.sum()
    print(n0 - n1, "additional matches")

    approximate_matches = { # 122 extra matches
        "Fort_Jackson,_New_York": "Hopkinton,_New_York",
        "West_Henrietta,_New_York": "Henrietta,_New_York",
        "Setauket,_New_York": "Setauket-East_Setauket,_New_York",
        "Campbell_Hall,_New_York": "Hamptonburgh,_New_York",
        "Kirkville,_New_York": "Manlius,_New_York",
        "Port_Crane,_New_York": "Fenton,_New_York",
        "Harpursville,_New_York": "Colesville,_New_York",
        "Rock_Tavern,_New_York": "New_Windsor,_New_York",
        "Salisbury_Mills,_New_York": "Beaverdam_Lake–Salisbury_Mills,_New_York",
        "Winthrop,_New_York": "Brasher_Falls–Winthrop,_New_York",
        "Brasher_Falls,_New_York": "Brasher_Falls–Winthrop,_New_York",
        "Pattersonville,_New_York": "Pattersonville-Rotterdam_Junction,_New_York",
        "Rotterdam_Junction,_New_York": "Pattersonville-Rotterdam_Junction,_New_York",
        "Dewittville,_New_York": "Chautauqua,_New_York",
        "North_Lawrence,_New_York": "Lawrence,_St._Lawrence_County,_New_York",
        "Mooers_Forks,_New_York": "Mooers,_New_York",
        "Erieville,_New_York": "Nelson,_New_York",
        "Bonner,_Montana": "Bonner-West_Riverside,_Montana",
        "Whitesville,_New_York": "Independence,_New_York",
        "Athol,_New_York": "Thurman,_New_York",
        "Ellington,_Connecticut": "Ellington_Center_Historic_District",
        "Comstock,_New_York": "Fort_Ann,_New_York",
        "Alfred_Station,_New_York": "Alfred,_New_York",
        "Pomfret_Center,_Connecticut": "Pomfret,_Connecticut",
        "Lockwood,_New_York": "Barton,_New_York",
        "Swain,_New_York": "Grove,_New_York",
        "Merrill,_New_York": "Ellenburg,_New_York",
        "Huguenot,_New_York": "Deerpark,_New_York",
        "Godeffroy,_New_York": "Deerpark,_New_York",
        "Bouckville,_New_York": "Madison,_New_York",
        "Nicholville,_New_York": "Lawrence,_St._Lawrence_County,_New_York",
        "Greenhurst,_New_York": "Ellery,_New_York",
        "Henderson_Harbor,_New_York": "Henderson,_New_York",
        "Idyllwild,_California": "Idyllwild–Pine_Cove,_California",
        "Marietta,_South_Carolina": "Slater-Marietta,_South_Carolina",
        "Goodlow_Park,_Texas": "Goodlow,_Texas",
        "Lakeside,_Arizona": "Pinetop-Lakeside,_Arizona",
        "Pinetop,_Arizona": "Pinetop-Lakeside,_Arizona",
        "Heber,_Arizona": "Heber-Overgaard,_Arizona",
        "Overgaard,_Arizona": "Heber-Overgaard,_Arizona",
        "Issue,_Maryland": "Swan_Point,_Maryland",
        "Sparks_Glencoe,_Maryland": "Sparks,_Maryland",
        "Chippewa_Lake,_Michigan": "Chippewa_Township,_Mecosta_County,_Michigan",
        "Black_River,_Michigan": "Alcona_Township,_Michigan",
        "Goetzville,_Michigan": "Raber_Township,_Michigan",
        "Perronville,_Michigan": "Harris_Township,_Michigan",
        "Ruth,_Michigan": "Sherman_Township,_Huron_County,_Michigan",
        "Palms,_Michigan": "Minden_Township,_Michigan",
        "Filion,_Michigan": "Lincoln_Township,_Huron_County,_Michigan",
        "Eckerman,_Michigan": "Chippewa_Township,_Chippewa_County,_Michigan",
        "Somerset_Center,_Michigan": "Somerset_Township,_Michigan",
        "Port_Hadlock,_Washington": "Port_Hadlock-Irondale,_Washington",
        "Gulliver,_Michigan": "Doyle_Township,_Michigan",
        "Bannister,_Michigan": "Elba_Township,_Gratiot_County,_Michigan",
        "Skanee,_Michigan": "Arvon Township, Michigan",
        "Greenbush,_Michigan": "Greenbush_Township,_Alcona_County,_Michigan",
        "Hawks,_Michigan": "Bismarck_Township,_Michigan",
        "Skanee,_Michigan": "Arvon_Township,_Michigan",
        "Saginaw,_Michigan": "Saginaw_County,_Michigan",
        "Washington,_Michigan": "Washington_Township,_Macomb_County,_Michigan",
        "Oakland,_Michigan": "Oakland_County,_Michigan",
        "Shingleton,_Michigan": "Munising_Township,_Michigan",
        "Decker,_Michigan": "Lamotte_Township,_Michigan",
        "Middleton,_Michigan": "Fulton_Township,_Michigan",
        "South_Branch,_Michigan": "Goodar_Township,_Michigan",
        "Bailey,_Michigan": "Casnovia Township, Michigan",
        "Superior,_Michigan": "Superior_Township,_Washtenaw_County,_Michigan",
        "Belmont,_Michigan": "Plainfield_Township,_Kent_County,_Michigan",
        "Smiths_Creek,_Michigan": "Kimball_Township,_Michigan",
        "West_Olive,_Michigan": "Olive_Township,_Ottawa_County,_Michigan",
        "Hamilton,_Michigan": "Heath_Township,_Michigan",
        "New_Hudson,_Michigan": "Lyon_Township,_Oakland_County,_Michigan",
        "North_Street,_Michigan": "Clyde_Township,_St._Clair_County,_Michigan",
        "Columbus,_Michigan": "Columbus_Township,_St._Clair_County,_Michigan",
        "Fair_Haven,_Michigan": "Ira_Township,_Michigan",
        "Casco,_Michigan": "Casco_Township,_St._Clair_County,_Michigan",
        "Grand_Junction,_Michigan": "Columbia_Township,_Van_Buren_County,_Michigan",
        "Avoca,_Michigan": "Kenockee_Township,_Michigan",
        "Shelbyville,_Michigan": "Wayland_Township,_Michigan",
        "Gowen,_Michigan": "Montcalm_Township,_Michigan",
        "Hale,_Michigan": "Plainfield_Township,_Iosco_County,_Michigan",
        "Remus,_Michigan": "Wheatland_Township,_Mecosta_County,_Michigan",
        "Goodells,_Michigan": "Wales_Township,_Michigan",
        "Allenton,_Michigan": "Berlin_Township,_St._Clair_County,_Michigan",
        "Vulcan,_Michigan": "Norway_Township,_Michigan",
        "Osseo,_Michigan": "Jefferson_Township,_Hillsdale_County,_Michigan",
        "Conklin,_Michigan": "Chester_Township,_Ottawa_County,_Michigan",
        "Rives_Junction,_Michigan": "Rives_Township,_Michigan",
        "Fenwick,_Michigan": "Fairplain_Township,_Michigan",
        "Alger,_Michigan": "Moffatt_Township,_Michigan",
        "Manitou_Beach,_Michigan": "Manitou_Beach–Devils_Lake,_Michigan",
        "Jones,_Michigan": "Newberg_Township,_Michigan",
        "Clarklake,_Michigan": "Columbia_Township,_Jackson_County,_Michigan",
        "Vestaburg,_Michigan": "Richland_Township,_Montcalm_County,_Michigan",
        "Kewadin,_Michigan": "Milton_Township,_Antrim_County,_Michigan",
        "Lachine,_Michigan": "Long_Rapids_Township,_Michigan",
        "Berrien_Center,_Michigan": "Berrien_Township,_Michigan",
        "Jeddo,_Michigan": "Grant_Township,_St._Clair_County,_Michigan",
        "Rodney,_Michigan": "Colfax_Township,_Mecosta_County,_Michigan",
        "Irons,_Michigan": "Eden_Township,_Lake_County,_Michigan",
        "Sawyer,_Michigan": "Shorewood–Tower_Hills–Harbert,_Michigan",
        "Wetmore,_Michigan": "Munising_Township,_Michigan",
        "Elwell,_Michigan": "Seville_Township,_Michigan",
        "Riverdale,_Michigan": "Seville_Township,_Michigan",
        "Wallace,_Michigan": "Mellen_Township,_Michigan",
        "National_City,_Michigan": "Sherman_Township,_Iosco_County,_Michigan",
        "Bailey,_Michigan": "Casnovia_Township,_Michigan",
        "Trufant,_Michigan": "Maple_Valley_Township,_Montcalm_County,_Michigan",
        "Buskirk,_New_York": "Hoosick,_New_York",
        "Champion,_Pennsylvania": "Seven_Springs,_Pennsylvania",
        "Crescent_Lake,_Oregon": "Crescent_Lake_Junction,_Oregon",
        "Brimley,_Michigan": "Superior_Township,_Chippewa_County,_Michigan",
        "Blanchard,_Michigan": "Rolland_Township,_Michigan",
        "Atlantic_Mine,_Michigan": "Adams_Township,_Houghton_County,_Michigan",
        "Hope,_Michigan": "Hope_Township,_Midland_County,_Michigan",
        "Glennie,_Michigan": "Curtis_Township,_Michigan",
        "Coral,_Michigan": "Maple_Valley_Township,_Montcalm_County,_Michigan",
        "Six_Lakes,_Michigan": "Belvidere_Township,_Michigan",
        "Sears,_Michigan": "Orient_Township,_Michigan",
        "Spruce,_Michigan": "Caledonia_Township,_Alcona_County,_Michigan",
        "Trout_Creek,_Michigan": "Interior_Township,_Michigan",
        "Harbert,_Michigan": "Shorewood–Tower_Hills–Harbert,_Michigan",
        "Barbeau,_Michigan": "Bruce_Township,_Chippewa_County,_Michigan",
        "Hessel,_Michigan": "Clark_Township,_Michigan",
    }
    
    df.loc[mask, 'col_to_embed'] = df.loc[mask, 'col_to_embed'].map(approximate_matches).fillna(df.loc[mask, 'col_to_embed'])
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")



    # String-level morphological variations (--> 191 extra matches)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "col_to_embed"].str.replace("Saint_", "St._").str.replace("Lagrange", "LaGrange").str.replace("Lafayette", "LaFayette").str.replace("_Alaska", "_Anchorage").str.replace("St_", "St._").str.replace("O_", "O'").str.replace("Mt_", "Mount_").str.replace("Mc_", "Mc").str.replace("Le_", "Le").str.replace("La_", "La").str.replace("De_", "De").str.replace("boro", "borough")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")

    
    # Add counties for non-matches (--> 289 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + ",_" + df["County"] + ",_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_(city)' for non-matches (--> 10 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_(city),_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_(town)' for non-matches (--> 27 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_(town),_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Format City_(State) for non-matches (--> 5 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_(" + df["State"] + ")").str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_(CDP)' for non-matches (--> 12 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_(CDP),_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Change "_" for "-" in city names for non-matches (--> 14 extra matches)
    df.loc[mask, "col_to_embed"] = df["City"].str.replace(" ", "-") + ",_" + df["State"].str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Change "-" for "_" in city names for non-matches (--> 3 extra matches)
    df.loc[mask, "col_to_embed"] = df["City"].str.replace("-", "_") + ",_" + df["State"].str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_(village)' for non-matches (--> 5 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_(village),_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_City' for non-matches (--> 14 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_City,_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_Village' for non-matches (--> 8 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_Village,_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_Charter_Township' for non-matches (--> 8 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_Charter_Township,_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_Township' for non-matches (--> 68 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_Township,_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")

    
    # Remove "_Township" for non-matches (--> 12 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + ",_" + df["State"]).str.replace(" ", "_").str.replace("_Township", "")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")
    

    # Remove "Village_of_" for non-matches (--> 5 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + ",_" + df["State"]).str.replace(" ", "_").str.replace("Village_of_", "")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")
    
    
    # Keep only the city names for non-matches (--> 49 extra matches)
    df.loc[mask, "col_to_embed"] = df["City"].str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")
    print("Final number of unmatched entities: ", n1)
    
    
    # Restore original format for non-matches
    df.loc[mask, "col_to_embed"] = (df["City"] + ",_" + df["State"]).str.replace(" ", "_")


    # Correct mistakes
    mistakes = [
        "Smyrna",
        "Mears",
    ]
    mask = df["col_to_embed"].isin(mistakes)
    df.loc[mask, "col_to_embed"] = (df["City"] + ",_" + df["State"]).str.replace(" ", "_")


    """
    Note:

    The matched entities have the following types:
        ['City', 'Corporation', 'AdministrativeArea',
        'Census-designated_place', 'Borough_(Pennsylvania)',
        'New_England_town', 'Village_(United_States)', 'County_seat',
        'Human_settlement', 'Village', 'Township_(New_Jersey)', 'Town',
        'Town_(New_Jersey)', 'County_town',
        'Independent_city_(United_States)', 'Borough_(New_Jersey)',
        'Place', 'Gay_village', 'Hamlet_(place)',
        'Consolidated_city-county', 'College_town', 'Border_town',
        'Municipality', 'Township_(Pennsylvania)', 'Charter_township',
        'Administrative_divisions_of_New_York_(state)',
        'Township_(United_States)', 'Island', 'Air_base',
        'Home_Rule_Municipality_(Pennsylvania)', 'TouristAttraction',
        'Region', 'Ghost_town', 'Suburb', 'City_(New_Jersey)', 'Peninsula',
        'Thing', 'Unincorporated_area', 'River_island', 'Resort_town',
        'Planned_community', 'Neighbourhood', 'Geographical_feature',
        'Borough', 'Capital_city', 'Historic_district',
        'Concentration_camp', 'Micronation', 'Unorganized_territory',
        'Barrier_island', 'Civil_township',
        'Political_divisions_of_the_United_States', 'Country',
        'Sovereign_state', 'Bay', 'Electoral_district', 'Civil_parish',
        'City-state', 'SkiResort', 'Comune', 'Ranchos_of_California',
        'Show_cave', 'Biosphere_reserve', 'Park']
    """


    # Account for specificities of Yago3 and Yago4
    df["yago4_col_to_embed"] = df["col_to_embed"]

    yago4to3 = {
        'North_Bellmore_Q3825900': 'North_Bellmore,_New_York',
        'Bellmore_Q17399831': 'Bellmore,_New_York',
        'Silver_Gate_Q9077387': 'Silver_Gate,_Montana',
        'Cuddebackville_Q71299603': 'Cuddebackville,_New_York',
        'Dalton_Q34654902': 'Dalton,_New_York',
        'Pottersville_Q49362128': 'Pottersville,_New_York',
        'Bliss_Q34644184': 'Bliss,_New_York',
        'Shiloh_Township_Q6031630': 'Shiloh,_North_Carolina',
        'Stacy_Township_Q6031827': 'Stacy,_North_Carolina',
    }

    df["yago3_col_to_embed"] = df["col_to_embed"].map(yago4to3).fillna(df['col_to_embed'])
    # Note: For Yago3, manual processing decreases the number of unmatched entities from 9311 to 1724
    
    ## Final preprocessing steps
    # Add a column with raw entity names: City + State abbreviation
    df["raw_entities"] = df["City"] + ", " + df["Code"]

    # Keep only relevant columns
    df = df[["raw_entities", "yago3_col_to_embed", "yago4_col_to_embed", "target"]]

    # Replace target with log(target)
    df["target"] = np.log10(df["target"])

    # Remove rows with missing values
    df = df[~df["target"].isna()]

    ## Save dataframe
    df.to_parquet(dir_path / "target_log.parquet", index=False)
