from pathlib import Path
import pandas as pd
import numpy as np

from SEPAL.dataloader import DataLoader


if __name__ == "__main__":
    ## Load original data
    dir_path = Path(__file__).parent
    use_cols = ["ID", "City", "State", "County"]
    df = pd.read_csv(dir_path / "US_Accidents_March23.csv", usecols=use_cols)
    df.rename(columns={"State": "Code"}, inplace=True)

    ## Format city names to match Yago nomenclature
    # Get state names
    state_codes = pd.read_csv(dir_path / "state_codes.csv")
    df = df.merge(state_codes, on="Code")

    # Remove accidents that did not occur in a city
    df = df[~df["City"].isna()]

    # Format County and City names
    df.loc[:, "City"] = df["City"].str.replace(" Twp", " Township").str.replace(" township", " Township")
    df.loc[:, "County"] = df["County"] + " County"

    # Build col_to_embed
    df["col_to_embed"] = df["City"] + ",_" + df["State"]
    df.loc[:, "col_to_embed"] = df["col_to_embed"].str.replace(" ", "_")


    ## Group accidents by cities and make target
    df = df.groupby(["col_to_embed", "City", "Code", "State", "County"], as_index=False).count()
    df["raw_entities"] = df["City"] + ", " + df["Code"]
    df.rename(columns={"ID": "target"}, inplace=True)
    df = df[["City", "County", "State", "raw_entities", "col_to_embed", "target"]]



    ## Deal with unmatched entities
    # Get Yago4 entities
    yago_dir = Path(__file__).absolute().parents[2] / "knowledge_graphs"
    yago4_dl = DataLoader(yago_dir / "yago4_with_full_ontology")
    entity_list = list(yago4_dl.entity_to_idx.keys())

    yago4_types = pd.read_parquet(yago_dir / "yago4/yagoTypes.parquet")



    mask = ~df["col_to_embed"].isin(entity_list)
    n0 = mask.sum()
    print("Initial number of unmatched entities: ", n0)



    # String-level morphological variations (--> 387 extra matches)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "col_to_embed"].str.replace("_Del_", "_del_").str.replace("_De_", "_de_").str.replace("_Du_", "_du_").str.replace("O_", "O'").str.replace("boro", "borough").str.replace("Mt_", "Mount_").str.replace("Mc_", "Mc").str.replace("_In_", "_in_").str.replace("_The_", "_the_").str.replace("_On_", "-on-").str.replace("_Of_", "_of_").str.replace("Saint_", "St._").str.replace("St_", "St._").str.replace("_la_", "_La_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n1 = mask.sum()
    print(n0 - n1, "additional matches")


    # Capitalize after Mc (--> 27 extra matches)
    def capitalize_after_subword(s, subword):
        x = s.find(subword) + len(subword)
        if x == -1 + len(subword):
            return s
        else:
            return s[:x] + s[x].upper() + s[x+1:]
    df.loc[mask, "col_to_embed"] = df.loc[mask, "col_to_embed"].apply(capitalize_after_subword, args=("Mc",))
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Change state for city (--> 102 extra matches)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "col_to_embed"].str.replace("_Tennessee", "_Chattanooga").str.replace("_Arizona", "_Phoenix").str.replace("_Texas", "_Houston").str.replace("_New_York", "_Queens").str.replace("_California", "_Los_Angeles").str.replace("_Massachusetts", "_Boston").str.replace("_Maryland", "_Baltimore").str.replace("_District_of_Columbia", "_Maryland").str.replace("_Kentucky", "_Louisville")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add counties for non-matches (--> 252 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + ",_" + df["County"] + ",_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_(city)' for non-matches (--> 17 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_(city),_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_(town)' for non-matches (--> 71 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_(town),_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Format City_(State) for non-matches (--> 16 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_(" + df["State"] + ")").str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_(CDP)' for non-matches (--> 15 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_(CDP),_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Change "_" for "-" in city names for non-matches (--> 23 extra matches)
    df.loc[mask, "col_to_embed"] = df["City"].str.replace(" ", "-") + ",_" + df["State"].str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_(village)' for non-matches (--> 5 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_(village),_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_City' for non-matches (--> 22 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_City,_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_Village' for non-matches (--> 8 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_Village,_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_Charter_Township' for non-matches (--> 2 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_Charter_Township,_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_Township' and counties for non-matches (--> 71 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_Township,_" + df["County"] + ",_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Add '_Township' for non-matches (--> 101 extra matches)
    df.loc[mask, "col_to_embed"] = (df["City"] + "_Township,_" + df["State"]).str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")
    
    
    # Restore original format for non-matches
    df.loc[mask, "col_to_embed"] = (df["City"] + ",_" + df["State"]).str.replace(" ", "_")
    

    # Deal with special cases
    exact_matches = { # 419 matches
        "Alum_Bank,_Pennsylvania": "Pleasantville,_Bedford_County,_Pennsylvania",
        "Abiquiu,_New_Mexico": "Abiquiú,_New_Mexico",
        "York_New_Salem,_Pennsylvania": "New_Salem,_Pennsylvania",
        'New_York,_New_York': 'New_York_City',
        'North_Bellmore,_New_York': 'North_Bellmore_Q3825900',
        'Bellmore,_New_York': 'Bellmore_Q17399831',
        'Waterloo,_New_York': 'Waterloo,_New_York_(town)',
        'Northville,_New_York': 'Northville,_Fulton_County,_New_York',
        'Saint_Albans,_Vermont': 'St._Albans_(city),_Vermont',
        'Slatyfork,_West_Virginia': 'Slaty_Fork,_West_Virginia',
        'Renick,_West_Virginia': 'Falling_Spring,_West_Virginia',
        'Mineral_Wells,_West_Virginia': 'Mineralwells,_West_Virginia',
        'Silver_Gate,_Montana': 'Silver_Gate_Q9077387',
        'Minot_AFB,_North_Dakota': 'Minot_Air_Force_Base',
        'Cuddebackville,_New_York': 'Cuddebackville_Q71299603',
        'Preston_Hollow,_New_York': 'Preston-Potter_Hollow,_New_York',
        'Dalton,_New_York': 'Dalton_Q34654902',
        'Diberville,_Mississippi': "D'Iberville,_Mississippi",
        'Robinsonville,_Mississippi': 'Tunica_Resorts,_Mississippi',
        'Vernon_Rockville,_Connecticut': 'Vernon,_Connecticut',
        'Bliss,_New_York': 'Bliss_Q34644184',
        'Jewell,_Iowa': 'Jewell_Junction,_Iowa',
        'Laceys_Spring,_Alabama': "Lacey's_Spring,_Alabama",
        'Jacksons_Gap,_Alabama': "Jackson's_Gap,_Alabama",
        'La_Canada_Flintridge,_California': 'La_Cañada_Flintridge,_California',
        'San_Miguel,_California': 'San_Miguel,_San_Luis_Obispo_County,_California',
        'Pinon_Hills,_California': 'Piñon_Hills,_California',
        'Mi_Wuk_Village,_California': 'Mi-Wuk_Village,_California',
        'The_Sea_Ranch,_California': 'Sea_Ranch,_California',
        'Jacumba,_California': 'Jacumba_Hot_Springs,_California',
        'Desoto,_Texas': 'DeSoto,_Texas',
        'Township_of_Washington,_New_Jersey': 'Washington_Township,_Bergen_County,_New_Jersey',
        'Penitas,_Texas': 'Peñitas,_Texas',
        'Leesville,_South_Carolina': 'Batesburg-Leesville,_South_Carolina',
        'Sullivans_Island,_South_Carolina': "Sullivan's_Island,_South_Carolina",
        'Longcreek,_South_Carolina': 'Long_Creek,_South_Carolina',
        'Deridder,_Louisiana': 'DeRidder,_Louisiana',
        'Dekalb,_Illinois': 'DeKalb,_Illinois',
        'Newtown_Square,_Pennsylvania': 'Newtown_Township,_Delaware_County,_Pennsylvania',
        'Etters,_Pennsylvania': 'Goldsboro,_Pennsylvania',
        'Susquehanna,_Pennsylvania': 'Susquehanna_Depot,_Pennsylvania',
        'Howardsville,_Virginia': 'Howardsville,_Albemarle_County,_Virginia',
        'Pulaski,_Pennsylvania': 'Pulaski_Township,_Lawrence_County,_Pennsylvania',
        'Mapleton_Depot,_Pennsylvania': 'Mapleton,_Pennsylvania',
        'Bradfordwoods,_Pennsylvania': 'Bradford_Woods,_Pennsylvania',
        'Dewitt,_Virginia': 'DeWitt,_Virginia',
        'Green_Bay,_Virginia': 'Green_Bay,_Prince_Edward_County,_Virginia',
        'West_Finley,_Pennsylvania': 'West_Finley_Township,_Washington_County,_Pennsylvania',
        'Portage_des_Sioux,_Missouri': 'Portage_Des_Sioux,_Missouri',
        'Arlington,_Virginia': 'Arlington_County,_Virginia',
        'Spotsylvania,_Virginia': 'Spotsylvania_Courthouse,_Virginia',
        'Lake_Worth,_Florida': 'Lake_Worth_Beach,_Florida',
        'Land_O_Lakes,_Florida': "Land_O'_Lakes,_Florida",
        'Bois_D_Arc,_Missouri': "Bois_D'Arc,_Missouri",
        'Deland,_Florida': 'DeLand,_Florida',
        'Hallandale,_Florida': 'Hallandale_Beach,_Florida',
        'Saint_Louis,_Missouri': 'St._Louis',
        'Lees_Summit,_Missouri': "Lee's_Summit,_Missouri",
        'Sainte_Genevieve,_Missouri': 'Ste._Genevieve,_Missouri',
        'Defuniak_Springs,_Florida': 'DeFuniak_Springs,_Florida',
        'Debary,_Florida': 'DeBary,_Florida',
        'Ponte_Vedra,_Florida': 'Ponte_Vedra_Beach,_Florida',
        'Okauchee,_Wisconsin': 'Okauchee_Lake,_Wisconsin',
        'Pryor,_Oklahoma': 'Pryor_Creek,_Oklahoma',
        'Fontana,_Wisconsin': 'Fontana-on-Geneva_Lake,_Wisconsin',
        'Willow_Spring,_North_Carolina': 'Willow_Springs,_North_Carolina',
        'Midland,_Georgia': 'Midland,_Columbus,_Georgia',
        'Marble_Hill,_Georgia': 'Marblehill,_Georgia',
        'Salem,_Wisconsin': 'Salem_(community),_Kenosha_County,_Wisconsin',
        'Union_Grove,_North_Carolina': 'Union_Grove_Township,_Iredell_County,_North_Carolina',
        'Boomer,_North_Carolina': 'Boomer_Township,_Wilkes_County,_North_Carolina',
        'Swanquarter,_North_Carolina': 'Swan_Quarter,_North_Carolina',
        'Tall_Timbers,_Maryland': "Tall_Timbers,_St._Mary's_County,_Maryland",
        'Tilghman,_Maryland': 'Tilghman_Island,_Maryland',
        'Stacy,_North_Carolina': 'Stacy_Township_Q6031827',
        'Belcamp,_Maryland': 'Riverside,_Harford_County,_Maryland',
        'Springville,_Indiana': 'Springville,_Lawrence_County,_Indiana',
        'Linthicum_Heights,_Maryland': 'Linthicum,_Maryland',
        'Riverdale,_Maryland': 'Riverdale_Park,_Maryland',
        'Espanola,_New_Mexico': 'Española,_New_Mexico',
        'Penasco,_New_Mexico': 'Peñasco,_New_Mexico',
        'Cerrillos,_New_Mexico': 'Los_Cerrillos,_New_Mexico',
        'Hagerhill,_Kentucky': 'Hager_Hill,_Kentucky',
        'Canon_City,_Colorado': 'Cañon_City,_Colorado',
        'Seatac,_Washington': 'SeaTac,_Washington',
        'Dupont,_Washington': 'DuPont,_Washington',
        'Mount_Angel,_Oregon': 'Mt._Angel,_Oregon',
        'Thompsons_Station,_Tennessee': "Thompson's_Station,_Tennessee",
        'Lake_City,_Tennessee': 'Rocky_Top,_Tennessee',
        'Grandview,_Tennessee': 'Grandview,_Rhea_County,_Tennessee',
        'Washington,_District_of_Columbia': 'Washington,_D.C.',
        'Dewitt,_Michigan': 'DeWitt,_Michigan',
        'Sault_Sainte_Marie,_Michigan': 'Sault_Ste._Marie,_Michigan',
        'Watertown,_Michigan': 'Watertown_Charter_Township,_Clinton_County,_Michigan',
        "Alviso,_California": "Alviso,_San_Jose",
        "Arleta,_California": "Arleta,_Los_Angeles",
        "Arverne,_New_York": "Arverne,_Queens",
        "Avon_By_the_Sea,_New_Jersey": "Avon-by-the-Sea,_New_Jersey",
        "BWI_Airport,_Maryland": "Baltimore–Washington_International_Airport",
        "Barksdale_AFB,_Louisiana": "Barksdale_Air_Force_Base",
        "Batesburg,_South_Carolina": "Batesburg_Q34676391",
        "Battleboro,_North_Carolina": "Battleboro_Q26300877",
        "Beale_AFB,_California": "Beale_Air_Force_Base",
        "Bear_Mountain,_New_York": "Bear_Mountain_(Hudson_Highlands)",
        "Beechgrove,_Tennessee": "Beechgrove,_Coffee_County,_Tennessee",
        "Benezett,_Pennsylvania": "Benezette,_Pennsylvania",
        "Benton,_Pennsylvania": "Benton,_Columbia_County,_Pennsylvania",
        "Bird_In_Hand,_Pennsylvania": "Bird-in-Hand,_Pennsylvania",
        "Birds_Lndg,_California": "Birds_Landing,_California",
        "Biscoe,_Arkansas": "Fredonia_(Biscoe),_Arkansas",
        "Black_Hawk,_South_Dakota": "Blackhawk,_South_Dakota",
        "Bolling_Afb,_District_of_Columbia": "Bolling_Air_Force_Base",
        "Brinkhaven,_Ohio": "Gann,_Ohio",
        "Bristol_town,_Wisconsin": "Bristol_(town),_Kenosha_County,_Wisconsin",
        "Broad_Channel,_New_York": "Broad_Channel,_Queens",
        "Bronx,_New_York": "Bronx_County_Q855974",
        "Brookline,_Pennsylvania": "Brookline_(Pittsburgh)",
        "Brooklyn,_Maryland": "Brooklyn,_Baltimore",
        "Brooklyn,_New_York": "Kings_County_Q11980692",
        "Brookpark,_Ohio": "Brook_Park,_Ohio",
        "Brownstown_Twp,_Michigan": "Brownstown_Charter_Township,_Michigan",
        "Brownstown_Township,_Michigan": "Brownstown_Charter_Township,_Michigan",
        "Buffalo_Valley,_Tennessee": "Buffalo_Valley,_Putnam_County,_Tennessee",
        "Buford,_Wyoming": "PhinDeli_Town_Buford,_Wyoming",
        "Camp_Lejeune,_North_Carolina": "Marine_Corps_Base_Camp_Lejeune",
        "Camp_Pendleton,_California": "Marine_Corps_Base_Camp_Pendleton",
        "Camp_Pendleton_Marine_Corps_Base,_California": "Marine_Corps_Base_Camp_Pendleton",
        "Canyon_Country,_California": "Canyon_Country,_Santa_Clarita,_California",
        "Capistrano_Beach,_California": "Capistrano_Beach,_Dana_Point,_California",
        "Cardiff_By_The_Sea,_California": "Cardiff-by-the-Sea,_Encinitas,_California",
        "Cardiff_By_the_Sea,_California": "Cardiff-by-the-Sea,_Encinitas,_California",
        "Cardiff_by_the_Sea,_California": "Cardiff-by-the-Sea,_Encinitas,_California",
        "Carmel,_California": "Carmel-by-the-Sea,_California",
        "Cave-in-Rock,_Illinois": "Cave-In-Rock,_Illinois",
        "Cedar_Bluff,_Mississippi": "Cedarbluff,_Mississippi",
        "Cedar_Valley,_Utah": "Cedar_Valley_Q49786875",
        "Charleston_AFB,_South_Carolina": "Charleston_Air_Force_Base",
        "Charleston_Afb,_South_Carolina": "Charleston_Air_Force_Base",
        "City_Of_Spokane_Valley,_Washington": "Spokane_Valley,_Washington",
        "Coalton,_West_Virginia": "Womelsdorf_(Coalton),_West_Virginia",
        "Codyville_Plt,_Maine": "Codyville,_Maine",
        "Coeur_D_Alene,_Idaho": "Coeur_d'Alene,_Idaho",
        "Colfax,_Missouri": "Colfax_Township_Q9037809",
        "Columbia_Station,_Ohio": "Columbia_Township,_Lorain_County,_Ohio",
        "Concord,_Virginia": "Concord,_Campbell_County,_Virginia",
        "Cooke_City,_Montana": "Cooke_City_Q17400774",
        "Corona_Del_Mar,_California": "Corona_del_Mar,_Newport_Beach",
        "Corona_del_Mar,_California": "Corona_del_Mar,_Newport_Beach",
        "Cowlesville,_New_York": "Cowlesville_Q43077948",
        "Crossroads,_Texas": "Cross_Roads,_Texas",
        "Croton,_Ohio": "Hartford,_Ohio",
        "Dania,_Florida": "Dania_Beach,_Florida",
        "Dayville,_Connecticut": "Dayville_(CDP),_Connecticut",
        "De_Berry,_Texas": "DeBerry,_Texas",
        "De_Kalb_Junction,_New_York": "DeKalb_Junction,_New_York",
        "De_Lancey,_Pennsylvania": "Adrian_Mines,_Pennsylvania",
        "De_Leon_Springs,_Florida": "DeLeon_Springs,_Florida",
        "De_Mossville,_Kentucky": "DeMossville,_Kentucky",
        "De_Ruyter,_New_York": "DeRuyter,_New_York",
        "De_Witt,_Arkansas": "DeWitt,_Arkansas",
        "Deforest,_Wisconsin": "DeForest,_Wisconsin",
        "Demotte,_Indiana": "DeMotte,_Indiana",
        "Desmet,_Idaho": "De_Smet,_Idaho",
        "Dillonvale,_Ohio": "Dillonvale,_Jefferson_County,_Ohio",
        "Dover_AFB,_Delaware": "Dover_Air_Force_Base",
        "Drybranch,_West_Virginia": "Dry_Branch,_West_Virginia",
        "Du_Bois,_Pennsylvania": "DuBois,_Pennsylvania",
        "Dubois,_Pennsylvania": "DuBois,_Pennsylvania",
        "Duluth_city,_Minnesota": "Duluth,_Minnesota",
        "East_Elmhurst,_New_York": "East_Elmhurst,_Queens",
        "East_Liberty,_Pennsylvania": "East_Liberty_(Pittsburgh)",
        "East_Mc_Keesport,_Pennsylvania": "East_McKeesport,_Pennsylvania",
        "Eglin_AFB,_Florida": "Eglin_Air_Force_Base",
        "Elgin,_South_Carolina": "Elgin,_Kershaw_County,_South_Carolina",
        "Elkhorn,_Nebraska": "Elkhorn,_Omaha,_Nebraska",
        "Elsinore_Valley,_California": "Elsinore_Valley",
        "Emerald_Hills,_California": "Emerald_Lake_Hills,_California",
        "Etna,_Ohio": "Etna,_Licking_County,_Ohio",
        "Fairfield,_Virginia": "Fairfield,_Rockbridge_County,_Virginia",
        "Fawn_Grove,_Maryland": "Fawn_Grove,_Pennsylvania",
        "Feeding_Hills,_Massachusetts": "Feeding_Hills,_Agawam,_Massachusetts",
        "Foothill_Ranch,_California": "Foothill_Ranch,_Lake_Forest,_California",
        "Fort_A_P_Hill,_Virginia": "Fort_A.P._Hill",
        "Fort_George_G_Meade,_Maryland": "Fort_Meade,_Maryland",
        "Fort_Irwin,_California": "Fort_Irwin_National_Training_Center",
        "Fort_Mc_Coy,_Florida": "Fort_McCoy,_Florida",
        "Fort_Ripley_city,_Minnesota": "Fort_Ripley,_Minnesota",
        "Fredericksburg,_Pennsylvania": "Fredericksburg,_Lebanon_County,_Pennsylvania",
        "Ft_Mitchell,_Kentucky": "Fort_Mitchell,_Kentucky",
        "Ft_Wright,_Kentucky": "Fort_Wright,_Kentucky",
        "Garland_City,_Arkansas": "Garland,_Arkansas",
        "Geff,_Illinois": "Jeffersonville,_Illinois",
        "Gig_Harbor_Peninsula,_Washington": "Gig_Harbor,_Washington",
        "Gilmanton_Iron_Works,_New_Hampshire": "Gilmanton_Ironworks,_New_Hampshire",
        "Glenwood,_West_Virginia": "Glenwood,_Mason_County,_West_Virginia",
        "Grand_Rapids_city,_Michigan": "Grand_Rapids,_Michigan",
        "Granite_Canon,_Wyoming": "Granite,_Wyoming",
        "Green_Castle,_Missouri": "Greencastle,_Missouri",
        "Greens,_New_Hampshire": "Green's_Grant,_New_Hampshire",
        "Grosse_Tete,_Louisiana": "Grosse_Tête,_Louisiana",
        "Guild,_Tennessee": "Haletown,_Tennessee",
        "Half_Way,_Missouri": "Halfway,_Missouri",
        "Hammond,_Oregon": "Hammond_Q12891427",
        "Hanscom_Afb,_Massachusetts": "Hanscom_Air_Force_Base",
        "Haydenville,_Massachusetts": "Haydenville,_Massachusetts",
        "Hazle_Township,_Pennsylvania": "Hazle_Township,_Luzerne_County,_Pennsylvania",
        "Hazlet_Township,_New_Jersey": "Hazlet,_New_Jersey",
        "Hemlock,_New_York": "Hemlock_Q5894303",
        "Henry,_Virginia": "Henry,_Franklin_County,_Virginia",
        "Hill_AFB,_Utah": "Hill_Air_Force_Base",
        "Hollis_Center,_Maine": "Hollis_Center_Q33445559",
        "Howey_In_The_Hills,_Florida": "Howey-in-the-Hills,_Florida",
        "Independence,_West_Virginia": "Independence,_Preston_County,_West_Virginia",
        "Indian_Orchard,_Massachusetts": "Indian_Orchard,_Springfield,_Massachusetts",
        "Isleta,_New_Mexico": "Isleta_Village_Proper,_New_Mexico",
        "Jbsa_Ft_Sam_Houston,_Texas": "Fort_Sam_Houston",
        "Jbsa_Lackland,_Texas": "Lackland_Air_Force_Base",
        "Joint_Base_Lewis_Mcchord,_Washington": "Joint_Base_Lewis–McChord",
        "Kaibeto,_Arizona": "Kaibito,_Arizona",
        "Keatchie,_Louisiana": "Keachi,_Louisiana",
        "Keene_Valley,_New_York": "Keene_Valley_Q34686201",
        "Kenosha,_Illinois": "Kenosha,_Wisconsin",
        "Kings_Canyon_National_Pk,_California": "Kings_Canyon_National_Park",
        "Knapp,_Wisconsin": "Knapp,_Dunn_County,_Wisconsin",
        "LA_Place,_Louisiana": "LaPlace,_Louisiana",
        "La_Place,_Louisiana": "LaPlace,_Louisiana",
        "La_Fayette,_Georgia": "LaFayette,_Georgia",
        "La_Fayette,_New_York": "LaFayette,_New_York",
        "La_Follette,_Tennessee": "LaFollette,_Tennessee",
        "La_Jose,_Pennsylvania": "Newburg,_Clearfield_County,_Pennsylvania",
        "La_Salle,_Colorado": "LaSalle,_Colorado",
        "La_Salle,_Illinois": "LaSalle,_Illinois",
        "Lafayette,_Alabama": "LaFayette,_Alabama",
        "Lagrange,_Georgia": "LaGrange,_Georgia",
        "Lagrange,_Indiana": "LaGrange,_Indiana",
        "Lagrange,_Ohio": "LaGrange,_Ohio",
        "Lake_Butler,_Florida": "Lake_Butler,_Union_County,_Florida",
        "Lamoure,_North_Dakota": "LaMoure,_North_Dakota",
        "Lanse,_Michigan": "L'Anse,_Michigan",
        "Laplace,_Louisiana": "LaPlace,_Louisiana",
        "Latonia,_Kentucky": "Latonia,_Covington",
        "Lauderdale_By_The_Sea,_Florida": "Lauderdale-by-the-Sea,_Florida",
        "Lawrence,_New_York": "Lawrence,_Nassau_County,_New_York",
        "Le_Roy,_West_Virginia": "LeRoy,_West_Virginia",
        "Lee,_Iowa": "Lee_Township_Q9041241",
        "Leroy,_Michigan": "LeRoy,_Michigan",
        "Liberty,_Pennsylvania": "Liberty,_Tioga_County,_Pennsylvania",
        "Liberty_Townshp,_Ohio": "Liberty_Township,_Butler_County,_Ohio",
        "Locust_Grove,_Virginia": "Locust_Grove,_Orange_County,_Virginia",
        "Londonderry,_Ohio": "Londonderry,_Ross_County,_Ohio",
        "Long_Branch,_Texas": "Long_Branch,_Panola_County,_Texas",
        "Los_Ranchos,_New_Mexico": "Los_Ranchos_de_Albuquerque,_New_Mexico",
        "Louisville_Airport,_Kentucky": "Louisville_International_Airport",
        "MI_Wuk_Village,_California": "Mi-Wuk_Village,_California",
        "Madeline_Plains,_California": "Madeline_Plains_Q49596861",
        "Malmstrom_AFB,_Montana": "Malmstrom_Air_Force_Base",
        "Manhattan,_New_York": "New_York_County_Q500416",
        "Marine_On_Saint_Croix,_Minnesota": "Marine_on_St._Croix,_Minnesota",
        "Marshall,_Wisconsin": "Marshall,_Dane_County,_Wisconsin",
        "McDermitt,_Oregon": "McDermitt,_Nevada_and_Oregon",
        "McRae-Helena,_Georgia": "McRae–Helena,_Georgia",
        "Mc_Rae_Helena,_Georgia": "McRae–Helena,_Georgia",
        "Meramec,_Missouri": "Meramec_Township,_St._Louis_County,_Missouri",
        "Meridian,_California": "Meridian,_Sutter_County,_California",
        "Middle_Brook,_Missouri": "Middlebrook,_Missouri",
        "Middle_Keys,_Florida": "Middle_Keys_Q3312642",
        "Milldale,_Connecticut": "Milldale_(Southington)",
        "Monroeton,_Pennsylvania": "Monroe,_Pennsylvania",
        "Montpelier,_Virginia": "Montpelier,_Hanover_County,_Virginia",
        "Mount_Washington,_Pennsylvania": "Mount_Washington,_Pittsburgh_(neighborhood)",
        "Mountain_Brk,_Alabama": "Mountain_Brook,_Alabama",
        "Mt_Aukum,_California": "Aukum,_California",
        "Mt_Hamilton,_California": "Mount_Hamilton_(California)",
        "Mt_Lemmon,_Arizona": "Mount_Lemmon",
        "Mt_Wilson,_California": "Mount_Wilson_(California)",
        "New_Matamoras,_Ohio": "Matamoras,_Ohio",
        "New_Raymer,_Colorado": "Raymer,_Colorado",
        "New_Salem_Borough,_Pennsylvania": "New_Salem,_Pennsylvania",
        "Newburg,_Pennsylvania": "Newburg,_Cumberland_County,_Pennsylvania",
        "Newhall,_California": "Newhall,_Santa_Clarita,_California",
        "Newport,_Virginia": "Newport,_Giles_County,_Virginia",
        "Newport_Coast,_California": "Newport_Coast,_Newport_Beach",
        "Newton_Center,_Massachusetts": "Newton_Centre,_Massachusetts",
        "Mc_Leansboro,_Illinois": "McLeansboro,_Illinois",
        "Mcclellan,_California": "McClellan_Air_Force_Base",
        "North_Grosvenordale,_Connecticut": "North_Grosvenordale",
        "North_Versailles,_Pennsylvania": "North_Versailles_Township,_Allegheny_County,_Pennsylvania",
        "Obrien,_Florida": "O'Brien,_Florida",
        "Obrien,_Oregon": "O'Brien,_Oregon",
        "Odonnell,_Texas": "O'Donnell,_Texas",
        "Offutt_AFB,_Nebraska": "Offutt_Air_Force_Base",
        "Olathe_city,_Kansas": "Olathe,_Kansas",
        "Old_Fort,_Tennessee": "Oldfort,_Tennessee",
        "Oneals,_California": "O'Neals,_California",
        "Oneill,_Nebraska": "O'Neill,_Nebraska",
        "Opa-Locka,_Florida": "Opa-locka,_Florida",
        "Opa_Locka,_Florida": "Opa-locka,_Florida",
        "Panama_City_Beaches,_Florida": "Panama_City_Beach,_Florida",
        "Park_Hall,_Maryland": "Park_Hall_Estates,_Maryland",
        "Parsippany,_New_Jersey": "Parsippany-Troy_Hills,_New_Jersey",
        "Patrick_AFB,_Florida": "Patrick_Air_Force_Base",
        "Pekin,_Indiana": "Old_Pekin,_Indiana",
        "Petersburg,_New_York": "Petersburgh,_New_York",
        "Petrified_Forest_Natl_Pk,_Arizona": "Petrified_Forest_National_Park",
        "Philipsburg,_Pennsylvania": "Philipsburg,_Centre_County,_Pennsylvania",
        "Pine_Grove,_Pennsylvania": "Pine_Grove,_Schuylkill_County,_Pennsylvania",
        "Pine_Mountain,_Georgia": "Pine_Mountain,_Harris_County,_Georgia",
        "Pinkhams,_New_Hampshire": "Pinkham's_Grant,_New_Hampshire",
        "Pleasant_Prairie,_Illinois": "Pleasant_Prairie,_Wisconsin",
        "Point_Mugu_Nawc,_California": "Naval_Air_Station_Point_Mugu",
        "Point_Pleasant_Boro,_New_Jersey": "Point_Pleasant,_New_Jersey",
        "Pointblank,_Texas": "Point_Blank,_Texas",
        "Pompeys_Pillar,_Montana": "Pompey's_Pillar,_Montana",
        "Pompton_Plains,_New_Jersey": "Pompton_Plains_Q49604502",
        "Portland_East,_Oregon": "East_Portland,_Oregon",
        "Princeton,_Oregon": "New_Princeton,_Oregon",
        "Proctor,_Arkansas": "Proctor_Township_Q9043747",
        "Queens,_New_York": "Queens_County_Q5142559",
        "Reagan,_Tennessee": "Reagan,_Henderson_County,_Tennessee",
        "Redlake,_Minnesota": "Red_Lake,_Minnesota",
        "Richland,_Arkansas": "Richland_Township_Q9043926",
        "Richland_VIII,_Nebraska": "_Q5810807",
        "Rockhill_Furnace,_Pennsylvania": "Rockhill,_Pennsylvania",
        "Rough_And_Ready,_California": "Rough_and_Ready,_California",
        "Royal_Oaks,_California": "Royal_Oaks_Park",
        "Ryland_Hght,_Kentucky": "Ryland_Heights,_Kentucky",
        "S_Coffeyville,_Oklahoma": "South_Coffeyville,_Oklahoma",
        "Saginaw_city,_Michigan": "Saginaw_City,_Michigan",
        "Saint_Thomas,_Pennsylvania": "St._Thomas_Township,_Franklin_County,_Pennsylvania",
        "San_Juan_Pueblo,_New_Mexico": "Ohkay_Owingeh,_New_Mexico",
        "San_Ysidro,_California": "San_Ysidro,_San_Diego",
        "Sandy_River_Plt,_Maine": "Sandy_River_Plantation,_Maine",
        "Santo_Domingo_Pueblo,_New_Mexico": "Kewa_Pueblo,_New_Mexico",
        "Shaw_AFB,_South_Carolina": "Shaw_Air_Force_Base",
        "Shaw_Afb,_South_Carolina": "Shaw_Air_Force_Base",
        "Sheffield_Village,_Ohio": "Sheffield,_Ohio",
        "Sheppard_AFB,_Texas": "Sheppard_Air_Force_Base",
        "Sheppard_Afb,_Texas": "Sheppard_Air_Force_Base",
        "Silverpeak,_Nevada": "Silver_Peak,_Nevada",
        "Sinnamahoning,_Pennsylvania": "Sinnemahoning,_Pennsylvania",
        "Six_Mile_Run,_Pennsylvania": "Coaldale,_Bedford_County,_Pennsylvania",
        "Smithmill,_Pennsylvania": "Janesville,_Pennsylvania",
        "Smokerun,_Pennsylvania": "Smoke_Run,_Pennsylvania",
        "St._Marys,_North_Carolina": "St._Mary's_Township,_Wake_County,_North_Carolina",
        "St_Benedict,_Pennsylvania": "Saint_Benedict,_Pennsylvania",
        "St_Clair,_Missouri": "Saint_Clair,_Missouri",
        "St_Georges,_Delaware": "Saint_Georges,_Delaware",
        "St_Helena,_South_Carolina": "Saint_Helena_Island_(South_Carolina)",
        "St_Louis,_Missouri": "St._Louis",
        "St_Meinrad,_Indiana": "Saint_Meinrad,_Indiana",
        "St_Michaels,_Maryland": "Saint_Michaels,_Maryland",
        "State_University,_Arkansas": "Arkansas_State_University",
        "Ste_Genevieve,_Missouri": "Ste._Genevieve,_Missouri",
        "Stennis_Space_Center,_Mississippi": "John_C._Stennis_Space_Center",
        "Storrs_Mansfield,_Connecticut": "Storrs,_Connecticut",
        "Summit_Argo,_Illinois": "Summit,_Illinois",
        "Sun_City,_California": "Sun_City,_Menifee,_California",
        "Superior_city,_Wisconsin": "Superior,_Wisconsin",
        "Thetford_Center,_Vermont": "Thetford_Center_Historic_District",
        "Thompson,_Utah": "Thompson_Springs,_Utah",
        "Thurman,_Ohio": "Centerville,_Gallia_County,_Ohio",
        "Township_12_Paw_Creek,_North_Carolina": "_Q6026563",
        "Township_1_Charlotte,_North_Carolina": "_Q5671103",
        "Township_6_Clear_Creek,_North_Carolina": "_Q6026772",
        "Township_9,_Arkansas": "Township_9_Q6026845",
        "Township_Of_Washington,_New_Jersey": "Washington_Township,_Bergen_County,_New_Jersey",
        "Travis_AFB,_California": "Travis_Air_Force_Base",
        "Travis_Afb,_California": "Travis_Air_Force_Base",
        "Tuskegee_Institute,_Alabama": "Tuskegee_University",
        "United_States_Air_Force_Academy,_Colorado": "Air_Force_Academy,_Colorado",
        "Upper_Keys,_Florida": "Upper_Keys_Q3552092",
        "Usaf_Academy,_Colorado": "Air_Force_Academy,_Colorado",
        "Vale,_North_Carolina": "Vale,_Lincoln_County,_North_Carolina",
        "Vashon_Island,_Washington": "Vashon_Island_Q12834566",
        "Vestavia,_Alabama": "Vestavia_Hills,_Alabama",
        "Village_Of_Indian_Springs,_Ohio": "Fairfield_Township,_Butler_County,_Ohio",
        "Village_Of_Lakewood,_Illinois": "Lakewood,_Illinois",
        "Wakeeney,_Kansas": "WaKeeney,_Kansas",
        "Walnut_Hills,_Ohio": "Walnut_Hills,_Cincinnati",
        "Washington,_Maryland": "Washington,_D.C.",
        "Waverly,_New_York": "Waverly,_Tioga_County,_New_York",
        "Wentworths_Location,_New_Hampshire": "Wentworth's_Location,_New_Hampshire",
        "West_Granby,_Connecticut": "West_Granby_Historic_District",
        "West_Harrison,_New_York": "West_Harrison_Q55806942",
        "West_Hatfield,_Massachusetts": "West_Hatfield_Historic_District",
        "West_des_Moines,_Iowa": "West_Des_Moines,_Iowa",
        "White_Hall,_Maryland": "White_Hall,_Baltimore_County,_Maryland",
        "Whiteford,_Pennsylvania": "Whiteford,_Maryland",
        "Williamson,_North_Carolina": "_Q6032613",
        "Willowbrook,_Illinois": "Willowbrook,_DuPage_County,_Illinois",
        "Woodbury,_New_York": "Woodbury,_Nassau_County,_New_York",
        "Ysleta_Del_Sur_Pueblo,_Texas": "Ysleta_del_Sur_Pueblo",
        "Clinton,_Michigan": "Clinton,_Lenawee_County,_Michigan",
        "Clinton,_Wisconsin": "Clinton_(town),_Rock_County,_Wisconsin",
        "Cordell,_Oklahoma": "New_Cordell,_Oklahoma",
        "District_9,_Maryland": "Maryland_Legislative_District_9",
        "District_13,_Maryland": "Maryland_Legislative_District_13",
    }
    df.loc[mask, 'col_to_embed'] = df.loc[mask, 'col_to_embed'].map(exact_matches).fillna(df.loc[mask, 'col_to_embed'])
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    approximate_matches = { # 284 extra matches
        "Afton,_Michigan": "Ellis_Township,_Michigan",
        "Woodstock_Valley,_Connecticut": "Woodstock,_Connecticut",
        "Albertson,_North_Carolina": "Albertson_Township_Q6026937",
        'West_Henrietta,_New_York': 'Henrietta,_New_York',
        'Setauket,_New_York': 'Setauket-East_Setauket,_New_York',
        'Campbell_Hall,_New_York': 'Hamptonburgh,_New_York',
        'Kirkville,_New_York': 'Manlius,_New_York',
        'Port_Crane,_New_York': 'Fenton,_New_York',
        'Harpursville,_New_York': 'Colesville,_New_York',
        'Rock_Tavern,_New_York': 'New_Windsor,_New_York',
        'Salisbury_Mills,_New_York': 'Beaverdam_Lake–Salisbury_Mills,_New_York',
        'Winthrop,_New_York': 'Brasher_Falls–Winthrop,_New_York',
        'Pattersonville,_New_York': 'Pattersonville-Rotterdam_Junction,_New_York',
        'Rotterdam_Junction,_New_York': 'Pattersonville-Rotterdam_Junction,_New_York',
        'North_Lawrence,_New_York': 'Lawrence,_St._Lawrence_County,_New_York',
        'Mooers_Forks,_New_York': 'Mooers,_New_York',
        'Bonner,_Montana': 'Bonner-West_Riverside,_Montana',
        'Athol,_New_York': 'Thurman,_New_York',
        'Ellington,_Connecticut': 'Ellington_Center_Historic_District',
        'Comstock,_New_York': 'Fort_Ann,_New_York',
        'Alfred_Station,_New_York': 'Alfred,_New_York',
        'Pomfret_Center,_Connecticut': 'Pomfret,_Connecticut',
        'Huguenot,_New_York': 'Deerpark,_New_York',
        'Godeffroy,_New_York': 'Deerpark,_New_York',
        'Idyllwild,_California': 'Idyllwild–Pine_Cove,_California',
        'Marietta,_South_Carolina': 'Slater-Marietta,_South_Carolina',
        'Lakeside,_Arizona': 'Pinetop-Lakeside,_Arizona',
        'Pinetop,_Arizona': 'Pinetop-Lakeside,_Arizona',
        'Heber,_Arizona': 'Heber-Overgaard,_Arizona',
        'Overgaard,_Arizona': 'Heber-Overgaard,_Arizona',
        'Sparks_Glencoe,_Maryland': 'Sparks,_Maryland',
        'Eckerman,_Michigan': 'Chippewa_Township,_Chippewa_County,_Michigan',
        'Port_Hadlock,_Washington': 'Port_Hadlock-Irondale,_Washington',
        'Gulliver,_Michigan': 'Doyle_Township,_Michigan',
        'Bailey,_Michigan': 'Casnovia_Township,_Michigan',
        'Belmont,_Michigan': 'Plainfield_Township,_Kent_County,_Michigan',
        'Smiths_Creek,_Michigan': 'Kimball_Township,_Michigan',
        'West_Olive,_Michigan': 'Olive_Township,_Ottawa_County,_Michigan',
        'Hamilton,_Michigan': 'Heath_Township,_Michigan',
        'New_Hudson,_Michigan': 'Lyon_Township,_Oakland_County,_Michigan',
        'North_Street,_Michigan': 'Clyde_Township,_St._Clair_County,_Michigan',
        'Columbus,_Michigan': 'Columbus_Township,_St._Clair_County,_Michigan',
        'Grand_Junction,_Michigan': 'Columbia_Township,_Van_Buren_County,_Michigan',
        'Shelbyville,_Michigan': 'Wayland_Township,_Michigan',
        'Gowen,_Michigan': 'Montcalm_Township,_Michigan',
        'Hale,_Michigan': 'Plainfield_Township,_Iosco_County,_Michigan',
        'Remus,_Michigan': 'Wheatland_Township,_Mecosta_County,_Michigan',
        'Goodells,_Michigan': 'Wales_Township,_Michigan',
        'Conklin,_Michigan': 'Chester_Township,_Ottawa_County,_Michigan',
        'Rives_Junction,_Michigan': 'Rives_Township,_Michigan',
        'Fenwick,_Michigan': 'Fairplain_Township,_Michigan',
        'Alger,_Michigan': 'Moffatt_Township,_Michigan',
        'Manitou_Beach,_Michigan': 'Manitou_Beach–Devils_Lake,_Michigan',
        'Jones,_Michigan': 'Newberg_Township,_Michigan',
        'Clarklake,_Michigan': 'Columbia_Township,_Jackson_County,_Michigan',
        'Vestaburg,_Michigan': 'Richland_Township,_Montcalm_County,_Michigan',
        'Kewadin,_Michigan': 'Milton_Township,_Antrim_County,_Michigan',
        'Lachine,_Michigan': 'Long_Rapids_Township,_Michigan',
        'Rodney,_Michigan': 'Colfax_Township,_Mecosta_County,_Michigan',
        'Irons,_Michigan': 'Eden_Township,_Lake_County,_Michigan',
        'Sawyer,_Michigan': 'Shorewood–Tower_Hills–Harbert,_Michigan',
        'Elwell,_Michigan': 'Seville_Township,_Michigan',
        'Wallace,_Michigan': 'Mellen_Township,_Michigan',
        "American_Fork-Pleasant_Grove,_Utah": "Pleasant_Grove,_Utah",
        "Amston,_Connecticut": "Hebron,_Connecticut",
        "Anaheim-Santa_Ana-Garden_Grove,_California": "Anaheim,_California",
        "Arcadia_West,_Florida": "Arcadia,_Florida",
        "Ashley_Falls,_Massachusetts": "Sheffield,_Massachusetts",
        "Atlanta-Decatur,_Georgia": "Atlanta",
        "Attleboro_Falls,_Massachusetts": "North_Attleborough,_Massachusetts",
        "Avalon-Mulat,_Florida": "Avalon,_Florida",
        "Baldwin_Place,_New_York": "Somers,_New_York",
        "Bayside_Hills,_New_York": "Bayside,_Queens",
        "Beach_Lake,_Pennsylvania": "Berlin_Township,_Wayne_County,_Pennsylvania",
        "Beaverton-Hillsboro,_Oregon": "Hillsboro,_Oregon",
        "Belvedere-Tiburon,_California": "Tiburon,_California",
        "Belvedere_Tiburon,_California": "Tiburon,_California",
        "Big_Oak_Flat,_California": "Groveland-Big_Oak_Flat,_California",
        "Black_Forest-Peyton,_Colorado": "Black_Forest,_Colorado",
        "Blairsden-Graeagle,_California": "Graeagle,_California",
        "Blairsden_Graeagle,_California": "Graeagle,_California",
        "Bridgewater_Corners,_Vermont": "Bridgewater,_Vermont",
        "Bruceville,_Texas": "Bruceville-Eddy,_Texas",
        "Bryant_Pond,_Maine": "Woodstock,_Maine",
        "Camby,_Indiana": "Decatur_Township,_Marion_County,_Indiana",
        "Camden-Wyoming,_Delaware": "Camden,_Delaware",
        "Camden_Wyoming,_Delaware": "Camden,_Delaware",
        "Camp_Lejeune,_North_Carolina": "Camp_Lejeune,_North_Carolina",
        "Cataumet,_Massachusetts": "Bourne,_Massachusetts",
        "Cedar_Park-Liberty_Hill,_Texas": "Cedar_Park,_Texas",
        "Center_Tuftonboro,_New_Hampshire": "Tuftonboro,_New_Hampshire",
        "Chadds_Ford,_Pennsylvania": "Chadds_Ford_Township,_Delaware_County,_Pennsylvania",
        "Channing,_Michigan": "Sagola_Township,_Michigan",
        "Cherry_Valley,_Massachusetts": "Leicester,_Massachusetts",
        "Claryville,_New_York": "Neversink,_New_York",
        "Clinton_Corners,_New_York": "Clinton,_Dutchess_County,_New_York",
        "Coburn_Gore,_Maine": "North_Franklin,_Maine",
        "Cocoa-Rockledge,_Florida": "Rockledge,_Florida",
        "Cocoa_Beach-Cape_Canaveral,_Florida": "Cocoa_Beach,_Florida",
        "Cogan_Station,_Pennsylvania": "Lycoming_Township,_Lycoming_County,_Pennsylvania",
        "Conewango_Valley,_New_York": "Conewango,_New_York",
        "Conservation,_Florida": "Broward_County,_Florida",
        "Coral_Springs-Margate,_Florida": "Coral_Springs,_Florida",
        "Croton_Falls,_New_York": "North_Salem,_New_York",
        "Davisburg,_Michigan": "Springfield_Township,_Oakland_County,_Michigan",
        "Delano-McFarland,_California": "Delano,_California",
        "Dewey,_Arizona": "Dewey–Humboldt,_Arizona",
        "Dorchester_Center,_Massachusetts": "Dorchester,_Boston",
        "Duke_Center,_Pennsylvania": "Otto_Township,_McKean_County,_Pennsylvania",
        "East_Berne,_New_York": "Berne,_New_York",
        "East_Canaan,_Connecticut": "North_Canaan,_Connecticut",
        "East_Dummerston,_Vermont": "Dummerston,_Vermont",
        "East_Hartland,_Connecticut": "Hartland,_Connecticut",
        "East_Killingly,_Connecticut": "Killingly,_Connecticut",
        "East_San_Gabriel_Valley,_California": "San_Gabriel,_California",
        "East_Setauket,_New_York": "Setauket-East_Setauket,_New_York",
        "East_Springfield,_Pennsylvania": "Springfield_Township,_Erie_County,_Pennsylvania",
        "East_Tehama,_California": "Tehama,_California",
        "East_Thetford,_Vermont": "Thetford,_Vermont",
        "East_Walpole,_Massachusetts": "Walpole,_Massachusetts",
        "East_Wareham,_Massachusetts": "Wareham,_Massachusetts",
        "East_Weymouth,_Massachusetts": "Weymouth,_Massachusetts",
        "Eddy,_Texas": "Bruceville-Eddy,_Texas",
        "Edgewood,_New_York": "Hunter,_New_York",
        "Elizabethport,_New_Jersey": "Elizabeth,_New_Jersey",
        "Federal_Way-Auburn,_Washington": "Federal_Way,_Washington",
        "Fleming,_Pennsylvania": "Unionville,_Centre_County,_Pennsylvania",
        "Fryburg,_Pennsylvania": "Washington_Township,_Clarion_County,_Pennsylvania",
        "Glades,_Florida": "Belle_Glade,_Florida",
        "Glen_Wild,_New_York": "Thompson,_New_York",
        "Gracewood,_Georgia": "Augusta,_Georgia",
        "Grant,_Florida": "Grant-Valkaria,_Florida",
        "Haydenville,_Massachusetts": "Williamsburg,_Massachusetts",
        "Herron,_Michigan": "Wilson_Township,_Alpena_County,_Michigan",
        "Hotevilla,_Arizona": "Hotevilla-Bacavi,_Arizona",
        "Humboldt,_Arizona": "Dewey–Humboldt,_Arizona",
        "Indialantic-Melbourne_Beach,_Florida": "Melbourne_Beach,_Florida",
        "Jacksonville_East,_Florida": "Jacksonville,_Florida",
        "Jacksonville_West,_Florida": "Jacksonville,_Florida",
        "Jefferson,_Massachusetts": "Holden,_Massachusetts",
        "La_Crescenta,_California": "La_Crescenta-Montrose,_California",
        "Lagrangeville,_New_York": "LaGrange,_New_York",
        "Lakeport,_Michigan": "Burtchville_Township,_Michigan",
        "Lakeside-Marblehead,_Ohio": "Danbury_Township,_Ottawa_County,_Ohio",
        "Lakeside_Marblehead,_Ohio": "Danbury_Township,_Ottawa_County,_Ohio",
        "Lambert,_Montana": "Fox_Lake,_Montana",
        "Lanesville,_New_York": "Hunter,_New_York",
        "Leo,_Indiana": "Leo-Cedarville,_Indiana",
        "Louisville_Central,_Kentucky": "Downtown_Louisville",
        "Lower_Keys,_Florida": "Cudjoe_Key,_Florida",
        "Lyndora,_Pennsylvania": "Homeacre-Lyndora,_Pennsylvania",
        "Malden_Bridge,_New_York": "Chatham_(town),_New_York",
        "Martville,_New_York": "Sterling,_New_York",
        "McCloud-Medicine_Lake,_California": "McCloud,_California",
        "Middlebury_Center,_Pennsylvania": "Middlebury_Township,_Tioga_County,_Pennsylvania",
        "Mill_Run,_Pennsylvania": "Springfield_Township,_Fayette_County,_Pennsylvania",
        "Milmont_Park,_Pennsylvania": "Ridley_Township,_Delaware_County,_Pennsylvania",
        "Minturn-Red_Cliff,_Colorado": "Minturn,_Colorado",
        "Montrose,_California": "La_Crescenta-Montrose,_California",
        "Mt_Carmel,_Utah": "Mount_Carmel_Junction,_Utah",
        "Needham_Heights,_Massachusetts": "Needham,_Massachusetts",
        "New_Preston_Marble_Dale,_Connecticut": "New_Preston,_Connecticut",
        "New_Russia,_New_York": "Elizabethtown,_New_York",
        "North_Antelope_Valley,_California": "Palmdale,_California",
        "North_Baldwin,_New_York": "Baldwin,_Nassau_County,_New_York",
        "North_Dartmouth,_Massachusetts": "Dartmouth,_Massachusetts",
        "North_Dighton,_Massachusetts": "Dighton,_Massachusetts",
        "North_Dinwiddie,_Virginia": "Dinwiddie,_Virginia",
        "North_Easton,_Massachusetts": "Easton,_Massachusetts",
        "North_El_Dorado,_California": "El_Dorado_Hills,_California",
        "North_Ferrisburgh,_Vermont": "Ferrisburgh,_Vermont",
        "North_Franklin,_Connecticut": "Franklin,_Connecticut",
        "North_Grafton,_Massachusetts": "Grafton,_Massachusetts",
        "North_Oxford,_Massachusetts": "Oxford,_Massachusetts",
        "North_Scituate,_Rhode_Island": "Smithville_–_North_Scituate,_Rhode_Island",
        "North_Venice,_Florida": "Venice,_Florida",
        "North_Weymouth,_Massachusetts": "Weymouth,_Massachusetts",
        "North_Whidbey_Island,_Washington": "Whidbey_Island",
        "North_Windham,_Connecticut": "Windham,_Connecticut",
        "Northeast_Dallas,_Texas": "Dallas",
        "Northeast_Jefferson,_Colorado": "Jefferson,_Colorado",
        "Northeast_Travis,_Texas": "Travis,_Texas",
        "Northwest_Clackamas,_Oregon": "Clackamas,_Oregon",
        "Northwest_Travis,_Texas": "Travis,_Texas",
        "Oak_Bay-Port_Ludlow,_Washington": "Port_Ludlow,_Washington",
        "Oakdale,_Connecticut": "Montville,_Connecticut",
        "Oakland_Gardens,_New_York": "Oakland_Gardens,_Queens,_New_York",
        "Oceanside-Escondido,_California": "Oceanside,_California",
        "Oklahoma_City_Northwest,_Oklahoma": "Oklahoma_City",
        "Oklahoma_City_Southwest,_Oklahoma": "Oklahoma_City",
        "Otis_Orchards,_Washington": "Otis_Orchards-East_Farms,_Washington",
        "Palm_River-Gibsonton,_Florida": "Gibsonton,_Florida",
        "Penllyn,_Pennsylvania": "Lower_Gwynedd_Township,_Montgomery_County,_Pennsylvania",
        "Portland_West,_Oregon": "Portland,_Oregon",
        "Poughquag,_New_York": "Beekman,_New_York",
        "Primos,_Pennsylvania": "Upper_Darby_Township,_Delaware_County,_Pennsylvania",
        "Provo-Orem,_Utah": "Provo,_Utah",
        "Purdys,_New_York": "North_Salem,_New_York",
        "Randolph_Center,_Vermont": "Randolph,_Vermont",
        "Reddick-McIntosh,_Florida": "Reddick,_Florida",
        "Roswell-Alpharetta,_Georgia": "Roswell,_Georgia",
        "Royal_Palm_Beach-West_Jupiter,_Florida": "Royal_Palm_Beach,_Florida",
        "Russian_River-Sonoma_Coast,_California": "Sonoma,_California",
        "San_Antonio_Central,_Texas": "San_Antonio",
        "San_Antonio_North,_Texas": "San_Antonio",
        "Schodack_Landing,_New_York": "Schodack,_New_York",
        "Seattle_East,_Washington": "Seattle",
        "South_Chatham,_Massachusetts": "Chatham,_Massachusetts",
        "North_Chesterfield,_Virginia": "Chesterfield,_Virginia",
        "South_Chesterfield,_Virginia": "Chesterfield,_Virginia",
        "South_Dartmouth,_Massachusetts": "Dartmouth,_Massachusetts",
        "South_Easton,_Massachusetts": "South_Easton,_Massachusetts",
        "South_Easton,_Massachusetts": "Easton,_Massachusetts",
        "South_El_Dorado,_California": "El_Dorado_Hills,_California",
        "South_Glastonbury,_Connecticut": "South_Glastonbury_Historic_District",
        "South_Grafton,_Massachusetts": "Grafton,_Massachusetts",
        "South_Jefferson,_Kentucky": "Jeffersontown,_Kentucky",
        "South_Prince_George,_Virginia": "Prince_George,_Virginia",
        "North_Prince_George,_Virginia": "Prince_George,_Virginia",
        "South_Richmond_Hill,_New_York": "Richmond_Hill,_Queens",
        "South_Setauket,_New_York": "Setauket-East_Setauket,_New_York",
        "South_Weymouth,_Massachusetts": "Weymouth,_Massachusetts",
        "South_Yakima,_Washington": "Yakima,_Washington",
        "Southeast_Jefferson,_Kentucky": "Jeffersontown,_Kentucky",
        "Southeast_Marin,_California": "Marin_City,_California",
        "Southwest_Dallas,_Texas": "Dallas",
        "Springville-Johnsondale,_California": "Springville,_California",
        "Springville-Mapleton,_Utah": "Springville,_Utah",
        "Stanfordville,_New_York": "Stanford,_New_York",
        "Stormville,_New_York": "East_Fishkill,_New_York",
        "Storrs_Center,_Connecticut": "Storrs,_Connecticut",
        "Sunland,_California": "Sunland-Tujunga,_Los_Angeles",
        "Trenary,_Michigan": "Mathias_Township,_Michigan",
        "Trout_Run,_Pennsylvania": "Lewis_Township,_Lycoming_County,_Pennsylvania",
        "Tujunga,_California": "Sunland-Tujunga,_Los_Angeles",
        "Twain_Harte-Tuolumne_City,_California": "Twain_Harte,_California",
        "Unionville,_Connecticut": "Farmington,_Connecticut",
        "Upper_Jay,_New_York": "Jay,_New_York",
        "Victorville-Hesperia,_California": "Victorville,_California",
        "View_Park,_California": "View_Park–Windsor_Hills,_California",
        "Walla_Walla-College_Place,_Washington": "Walla_Walla,_Washington",
        "Washington_Depot,_Connecticut": "Washington,_Connecticut",
        "Waterville,_Pennsylvania": "Cummings_Township,_Lycoming_County,_Pennsylvania",
        "Wellesley_Hills,_Massachusetts": "Wellesley,_Massachusetts",
        "West_Baldwin,_Maine": "Baldwin,_Maine",
        "West_Collingswood,_New_Jersey": "Haddon_Township,_New_Jersey",
        "West_Collingswood_Heights,_New_Jersey": "Haddon_Township,_New_Jersey",
        "West_Columbia-Cayce,_South_Carolina": "West_Columbia,_South_Carolina",
        "West_Coxsackie,_New_York": "Coxsackie_(village),_New_York",
        "West_Enfield,_Maine": "Enfield,_Maine",
        "West_Santa_Cruz,_California": "Santa_Cruz,_California",
        "West_Springfield,_Pennsylvania": "Springfield_Township,_Erie_County,_Pennsylvania",
        "West_Suffield,_Connecticut": "Suffield,_Connecticut",
        "West_Townsend,_Massachusetts": "Townsend,_Massachusetts",
        "Westdale,_New_York": "Camden,_New_York",
        "Willow_Creek-Hoopa_Valley,_California": "Willow_Creek,_California",
        "Wimauma-Riverview,_Florida": "Riverview,_Florida",
        "Windsor_Hills,_California": "View_Park–Windsor_Hills,_California",
        "Windsor_Mill,_Maryland": "Milford_Mill,_Maryland",
        "Winter_Garden-Ocoee,_Florida": "Ocoee,_Florida",
    }
    # Note: we do not put the approximate matches in the final file
    #df.loc[mask, 'col_to_embed'] = df.loc[mask, 'col_to_embed'].map(approximate_matches).fillna(df.loc[mask, 'col_to_embed'])
    #mask = ~df["col_to_embed"].isin(entity_list)
    #n0, n1 = n1, mask.sum()
    #print(n0 - n1, "additional matches")

    
    # Keep only the city names for non-matches (--> 240 extra matches)
    df.loc[mask, "col_to_embed"] = df["City"].str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")
    
    
    # Restore original format for non-matches
    df.loc[mask, "col_to_embed"] = (df["City"] + ",_" + df["State"]).str.replace(" ", "_")
    

    print("Final number of unmatched entities: ", n1)


    # Correct mistakes
    mistakes = [
        "Ballston_Lake",
        "Bermuda",
        "Blanchard",
        "Briones",
        "Buskirk",
        "Cobalt",
        "Cranberry",
        "Crescent_Lake_(Oregon)",
        "Croatan",
        "Cross_River_(New_York)",
        "District_13",
        "Dundee",
        "Evergreen",
        "Gunpowder",
        "Iron",
        "Jamaica",
        "James_Creek",
        "Labelle",
        "Lovejoy",
        "McClellan",
        "Mears",
        "Orchard_Lake_(Michigan)",
        "Poland",
        "Sears",
        "Silverlake_(Washington)",
        "Slater",
        "Sunset",
        "Valencia",
    ]
    corrections = {
        "Bermuda": "Bermuda_Hundred,_Virginia",
        "Briones": "Briones_Hills",
        "Croatan": "Croatan_Township_Q6028170",
        "Evergreen": "Evergreen,_Tatums_Township,_Columbus_County,_North_Carolina",
        "Iron": "Iron_Junction,_Minnesota",
        "Jamaica": "Jamaica,_Queens",
        "Labelle": "LaBelle,_Florida",
        "Lovejoy": "Brooklyn,_Illinois",
        "McClellan": "McClellan_Park,_California",
        "Orchard_Lake_(Michigan)": "Orchard_Lake_Village,_Michigan",
        "Poland": "Poland,_Herkimer_County,_New_York",
        "Silverlake_(Washington)": "Silver_Lake,_Washington",
        "Sunset": "Sunset,_Montague_County,_Texas",
        "Valencia": "Valencia,_Santa_Clarita,_California",
    }
    mask = df["col_to_embed"].isin(mistakes)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "col_to_embed"].map(corrections).fillna((df["City"] + ",_" + df["State"]).str.replace(" ", "_"))


    ## Yago4to3
    df["yago4_col_to_embed"] = df["col_to_embed"]
    mask = df["col_to_embed"].str.contains("_Q[0-9]{6,10}", regex=True).astype("boolean")
    df["yago3_col_to_embed"] = df["col_to_embed"]
    df.loc[mask, "yago3_col_to_embed"] = (df["City"] + ",_" + df["State"]).str.replace(" ", "_")


    ## Groupby col_to_embed and sum target
    df = df.groupby(["yago3_col_to_embed", "yago4_col_to_embed", "raw_entities"], as_index=False).sum("target")


    ## Transform target to log(target)
    df["target"] = np.log10(df["target"])


    ## Save dataframe
    df = df[["raw_entities", "yago3_col_to_embed", "yago4_col_to_embed", "target"]]
    df.to_parquet(dir_path / "target_log.parquet", index=False)