from pathlib import Path
import pandas as pd
import numpy as np

from SEPAL.dataloader import DataLoader


if __name__ == "__main__":
    ## Load original data
    dir_path = Path(__file__).parent
    use_cols = ["title", "release_date", "revenue", "original_title", "original_language", "production_countries"]
    df = pd.read_csv(dir_path / "movies_metadata.csv", usecols=use_cols)
    df["year"] = df["release_date"].str[:4]
    df["title"] = df["title"].astype(str)


    ## Filter out movies with no revenue
    df = df[(df["revenue"] != 0) & (~df["revenue"].isna())]


    ## Remove duplicates
    df.drop_duplicates(keep='first', inplace=True, ignore_index=False)


    ## Format movie names to match Yago nomenclature
    df["col_to_embed"] = df["title"].str.replace(" ", "_") + "_(" + df["year"] + "_film)"


    ## Deal with unmatched entities
    # Get Yago4 entities
    yago_dir = Path(__file__).absolute().parents[2] / "knowledge_graphs"
    yago4_dl = DataLoader(yago_dir / "yago4_with_full_ontology")
    entity_list = list(yago4_dl.entity_to_idx.keys())

    yago4_types = pd.read_parquet(yago_dir / "yago4/yagoTypes.parquet")



    mask = ~df["col_to_embed"].isin(entity_list)
    n0 = mask.sum()
    print("Initial number of unmatched entities: ", n0)

    # Deal with homonymous specific cases (--> 7 extra matches)
    df.loc[df.col_to_embed == "Repentance_(1984_film)", "col_to_embed"] = "Repentance_(1987_film)"
    df.loc[df.col_to_embed == "Repentance_(2014_film)", "col_to_embed"] = "Repentance_(2013_film)"
    df.loc[df.col_to_embed == "Dracula_(1931_film)", "col_to_embed"] = "Dracula_(1931_English-language_film)"
    df.loc[df.col_to_embed == "Dracula_(1992_film)", "col_to_embed"] = "Bram_Stoker's_Dracula_(1992_film)"
    df.loc[df.col_to_embed == "The_Notebook_(2013_film)", "col_to_embed"] = "The_Notebook_(2013_Hungarian_film)"
    df.loc[df.col_to_embed == "Logan_(2010_film)", "col_to_embed"] = "Logan_Q6666921"
    df.loc[df.col_to_embed == "The_Priest_(1978_film)", "col_to_embed"] = "El_sacerdote"
    mask = ~df["col_to_embed"].isin(entity_list)
    n1 = mask.sum()
    print(n0 - n1, "additional matches")


    # Deal with specific cases
    matches = { # 472 matches
        "Se7en": "Seven_(1995_film)",
        "Baasha": "Baashha",
        "Leon:_The_Professional": "Léon:_The_Professional",
        "New_Nightmare": "Wes_Craven's_New_Nightmare",
        "The_Naked_Gun_33⅓:_The_Final_Insult": "Naked_Gun_33⅓:_The_Final_Insult",
        "Highlander:_The_Final_Dimension": "Highlander_III:_The_Sorcerer",
        "Pokémon_4Ever:_Celebi_-_Voice_of_the_Forest": "Pokémon_4Ever",
        "Pregnant": "The_Pregnant",
        "The_Man_without_a_Face": "The_Man_Without_a_Face",
        "Sirens": "Sirens_(1994_film)",
        "Happy_Weekend": "Happy_Weekend_Q1584318",
        "A_Prince_(almost)_Charming": "Un_prince_(presque)_charmant_Q3548954",
        "Chasing_Trane": "Chasing_Trane:_The_John_Coltrane_Documentary",
        "DragonHeart": "Dragonheart",
        "Bloodsport_II": "Bloodsport_II:_The_Next_Kumite",
        "Dr._Strangelove_or:_How_I_Learned_to_Stop_Worrying_and_Love_the_Bomb": "Dr._Strangelove",
        "Emma": "Emma_(1996_theatrical_film)",
        "Police_Story_3:_Supercop": "Supercop",
        "Life_of_Brian": "Monty_Python's_Life_of_Brian",
        "My_Left_Foot:_The_Story_of_Christy_Brown": "My_Left_Foot",
        "Paris_is_Burning": "Paris_Is_Burning_(film)",
        "GoodFellas": "Goodfellas",
        "Better_Off_Dead...": "Better_Off_Dead_(film)",
        "Pump_up_the_Volume": "Pump_Up_the_Volume_(film)",
        "Alien³": "Alien_3",
        "Kama_Sutra_-_A_Tale_of_Love": "Kama_Sutra:_A_Tale_of_Love",
        "Mrs._Dalloway": "Mrs_Dalloway_(film)",
        "An_Alan_Smithee_Film:_Burn,_Hollywood,_Burn": "An_Alan_Smithee_Film:_Burn_Hollywood_Burn",
        "Six_Days_Seven_Nights": "Six_Days,_Seven_Nights",
        "The_X_Files": "The_X-Files",
        "A_Nightmare_on_Elm_Street_Part_2:_Freddy's_Revenge": "A_Nightmare_on_Elm_Street_2:_Freddy's_Revenge",
        "Jane_Austen's_Mafia!": "Mafia!",
        "Honey_I_Blew_Up_the_Kid": "Honey,_I_Blew_Up_the_Kid",
        "Halloween:_H20": "Halloween_H20:_20_Years_Later",
        "Ever_After:_A_Cinderella_Story": "Ever_After",
        "The_Neverending_Story_II:_The_Next_Chapter": "The_NeverEnding_Story_II:_The_Next_Chapter",
        "Why_Do_Fools_Fall_In_Love": "Why_Do_Fools_Fall_in_Love_(film)",
        "History_of_the_World:_Part_I": "History_of_the_World,_Part_I",
        "Leatherface:_Texas_Chainsaw_Massacre_III": "Leatherface:_The_Texas_Chainsaw_Massacre_III",
        "8MM": "8mm_(film)",
        "Pet_Sematary_II": "Pet_Sematary_Two",
        "The_Concorde..._Airport_'79": "The_Concorde_..._Airport_'79",
        "Jeanne_and_the_Perfect_Guy": "The_Perfect_Guy_(1998_film)",
        "SLC_Punk": "SLC_Punk!",
        "eXistenZ": "Existenz",
        "Hang_'em_High": "Hang_'Em_High",
        "Experience_Preferred...But_Not_Essential": "Experience_Preferred..._But_Not_Essential",
        "Victor/Victoria": "Victor%2FVictoria",
        "Face/Off": "Face%2FOff",
        "Contact": "Contact_(1997_American_film)",
        "Anywhere_But_Here": "Anywhere_but_Here_(film)",
        "Pokémon:_The_First_Movie:_Mewtwo_Strikes_Back": "Pokémon:_The_First_Movie",
        "The_Hand_that_Rocks_the_Cradle": "The_Hand_That_Rocks_the_Cradle_(film)",
        "Death_Wish_2": "Death_Wish_II",
        "Death_Wish_5:_The_Face_of_Death": "Death_Wish_V:_The_Face_of_Death",
        "Jennifer_Eight": "Jennifer_8",
        "Mission:_Impossible_II": "Mission:_Impossible_2",
        "Nine_1/2_Weeks": "9½_Weeks",
        "The_Adventures_of_Rocky_&_Bullwinkle": "The_Adventures_of_Rocky_and_Bullwinkle_(film)",
        "Everything_You_Always_Wanted_to_Know_About_Sex_*But_Were_Afraid_to_Ask": "Everything_You_Always_Wanted_to_Know_About_Sex*_(*But_Were_Afraid_to_Ask)_(film)",
        "Critters_2": "Critters_2:_The_Main_Course",
        "Elvira,_Mistress_of_the_Dark": "Elvira:_Mistress_of_the_Dark_(film)",
        "My_Stepmother_is_an_Alien": "My_Stepmother_Is_an_Alien",
        "Best_of_the_Best_2": "Best_of_the_Best_II",
        "Enemies:_A_Love_Story": "Enemies,_A_Love_Story",
        "Stepfather_II:_Make_Room_For_Daddy": "Stepfather_II",
        "La_Cage_aux_folles": "La_Cage_aux_Folles_(film)",
        "La_Cage_aux_folles_II": "La_Cage_aux_Folles_II",
        "John_Q": "John_Q.",
        "Sällskapsresan_-_eller_finns_det_svenskt_kaffe_på_grisfesten?": "Sällskapsresan",
        "National_Lampoon’s_Van_Wilder": "Van_Wilder",
        "Every_Which_Way_But_Loose": "Every_Which_Way_but_Loose_(film)",
        "xXx": "XXX_(2002_film)",
        "S1m0ne": "Simone_(2002_film)",
        "FearDotCom": "FeardotCom",
        "His_Secret_Life": "The_Ignorant_Fairies",
        "When_Time_Ran_Out...": "When_Time_Ran_Out",
        "Wholly_Moses": "Wholly_Moses!",
        "Adaptation.": "Adaptation_(film)",
        "About_Last_Night...": "About_Last_Night_(1986_film)",
        "Die_Hard:_With_a_Vengeance": "Die_Hard_with_a_Vengeance",
        "Bablo": "_Q4074469",
        "Glukhar_v_kino": "_Q1835010",
        "Pro_Lyuboff": "_Q4379490",
        "Antidur": "_Q4066999",
        "Mommies,_Happy_New_Year!": "_Q4402946",
        "That_still_Karloson!": "_Q4461441",
        "Solovey-Razboynik": "_Q3604731",
        "And_Here's_What's_Happening_to_Me": "_Q4425710",
        "MouseHunt": "Mouse_Hunt",
        "The_Castle": "The_Castle_(1997_Australian_film)",
        "The_Terrorist": "The_Terrorist_(1997_film)",
        "А_поутру_они_проснулись": "_Q4053892",
        "Hitler's_Kaput!": "Hitler_Goes_Kaput!",
        "Chelovek,_kotoryy_znal_vsyo": "The_Man_Who_Knew_Everything",
        "On_the_Hook!": "_Q4310974",
        "Apartment_18": "_Q4112945",
        "A_Cry_in_the_Dark": "Evil_Angels_(film)",
        "...And_Justice_for_All": "...And_Justice_for_All.",
        "Black_and_White": "Black_and_White_(1999_drama_film)",
        "Gossip": "Gossip_(2000_American_film)",
        "Porky's_3:_Revenge": "Porky's_Revenge!",
        "Gone_in_Sixty_Seconds": "Gone_in_60_Seconds_(2000_film)",
        "The_Kid": "Disney's_The_Kid",
        "Head_Over_Heels": "Head_over_Heels_(2001_film)",
        "The_Way_of_the_Dragon": "Way_of_the_Dragon",
        "Black_Rain": "Black_Rain_(1989_American_film)",
        "Every_Which_Way_but_Loose": "Every_Which_Way_but_Loose_(film)",
        "Come_Back_to_the_5_&_Dime,_Jimmy_Dean,_Jimmy_Dean": "Come_Back_to_the_Five_and_Dime,_Jimmy_Dean,_Jimmy_Dean_(film)",
        "Asoka": "Aśoka_(film)",
        "Sunless": "Sans_Soleil",
        "Amandla!_A_Revolution_in_Four-Part_Harmony": "Amandla!:_A_Revolution_in_Four-Part_Harmony",
        "Ringu": "Ring_(film)",
        "Ringu_2": "Ring_2",
        "The_Spanish_Apartment": "L'Auberge_Espagnole",
        "Pokémon_Heroes:_Latios_and_Latias": "Pokémon_Heroes",
        "The_Advocate": "The_Hour_of_the_Pig",
        "L'imbalsamatore": "The_Embalmer_(2002_film)",
        "Lara_Croft_Tomb_Raider:_The_Cradle_of_Life": "Lara_Croft:_Tomb_Raider_–_The_Cradle_of_Life",
        "Kickboxer_2:__The_Road_Back": "Kickboxer_2",
        "Remo_Williams:_The_Adventure_Begins...": "Remo_Williams:_The_Adventure_Begins",
        "Ay,_Carmela!": "¡Ay_Carmela!",
        "Cabin_Fever": "Cabin_Fever_(2002_film)",
        "Lagaan:_Once_Upon_a_Time_in_India": "Lagaan",
        "Kill_Bill:_Vol._1": "Kill_Bill:_Volume_1",
        "Kill_Bill:_Vol._2": "Kill_Bill:_Volume_2",
        "Vivre_Sa_Vie": "My_Life_to_Live",
        "Barbershop_2:__Back_in_Business": "Barbershop_2:_Back_in_Business",
        "Good_bye,_Lenin!": "Good_Bye,_Lenin!",
        "Those_Magnificent_Men_in_Their_Flying_Machines_or_How_I_Flew_from_London_to_Paris_in_25_hours_11_minutes": "Those_Magnificent_Men_in_their_Flying_Machines",
        "Noises_Off...": "Noises_Off_(film)",
        "Bon_Voyage,_Charlie_Brown_(and_Don't_Come_Back!)": "Bon_Voyage,_Charlie_Brown_(and_Don't_Come_Back!!)",
        "The_Legend_I": "Fong_Sai-yuk_(film)",
        "Premature_Burial": "The_Premature_Burial_(film)",
        "Gladiator_1992": "Gladiator_(1992_film)",
        "Batteries_not_Included": "Batteries_Not_Included",
        "Rapa_Nui": "Rapa-Nui_(film)",
        "Secret_Society": "Secret_Society_Q47090809",
        "AVP:_Alien_vs._Predator": "Alien_vs._Predator_(film)",
        "Yu-Gi-Oh!_The_Movie": "Yu-Gi-Oh!_The_Movie:_Pyramid_of_Light",
        "Orca:_The_Killer_Whale": "Orca_(film)",
        "Shall_We_Dance?": "Shall_We_Dance%3F_(2004_film)",
        "All_I_Want_For_Christmas": "All_I_Want_for_Christmas_(film)",
        "L'Âge_d'Or": "L'Age_d'Or",
        "The_Legend_II": "Fong_Sai-yuk_II",
        "Millions": "Millions_(2004_film)",
        "Ong-Bak:_The_Thai_Warrior": "Ong-Bak:_Muay_Thai_Warrior",
        "The_Visual_Bible:_The_Gospel_of_John": "The_Gospel_of_John_(2003_film)",
        "Bummer": "Bimmer_(film)",
        "Tae_Guk_Gi:_The_Brotherhood_of_War": "Taegukgi_(film)",
        "What_the_#$*!_Do_We_(K)now!?": "What_the_Bleep_Do_We_Know!%3F",
        "Daddy's_Dyin'..._Who's_Got_the_Will?": "Daddy's_Dyin':_Who's_Got_the_Will%3F",
        "The_Ballad_of_the_Sad_Cafe": "The_Ballad_of_the_Sad_Café_(film)",
        "A_Lot_Like_Love": "A_Lot_like_Love",
        "xXx:_State_of_the_Union": "XXX:_State_of_the_Union",
        "The_Adventures_of_Sharkboy_and_Lavagirl": "The_Adventures_of_Sharkboy_and_Lavagirl_in_3-D",
        "Herbie_Fully_Loaded": "Herbie:_Fully_Loaded",
        "The_Heart_is_Deceitful_Above_All_Things": "The_Heart_Is_Deceitful_Above_All_Things",
        "The_40_Year_Old_Virgin": "The_40-Year-Old_Virgin",
        "Red_Eye": "Red_Eye_(2005_American_film)",
        "Asterix_&_Obelix_Take_on_Caesar": "Asterix_and_Obelix_vs._Caesar",
        "Just_Like_Heaven": "Just_like_Heaven_(film)",
        "Everything_is_Illuminated": "Everything_Is_Illuminated_(film)",
        "Green_Street_Hooligans": "Green_Street_(film)",
        "Mirrormask": "MirrorMask",
        "The_Curse_of_the_Were-Rabbit": "Wallace_&_Gromit:_The_Curse_of_the_Were-Rabbit",
        "Goal!:_The_Dream_Begins": "Goal!_(film)",
        "Good_Night,_and_Good_Luck.": "Good_Night,_and_Good_Luck",
        "Rumor_Has_It...": "Rumor_Has_It_(film)",
        "Sympathy_for_Lady_Vengeance": "Lady_Vengeance",
        "District_B13": "District_13",
        "Ultimate_Avengers": "Ultimate_Avengers_Q1051702",
        "The_Child": "L'Enfant_(film)",
        "The_Protector": "Tom-Yum-Goong",
        "Stranger_Than_Fiction": "Stranger_than_Fiction_(2006_film)",
        "Mrs_Palfrey_at_The_Claremont": "Mrs._Palfrey_at_the_Claremont",
        "Black_Gold": "Black_Gold_(2011_Qatari_film)",
        "Bon_Cop_Bad_Cop": "Bon_Cop,_Bad_Cop",
        "Borat:_Cultural_Learnings_of_America_for_Make_Benefit_Glorious_Nation_of_Kazakhstan": "Borat",
        "9th_Company": "The_9th_Company",
        "Van_Wilder_2:_The_Rise_of_Taj": "Van_Wilder:_The_Rise_of_Taj",
        "Fur:_An_Imaginary_Portrait_of_Diane_Arbus": "Fur_(film)",
        "INLAND_EMPIRE": "Inland_Empire_(film)",
        "Romanzo_criminale": "Romanzo_Criminale",
        "Fantastic_4:_Rise_of_the_Silver_Surfer": "Fantastic_Four:_Rise_of_the_Silver_Surfer",
        "Watch_Out,_We're_Mad": "Watch_Out,_We're_Mad!",
        "Black_Sheep": "Black_Sheep_(2006_New_Zealand_film)",
        "I'm_Not_There.": "I'm_Not_There",
        "Aliens_vs_Predator:_Requiem": "Aliens_vs._Predator:_Requiem",
        "The_Water_Horse": "The_Water_Horse:_Legend_of_the_Deep",
        "Dragon_Wars:_D-War": "D-War",
        "[REC]": "Rec_(film)",
        "Dark_Angel": "I_Come_in_Peace",
        "Mongol:_The_Rise_of_Genghis_Khan": "Mongol_(film)",
        "The_Visitor": "The_Visitor_(2007_drama_film)",
        "The_Bread,_My_Sweet": "A_Wedding_for_Bella",
        "Bigger_Stronger_Faster*": "Bigger,_Stronger,_Faster*",
        "When_Did_You_Last_See_Your_Father?": "And_When_Did_You_Last_See_Your_Father%3F",
        "WALL·E": "WALL-E",
        "Jimmy_Carter_Man_from_Plains": "Man_from_Plains",
        "The_X_Files:_I_Want_to_Believe": "The_X-Files:_I_Want_to_Believe",
        "The_Mother_of_Tears": "Mother_of_Tears",
        "Nick_and_Norah's_Infinite_Playlist": "Nick_&_Norah's_Infinite_Playlist",
        "The_Tulse_Luper_Suitcases,_Part_1:_The_Moab_Story": "The_Tulse_Luper_Suitcases,_Part_1:_The_Moab_Story_Q4508839",
        "The_Express": "The_Express:_The_Ernie_Davis_Story",
        "RockNRolla": "RocknRolla",
        "Mesrine:_Killer_Instinct": "Mesrine_(2008_film)",
        "He's_Just_Not_That_Into_You": "He's_Just_Not_That_into_You_(film)",
        "Rock-A-Doodle": "Rock-a-Doodle",
        "Monsters_vs_Aliens": "Monsters_vs._Aliens",
        "Nothing_But_the_Truth": "Nothing_but_the_Truth_(2008_American_film)",
        "The_Taking_of_Pelham_1_2_3": "The_Taking_of_Pelham_123_(2009_film)",
        "(500)_Days_of_Summer": "500_Days_of_Summer",
        "The_Inhabited_Island": "Dark_Planet_(film)",
        "Pagafantas": "Friend_Zone_(film)",
        "Evangelion:_1.0:_You_Are_(Not)_Alone": "Evangelion:_1.0_You_Are_(Not)_Alone",
        "The_Bad_Lieutenant:_Port_of_Call_-_New_Orleans": "Bad_Lieutenant:_Port_of_Call_New_Orleans",
        "Nine": "Nine_(2009_live-action_film)",
        "The_Girl_Who_Kicked_the_Hornet's_Nest": "The_Girl_Who_Kicked_the_Hornets'_Nest_(film)",
        "More_Than_a_Game": "More_than_a_Game",
        "[REC]²": "Rec_2",
        "Frozen": "Frozen_(2010_American_film)",
        "Ca$h": "Cash_(2010_film)",
        "The_Back-Up_Plan": "The_Back-up_Plan",
        "How_About_You...": "How_About_You_(film)",
        "TEKKEN": "Tekken_(2009_film)",
        "Cats_&_Dogs_2_:_The_Revenge_of_Kitty_Galore": "Cats_&_Dogs:_The_Revenge_of_Kitty_Galore",
        "Life_As_We_Know_It": "Life_as_We_Know_It_(film)",
        "PEEPLI_[Live]": "Peepli_Live",
        "RED": "Red_(2010_film)",
        "TRON:_Legacy": "Tron:_Legacy",
        "The_Secret_World_of_Arrietty": "Arrietty",
        "Tucker_and_Dale_vs_Evil": "Tucker_&_Dale_vs._Evil",
        "Who_Is_Harry_Nilsson_(And_Why_Is_Everybody_Talkin'_About_Him?)": "Who_Is_Harry_Nilsson_(And_Why_Is_Everybody_Talkin'_About_Him)%3F",
        "Hum_Aapke_Hain_Koun": "Hum_Aapke_Hain_Koun..!",
        "It's_Alive": "It's_Alive_(2009_film)",
        "First_Love": "First_Love_(2004_drama_film)",
        "Happythankyoumoreplease": "Happy._Thank_You._More._Please.",
        "Super": "Super_(2010_American_film)",
        "Troll_Hunter": "Trollhunter",
        "Death_Note:_The_Last_Name": "Death_Note_2:_The_Last_Name",
        "The_Arbor": "The_Arbor_Q65840668",
        "Atlas_Shrugged_Part_I": "Atlas_Shrugged:_Part_I",
        "Welcome_to_the_South": "Benvenuti_al_Sud",
        "Stolen": "Stolen_(2009_American_film)",
        "The_Other_Woman": "The_Other_Woman_(2009_film)",
        "Yoo_Hoo_Mrs_Goldberg": "Yoo-Hoo,_Mrs._Goldberg",
        "Crazy,_Stupid,_Love.": "Crazy,_Stupid,_Love",
        "Glee:_The_Concert_Movie": "Glee:_The_3D_Concert_Movie",
        "A_Very_Harold_&_Kumar_Christmas": "A_Very_Harold_&_Kumar_3D_Christmas",
        "The_Greatest_Movie_Ever_Sold": "POM_Wonderful_Presents:_The_Greatest_Movie_Ever_Sold",
        "50/50": "50%2F50_(2011_film)",
        "Mulan:_Rise_of_a_Warrior": "Mulan_(2009_film)",
        "Hipsters": "Stilyagi_(film)",
        "Six_Degrees_of_Celebration": "Yolki",
        "Louis_C.K.:_Live_at_the_Beacon_Theater": "Live_at_the_Beacon_Theater",
        "Chapiteau-Show": "_Q56305361",
        "The_Hunter": "The_Hunter_(2011_Australian_film)",
        "L!fe_Happens": "Life_Happens",
        "Bernie": "Bernie_(2011_film)",
        "4.3.2.1": "4.3.2.1.",
        "For_Greater_Glory_-_The_True_Story_of_Cristiada": "For_Greater_Glory",
        "Hit_&_Run": "Hit_and_Run_(2012_film)",
        "Hoodwinked_Too!_Hood_VS._Evil": "Hoodwinked_Too!_Hood_vs._Evil",
        "[REC]³_Genesis": "Rec_3:_Genesis",
        "Turkish_Passion": "The_Turkish_Passion",
        "Silent_Hill:_Revelation_3D": "Silent_Hill:_Revelation",
        "The_Shaolin_Temple": "Shaolin_Temple_(1982_film)",
        "Shaolin_Temple_2:_Kids_from_Shaolin": "Kids_From_Shaolin",
        "Nursery_University": "Nursery_University",
        "Le_Chef": "The_Chef_(film)",
        "The_Pirate_Bay:_Away_From_Keyboard": "TPB_AFK",
        "A_Coffee_in_Berlin": "A_Coffee_In_Berlin",
        "Satan's_Blood": "_Q5837517",
        "I'm_So_Excited!": "I'm_So_Excited_(film)",
        "InAPPropriate_Comedy": "Inappropriate_Comedy",
        "Cycling_with_Molière": "Bicycling_with_Molière",
        "Redd_Inc.": "Redd_Inc._Q18340945",
        "The_Way_Way_Back": "The_Way,_Way_Back",
        "The_Breath": "Breath_(2009_film)",
        "Adore": "Adoration_(2013_film)",
        "What_If": "The_F_Word_(2013_film)",
        "Blue_Is_the_Warmest_Color": "Blue_Is_the_Warmest_Colour",
        "The_Double": "The_Double_(2013_film)",
        "Pieta": "Pietà_(film)",
        "One_Piece_Film_Strong_World": "One_Piece_Film:_Strong_World",
        "The_Nutcracker:_The_Untold_Story": "The_Nutcracker_in_3D",
        "Aquí_Entre_Nos": "_Q28664077",
        "Good_Ol’_Freda": "Good_Ol’_Freda_Q13107958",
        "Underdogs": "Underdogs_(2013_Argentine_film)",
        "The_Final_Cut": "The_Final_Cut_(1995_film)",
        "Nymphomaniac:_Vol._II": "Nymphomaniac_Part_Two_Q21468405",
        "Niko_2_-_Little_Brother,_Big_Trouble": "Little_Brother,_Big_Trouble:_A_Christmas_Adventure",
        "Joe": "Joe_(2013_film)",
        "Under_the_Skin": "Under_the_Skin_(2013_film)",
        "Blank_City": "Blank_City_Q5729942",
        "Heaven_is_for_Real": "Heaven_Is_for_Real_(film)",
        "Tracks": "Tracks_(2013_film)",
        "Bikini_Spring_Break": "Bikini_Spring_Break_Q4907409",
        "Human_Capital": "Human_Capital_(2013_film)",
        "The_Internet's_Own_Boy:_The_Story_of_Aaron_Swartz": "The_Internet's_Own_Boy",
        "The_Strange_Color_of_Your_Body's_Tears": "The_Strange_Colour_of_Your_Body's_Tears",
        "Step_Up_All_In": "Step_Up:_All_In",
        "Atlas_Shrugged_Part_III:_Who_is_John_Galt?": "Atlas_Shrugged_Part_III:_Who_Is_John_Galt%3F",
        "House_III:_The_Horror_Show": "The_Horror_Show",
        "The_Disappearance_of_Eleanor_Rigby:_Them": "The_Disappearance_of_Eleanor_Rigby",
        "Felix_The_Cat:_The_Movie": "Felix_the_Cat:_The_Movie",
        "Cowgirls_n'_Angels": "Cowgirls_'n_Angels",
        "Dead_Snow_2:_Red_vs._Dead": "Dead_Snow:_Red_vs._Dead",
        #"The_Inhabited_Island_2:_Rebellion": "Dark_Planet_(film)",
        "20th_Century_Boys_-_Chapter_1:_Beginning_of_the_End": "20th_Century_Boys_1:_Beginning_of_the_End_Q2814791",
        "The_Protector_2": "Tom_Yum_Goong_2",
        "A_Gorgeous_Girl_Like_Me": "Such_a_Gorgeous_Kid_Like_Me",
        "No_Retreat,_No_Surrender_2:_Raging_Thunder": "No_Retreat,_No_Surrender_2",
        "One_Bright_Shining_Moment": "One_Bright_Shining_Moment:_The_Forgotten_Summer_of_George_McGovern",
        "The_Woman_in_Black_2:_Angel_of_Death": "The_Woman_in_Black:_Angel_of_Death",
        "Pancho,_el_perro_millonario": "Millionaire_Dog_Q17478195",
        "Forbidden_Kingdom": "Viy_(2014_film)",
        "Nurse_3-D": "Nurse_3D",
        "Tad,_the_Lost_Explorer": "Tad,_The_Lost_Explorer",
        "Torrente_5:_Operación_Eurovegas": "Torrente_5:_Operación_Eurovegas_Q16641781",
        "El_robobo_de_la_jojoya": "_Q5826504",
        "Soccer_Days": "Football_Days",
        "Aquí_llega_Condemor,_el_pecador_de_la_pradera": "_Q623438",
        "El_asombroso_mundo_de_Borjamari_y_Pocholo": "El_asombroso_mundo_de_Borjamari_y_Pocholo_Q3310984",
        "FBI:_Frikis_buscan_incordiar": "_Q5853907",
        "[REC]⁴_Apocalypse": "Rec_4:_Apocalypse",
        "Sällskapsresan_II_-_Snowroller": "Sällskapsresan_2_–_Snowroller",
        "Soulless": "Dukhless",
        "Kidnapping_Mr._Heineken": "Kidnapping_Freddy_Heineken",
        "Insurgent": "The_Divergent_Series:_Insurgent",
        "La_vérité_si_je_mens_!": "La_Vérité_si_je_mens_!",
        "Erkan_&_Stefan_2": "Erkan_&_Stefan_gegen_die_Mächte_der_Finsternis_Q1354951",
        "Ryaba,_My_Chicken": "Assia_and_the_Hen_with_the_Golden_Eggs",
        "Hungry_Hearts": "Hungry_Hearts_(2014_film)",
        "Parts_Per_Billion": "Parts_per_Billion",
        "Table_For_Five": "Table_for_Five",
        "Tinker_Bell_and_the_Pirate_Fairy": "The_Pirate_Fairy",
        "Falling_From_Grace": "Falling_from_Grace_(film)",
        "Barbie_in_'A_Christmas_Carol'": "Barbie_in_'A_Christmas_Carol'_Q284162",
        "B/W": "_Q16716103",
        "Jimmy_P.": "Jimmy_P:_Psychotherapy_of_a_Plains_Indian",
        "Plus_one": "Plus_One_(2008_film)",
        "Allegiant": "The_Divergent_Series:_Allegiant",
        "He-Man_and_She-Ra:_The_Secret_of_the_Sword": "The_Secret_of_the_Sword",
        "Love_and_the_City": "Love_in_the_Big_City",
        "El_truco_del_manco": "_Q6428320",
        "La_peggior_settimana_della_mia_vita": "The_Worst_Week_of_My_Life_(film)",
        "Welcome_Mr._President!": "Welcome_Mr._President",
        "A_Whole_Life_Ahead": "Your_Whole_Life_Ahead_of_You",
        "Microbe_and_Gasoline": "Microbe_&_Gasoline",
        "Kaiji_2:_The_Ultimate_Gambler": "_Q3812483",
        "The_Witch": "The_Witch_(2015_film)",
        "The_Visit": "The_Visit_(2015_American_film)",
        "Always_-_Sunset_on_Third_Street": "Always_Sanchōme_no_Yūhi",
        "OPA!": "Opa!",
        "Hate_Crime": "Hate_Crime_(2005_film)",
        "¿Quién_mató_a_Bambi?": "Who_Killed_Bambi%3F_(2013_film)",
        "La_Vérité_si_je_Mens_!_3": "La_Vérité_si_je_mens_!_3_Q3213979",
        "Would_I_Lie_to_You?_2": "La_Vérité_si_je_mens_!_2_Q3213978",
        "Puella_Magi_Madoka_Magica_the_Movie_Part_III:_Rebellion": "Puella_Magi_Madoka_Magica:_Rebellion_Q17479150",
        "Rita's_Last_Fairy_Tale": "_Q4373874",
        "Hard": "Hard_Q3127330",
        "William_&_Kate": "William_&_Kate:_The_Movie",
        "Faster_than_Rabbits": "_Q15709915",
        "What_Men_Still_Talk_About": "_Q4328873",
        "The_Letters": "The_Letters_(2014_film)",
        "F.C._De_Kampioenen:_Kampioen_zijn_blijft_plezant": "F.C._De_Kampioenen:_Kampioen_zijn_blijft_plezant_Q3342853",
        "Crazy_About_Ya": "_Q3191414",
        "Hell_&_Back": "Hell_and_Back_(film)",
        "My_King": "Mon_Roi",
        "ABCD_(Any_Body_Can_Dance)": "ABCD:_Any_Body_Can_Dance",
        "Orleans": "Orlean_(film)",
        "Requiem_for_the_American_Dream": "Requiem_for_the_American_Dream_Q24273547",
        "Agnivarsha:_The_Fire_and_the_Rain": "Agni_Varsha",
        "Freddie_as_F.R.O.7.": "Freddie_as_F.R.O.7",
        "Eddie:_The_Sleepwalking_Cannibal": "Eddie:_The_Sleepwalking_Cannibal_Q3047233",
        "The_Life_of_Guskou_Budori": "The_Life_of_Budori_Gusuko",
        "Garv:_Pride_and_Honour": "Garv:_Pride_&_Honour",
        "Chaahat_Ek_Nasha...": "Chaahat_–_Ek_Nasha",
        "3_Braves": "3_Bahadur",
        "Dear_Guest,_When_Will_You_Leave?": "Atithi_Tum_Kab_Jaoge%3F",
        "Mr._Right": "Mr._Right_(2015_film)",
        "Elections_Day_2": "_Q21843265",
        "Too_Late": "Too_Late_(2015_film)",
        "Villan": "Villain_(2002_film)",
        "Mugavari": "Mugavaree",
        "7Aum_Arivu": "7aum_Arivu",
        "Sky_Of_Love": "Koizora_(film)",
        "Delhi_Dance": "_Q7850242",
        "Boris_-_Il_film": "Boris:_The_Film",
        "The_Life_and_Death_of_9413,_a_Hollywood_Extra": "The_Life_and_Death_of_9413:_a_Hollywood_Extra",
        "El_Estudiante": "El_estudiante",
        "La_caliente_niña_Julietta": "The_Hot_Girl_Juliet_Q9019115",
        "Charlie": "Charlie_(2015_Malayalam_film)",
        "The_Democratic_Terrorist": "_Q1198435",
        "Gabo:_The_Creation_of_Gabriel_Garcia_Marquez": "Gabo,_la_creación_de_Gabriel_García_Márquez",
        "Lift": "Lift_Q4264074",
        "Your_Name.": "Your_Name",
        "Miesten_välisiä_keskusteluja": "Miesten_välisiä_keskusteluja_Q11882474",
        "The_Work_and_the_Glory_II:_American_Zion": "The_Work_and_the_Glory:_American_Zion_(film)",
        "The_Book_of_Mormon_Movie,_Volume_1:_The_Journey": "The_Book_of_Mormon_Movie",
        "My_Life_as_a_Zucchini": "My_Life_as_a_Courgette",
        "Musudan": "_Q22973788",
        "Supersonic": "Oasis:_Supersonic",
        "Kabadayı": "For_Love_and_Honor",
        "Ivan_Tsarevich_&_the_Grey_Wolf_2": "Ivan_Tsarevich_and_the_Gray_Wolf_2",
        "Ivan_Tsarevich_&_the_Grey_Wolf_3": "Ivan_Tsarevich_and_the_Gray_Wolf_3",
        "Alesha_Popovich_and_Tugarin_the_Dragon": "Alesha_Poppovich_and_the_Snake's_Tugarin_Q2381788",
        "Nikitich_and_The_Dragon": "Dobrinya_and_the_Dragon",
        "Max_&_Leon": "La_folle_histoire_de_Max_et_Léon",
        "Rogue_One:_A_Star_Wars_Story": "Rogue_One",
        "Split": "Split_(2016_American_film)",
        "Tri_bogatyrya_i_Shamakhanskaya_tsaritsa": "_Q4462820",
        "The_Tobacconist_of_Vallecas": "_Q3208507",
        "Sing": "Sing_(2016_American_film)",
        "xXx:_Return_of_Xander_Cage": "XXX:_Return_of_Xander_Cage",
        "One_Piece_Film:_GOLD": "One_Piece_Film:_Gold",
        "Black_Snow": "Black_Snow_Q28032998",
        "Queen_of_Spades:_The_Dark_Rite": "_Q21148916",
        "Gantz:O": "Gantz:_O",
        "Fist_of_the_North_Star:_Legend_of_Raoh_-_Chapter_of_Death_in_Love": "_Q16642239",
        "Zombie_Fever": "Zombie_Holidays_3D",
        "Rescue_Under_Fire": "Rescue_Under_Fire_Q42915336",
        "Ultimate_Avengers_2": "Ultimate_Avengers_2_Q590098",
        "CHiPS": "CHiPs_(film)",
        "Sky._Plane._Girl.": "_Q4315527",
        "Spark:_A_Space_Tail": "Spark_(2016_film)",
        "The_Spacewalker": "The_Age_of_Pioneers",
        "Королёв": "_Q4233902",
        "Bizim_Aile": "_Q19473356",
        "Horoscope_for_Good_Luck": "_Q21638226",
        "Nasha_Russia:_Yaytsa_sudby": "Our_Russia._The_Balls_of_Fate",
        "The_Best_Movie_3-DE": "The_Best_Movie_3-De",
        "The_New_Year's_Rate_Plan": "_Q2120297",
        "A_Few_Less_Men": "A_Few_Less_Men_Q29907988",
        "The_Irony_of_Fate._The_Sequel": "The_Irony_of_Fate_2",
        "SuperManager,_or_Hack_of_the_Fate": "_Q1653410",
        "Petersburg:_Only_for_Love": "_Q27988104",
        "Boj_S_Tenyu_2:_Revansh": "Shadowboxing_2:_Revenge",
        "Dark_World:_Equilibrium": "Dark_World:_Equilibrium_Q15383029",
        "Lucky_Island": "_Q15270893",
        "Hooked_on_the_Game_2._The_Next_Level": "Hooked_2_Q4310959",
        "We_Are_from_the_Future_2": "_Q3624680",
        "Lovey-Dovey_2": "_Q4271793",
        "Love_and_the_City_2": "Love_in_the_Big_City_2",
        "Antikiller_2:_Antiterror": "Antikiller_2:_Antiterror_Q4067009",
        "Antikiller_D.K": "Antikiller_3_Q4067010",
        "The_Spy": "Spy_(2012_Russian_film)",
        "Ticket_to_Vegas": "A_ticket_to_Vegas_Q4086678",
        "Mechenosets": "The_Sword_Bearer",
        "Don't_Even_Think": "_Q4154027",
        "Chaos": "Chaos_(2005_Capitol_film)",
        "Doctor_Dolittle": "Dr._Dolittle_(1998_film)",
        "Butterfly": "Butterfly's_Tongue",
        "Crime_+_Punishment_in_Suburbia": "Crime_and_Punishment_in_Suburbia",
        "M*A*S*H": "MASH_(film)",
        "Salton_Sea": "The_Salton_Sea_(2002_film)",
        "Gloomy_Sunday": "Ein_Lied_von_Liebe_und_Tod",
        "Kops": "Kopps",
        "The_Rocket:_The_Legend_of_Rocket_Richard": "The_Rocket_(2005_film)",
        "Il_Divo": "Il_divo_(film)",
        "9": "9_(2009_animated_film)",
        "Picture_Me": "Picture_Me:_A_Model's_Diary_Q3382574",
        "The_Invincible_Iron_Man": "The_Invincible_Iron_Man_Q1782442",
        "Mud": "Mud_(2012_film)",
        "Gwendoline": "The_Perils_of_Gwendoline_in_the_Land_of_the_Yik-Yak",
        "Target": "_Q4298409",
        "Welcome_to_the_North": "Benvenuti_al_Nord",
        "The_Nightmare": "The_Nightmare_(2015_American_film)",
        "Demolition": "Demolition_(2015_film)",
        "Bodyguard": "Bodyguard_(2011_Hindi_film)",
        "Ed_Gein": "In_the_Light_of_the_Moon",
        "Easy_on_the_Eyes": "_Q20081557",
        "The_Battalion": "Battalion_(2015_film)",
    }
    df.loc[mask, 'col_to_embed'] = df.loc[mask, "title"].str.replace(" ", "_").map(matches).fillna(df.loc[mask, 'col_to_embed'])
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Try "title_(film)" format for non matches (--> 1838 extra matches)
    df.loc[mask, 'col_to_embed'] = df.loc[mask, "title"].str.replace(" ", "_") + "_(film)"
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Correct special characters (--> 4 extra matches)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "col_to_embed"].str.replace("/", "%2F").str.replace("?", "%3F").str.replace("’", "'")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # String-level morphological variations for non matches (--> 2 extra matches)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "col_to_embed"].str.replace("_With_", "_with_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Try "title" format for non matches (--> 3401 extra matches)
    df.loc[mask, 'col_to_embed'] = df.loc[mask, "title"].str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Correct special characters (--> 22 extra matches)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "col_to_embed"].str.replace("/", "%2F").str.replace("?", "%3F").str.replace("’", "'")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # String-level morphological variations for non matches (--> 17 extra matches)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "col_to_embed"].str.replace("Twelve_", "12_").str.replace("_To_", "_to_").str.replace("_An_", "_an_").str.replace("_Till_", "_till_").str.replace("_With_", "_with_").str.replace("_Of_", "_of_").str.replace("_The_", "_the_").str.replace("_But_", "_but_").str.replace("_By_", "_by_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Remove punctuation for non matches (--> 17 extra matches)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "col_to_embed"].str.replace(":_", "_").str.replace(",_", "_").str.replace("_&_", "_and_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Modify punctuation for non matches (--> 9 extra matches)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "title"].str.replace(" ", "_").str.replace("-", "–")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Modify punctuation for non matches (--> 3 extra matches)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "title"].str.replace(" ", "_").str.replace(":", "_–")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Try "original_title" format for non matches (--> 37 extra matches)
    df.loc[mask, 'col_to_embed'] = df.loc[mask, "original_title"].str.replace(" ", "_")
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    # Try "original_title_(film)" format for non matches (--> 5 extra matches)
    df.loc[mask, 'col_to_embed'] = df.loc[mask, "original_title"].str.replace(" ", "_") + "_(film)"
    mask = ~df["col_to_embed"].isin(entity_list)
    n0, n1 = n1, mask.sum()
    print(n0 - n1, "additional matches")


    print("Final number of unmatched entities: ", n1)


    # Restore original format for non-matches
    df.loc[mask, "col_to_embed"] = df["title"].str.replace(" ", "_") + "_(" + df["year"] + "_film)"


    # Correct mistakes
    mistakes = [
        "The_Doors",
        "2010",
    ]
    corrections = {
        "The_Doors": "The_Doors_(film)",
        "2010": "2010:_The_Year_We_Make_Contact",
    }
    mask = df["col_to_embed"].isin(mistakes)
    df.loc[mask, "col_to_embed"] = df.loc[mask, "title"].str.replace(" ", "_").map(corrections).fillna(df.loc[mask, 'col_to_embed'])
    df.loc[df.original_title == "Суперплохие", "col_to_embed"] = "Superbad_(2016_film)"


    # Check that each col_to_embed entry is unique
    assert df["col_to_embed"].is_unique, "Entries of col_to_embed should be unique but are not."


    # Account for specificities of Yago3 and Yago4
    df["yago4_col_to_embed"] = df["col_to_embed"]
    mask = df["col_to_embed"].str.contains("_Q[0-9]{6,10}", regex=True).astype("boolean")
    df["yago3_col_to_embed"] = df["col_to_embed"]
    df.loc[mask, "yago3_col_to_embed"] = df["title"].str.replace(" ", "_") + "_(" + df["year"] + "_film)"

    ## Final preprocessing steps
    # Add a column with raw entity names: Title + Release date
    df["raw_entities"] = df["title"] + ", " + df["release_date"]

    # Build target
    df["target"] = np.log10(df["revenue"])

    # Keep only relevant columns
    df = df[["raw_entities", "yago3_col_to_embed", "yago4_col_to_embed", "target"]]

    ## Save dataframe
    df.to_parquet(dir_path / "target_log.parquet", index=False)