import pandas as pd
from tqdm import tqdm

from SEPAL import SEPAL_DIR
from SEPAL.utils import get_downstream_wikidb_files


def yago45_matching():
    print("Processing YAGO4.5 matching")
    # Get the list of target files
    target_files = get_downstream_wikidb_files()
    # Get the yago4.5 to wikidata mapping
    yago45_to_wd = pd.read_parquet(
        SEPAL_DIR / "datasets/knowledge_graphs/yago4.5/link_to_wikidata.parquet"
    )
    yago45_bw_to_wd = pd.read_parquet(
        SEPAL_DIR / "datasets/knowledge_graphs/yago4.5/link_to_wikidata_BW.parquet"
    )
    yago45_to_wd = pd.concat([yago45_to_wd, yago45_bw_to_wd])
    # Map the downstream entities
    for target_path in tqdm(target_files, desc="Processing targets", unit="file"):
        # Load target
        target = pd.read_parquet(target_path)
        # Merge target and yago45_to_wd on 'wikidata_col_to_embed' and 'Wikidata_entity'
        target = target.merge(
            yago45_to_wd,
            left_on="wikidata_col_to_embed",
            right_on="Wikidata_entity",
            how="left",
        )
        # Rename 'Yago_entity' to 'yago4.5_col_to_embed'
        target.rename(columns={"Yago_entity": "yago4.5_col_to_embed"}, inplace=True)
        # Drop 'Wikidata_entity' column
        target.drop(columns="Wikidata_entity", inplace=True)
        # Save target
        target.to_parquet(target_path)
    return


def yago4_matching():
    print("Processing YAGO4 matching")
    # Get the list of target files
    target_files = get_downstream_wikidb_files()
    # Get the yago4 to wikidata mapping
    yago4_to_wd = pd.read_parquet(
        SEPAL_DIR / "datasets/knowledge_graphs/yago4/link_to_wikidata.parquet"
    )
    # Map the downstream entities
    for target_path in tqdm(target_files, desc="Processing targets", unit="file"):
        # Load target
        target = pd.read_parquet(target_path)
        # Merge target and yago4_to_wd on 'wikidata_col_to_embed' and 'Wikidata_entity'
        target = target.merge(
            yago4_to_wd,
            left_on="wikidata_col_to_embed",
            right_on="Wikidata_entity",
            how="left",
        )
        # Rename 'Yago_entity' to 'yago4_col_to_embed'
        target.rename(columns={"Yago_entity": "yago4_col_to_embed"}, inplace=True)
        # Drop 'Wikidata_entity' column
        target.drop(columns="Wikidata_entity", inplace=True)
        # Save target
        target.to_parquet(target_path)
    return


def freebase_matching():
    print("Processing Freebase matching")
    # Get the list of target files
    target_files = get_downstream_wikidb_files()
    # Get the Freebase to Yago4 mapping
    yago4_to_fb = pd.read_parquet(
        SEPAL_DIR / "datasets/knowledge_graphs/yago4/link_to_freebase.parquet"
    )
    # Map the downstream entities
    for target_path in tqdm(target_files, desc="Processing targets", unit="file"):
        # Load target
        target = pd.read_parquet(target_path)
        # Merge target and yago4_to_fb on 'yago4_col_to_embed' and 'Yago_entity'
        target = target.merge(
            yago4_to_fb,
            left_on="yago4_col_to_embed",
            right_on="Yago_entity",
            how="left",
        )
        # Rename 'Freebase_entity' to 'freebase_col_to_embed'
        target.rename(
            columns={"Freebase_entity": "freebase_col_to_embed"}, inplace=True
        )
        # Drop 'Yago_entity' column
        target.drop(columns="Yago_entity", inplace=True)
        # Save target
        target.to_parquet(target_path)
    return


def yago3_matching():
    print("Processing YAGO3 matching")
    # Get the list of target files
    target_files = get_downstream_wikidb_files()
    # Get the yago3 to dbpedia mapping
    yago3_to_dbpedia = pd.read_csv(
        SEPAL_DIR / "datasets/knowledge_graphs/yago-3.0.2/yagoDBpediaInstances.tsv",
        sep="\t",
        header=0,
    )
    yago3_to_dbpedia = yago3_to_dbpedia.iloc[:, [1, 3]]
    yago3_to_dbpedia.columns = ["Yago3_entity", "DBpedia_entity"]
    yago3_to_dbpedia["Yago3_entity"] = yago3_to_dbpedia[
        "Yago3_entity"
    ].str.removeprefix("<")
    yago3_to_dbpedia["Yago3_entity"] = yago3_to_dbpedia[
        "Yago3_entity"
    ].str.removesuffix(">")
    yago3_to_dbpedia["DBpedia_entity"] = yago3_to_dbpedia[
        "DBpedia_entity"
    ].str.removeprefix("<http://dbpedia.org/resource/")
    yago3_to_dbpedia["DBpedia_entity"] = yago3_to_dbpedia[
        "DBpedia_entity"
    ].str.removesuffix(">")
    # Get the yago4 to dbpedia mapping
    yago4_to_dbpedia = pd.read_parquet(
        SEPAL_DIR / "datasets/knowledge_graphs/yago4/link_to_dbpedia.parquet"
    )
    # Get the yago3 to yago4 mapping
    yago3_to_yago4 = yago3_to_dbpedia.merge(
        yago4_to_dbpedia,
        on="DBpedia_entity",
        how="left",
    )
    yago3_to_yago4.drop(columns="DBpedia_entity", inplace=True)
    yago3_to_yago4.rename(
        columns={
            "Yago3_entity": "yago3_col_to_embed",
            "Yago_entity": "yago4_col_to_embed",
        },
        inplace=True,
    )
    # Map the downstream entities
    for target_path in tqdm(target_files, desc="Processing targets", unit="file"):
        # Load target
        target = pd.read_parquet(target_path)
        # Merge target and yago3_to_yago4 on 'yago4_col_to_embed'
        target = target.merge(
            yago3_to_yago4.dropna(subset=["yago4_col_to_embed"]),
            on="yago4_col_to_embed",
            how="left",
        )
        # Save target
        target.to_parquet(target_path)
    return


def main():
    yago45_matching()
    yago4_matching()
    freebase_matching()
    yago3_matching()
    return


if __name__ == "__main__":
    main()
