import copy
from sklearn.metrics.pairwise import cosine_similarity
from Helpers.helper import w2v_emb_col
import pickle
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
import os

curdir = '.'
source_path = os.path.abspath(os.curdir)
skill_cluster_path = "/../feature_data/competency_grp_data/skill_cluster_dict_40.pkl"
cluster_skill_path = "/../feature_data/competency_grp_data/cluster_skill_dict_40.pkl"
embed___model = "/../feature_data/Word2vec_features/w2v_skills.models/w2v_model_embed_30.model"
model_file_path = source_path + "/../model/Word2vec_only/"
test_path_folder = source_path + "/../dataset/Word2vec_only/"


def embedding_feature(model):
    w2v_model_path_30 = source_path + embed___model
    skill_cluster_dict = pickle.load(open(source_path + skill_cluster_path, "rb"))
    rhs_embedding = pd.read_csv(source_path + "/../feature_data/check.csv")
    cluster_skill_dict = pickle.load(open(source_path + cluster_skill_path, "rb"))
    grp_40_df = pd.DataFrame(dict.keys(cluster_skill_dict), columns=["group_name"])
    # read cluster dictionary
    rhs_embedding['LABELS'] = rhs_embedding["1"].apply(lambda x: x.strip('[]'))
    # if model == "pytorch":
    #     col_name = py_torch_emb_col
    #     embedding1 = rhs_embedding["LABELS"].values
    #     final_list = []
    #     for ele in embedding1:
    #         inter = ele.split()
    #         inter_1 = [float(i) for i in inter]
    #         final_list.append(inter_1)

    if model == "word2vec":
        col_name = w2v_emb_col
        w2v_30 = Word2Vec.load(w2v_model_path_30)
        print(len(w2v_30.wv.vocab))
        final_list = []
        for label in rhs_embedding["rhs"].values:
            emb_30 = w2v_30[label.replace("$", " ")]
            emb_all = list(emb_30)
            final_list.append(emb_all)

    labels = rhs_embedding["rhs"].values
    dependent = []

    for i in range(len(labels)):
        dep = skill_cluster_dict.get(labels[i].replace("$", " "))
        if dep:
            dep_str = ",".join(dep)
        else:
            dep_str = None
        dependent.append(dep_str)

    embedding_df = pd.DataFrame(final_list, columns=col_name)
    training_dataframe = pd.DataFrame({"skill_name": labels, "label": dependent})
    training_dataframe = pd.concat([training_dataframe, embedding_df], axis=1)
    # training_dataframe_new = training_dataframe.drop('label', axis=1) \
    #     .join(training_dataframe.label.str.split(",", expand=True).stack()
    #           .reset_index(drop=True, level=1).rename('label'))
    grp_40_df['key'] = 0
    training_dataframe['key'] = 0
    training_dataframe_new1 = pd.merge(training_dataframe, grp_40_df, on="key", how="outer")
    return training_dataframe_new1.dropna().reset_index(drop=True)


def process_df(df):
    dataframe = df.dropna()
    label = dataframe[["label"]]
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(label)
    cat = enc.categories_[0]
    # for group in cat:
    #     cs_w2v.append(group_cosine_similarity_w2v[group][0])
    #     length_w2v.append(group_cosine_similarity_w2v[group][1])
    #     var_w2v.append(variance_w2v[group][0])
    y_train_new = enc.transform(label).toarray()
    y_train_df = pd.DataFrame(data=y_train_new,
                              index=np.array(range(len(y_train_new))),
                              columns=np.array(range(40)))

    train_new_check = pd.concat([dataframe.reset_index(drop=True), y_train_df.reset_index(drop=True)], axis=1)
    label_drop_column = ['skill_name', "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13",
                         "d14", "d15", "d16", "d17", "d18",
                         "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30"]
    train_df_1 = train_new_check.groupby(label_drop_column).sum().reset_index()
    print("check")
    return train_df_1, cat


def get_tf_idf_feature(train_data, test_data, type):
    TFIDF_path = "/../feature_data/TFIDF/"
    skill_tf_idf = pickle.load(open(source_path+TFIDF_path+"skill_tfidf_dict_40.pkl", "rb"))
    skill_tf_idf_df = pd.DataFrame.from_dict(skill_tf_idf, orient='index', columns=['tf_idf'])
    skill_tf_idf_df["skill_name"] = skill_tf_idf_df.index.str.replace(" ", "$")

    train_data_tf_idf = pd.merge(train_data, skill_tf_idf_df, on="skill_name")
    rhs_embedding = pd.read_csv("check.csv")
    rhs_embedding['LABELS'] = rhs_embedding["1"].apply(lambda x: x.strip('[]'))
    embedding1 = rhs_embedding["LABELS"].values
    if type == "pytorch":
        # embedding1 = labels
        skill_emb_dict = {}
        final_list = []
        index = 0
        for ele in embedding1:
            inter = ele.split()
            inter_1 = [float(i) for i in inter]
            final_list.append(inter_1)
            key = rhs_embedding.iloc[index]
            skill_emb_dict[key["rhs"]] = inter_1
            index = index + 1
    if type == "word2vec":
        w2v_model_path_30 = embed___model
        w2v_30 = Word2Vec.load(w2v_model_path_30)
        skill_emb_dict = {}
        index = 0
        for ele in embedding1:
            key = rhs_embedding.iloc[index]
            skill_emb_dict[key["rhs"]] = w2v_30[key['rhs'].replace("$", " ")]
            index = index + 1

    top_1 = 0
    top_2 = 0
    top_3 = 0
    top_1_tfidf = 0
    top_2_tfidf = 0
    top_3_tfidf = 0
    top1_skill = ""
    top2_skill = ""
    top3_skill = ""
    train_skill_list = list(train_data_tf_idf["skill_name"].values)
    test_skill_list = list(test_data["skill_name"].values)
    test_skill_tfidf = {}
    for test_skill in test_skill_list:
        test_skill_embedding = skill_emb_dict.get(test_skill)
        for train_skill in train_skill_list:
            train_skill_embedding = skill_emb_dict.get(train_skill)
            sim_score = cosine_similarity([test_skill_embedding], [train_skill_embedding])[0][0]
            if sim_score > top_1:
                temp = top_1
                top_1 = sim_score
                top_3 = top_2
                top_2 = temp
                top_1_tfidf = skill_tf_idf.get(train_skill.replace("$", " "))
                if top1_skill != "":
                    top_2_tfidf = skill_tf_idf.get(top1_skill.replace("$", " "))
                if top2_skill != "":
                    top_3_tfidf = skill_tf_idf.get(top2_skill.replace("$", " "))
                top1_skill = train_skill
            elif sim_score > top_2:
                top_3 = top_2
                top_2 = sim_score
                top_2_tfidf = skill_tf_idf.get(train_skill.replace("$", " "))
                if top2_skill != "":
                    top_3_tfidf = skill_tf_idf.get(top2_skill.replace("$", " "))
                top2_skill = train_skill
            elif sim_score > top_3:
                top3_skill = train_skill
                top_3 = sim_score
                top_3_tfidf = skill_tf_idf.get(top3_skill.replace("$", " "))
        test_skill_tfidf[test_skill] = (top_1_tfidf + top_2_tfidf + top_3_tfidf) / 3
    test_tf_idf_df = pd.DataFrame.from_dict(test_skill_tfidf, orient='index',
                                            columns=['tf_idf'])
    test_tf_idf_df["skill_name"] = test_tf_idf_df.index
    test_data_tf_idf = pd.merge(test_data, test_tf_idf_df, on="skill_name")
    return train_data_tf_idf, test_data_tf_idf


def process_cluster_data(cluster_data):
    enc = OneHotEncoder(handle_unknown='ignore')
    cluster_new = enc.fit_transform(cluster_data[['cluster']]).toarray()
    cluster_new_df = pd.DataFrame(data=cluster_new,
                                  index=np.array(range(len(cluster_new))),
                                  columns=np.array(range(100)))

    cluster_new_df_check = pd.concat([cluster_data.reset_index(drop=True), cluster_new_df.reset_index(drop=True)],
                                     axis=1)
    return cluster_new_df_check


def top_features(model):
    similarity_folder = "/../feature_data/Word2vec_features/similarity_based_feature/"
    if model == "w2v":
        top1 = pd.read_csv(source_path + similarity_folder + "w2v_top1_dist.csv")
        top2 = pd.read_csv(source_path + similarity_folder + "w2v_top2_dist.csv")
        top3 = pd.read_csv(source_path + similarity_folder + "w2v_top3_dist.csv")

    top1_tr = top1.melt(id_vars=['skill_name'])
    top2_tr = top2.melt(id_vars=['skill_name'])
    top3_tr = top3.melt(id_vars=['skill_name'])

    top1_top2 = pd.merge(top1_tr, top2_tr, on=["skill_name", 'variable'])
    top1_top2_top3 = pd.merge(top1_top2, top3_tr, on=["skill_name", 'variable'])
    return top1_top2_top3


def dist_features(model):
    similarity_folder = "/../feature_data/Word2vec_features/similarity_based_feature/"
    if model == "w2v":
        skill_group_similarity = pd.read_csv(source_path + similarity_folder + "skill_group_similarity_w2v.csv")
        skill_group_similarity.drop(['Unnamed: 0'], axis=1, inplace=True)
        group_similarity = skill_group_similarity.melt(id_vars=['skill_name'])

    return group_similarity


def get_labels():
    skill_cluster_dict = pickle.load(open(source_path + skill_cluster_path, "rb"))
    skill_list = []
    group_list = []
    for index, key in enumerate(skill_cluster_dict):
        groups = skill_cluster_dict[key]
        for group in groups:
            skill_list.append(key.replace(" ", "$"))
            group_list.append(group)
    data_tuples = list(zip(skill_list, group_list))
    label_df = pd.DataFrame(data_tuples, columns=["skill_name", "group_name"])
    label_df["label"] = 1
    return label_df


def group_wise_prim_sec():
    competency_grp_path = "/../feature_data/competency_grp_data/"
    primary_skill_group = pickle.load(open(source_path + competency_grp_path + 'primary_skill.pkl', "rb"))
    primary_skill_count = {}
    secondary_skill_group = pickle.load(open(source_path + competency_grp_path + 'secondary_skill.pkl', "rb"))
    secondary_skill_count = {}
    for index, key in enumerate(primary_skill_group):
        primary_skill_count[key] = len(primary_skill_group[key])
    for index, key in enumerate(secondary_skill_group):
        secondary_skill_count[key] = len(secondary_skill_group[key])
    primary_skill_count_df = pd.DataFrame.from_dict(primary_skill_count, orient='index',
                                                    columns=['prim_count'])
    primary_skill_count_df["group_name"] = primary_skill_count_df.index
    secondary_skill_count_df = pd.DataFrame.from_dict(secondary_skill_count, orient='index',
                                                      columns=['sec_count'])
    secondary_skill_count_df["group_name"] = secondary_skill_count_df.index
    group_prim_sec = pd.merge(primary_skill_count_df, secondary_skill_count_df, on=["group_name"])
    return group_prim_sec


def accuracy_prediction_and_skill_classification(model, test_data, test_data_label, test, type):
    prediction = model.predict_proba(test_data)
    from sklearn.metrics import classification_report
    cutoffs = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
    # cutoffs = [0.2]
    for cutoff in cutoffs:
        result_list_inter = []
        result_list_inter.append(cutoffs)
        if type == "dnn":
            pred_check = copy.deepcopy(prediction)
        else:
            pred_check = copy.deepcopy(prediction[:, 1])

        print("------cut off is--------", cutoff)
        pred_check[pred_check >= cutoff] = 1
        pred_check[pred_check < cutoff] = 0
        scores = classification_report(test_data_label, pred_check, digits=4)
        print(scores)


def training(final_df):
    feature_importance_path = "/../feature_data/feature_importance/"
    test_path_folder = source_path + "/../dataset/Word2vec_only/"
    '''
    TFIDF value is per calculated for train and test set to reduce data preparation time
    '''
    TFIDF_path = "/../feature_data/TFIDF/"
    tf_idf_train_py = pd.read_csv(source_path + TFIDF_path + "train_data_tf_idf.csv")
    tf_idf_test_py = pd.read_csv(source_path + TFIDF_path + "test_data_tf_idf.csv")

    train = pd.merge(final_df, tf_idf_train_py, on="skill_name")
    test = pd.merge(final_df, tf_idf_test_py, on="skill_name")
    train.to_csv(test_path_folder+"train_2_class.csv",index=False)
    test.to_csv(test_path_folder+"test_2_class.csv",index=False)
    train_new = train.drop(["skill_name", "key", "label_x", "variable", "label_y", "group_name"], axis=1)
    test_new = test.drop(["skill_name", "key", "label_x", "variable", "label_y", "group_name"], axis=1)
    # train = final_df_new.sample(frac=0.8, random_state=200)  # random state is a seed value
    # test = final_df_new.drop(train.index)
    X_train = train_new.drop(columns=["label_new"])
    Y_train = train_new[["label_new"]]
    X_test = test_new.drop(columns=["label_new"])
    Y_test = test_new[["label_new"]]
    xgb_model = XGBClassifier(n_jobs=-1, max_depth=5, n_estimators=800)
    xgb_model.fit(X_train, Y_train)
    file_name = "w2v_only_xgb.pkl"

    # save
    pickle.dump(xgb_model, open(model_file_path+file_name, "wb"))

    col_list = []
    score_list = []
    for col, score in zip(X_train.columns, xgb_model.feature_importances_):
        col_list.append(col)
        score_list.append(score)
        print(col, score)
    data_tuples = list(zip(col_list, score_list))
    label_df = pd.DataFrame(data_tuples, columns=["feature_name", "score"])
    label_df.to_csv(source_path+feature_importance_path+"feature_imp_only_word2vec.csv", index=False)
    print("-----------------" + "Training Completed Successfully" + "-----------------")


def prepare_data():

    # ---------- Embedding features-------------------------
    w2v_embedding_df = embedding_feature("word2vec")

    # -----------Merge Top1,Top2,Top3 related features
    top_features_wv = top_features("w2v")
    top_features_wv["skill_name"] = top_features_wv["skill_name"].str.replace(" ", "$")
    emb_top_feature = pd.merge(w2v_embedding_df, top_features_wv, left_on=["skill_name", "group_name"],
                               right_on=["skill_name", "variable"])

    # -----------Merge distance related features
    dist_features_wv = dist_features("w2v")
    dist_features_wv["skill_name"] = dist_features_wv["skill_name"].str.replace(" ", "$")
    dist_features_wv["dist_f"] = dist_features_wv["value"]
    dist_features_wv.drop(["value"], inplace=True, axis=1)
    emb_top_dist_feature = pd.merge(emb_top_feature, dist_features_wv, on=["skill_name", "variable"])
    # -----------prim and secn group count related features
    grp_prim_sec_df = group_wise_prim_sec()
    # -----------Merge distance and top features-------------
    dist_top_sim_prim_sec_feature = pd.merge(emb_top_dist_feature, grp_prim_sec_df, on=["group_name"])
    # ----------- Attach label ----------------------------
    labels = get_labels()

    final_df = pd.merge(dist_top_sim_prim_sec_feature, labels, on=["skill_name", "group_name"], how="left")
    final_df["label_new"] = np.where(final_df["label_y"] == 1, 1, 0)
    return final_df


def testing(test_path):

    test_data = pd.read_csv(test_path_folder+test_path)
    test_new = test_data.drop(["skill_name", "key", "label_x", "variable", "label_y", "group_name"], axis=1)
    X_test = test_new.drop(columns=["label_new"])
    Y_test = test_new[["label_new"]]
    # load
    xgb_model = pickle.load(open(model_file_path+"w2v_only_xgb.pkl", "rb"))

    results = xgb_model.predict(X_test)
    print("-----------------" + "Testing Completed Successfully" + "-----------------")


if __name__ == "__main__":
    '''
    Below function is used to prepare training data
    '''
    data_df = prepare_data()
    '''
    Below function is used for model training
    '''
    training(data_df)
    '''
    Below function is used to load the model trained in previous step on the test data.
    You can replace "test_2_class.csv" with your own test data
    '''
    testing("test_2_class.csv")
