'''
- glove.py
- This file handles the loading and distance measuring of glove embeddings
'''

# External imports
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

# Internal imports


'''
----------load_embedding----------
- This function loads in the current feature vector set to work with
-----Inputs-----
- file_location - the location of the fasttext embedding file to use
-----Output-----
- embedding - the dictionary of embeddings
'''
def load_embedding(file_location):
    nltk.download('punkt')
    f = open(file_location, encoding="utf8")
    embedding = {}
    words = []
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embedding[word] = vector
    return {"embedding":embedding}


'''
----------get_avg_text_vector----------
- This function loads in the current feature vector set to work with
-----Inputs-----
- text - the text whose average embedding is to be retrieved
- embedding - the name of the embedding to use
-----Output-----
- distance - the averaged
'''
def get_avg_text_vector(text, embedding):
    sum_of_vector = np.full((len(embedding[list(embedding.keys())[0]]), ), 0)
    small_number = 1e-10
    # word_vec = text.split()
    tokenized_vec = word_tokenize(text)
    # print("Split:", word_vec, "NLTK tokenizer:", tokenized_vec)
    for word in tokenized_vec:
        word_embedding = embedding.get(word, np.full((len(embedding[list(embedding.keys())[0]]), ), small_number))
        sum_of_vector = np.add(word_embedding, sum_of_vector)
    vector_len = len(tokenized_vec)
    if vector_len == 0:
        vector_len = 1
    sum_of_vector = np.divide(sum_of_vector, vector_len)
    return sum_of_vector


'''
----------load_finetuned_embedding----------
- This function loads in the current feature vector set to work with
-----Inputs-----
- file_location - the location of the fasttext embedding file to use
-----Output-----
- embedding - the dictionary of embeddings
'''


'''
----------get_finetuned_avg_text_vector----------
- This function loads in the current feature vector set to work with
-----Inputs-----
- embedding - the name of the embedding to use
- schema - the currently-active schema
-----Output-----
- embedding - the dictionary of embeddings
'''