'''
- fasttext.py
- This file handles the loading and distance measuring of fasttext embeddings
'''

# External imports
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

# Internal imports



'''
----------load_embedding----------
- This function loads in the current feature vector set to work with
-----Inputs-----
- file_location - the location of the fasttext embedding file to use
-----Output-----
- embedding - the dictionary of embeddings
'''
def load_embedding(file_location):
    nltk.download('punkt')
    embeddings = {}
    words = {}
    file = open(file_location, 'r', encoding='utf8', newline='\n')
    i=0
    for line in file:
        values = line.split()
        word = values[0]
        words[i] = word
        vector = np.asarray(values[1:], dtype=float)
        embeddings[word] = vector
        i+=1
    return {"embedding":embeddings}


'''
----------get_avg_text_vector----------
- This function loads in the current feature vector set to work with
-----Inputs-----
- text - the text whose average embedding is to be retrieved
- embedding - the name of the embedding to use
-----Output-----
- distance - the averaged 
'''
def get_avg_text_vector(text, embedding):
    text_array = word_tokenize(text)
    small_number = 1e-10
    vector = np.zeros(len(embedding[list(embedding.keys())[0]]) ,dtype=float)
    counter = 0
    for word in text_array:
        if word in embedding.keys():
            word_embedding = embedding[word]
            vector = np.add(vector, word_embedding)
            counter += 1
    if counter == 0:
        return np.divide(vector, 1)
    return np.divide(vector, counter)


'''
----------load_finetuned_embedding----------
- This function loads in the current feature vector set to work with
-----Inputs-----
- file_location - the location of the fasttext embedding file to use
-----Output-----
- embedding - the dictionary of embeddings
'''


'''
----------get_finetuned_avg_text_vector----------
- This function loads in the current feature vector set to work with
-----Inputs-----
- embedding - the name of the embedding to use
- schema - the currently-active schema
-----Output-----
- embedding - the dictionary of embeddings
'''