import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import os
import subprocess

def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings
        
def build_embedding(tokenized_sentences, parameters, use_pretrained, DATA_DIR):
    if use_pretrained:
        model_path=f"{DATA_DIR}/glove.model"
        return load_glove_embeddings(model_path)
    
    output_file = "custom_glove_text8_sentences.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        for sentence in tokenized_sentences:
            # Join words in the sentence with spaces and write to the file
            f.write(" ".join(sentence) + "\n")

    print(f"Text8-like file with sentences saved to {output_file}")

    # 1. Clone GloVe if not present
    if not os.path.exists("glove"):
        subprocess.run(["git", "clone", "https://github.com/stanfordnlp/glove"], check=True)
    else:
        print("GloVe repository already exists.")
        
    # 2. Move corpus to glove/text8
    text8_path = os.path.join("glove", "text8")
    if os.path.exists(text8_path):
        print("glove/text8 already exists. Removing it.")
        subprocess.run(["rm", "-rf", text8_path], check=True)
    else:
        print("glove/text8 does not exist.")
        
    subprocess.run(["mv", output_file, text8_path], check=True)

    # 3. Edit demo.sh with parameters
    demo_path = os.path.join("glove", "demo.sh")
    if not os.path.exists(demo_path):
        print("demo.sh not found in glove directory.")
        return
    with open(demo_path, "r") as f:
        lines = f.readlines()
    for i, line in enumerate(lines):
        if "VECTOR_SIZE=" in line and "export" not in line:
            lines[i] = f"VECTOR_SIZE={parameters['vector_size']}\n"
        if "MAX_ITER=" in line and "export" not in line:
            lines[i] = f"MAX_ITER={parameters['max_iter']}\n"
        if "WINDOW_SIZE=" in line and "export" not in line:
            lines[i] = f"WINDOW_SIZE={parameters['window_size']}\n"
    with open(demo_path, "w") as f:
        f.writelines(lines)

    # 4. Build GloVe
    subprocess.run(["make"], cwd="glove", check=True)

    # 5. Run demo.sh
    subprocess.run(["bash", "demo.sh"], cwd="glove", check=True)
    
    model_path="glove/vectors.txt"
    wordvector = load_glove_embeddings(model_path)
    remove_cloned_glove()
    return wordvector

def remove_cloned_glove():
    print("Removing glove directory")
    subprocess.run(["rm", "-rf", "glove"], check=True)
    
def get(word_vectors, parameters, target_words):
    vector_size = parameters['vector_size']
    target_similarity=defaultdict(list)
    
    profile = np.empty((len(target_words), vector_size))
    for i, word in enumerate(target_words):
        if word in word_vectors:
            profile[i, :] = word_vectors[word]
        else:
            profile[i, :] = np.zeros(vector_size)

    similarity = cosine_similarity(profile)
    for i in range(len(target_words)):
        sorted_index = np.argsort(-1*similarity[i,:])
        for j in range(1, len(target_words)):
            target_similarity[(target_words[i], target_words[sorted_index[j]])]  = similarity[i,sorted_index[j]]

    return target_similarity