import gensim, pickle, csv, os
import numpy as np


def build_word2vec(data_path, dim=300, negative=10, window=4, iter=10):
    filename = os.path.join(data_path, "unsupervised.csv") 
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        next(csv_reader, None) ## skip the header
        sentences = map(lambda x: x[0].split(), csv_reader)
        model = gensim.models.Word2Vec(list(sentences), 
                                        min_count=2, 
                                        sg=1, 
                                        size=dim, 
                                        iter=iter, 
                                        workers=8, 
                                        negative=negative,
                                        window=window)

    vocab_size = len(model.wv.vocab)
    embedding = np.zeros((vocab_size, dim))
    word2ind = {}
    for i, word in enumerate(list(model.wv.vocab)):
        vec = np.array(list(model.wv.__getitem__(word)))
        word2ind[word] = i
        embedding[i,:] = vec

    embedding_file = os.path.join(data_path, "skipEmbeddings.npy")
    ind_file = os.path.join(data_path, "word2ind.pkl")

    np.save(embedding_file, embedding)
    with open(ind_file, 'wb') as f:
        pickle.dump(word2ind, f)
    return

def word2vec_document_embedding(filename, embedding, word2ind):
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        next(csv_reader, None) ## skip the header
        sentences = map(lambda x: list(map(lambda word: word2ind[word], x[0].split())), csv_reader)
        representation = np.vstack(list(map(lambda inds: np.mean(embedding[inds], axis=0), sentences)))
    return(representation)