import string, csv, os, re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from scipy.io import savemat, loadmat
import pickle

def make_data(alpha, data_path, n_docs=100000):
    if (not os.path.isdir(data_path)):
        os.mkdir(data_path)

    V = 5000  ## vocab size
    lamb = 30  ## average document length
    K = 20  ## number of topics
    # alpha = alpha/K ## Dirichlet parameter
    vocab = [str(i) for i in range(V)]

    ## generate topics
    # topics = np.random.dirichlet(alpha*np.ones(V), K)
    with open('TopicsMatrix.pkl', 'rb') as f:
        topicmat = pickle.load(f)
    topics = topicmat[int(alpha) - 1]

    def generate_documents(num):
        ## generate document lengths
        doc_lengths = np.maximum(np.random.poisson(lam=lamb, size=num), 4)

        ## sample the topic for each document (equal probability)
        doc_topics = np.random.choice(K, size=num, replace=True)

        documents = []
        for length, topic in zip(doc_lengths, doc_topics):
            doc = np.random.choice(vocab, size=length, replace=True, p=topics[topic, :])
            documents.append(doc)
        return (doc_topics, documents)

    N_unsup = n_docs  ## number of unsupervised documents
    N_train = 5000  ## Documents to test on
    N_test = 3000  ## Reference documents

    _, unsup_documents = generate_documents(N_unsup)
    train_topics, train_documents = generate_documents(N_train)
    test_topics, test_documents = generate_documents(N_test)

    with open(os.path.join(data_path, 'train_topics.pkl'), 'wb') as f:
        pickle.dump(train_topics, f)

    with open(os.path.join(data_path, 'test_topics.pkl'), 'wb') as f:
        pickle.dump(test_topics, f)

    with open(os.path.join(data_path, 'topics.pkl'), 'wb') as f:
        pickle.dump(topics, f)

    X_unsup = unsup_documents
    X_train = train_documents
    X_test = test_documents

    id2word = dict([(w, str(w)) for w in range(V)])

    inds = np.random.choice(len(X_unsup), int(0.1 * len(X_unsup)), replace=False)
    X_valid = [X_unsup[i] for i in inds]

    used_inds = set(inds.tolist())
    X_unsupervised = [X_unsup[i] for i in range(len(X_unsup)) if i not in used_inds]

    indexed_train = [[int(word) for word in doc] for doc in X_train]
    indexed_test = [[int(word) for word in doc] for doc in X_test]
    indexed_unsup = [[int(word) for word in doc] for doc in X_unsupervised]
    indexed_valid = [[int(word) for word in doc] for doc in X_valid]

    def create_list_words(in_docs):
        return [x for y in in_docs for x in y]

    words_train = create_list_words(indexed_train)
    words_test = create_list_words(indexed_test)
    words_unsup = create_list_words(indexed_unsup)
    words_valid = create_list_words(indexed_valid)

    def create_doc_indices(in_docs):
        aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)]
        return [int(x) for y in aux for x in y]

    doc_indices_train = create_doc_indices(indexed_train)
    doc_indices_test = create_doc_indices(indexed_test)
    doc_indices_unsup = create_doc_indices(indexed_unsup)
    doc_indices_valid = create_doc_indices(indexed_valid)

    # Number of documents in each set
    n_docs_train = len(indexed_train)
    n_docs_test = len(indexed_test)
    n_docs_unsup = len(indexed_unsup)
    n_docs_valid = len(indexed_valid)

    def create_bow(doc_indices, words, n_docs, vocab_size):
        return sparse.coo_matrix(([1] * len(doc_indices), (doc_indices, words)), shape=(n_docs, vocab_size)).tocsr()

    bow_train = create_bow(doc_indices_train, words_train, n_docs_train, V)
    bow_test = create_bow(doc_indices_test, words_test, n_docs_test, V)
    bow_unsup = create_bow(doc_indices_unsup, words_unsup, n_docs_unsup, V)
    bow_valid = create_bow(doc_indices_valid, words_valid, n_docs_valid, V)

    # Split bow intro token/value pairs
    print('splitting bow into token/value pairs and saving to disk...')

    def split_bow(bow_in, n_docs):
        indices = [[w for w in bow_in[doc, :].indices] for doc in range(n_docs)]
        counts = [[c for c in bow_in[doc, :].data] for doc in range(n_docs)]
        return indices, counts

    bow_train_tokens, bow_train_counts = split_bow(bow_train, n_docs_train)
    bow_test_tokens, bow_test_counts = split_bow(bow_test, n_docs_test)
    bow_unsup_tokens, bow_unsup_counts = split_bow(bow_unsup, n_docs_unsup)
    bow_valid_tokens, bow_valid_counts = split_bow(bow_valid, n_docs_valid)

    savemat(os.path.join(data_path, 'bow_train_tokens'), {'tokens': bow_train_tokens}, do_compression=True)
    savemat(os.path.join(data_path, 'bow_train_counts'), {'counts': bow_train_counts}, do_compression=True)
    savemat(os.path.join(data_path, 'bow_test_tokens'), {'tokens': bow_test_tokens}, do_compression=True)
    savemat(os.path.join(data_path, 'bow_test_counts'), {'counts': bow_test_counts}, do_compression=True)
    savemat(os.path.join(data_path, 'bow_unsup_tokens'), {'tokens': bow_unsup_tokens}, do_compression=True)
    savemat(os.path.join(data_path, 'bow_unsup_counts'), {'counts': bow_unsup_counts}, do_compression=True)
    savemat(os.path.join(data_path, 'bow_valid_tokens'), {'tokens': bow_valid_tokens}, do_compression=True)
    savemat(os.path.join(data_path, 'bow_valid_counts'), {'counts': bow_valid_counts}, do_compression=True)

    with open(os.path.join(data_path, 'vocab.pkl'), 'wb') as f:
        pickle.dump(vocab, f)

    with open(os.path.join(data_path, 'id2word.pkl'), 'wb') as f:
        pickle.dump(id2word, f)

    return (topics, id2word)


