from time import time

import numpy as np
from corpus import fetch_itaki
from gensim import corpora, models, similarities
from sklearn import metrics

cat0 = ["Telugu", "Romanian", "Sinhala", "Javanese"]
cat1 = ["Arabic", "English", "Korean", "Malay", "Italian", "Hungarian", "Japanese", "Tamil", "Spanish", "Thai",
        "Chinese", "Vietnamese"]
cat2 = ["German", "Indonesian", "Hindi", "Ukrainian", "Turkish", "Malayalam", "Punjabi", "Bulgarian", "French",
        "Portuguese"]

########################################################################################################################
#                                                  Parameters
########################################################################################################################
# ----- Data sets -----------
languages = cat0 + cat1 + cat2
train_precentage = 0.7
doc_limit = 0
# ------ TF-IDF -------------
ngram_range = (2, 8)
# ngram_range = (3,3)
analyzer = 'char'
stop_words = None
# ------ SDG Classifier -----
max_iter = 50


# max_iter=1000

########################################################################################################################
#                                                  Implementation
########################################################################################################################
def run_svd_classifier(train_dataset, test_dataset):
    LSA_DIMENTIONS = len(languages)
    texts = tokenize_words(train_dataset.data)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(doc) for doc in texts]

    print("Applying TF-IDF")
    tfidf = models.TfidfModel(corpus)  # step 1 -- initialize a model
    corpus_tfidf = tfidf[corpus]

    # LSI sematic space
    print("Create LSI")
    # transform our Tf-Idf corpus via Latent Semantic Indexing into a latent 3-D space (3-D because we set num_topics=3).
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=LSA_DIMENTIONS,
                          power_iters=4)  # initialize an LSI transformation
    corpus_lsi = lsi[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

    # create index
    print("Create Index")
    index = similarities.MatrixSimilarity(corpus_lsi,
                                          num_features=LSA_DIMENTIONS)  # transform corpus to LSI space and index it

    # process query
    vec_lsi = []
    for i, data in enumerate(test_dataset.data):
        vec_bow = dictionary.doc2bow(tokenize_words([data])[0])
        vec_lsi += [lsi[vec_bow]]  # convert the query to LSI space

    # vec_bow = dictionary.doc2bow(tokenize([test_dataset.data[1]])[0])
    # vec_lsi = lsi[vec_bow]  # convert the query to LSI space
    sim_scores = index[vec_lsi]  # perform a similarity query against the corpus

    print("Similarity measures(-1..0..1)")
    predicted = []
    actual = []
    for index, (sim_score, doc) in enumerate(zip(sim_scores, test_dataset.data)):
        # output measures
        max = 0.0
        pos = 0
        for i, s in enumerate(sim_score):
            if (s > max):
                max = s
                pos = i
        actualLang = languages[test_dataset.target[index]]
        if max:
            predictedLang = languages[train_dataset.target[pos]]

            predicted.append(train_dataset.target[pos])
            actual.append(test_dataset.target[index])

            if predictedLang != actualLang:
                print("Predicted: '" + predictedLang + "' actual: '" + actualLang + "' Similarity score: " + str(max))
                print("DOC:" + doc)
        else:
            predictedLang = actualLang


    print("Accuracy: %.4f" % np.mean(np.array(predicted) == actual))
    print(metrics.classification_report(actual, predicted, labels=[i for i in range(0, len(train_dataset.target_names))], target_names=train_dataset.target_names))


def tokenize_words(texts):
    return [text.lower().split(" ") for text in texts]

def tokenize_chars(texts):
    return [[word for word in text] for text in texts]

print("########### Loading Corpora ################")
t0 = time()
dataset = fetch_itaki(subsets=['train', 'test'], categories=languages, skip_non_english=False,
                      train_precentage=train_precentage, verbose=False)
svd_train_all = dataset['train']
svd_test_all = dataset['test']
duration = time() - t0
print("Loading Corpora: done in %fs" % duration)

print("########### Classifier ################")
t0 = time()
run_svd_classifier(svd_train_all, svd_test_all)
duration = time() - t0
print("Run Classifier: done in %fs" % duration)
