import tensorflow as tf
import numpy as np
import pandas as pd
import scipy
import os
from word2gm_loader import *
from projectutil import find_list_ckpts
from ggplot import *
import time
import re
import pickle
from sklearn.metrics import f1_score
######################################
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
#%matplotlib inline
import matplotlib.cm as cm
#from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC
from ggplot import *
#import time #depricated
#from pandas.lib import Timestamp # use this instead of import time.
from pandas import Timestamp
from collections import defaultdict
import collections
## for tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#######******This funciton is for SCWS from ALL DATA ****########
## creating vocabulary dictionary.
    ## id2word and word2id
def vocab_dict(vocab_file):
    #f = open("evaluation_data/vocab.txt", "rb")
    #f = open("evaluation_data/alldata_vocab.txt", "rb")
    #f = open("evaluation_data/unfinish_ste/vocab.txt", "rb")
    f = open(vocab_file, "rb")
    #f = open("evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_SCWS_alldata/vocab.txt", "rb")
    lines = f.readlines()
    #words = []
    #id2word = {}
    id2word = []
    word2id = {}
    for line in lines:
        word, count = line.split()[:2]
        if word !='UNK':
            id2word.append(word)
    for i in xrange(len(id2word)):
        word2id[id2word[i]] = i
    ##for i in xrange(len(id2word)):
        ##print(id2word[i])
        ##print(word2id[id2word[i]])
    ##print(id2word[0])
    #return
    return id2word, word2id

### showing numbers of nearest neighbors(mentioned) of a given word.
def show_nearest_neighbors(id2emb, id2word, idx, num_nns):
    dist = np.dot(id2emb, id2emb[idx]) / (np.linalg.norm(id2emb) * np.linalg.norm(id2emb[idx])) ## Cosine Similarity: is the normalized dot product between vectors.
    #dist = np.dot(id2emb, id2emb[idx])
    ## argsort()[::]gives the index of the array dist, putting - preventing copying -1  reverse array return by argsort()
    highsim_idxs = dist.argsort()[::-1]
    highsim_idxs = highsim_idxs[:num_nns]
    nn_words = []
    for i in highsim_idxs:
        nn_words.append(id2word[i])
    return nn_words

# Dealing Huang data for pearson's correlations.
def process_huang(filename='ehuang_sim_wcontext/SCWS/ratings.txt', context_window=5, verbose=False):
  dirname = 'evaluation_data'
  filepath = os.path.join(dirname, filename)
  f = open(filepath, 'r')
  result_list = []
  for line_num, line in enumerate(f):
    ob = re.search(r'(.*)<b>(.*)</b>(.*)<b>(.*)</b>(.*?)\t(.+)', line)
    pre1 = ob.group(1).split()
    word1 = ob.group(2).strip()
    middle = ob.group(3).split()
    word2 = ob.group(4).strip()
    post2 = ob.group(5).split()
    scores = ob.group(6).split()
    pre1 = pre1[-context_window:]
    post1 = middle[:context_window]
    pre2 = middle[-context_window:]
    post2 = post2[:context_window]
    scores = [float(score) for score in scores]
    ave_score = np.mean(np.array(scores))
    if verbose:
      print ('line')
      print ('---------')
      print ('word {} has context'.format(word1))
      print ('pre1')
      print ('post1')
      print ('.........')
      print ('word {} has context'.format(word2))
      print ('pre2')
      print ('post2')
      print ('scores = ', scores)
      print ('average score = ', ave_score)
    result = (word1, pre1+post1, word2, pre2+post2, ave_score)
    result_list.append(result)
    #print result_list
    #return
  return result_list

def topic_emb_dict():
    ### Retriving topic_emb dictionary
    topic_emb = {}
    # with open('evaluation_data/EXPERIMENTS_STE_TDE/TDE/NoAvg_commentOut_TDE_WIKI/topic_emb.txt', 'rb') as handle:
    with open('evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_SCWS_alldata/topic_emb.txt', 'rb') as handle:
        data = handle.read()
    # reconstracting  the data as dictionary
    topic_emb = pickle.loads(data)
    # print('vocab_dict:', vocab_dict)
    # print(topic_emb[1]["also"])
    return topic_emb

#MaxSimC
#AvgSimC

## please use this method for MaxSimC and AvgSimC

def maxSimC(w1, w2, prob_wordvecs_dict):
    if w1 == w2:
        return 1.0
    metric_grid = np.zeros((20, 20))
    for cl1 in range(20):
        for cl2 in range(20):
            we1 = prob_wordvecs_dict[cl1][w1]
            we2 = prob_wordvecs_dict[cl2][w2]
            #print(tw_1, tw_2)
            #metric_grid[cl1, cl2] = 1 - scipy.spatial.distance.cosine(we1, we2)
            metric_grid[cl1, cl2] = np.dot(we1, we2) / (np.linalg.norm(we1) * np.linalg.norm(we2)) # Calculating Cosign Similarity.
            #metric_grid[cl1, cl2] = cosine_similarity(we1, we2)
    #print(np.max(metric_grid))

    ## please use this method for MaxSimC and AvgSimC

    return np.max(metric_grid) # MaxSimC: maximum of the pariwise cosign similarity
    #return np.mean(metric_grid)# MinSimC: pairwise cosine similarity min

def avgSimC(w1, w2, prob_wordvecs_dict):
    if w1 == w2:
        return 1.0
    metric_grid = np.zeros((20, 20))
    for cl1 in range(20):
        for cl2 in range(20):
            we1 = prob_wordvecs_dict[cl1][w1]
            we2 = prob_wordvecs_dict[cl2][w2]
            #print(tw_1, tw_2)
            #metric_grid[cl1, cl2] = 1 - scipy.spatial.distance.cosine(we1, we2)
            metric_grid[cl1, cl2] = np.dot(we1, we2) / (np.linalg.norm(we1) * np.linalg.norm(we2)) # Calculating Cosign Similarity.
            #metric_grid[cl1, cl2] = cosine_similarity(we1, we2)
    #print(np.mean(metric_grid))

    return np.mean(metric_grid)# MinSimC: pairwise cosine similarity min

def getModeldata(url):
    ## call form notepad for load dtaset
    dtaset = pd.read_csv(url, delimiter='\s', header=None, engine='python')

    ##****************Converting DataFrame to Numpy Array*********************##.
    ### below  tow lines to ckrate index2word or id2word
    ##------------------------------------------
    df_numpy = dtaset.to_numpy()
    id2topicword = df_numpy[:, 0]
    #-------------------------------------------
    ### We making its our way ###########************************
    ## things to remember to use "id2topicword[]" we have to use index % num_cluster to access the word
    id2emb = df_numpy[:,1:]
    vocab_size = id2topicword.size
    num_topics = 10
    topicword2id = {}
    for idx in xrange(vocab_size):

        topicword2id[id2topicword[idx]] = idx # word2id
    return id2topicword, topicword2id, id2emb, dtaset
    #return

def get_dataset(url):
    ## call form notepad for load dtaset
    dtaset = pd.read_csv(url, delimiter='\s', header=None, engine='python')
    return dtaset

def defaultdict_to_regular(d):
    if isinstance(d, defaultdict):
        d = {k: defaultdict_to_regular(v) for k, v in d.items()}
    return d

def topic_emb_dict():
    # dtaset_1000 = dtaset[0:100]
    #dtaset = get_dataset('evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_WIKI/vectors_kk_dc_10000lines_s400_n8_w10_b0_oit15_init15_k10_count5_sample0.000100_input.txt')
    url = "evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/vectors_kk_dc_10000lines_s400_n8_w10_b0_oit15_init15_k20_count5_sample0.000100_input.txt"
    dtaset = pd.read_csv(url, delimiter='\s', header=None, engine='python')
    #dtaset_1000 = dtaset[0:100]
    ##from collections import defaultdict
    topical_words_emb = defaultdict(lambda: defaultdict(list))
    #topical_words_emb = {}
    #topical_words_emb = defaultdict(list)
    for _, values in dtaset.iterrows():
        values = list(values)
        ##removing punctuation
        word = re.sub(r'[^\w\s]', '', values[0].split('#')[0])  # this line for removing punctuation
        ## print(word1)
        ## topical_words_emb[int(values[0].split('#')[1])][values[0]] += [values[1:]]
        topical_words_emb[int(values[0].split('#')[1])][word] += values[1:]
    ## convert default dict to dict
    topic_emb= defaultdict_to_regular(topical_words_emb)
    #topic_emb = dict(dict(topical_words_emb))
    #print(topic_emb)
    ## for saving dictionary as binary using pickel
    ## f = open("evaluation_data/EXPERIMENTS_STE_TDE/TDE/NoAvg_commentOut_TDE_WIKI/topic_emb.txt", "wb") ## Opening in binary write mode.
    f = open("evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/dict_topic_emb.txt", "wb")  ## Opening in binary write mode.
    ## Writing Dictionary Object of vocabulary.
    pickle.dump(topic_emb, f)
    f.close()
    print('"dict_topic_emb.txt" file saved and closed')
    return

def get_topic_emb():
    ### creating "topic_emb" dictionary
    ## crearting topic_emb_dict and save file
    #topic_emb_dict()
    ### Retriving topic_emb dictionary
    topic_emb = {}
    # with open('evaluation_data/EXPERIMENTS_STE_TDE/TDE/NoAvg_commentOut_TDE_WIKI/topic_emb.txt', 'rb') as handle:
    with open('evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/dict_topic_emb.txt', 'rb') as handle:
        data = handle.read()
        topic_emb = pickle.loads(data)
    # reconstracting  the data as dictionary

    # print('vocab_dict:', vocab_dict)
    #print(topic_emb[1]["also"])
    #print(topic_emb)
    return topic_emb
    #return

######****************************** Topic embeddings for 20NewsGroupDatasets*******####################################

def topic_20News_emb_dict():
    # dtaset_1000 = dtaset[0:100]
    #dtaset = get_dataset('evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_WIKI/vectors_kk_dc_10000lines_s400_n8_w10_b0_oit15_init15_k10_count5_sample0.000100_input.txt')
    ## for TDE
    url = "evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/vectors_kk_dc_10000lines_s400_n8_w10_b0_oit15_init15_k20_count5_sample0.000100_input.txt"
    ## for STE
    #url = "evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/vectors_kk_dc_10000lines_s400_n8_w10_b0_oit15_init15_k20_count5_sample0.000100_input.txt"

    dtaset = pd.read_csv(url, delimiter='\s', header=None, engine='python')
    #dtaset_1000 = dtaset[0:100]
    ##from collections import defaultdict
    topical_words_emb = defaultdict(lambda: defaultdict(list))
    #topical_words_emb = {}
    #topical_words_emb = defaultdict(list)
    for _, values in dtaset.iterrows():
        values = list(values)
        ##removing punctuation
        #word = re.sub(r'[^\w\s]', '', values[0].split('#')[0])  # this line for removing punctuation
        ## print(word1)
        ## topical_words_emb[int(values[0].split('#')[1])][values[0]] += [values[1:]]

        ###topical_words_emb[cluster][word]
        topical_words_emb[int(values[0].split('#')[1])][values[0].split('#')[0]] += values[1:]
        ###topical_words_emb[word][cluster]
        #topical_words_emb[values[0].split('#')[0]][int(values[0].split('#')[1])] += values[1:]

    ## convert default dict to dict
    topic_emb = defaultdict_to_regular(topical_words_emb)
    #topic_emb = dict(dict(topical_words_emb))
    #print(topic_emb)
    ## for saving dictionary as binary using pickel
    ## f = open("evaluation_data/EXPERIMENTS_STE_TDE/TDE/NoAvg_commentOut_TDE_WIKI/topic_emb.txt", "wb") ## Opening in binary write mode.
    ## for TDE
    f = open("evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/20_topic_emb.txt", "wb")  ## Opening in binary write mode.
    ##for STE
    #f = open("evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/20_topic_emb.txt", "wb")
    ## Writing Dictionary Object of vocabulary.
    pickle.dump(topic_emb, f)
    f.close()
    print('"dict_topic_emb.txt" file saved and closed')
    return

def get_20Newstopic_emb():
    ### creating "topic_emb" dictionary
    ## crearting topic_emb_dict and save file
    #topic_emb_dict()
    ### Retriving topic_emb dictionary
    topic_emb = {}
    ## for STE
    #with open('evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/20_topic_emb.txt', 'rb') as handle:
    ## for TDE
    with open('evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/20_topic_emb.txt', 'rb') as handle:
        data = handle.read()
        topic_emb = pickle.loads(data)
    # reconstracting  the data as dictionary

    # print('vocab_dict:', vocab_dict)
    #print(topic_emb[1]["also"])
    #print(topic_emb)
    return topic_emb
    #return

#######***************************** Bigram PMI and Top Bigram Block ***************####################################

######***********20News*************Probability of individual words under each topic****************####################

def create_20News_probability_topic_vectors(vocab_file, num_cluster):
    topic_emb_20News = get_20Newstopic_emb() ##
    vocab_dict = vocab_count_dict(vocab_file) ##
    topic_centroid_prob_map = defaultdict(lambda: defaultdict(list))
    #topic_centroid_prob_map = {} ## This actually creates topic embedidngs.
    prob_topic = {} # This is the topic probability
    topic_emb_sum = {}
    for k, v in topic_emb_20News.items():
        for i, l in v.items():
            topic_emb_20News[k][i] = sum(l) ### Thsi topic embeddig is single valued embeding for each word.
    #print(topic_emb[0]['alcohols'])
    #print(topic_emb)
    #for i in topic_emb:
    #    for k in topic_emb[i]:
    #        topic_emb[i][k] = sum(
    #print(topic_emb)
    for index in range(0, num_cluster):
        print("Cluster-", index)
        topic_centroid_prob_map[index] = {}
        prob_topic[index] = 0
        print("Entering inner for loop...")
        #prob_topic[index]
        ## for this part instead of "word_centroid_map" we need to read words from vocab.txt in to array and then interate each word in the array.
        ##for word in word_centroid_map: ## for this part flow above line.
        for word in vocab_dict.keys(): ### we need to activate this line and deactivate above line.
            #print(topic_emb[index][word])
            #print(word)
            # topic_centroid_prob_map[index][word] += topic_emb[index][word]
            #print(topic_centroid_prob_map[index][word])
            #print(topic_centroid_prob_map)
            prob_topic[index] += topic_emb_20News[index][word]  ## to sum all value in a topic to use it below to
                ##prob_topic[index] += topic_centroid_prob_map[index][word] ## to sum all value in a topic to use it below to
        print("Exiting inner for loop...")

        for k, v in topic_emb_20News[index].items():
            topic_centroid_prob_map[index][k] = (v * 1.0) / prob_topic[index] ## This is how to get probability of each word in a topic. this is how to get probability of a word undr the topic.
        topic_word_prob = defaultdict_to_regular(topic_centroid_prob_map)

    #print(topic_centroid_prob_map[index])
    f = open("evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/20News_topic_centroid_prob_map.txt", "wb")  ## Opening in binary write mode.
    ## for STE
    #f = open("evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/20News_topic_centroid_prob_map.txt","wb")  ## Opening in binary write mode.
    # Writing Dictionary Object of vocabulary.
    pickle.dump(topic_word_prob, f)
    f.close()
    print("'20News_topic_centroid_prob_map.txt' file wrote and closed")
    return

def get_20Newstopic_centroid_prob_map():
    ##for TDE
    with open('evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/20News_topic_centroid_prob_map.txt', 'rb') as handle:
    ## for STE
    #with open('evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/20News_topic_centroid_prob_map.txt', 'rb') as handle:
        data = handle.read()
        topic_word_prob = pickle.loads(data)
    return topic_word_prob

#######################***********************####################**************########################################

def get_sif_weights(vocab_file):
    a_weight = 0.01
    f = open(vocab_file, "r")
    lines = f.readlines()
    weight_dict = {}
    total = 0
    for line in lines:
        word, count = line.split()[:2]
        weight_dict[word] = int(count)
        total = total + int(count)
    for word in weight_dict:
        prob = weight_dict[word] * 1.0 / total
        weight_dict[word] = a_weight * 1.0 / (a_weight * 1.0 + prob)

    #print(weight_dict)
    return weight_dict

def vocab_count_dict(vocab_file):
    f = open(vocab_file, "rb")
    lines = f.readlines()
    vocab_count = {}
    for line in lines:
        word, count = line.split()[:2]
        vocab_count[word] = int(count)
    print("returning 'vocab_count' dict")
    return vocab_count

def get_doccofrequency(traindata):
    print("Getting Doc-frequency and co-frequency")
    doc_cofreq = {}
    doc_freq = {}
    for item in traindata:
        unique_item = list(set(item.split()))
        for row in unique_item:
            try:
                doc_freq[row] += 1
            except:
                doc_cofreq[row] = {}
                doc_freq[row] = 1
            for col in unique_item:
                if row != col:
                    try:
                        doc_cofreq[row][col] += 1
                    except:
                        doc_cofreq[row][col] = 1

    return (doc_freq, doc_cofreq)

def topic_index_wise_emb_val():
    #from collections import defaultdict
    # dtaset_1000 = dtaset[0:100]
    #dtaset = get_dataset('evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_WIKI/vectors_kk_dc_10000lines_s400_n8_w10_b0_oit15_init15_k10_count5_sample0.000100_input.txt')
    url = "evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_WIKI/vectors_kk_dc_10000lines_s400_n8_w10_b0_oit15_init15_k10_count5_sample0.000100_input.txt"
    dtaset = pd.read_csv(url, delimiter='\s', header=None, engine='python')
    #data_10 = dtaset[0:100]
    emb_topic = defaultdict(list)
    emb_label = defaultdict(list)
    for _, values in dtaset.iterrows():

        values = list(values)
        emb_topic[int(values[0].split('#')[1])] += [values[1:]]

    # convert default dict to dict
    emb_topic = dict(emb_topic)
    ## for saving dictionary as binary using pickel
    # f = open("evaluation_data/EXPERIMENTS_STE_TDE/TDE/NoAvg_commentOut_TDE_WIKI/topic_emb.txt", "wb") ## Opening in binary write mode.
    f = open("evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_WIKI/topic_index_wise_emb_val.txt", "wb")  ## Opening in binary write mode.
    # Writing Dictionary Object of vocabulary.
    pickle.dump(emb_topic, f)
    f.close()
    return

def get_topic_indx_wise_emb_val():
    ### creating "topic_emb" dictionary
    topic_index_wise_emb_val()
    ### Retriving topic_emb dictionary
    indx_wise_emb_topic = {}
    # with open('evaluation_data/EXPERIMENTS_STE_TDE/TDE/NoAvg_commentOut_TDE_WIKI/topic_emb.txt', 'rb') as handle:
    with open('evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_WIKI/topic_index_wise_emb_val.txt', 'rb') as handle:
        data = handle.read()
    # reconstracting  the data as dictionary
    indx_wise_emb_topic = pickle.loads(data)
    #print('indx_wise_emb_topic:', indx_wise_emb_topic)
    # print(indx_wise_emb_topic[1])
    return indx_wise_emb_topic

## this method  prepare to make for  probability wise short  and get top bi-grams
##topic_centroid_prob_map[index][word] = He re is our topic wishe words

##topic_centroid_prob_map[index][word] = He re is our topic wishe words
#def get_probability_topic_vectors(vocab_dict, topic_emb, num_cluster):
def create_probability_topic_vectors(vocab_file, num_cluster):
    #topic_emb = get_topic_emb()
    topic_emb = get_20Newstopic_emb()
    vocab_dict = vocab_count_dict(vocab_file)
    topic_centroid_prob_map = {} ## This actually creates topic embedidngs.
    prob_topic = {} # This is the topic probability
    topic_emb_sum = {}
    for k, v in topic_emb.items():
        for i, l in v.items():
           topic_emb[k][i] = sum(l) ### Thsi topic embeddig is single valued embeding for each word.
    #print(topic_emb[0]['alcohols'])
    #print(topic_emb)
    #for i in topic_emb:
    #    for k in topic_emb[i]:
    #        topic_emb[i][k] = sum(
    #print(topic_emb)
    for index in range(0, num_cluster):
        print("Cluster-", index)
        topic_centroid_prob_map[index] = {}
        prob_topic[index] = 0
        print("Entering inner for loop...")
        #prob_topic[index]
        ## for this part instead of "word_centroid_map" we need to read words from vocab.txt in to array and then interate each word in the array.
        ##for word in word_centroid_map: ## for this part flow above line.
        for word in vocab_dict.keys(): ### we need to activate this line and deactivate above line.
            #print(topic_emb[index][word])
            #print(word)
            # topic_centroid_prob_map[index][word] += topic_emb[index][word]
            #print(topic_centroid_prob_map[index][word])
            #print(topic_centroid_prob_map)
            prob_topic[index] += topic_emb[index][word]  ## to sum all value in a topic to use it below to
                ##prob_topic[index] += topic_centroid_prob_map[index][word] ## to sum all value in a topic to use it below to
        print("Exiting inner for loop...")
        topic_centroid_prob_map[index] = {k: (v * 1.0) / (prob_topic[index]) for k, v in (topic_emb[index]).items()} ## This is how to get probability of each word in a topic. this is how to get probability of a word undr the topic.
    #print(topic_centroid_prob_map[index])
    ## for TDE
    #f = open("evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/topic_centroid_prob_map.txt", "wb")  ## Opening in binary write mode.
    ## for STE
    f = open("evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/topic_centroid_prob_map.txt","wb")  ## Opening in binary write mode.
    # Writing Dictionary Object of vocabulary.
    pickle.dump(topic_centroid_prob_map, f)
    f.close()
    print("'topic_centroid_prob_map.txt' file wrote and closed")
    return

def get_topic_centroid_prob_map():
    ## for TDE
    with open('evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/topic_centroid_prob_map.txt', 'rb') as handle:
    ## for STE
    #with open('evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/topic_centroid_prob_map.txt', 'rb') as handle:
        data = handle.read()
        topic_centroid_prob_map = pickle.loads(data)
    return topic_centroid_prob_map



def get_pmi(doc_cofreq, doc_freq, num_clusters, num_topwords, topic_centroid_prob_map):

    ### this code snip is from  https://github.com/dheeraj7596/SCDV

    print("Getting PMI...")
    topic_coherence = {}
    overall_coherence = 0
    top10words = {}
    ## for TED
    f = open('evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/bigram.txt','a')
    ## for STE
    #f = open('evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/bigram.txt','a')
    for index in range(0, num_clusters):
        most_probs = collections.Counter(topic_centroid_prob_map[index])
        require_mostcommons = most_probs.most_common(num_topwords) ## Getting top number of words.
        top10words[index] = require_mostcommons ## putting top number of words under topics.
        for i in range(1, num_topwords):
            for j in range(0, i):
                try:
                    temp = doc_cofreq[require_mostcommons[i][0]][require_mostcommons[j][0]]
                except:
                    temp = 0
                try:
                    a = math.log(((temp + 1) * 1.0) / (doc_freq[require_mostcommons[i][0]] * doc_freq[require_mostcommons[i][0]]))

                    #a = math.log(((temp + 1) * 1.0) / (doc_freq[require_mostcommons[i][0]] * doc_freq[require_mostcommons[i][0]]))
                except:
                    a = 0
                try:
                    topic_coherence[index] += a
                except:
                    topic_coherence[index] = a
                    f.write(require_mostcommons[i][0]+","+require_mostcommons[j][0] +":"+str(a)+"\n")

        overall_coherence += topic_coherence[index]
    print("Returning with PMI scores")
    return (topic_coherence, overall_coherence / num_clusters, top10words)

######## ************* END ******** PMI and Top Bigram***###################

# helper function
def quantitative_scws():
  data_huang = process_huang()
  #ws1, cs1, ws2, cs2, scores = zip(*data_huang)
  model_scores = []
  human_scores = []
  df = pd.DataFrame()
  for i, (w1, c1, w2, c2, human_score) in enumerate(data_huang):
    human_scores.append(human_score)
    w1, w2 = [w1.lower(), w2.lower()]
    word_vec = getModeldata() ### this function read  embeddings file to DataFrame and return all neceaary nparray to deal with embeddings, labels and id.
    # creating label in the model data using huang word w1 and w2 by adding '#' and topic number.
    for cl1 in range(10):
        word1 = "'"+w1+"',"+'#'+str(cl1)
        print('word1:', word1)
        try:
            if word_vec[word1]:
                for cl2 in range(10):
                    word2 ="'"+w2+"',"+'#'+str(cl2)
                    print('word2:', word2)
                    try:
                        if word_vec[word2]:
                            model_score = wordsim_context(word_vec[word1], word_vec[word2]) ## only compute wordsim for common words exist in Huang and model.
                            model_scores.append(model_score)
                    except:
                        continue
        except:
            continue

# this funciton to aappend desired words in to the numpy array.
# Average simc is use for establish the average similarity, and the msxsimC use for Maximum similarity measure.

            '''
            try:
                model_score = wordsim_context(word_vec[word1], word_vec[word2])
                model_scores.append(model_score)
            except:
                continue
             '''
  df['word1'], _, df['word2'], _, df['human scores'] = zip(*data_huang)
  df['model scores'] = model_scores
  #compute spearman correlation
  spr = scipy.stats.spearmanr(model_scores, human_scores)
  return spr, df

###################***For text SCWS data loading***######################
def data_scws(vocab_file):
  id2word, word2id = vocab_dict(vocab_file) ## calling dictionary created form vocab.txt.
  #print('id2word printing')
  #print(id2word)
  #print('word2id printing')
  #print(word2id)
  data_huang = process_huang() ## Getting words form Huang text files.
  #ws1, cs1, ws2, cs2, scores = zip(*data_huang)
  model_scores = []
  human_scores = []
  #new_human_scores = []
  scws_scores = []
  #scws_df = pd.DataFrame()
  word1_notexists = [] ## center words not exists in the model.
  word2_notexists = []## Context words not exists in the model.
  scws1_common = [] ## center words exists in both huang and model.
  scws2_common = [] ## context words exists in both huang and model.
  df = pd.DataFrame()
  for i, (w1, c1, w2, c2, human_score) in enumerate(data_huang):
    #print(type(human_score))
    human_scores.append(human_score)
    w1, w2 = [w1.lower(), w2.lower()]
    try:
        word1 = id2word[word2id[w1]]
        scws1_common.append(word1) #if word w1 in huang exists in the model vocab append it to scws1_common.append(word1)
    except:
        word1_notexists.append(w1)  # all center words those are not in the model are appending to "word1_notexists" numpy array.
        continue
    try:
       word2 = id2word[word2id[w2]]
       scws2_common.append(word2)  #if word w2 in huang exists in the model vocab append it to scws2_common.append(word2)
    except:
        word2_notexists.append(w2) # all context words those are not in the model are appending to "word2_notexists" numpy array.
        continue

  ## in active this block for initial checking if words in Model and Huang are same or not.
  # Inserting numpy arrays of huang's data to dataFrame as columns
  df['word1'], _, df['word2'], _, df['human scores'] = zip(*data_huang)
  # Making them lowercase ( lowercase all data in the dataframe)
  df['word1'] = df['word1'].str.lower()
  df['word2'] = df['word2'].str.lower()
  print('Original Dataframe', df)  ###***Original Data frame***###

  #wo1 = df['word1']
  #wo2 = df['word2']
  #print("wo1", wo1)
  #print("wo2", wo2)

  #Dropign center and context words from Huang those are not exist in the model data and build replacement of original data.

  for i in word1_notexists: ## for center words
     df.drop(df.loc[df['word1'] == i].index, inplace = True)
  for j in word2_notexists: ## for context words.
      df.drop(df.loc[df['word2'] == j].index, inplace = True)
  print('New dataframe', df)

  ##****** appending human scores to scws_scores() list ## *********
  for i, row in df.iterrows():
      scws_scores.append(row['human scores'])
  #print("Shape of scws_scores")
  #print(scws_scores)
  ####**************** geting topic wise word embeddings****************######
  ## Please adjust the url for "get_20Newstopic_emb()" either TDE or STE
  prob_wordvecs_dict = get_20Newstopic_emb()
  for index, row in df.iterrows():
      wn1 = row['word1']
      wn2 = row['word2']
      #print("wn1", wn1)
      #print("wn2", wn2)
      #loading topic_emb dictionary e.g., topic_emb[topic_number]["word"]
      ## instead of calling via wordsim_context() we are calling maxSimC() and avgSimC() directly below.
      #model_score = wordsim_context(wn1, wn2)  ## only compute wordsim for common words exist in Huang and model.
      ## for maxSimC()
      model_score = maxSimC(wn1, wn2, prob_wordvecs_dict)

      ## for avgSimC()

      #model_score = avgSimC(w1, w2, prob_wordvecs_dict)

      model_scores.append(model_score)

  '''
  ##3print("*************************")
  print('##############***scws_common array size***#################################')
  print(scws1_common)
  print('scws_common array size:', len(scws1_common))
  print('###*********************scws2_common array size*************************###')
  print(scws2_common)
  print('scws2_common array size:', len(scws2_common))
  print('***************************************************************************')
  ##print(scws_df['word2'])
  ##print('##############################################')
  print('****************************word1_notexists**********************')
  print(word1_notexists)
  print('word1_notexists array size:', len(word1_notexists))
  print('****************************word2_notexists**********************')
  print(word2_notexists)
  print('word2_notexists array size:', len(word2_notexists))
  ##print("#############*************Modified_Dataframe**************#############")
  ##print(df)
  '''
  df['model scores'] = model_scores
  print('DataFrame with model scores:', df)
  #print('calling pearson\'s correlations')

  nan_rows = df[df['model scores'].isna()]
  print('nan_rows', nan_rows)
  #print("model_scores", model_scores.shape)
  #print("human_scores", human_scores.shape)

  return model_scores, scws_scores, df

# returns a dataframe of results
def quantitative_scws_df():
  # run all metrics and criteria
  #####*************************Choose-Models****************************************#####
  ## for TDE
  vocab_file = "evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/vocab.txt"

  ## for STE
  #vocab_file = "evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/vocab.txt"

  #####******************************###############********************************#####

  #sp1, _ = data_scws(vocab_file)  ## data_scws internally call vocab_dict(vocab_file)

  model_scores, scws_scores, df = data_scws(vocab_file)

  model_scores = np.asarray(model_scores)
  scws_scores = np.asarray(scws_scores)
  #model_scores = model_scores.mean(axis=0)
  #scws_scores = scws_scores.mean(axis=0)
  #model_scores = model_scores.mean()
  #print(model_scores.shape)
  #scws_scores = scws_scores.mean()

  #print("model_scores:", model_scores.shape)
  #print("model_scores:", model_scores.size)
  #print("#*******#####____#####********#")
  #print("scws_scores:", scws_scores.shape)
  #print("scws_scores:", scws_scores.size)
  ## coef: correlation coefficient, spr: correlation
  coef, spr = scipy.stats.spearmanr(model_scores, scws_scores)
  #print('coef', coef*100)
  #print('spr', spr*100)

  #print('spr.correlation*100:', spr.correlation*100)

  df_spr = pd.DataFrame()
  #df_spr['spearman'] = [spr.correlation*100]
  df_spr['spearman'] = [coef* 100]

  #### to check if any column is Nan Valued

  #nan_rows = df_spr[df_spr['spearman'].isnull()]
  #print('nan_rows', nan_rows)

  ### Remove rows if any value of a column is nan
  #df_spr = df_spr[df_spr['spearman'].notna()]

  return df_spr
  #return

def create_probability_word_vectors(vocab_file):
    num_features = 400
    num_clusters = 20
    vocab_dict = vocab_count_dict(vocab_file)
    prob_wordvecs_dict = get_20Newstopic_emb()
    prob_wordvecs = {}
    for word in vocab_dict.keys():
        prob_wordvecs[word] = np.zeros(num_clusters * num_features, dtype="float32")

    for k, v in prob_wordvecs_dict.items():
        # print('k', k) ## cluster number
        for w, e in v.items():
            # print('i', i) ## word
            prob_wordvecs[w][k * num_features:(k + 1) * num_features] = e
            ##prob_wordvecs[word][index * num_features:(index + 1) * num_features] = model[word] * word_centroid_prob_map[word][index]

    #print(topic_centroid_prob_map[index])
    #f = open("evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/20News_prob_wordvecs.txt", "wb")  ## Opening in binary write mode.
    ## for STE
    f = open("evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/20News_prob_wordvecs.txt","wb")  ## Opening in binary write mode.
    # Writing Dictionary Object of vocabulary.
    pickle.dump(prob_wordvecs, f)
    f.close()
    print("'20News_prob_wordvecs.txt' file wrote and closed")

    return

def get_probability_word_vectors():

    ## for STE
    #with open('evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/20News_prob_wordvecs.txt', 'rb') as handle:
    ## for TDE
    with open('evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/20News_prob_wordvecs.txt', 'rb') as handle:
        data = handle.read()
        prob_wordvecs = pickle.loads(data)
    print("Returning with 'prob_wordvecs' dictionary")
    return prob_wordvecs

### for document vectors.
def create_cluster_vector_and_gwbowv(prob_wordvecs, wordlist, dimension, word_idf_dict, sif_weight_dict, num_centroids, train = False):

    bag_of_centroids = np.zeros(num_centroids * dimension, dtype="float32")
    global min_no
    global max_no

    for word in wordlist:
        '''
        print("##*****************The Train Word List**********##")
        print("word in wordlist:", word)
        print("##*****************####################**********##")
        '''
        ###only idf
        #bag_of_centroids += prob_wordvecs[word]
        ###idf and weight_dict
        try:
            bag_of_centroids += prob_wordvecs[word] * word_idf_dict[word] * sif_weight_dict[word]
        except:
            continue

    norm = np.sqrt(np.einsum('...i,...i', bag_of_centroids, bag_of_centroids))
    if (norm != 0):
        bag_of_centroids /= norm

    # To make feature vector sparse, make note of minimum and maximum values.
    if train:
        min_no += min(bag_of_centroids)
        max_no += max(bag_of_centroids)

    return bag_of_centroids

#######################***For topic wise bigram****######################

import nltk

def bigram_topical_words(num_of_clusters):
    #id2topicword, topicword2id, id2emb, dtaset = getModeldata(outfile_url)
    ### uncomment below line if want to use combined embedding and probability.
    #topic_word_emb = get_20Newstopic_emb() ## using combined embedding and probability of a word under each topic.

    ## uncomment below line if want to  use only topical probability of a word
    topic_word_prob = get_20Newstopic_centroid_prob_map() ## using only topic word probability (without word embedding)

    ### for vocab words in the embeddigns
    #vocab_dict = vocab_count_dict(vocab_file)
    bigram_df = pd.DataFrame()
    counts = defaultdict(int)

    all = pd.read_csv('data/all_v2.tsv', header=0, delimiter="\t")
    for review in all["news"]:
        # Get the wordlist in each news article.
        words = KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True)

        for pair in nltk.bigrams(words):
            #print(pair)
            counts[pair] += 1
        #for c, pair in ((c, pair) for pair, c in counts.itetems()):
        #    print pair, c
    #print(counts)
    for index in range(0, num_of_clusters):
        locals()['bigram%s' %index] = []
        locals()['probability%s' %index] = []
        locals()['cluster%s' %index] = []
        #declaring dataframe
        locals()['bigram_df%s' %index] = pd.DataFrame()

    for index in range(0, num_of_clusters):
        print(index)
        for (a, b), _count  in counts.iteritems():
            #print(_count)
            try:
                bigram_probability_scors = np.dot(topic_word_prob[index][a], topic_word_prob[index][b])
                bigram_probability_scors *= _count
                ##print(bigram_probability_scors)
                ab = " ".join((a, b))  ## to make bigram as string
                # print(ab)
                locals()['bigram%s' % index].append(ab)
                print(index)
                # print(locals()['bigram%s' %index])
                locals()['probability%s' % index].append(bigram_probability_scors)
                locals()['cluster%s' % index].append(index)
            except:
                continue


        #bigram_df['Bigrams'] = locals()['bigram%s' % index]
        locals()['bigram_df%s' % index]['Bigrams'] = locals()['bigram%s' % index]
        #print(index)
        #bigram_df['Probability'] = locals()['probability%s' % index]
        locals()['bigram_df%s' % index]['Probability'] = locals()['probability%s' % index]
        #bigram_df['Cluster'] = locals()['cluster%s' % index]
        locals()['bigram_df%s' % index]['Cluster'] = locals()['cluster%s' % index]

        ### use any one of the flowing lines do the same but in different formant.
        locals()['bigram_df%s' % index] = locals()['bigram_df%s' %index].set_index(['Bigrams']).groupby('Cluster')['Probability'].nlargest(150, keep='last').reset_index() ### keep='last' asure descending orders.
        #locals()['bigram_df%s' % index] = locals()['bigram_df%s' % index].sort_values('Probability', ascending=False).groupby('Cluster').head(150)

        #for TDE
        np.savetxt(r'evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/Bigrams/bigram'+str(index)+'.txt', locals()['bigram_df%s' %index].values, fmt='%s %s %s')
        ## for STE
        #np.savetxt(r'evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/Bigrams/bigram' + str(index) + '.txt', locals()['bigram_df%s' % index].values, fmt='%s %s %s')


        #####***check if lists are working fine***###
        #print (locals()['bigram%s' % index])
        #print(locals()['probability%s' % index])
        #####***making dataframe***##################
        #bigram_df[locals()['Bigram%s' %index]] = locals()['bigram%s' %index]
        #bigram_df[locals()['Probability%s' % index]] = locals()['probability%s' % index]
        #print(len(locals()['bigram%s' %index]))
        #print(len(locals()['probability%s' %index]))
        #print(len(locals()['cluster%s' %index]))

    #print(locals()['bigram0'])
    #print(len(locals()['bigram0']))
    #####***Shorting Dataframe columns  and creating new Dataframe with top 10 elements of each column***############

    #print()
    ###np.savetxt(r'evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/bigram.txt', bigram_df.values, fmt='%d%d')
    print("Data frame saved 'bigram.txt'")
    ## below goes methods to rank bigrams collections and dataframe..
    return

if __name__ == '__main__':
    sess = tf.Session()
    ## creating 20_topic_emb.txt - for topic embeddings.
    ###*************************************************************
    ### creating topic word_embedding dictionary for 20NewsDatasets
    #topic_20News_emb_dict()
    ####*****************########**********************************=
    # print("Differetn length")
    ## word_val is the dictionary word#cl : emb_val
    ## creating dictionary word_val using getModeldata()
    #df_numpy, emb = getModeldata()
    #id2topicword, topicword2id, id2emb = getModeldata()
    #print(id2emb[int(topicword2id['party#5'])])
    #print(df_numpy)
    #print(emb)
    #load_vocab()
    #quantitative_scws_df()
    #calling vocabulary_dictionary
    #num_topics = 10
    #id2word, word2id, id2topicword, topicword2id = vocab_dict(num_topics)
    #print("id2word:", id2word[1])
    #print("word2id:", word2id['of'])
    #print("id2word:", id2topicword[0])
    #print("word2id:", topicword2id['the#0'])
    #vocab_file = 'evaluation_data/EXPERIMENTS_STE_TDE/TDE/NoAvg_commentOut_TDE_WIKI/vocab.txt'

    ########################This block is for topic centroid probability map#########
    #topic_emb_dict() ## write topic_emb_dict{}
    #get_topic_emb() ## read topic_emb{} dict
    #vocab_file = 'evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_WIKI/vocab.txt'
    #num_cluster = 10
    ## writing "topic_centroid_prob_map" dictionary in to topic_centroid_prob_map.txt file
    #create_probability_topic_vectors(vocab_file, num_cluster)
    ## reading back "topic_centroid_prob_map" dictionary.
    ###### Topic centroid Probability map is needed for PMI and top words in the topic.
    #topic_centroid_prob_map = get_topic_centroid_prob_map()
    #print(topic_centroid_prob_map[0])
    #############################**END**###########################################

    #data_scws()
    #process_huang()
    #data_scws()
    ##Loding Model data
    #url = 'evaluation_data/unfinish_ste/vectors_kk_dc_10000lines_s400_n8_w10_b0_oit15_init15_k10_count5_sample0.000100_input.txt'
    #id2topicword, topicword2id, id2emb, dtaset = getModeldata(url)

    '''
    ### for seving topicword2id dictionary.
    f1 = open("evaluation_data/unfinish_ste/topicword2id.txt", "wb")
    pickle.dump(topicword2id, f1)  ## topicword2id is a dictionary
    f1.close()
    ### for retriving saved topicword2id Dictionary data and value form the binary file and finally reconstruct the dictionary.
    topicword2id = {}
    with open('evaluation_data/unfinish_ste/topicword2id.txt', 'rb') as handle:
        data = handle.read()
    # reconstracting  the data as dictionary
        topicword2id = pickle.loads(data)
    '''

    #print("id2topicword", id2topicword)
    #print("topicword2id", topicword2id)
    #print("id2emb", id2emb)
    #print("dtaset.head(20)", dtaset.head(20))

    #### Topic Wise word embeddings dictionary -> topic_emb[topic]["word"] = [embeddings value]
    #### It does work Completely fine

    ######*********************************SCWS-Pearson***Correlations*************************************************#
    '''
    ### Please choose similarity Matrix maxSimC or avgSimc inside "data_scws()" method
    ### please choose vocab.txt file inside "quantitative_scws_df()" methods.

    scws_pearson_df = quantitative_scws_df()
    print(scws_pearson_df)
    '''
    ######*********************************-----------End SCWS----------------*****************************************#

    #######*********************************Document Representation 20News Dataset*************************************#
    '''
    ### first create probability word vectos which will call by 'get_probability_word_vectors()' below
    ### creating probability word vectors

    ## for TDE
    vocab_file = "evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/vocab.txt"

    ## for STE
    #vocab_file = "evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/vocab.txt"

    ### **************Create_probability_word_vectors**************************
    ## creating tf_weight
    create_probability_word_vectors(vocab_file)

    ###************************************************************************###
    ## for TDE
    vocab_file = "evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/vocab.txt"

    ## for STE
    #vocab_file = "evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/vocab.txt"

    sif_weight_dict = get_sif_weights(vocab_file)
    #print(sif_weight_dict)
    ###************************************************************************###

    prob_wordvecs = get_probability_word_vectors()

    ######################################for 20NewsDataset#######################################
    # Load train data.
    train = pd.read_csv('data/train_v2.tsv', header=0, delimiter="\t")
    # Load test data.
    test = pd.read_csv('data/test_v2.tsv', header=0, delimiter="\t")
    all = pd.read_csv('data/all_v2.tsv', header=0, delimiter="\t")

    ################################***********for tfidf*********################################
    traindata = []
    for i in range(0, len(all["news"])):
       traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(all["news"][i], True)))

    tfv = TfidfVectorizer(strip_accents='unicode', dtype=np.float32)
    tfidfmatrix_traindata = tfv.fit_transform(traindata)
    featurenames = tfv.get_feature_names()
    idf = tfv._tfidf.idf_
    #print(idf)
    #print(featurenames)

    ## Creating a dictionary with word mapped to its idf value
    print("Creating word-idf dictionary for Training set...")
    word_idf_dict = {}
    for pair in zip(featurenames, idf):
        #print(pair)
        word_idf_dict[pair[0]] = pair[1]
        #print(word_idf_dict)
        #print(pair[1])

    ###########*************for Document vectors****##
    num_features = 400
    num_clusters = 20
    #print(prob_wordvecs)
    #print(prob_wordvecs.type())
    ### below block to making singel value by summing all featurs

            #prob_wordvecs[k][i] = l
            #print(prob_wordvecs[k][i])
    #prob_wordvecs = get_topic_emb() ### this is the dictionary of topic wise word embeddings Note: if  full num_features doesn't work we will use the summ version or have to simplify the dictionary.
    ### for this part we dont need to use 'word_centroid_map' and 'word_centroid_prob_map' because our embedidngs  already combined with probability.

    # gwbowv is a matrix which contains normalised document vectors.
    gwbowv = np.zeros((train["news"].size, num_clusters * (num_features)), dtype="float32")
    #gwbowv = np.zeros((train["news"].size, 1), dtype="float32")

    counter = 0
    min_no = 0
    max_no = 0
    cnt = 0
    for review in train["news"]:
        # Get the wordlist in each news article.
        words = KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True)
        #print(words)
        #print(prob_wordvecs)
        s = time.time()
        gwbowv[counter] = create_cluster_vector_and_gwbowv(prob_wordvecs, words, num_features, word_idf_dict, sif_weight_dict, num_clusters, train=True)
        cnt += time.time() - s

        counter += 1
        if counter % 1000 == 0:
            print("Train News Covered : ", counter)

    gwbowv_name = "SDV_" + str(num_clusters) + "cluster_" + str(num_features) + "feature_matrix_gmm_sparse.npy"

    #gwbowv_test = np.zeros((test["news"].size, num_clusters * (num_features)), dtype="float32")
    gwbowv_test = np.zeros((test["news"].size, num_clusters * (num_features)), dtype="float32")

    counter = 0
    for review in test["news"]:
        # Get the wordlist in each news article.
        words = KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True)
        gwbowv_test[counter] = gwbowv_test[counter] = create_cluster_vector_and_gwbowv(prob_wordvecs, words, num_features, word_idf_dict, sif_weight_dict, num_clusters)
        counter += 1
        if counter % 1000 == 0:
            print("Test News Covered : ", counter)

    test_gwbowv_name = "TEST_SDV_" + str(num_clusters) + "cluster_" + str(num_features) + "feature_matrix_gmm_sparse.npy"

    ### Using principle component Analysis(PCA). to remove common component from the weighted average vector to remvoe noise and redundancy from the document vectors which represent discriminating document.

    ## saving gwbowv train and test matrices
    np.save(gwbowv_name, gwbowv)
    np.save(test_gwbowv_name, gwbowv_test)

    print("##########Print***Train***document**vector##############")
    print("Train Document vector gwbowv:", gwbowv)
    print("##########Print***Test***document**vector##############")
    print("Test Document vector gwbowv:", gwbowv_test)

    ###************************************Fitting-For Classification-Score*************************
    print("Fitting a SVM classifier on labeled training data...")

    param_grid = [{'C': np.arange(0.1, 7, 0.2)}]
    scores = ['accuracy', 'recall_micro', 'f1_micro', 'precision_micro', 'recall_macro', 'f1_macro', 'precision_macro',
              'recall_weighted', 'f1_weighted', 'precision_weighted']  # , 'accuracy', 'recall', 'f1']
    for score in scores:
        strt = time.time()
        print("# Tuning hyper-parameters for", score, "\n")
        clf = GridSearchCV(LinearSVC(C=1), param_grid, cv=5, scoring='%s' % score)
        clf.fit(gwbowv, train["class"])
        print("Best parameters set found on development set:\n")
        print(clf.best_params_)
        print("Best value for ", score, ":\n")
        print(clf.best_score_)
        Y_true, Y_pred = test["class"], clf.predict(gwbowv_test)
        print("Report")
        print(classification_report(Y_true, Y_pred, digits=6))
        print("Accuracy: ", clf.score(gwbowv_test, test["class"]))
        print("Time taken:", time.time() - strt, "\n")
    endtime = time.time()

    print("********************************************************")
    '''
    #######*********************************END-Document Representation **************************************************#

    '''
    #########################################***Block For Bigram***#########################################################
    ## for TDE
    #vocab_file = "evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/vocab.txt"

    ## for STE
    vocab_file = "evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/vocab.txt"
    #vocab_dict = vocab_count_dict(vocab_file)
    #print(vocab_dict)
    num_of_clusters = 20

    ## step: 1
    ## creating topic embeddings for TDE and STE by changing input file url in side the funciton.
    #topic_20News_emb_dict()

    ## step: 2
    ## writing "topic_centroid_prob_map" -> topic_word_prob dictionary in to 20News_topic_centroid_prob_map.txt file (only the probability of a word in a topic)
    #create_20News_probability_topic_vectors(vocab_file, num_of_clusters)
    ### just for checking if  "topic_word_prob" was created right  by  def create_20News_probability_topic_vectors(vocab_file, num_of_clusters) methods.
    ## *********don't need to run below line its for test only ***************
    #topic_word_prob = get_20Newstopic_centroid_prob_map()
    #print(topic_word_prob)

    ## step 3
    ### for getting Topic wise top bigram ranked by dot product of probability scores.
    bigram_topical_words(num_of_clusters) ## Using 20News_alldata ffor bigram
    ######################################****Bigram-process***########################################################
    '''

    #######**************************************Block is for PMI****************************************#########
    ## vocab_file url
    ## for TED
    vocab_file = "evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/vocab.txt"
    ## for STE
    #vocab_file = "evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/vocab.txt"

    num_of_clusters = 20
    num_topwords = 10

    ## Step-1 (ned to play once to create topic_centroid vlues)
    ### Creating Topic Centroid Probability map
    #create_probability_topic_vectors(vocab_file, num_of_clusters)

    ## Step-2
    ### getting "topic_centroid_prob_map"
    topic_centroid_prob_map = get_topic_centroid_prob_map()

    ### printing "topic_centroid_prob_map" to test
    #print(topic_centroid_prob_map)

    all = pd.read_csv('data/all_v2.tsv', header=0, delimiter="\t")
    traindata = []
    for i in range(0, len(all["news"])):
       traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(all["news"][i], True)))
    #print(traindata)
    ## step-3
    doc_freq, doc_cofreq =  get_doccofrequency(traindata)

    ## step-4
    ## PMI of ranked 10 words.
    topic_pmi, overall_pmi, top10words_pmi = get_pmi(doc_cofreq, doc_freq, num_of_clusters, num_topwords, topic_centroid_prob_map)

    ## for TDE
    outfile = open("evaluation_data/EXPERIMENTS_STE_TDE/TDE/TDE_20News/PMI/10words_pmi.txt", "w")
    ## for STE
    #outfile = open("evaluation_data/EXPERIMENTS_STE_TDE/STE/STE_20News/PMI/5words_pmi.txt", "w")

    outfile.write("Overall PMI : ")
    outfile.write(str(overall_pmi))
    outfile.write("\n")
    outfile.write("Topic PMI :")
    outfile.write(str(topic_pmi))
    outfile.write("\n")
    for i in range(num_of_clusters):
        outfile.write("##########################################################")
        for item in top10words_pmi[i]:
            outfile.write(str(item))
            outfile.write("\n")
        outfile.write(str(topic_pmi[i]))
        outfile.write("\n")
        outfile.write("**********************************************************")
        outfile.write("\n")
    outfile.close()

