from distances import get_dist
from scipy.spatial.distance import cosine
import numpy as np

def lines_to_ngrams(lines, n=3):
    ngrams = []
    for s in lines:
        words = [e for e in str(s).replace('.','').replace('\n','').split(' ') if e != '']
        ngrams.append([tuple(words[i:i + n]) for i in range(len(words) - n + 1)])
    return ngrams

def ngram_cosine_distance(ngram1, ngram2):
    """
    Calc cosine distannce between (ngram1, ngram2) [[derived from (str1, str2)]] in the n-gram space.
    """

    def intersection(lst1, lst2):
        return list(set(lst1) & set(lst2))

    # acceleration step - if no intersection -> dist = 1.
    if len(intersection(ngram1, ngram2)) == 0:
        return 1.
    else:
        n_space = list(set().union(ngram1, ngram2))

        # vectorize
        vectors = []
        for n_gram in [ngram1, ngram2]:
            vectors.append([n_gram.count(e) for e in n_space])
        #print(vectors)
        return cosine(vectors[0], vectors[1])
    #get_dist(vectors[0], vectors[1], metric="cosine", check_for_duplicates=False)  # uv/|u||v|

def ngram_cosine_similarity_from_strings(str1, str2, n=3):
    ngrams = lines_to_ngrams([str1, str2], n)
    return 1 - ngram_cosine_distance(ngrams[0], ngrams[1])
    #1 - ngram_cosine_distance(str1, str2)#1 - ngram_cosine_distance(ngrams[0], ngrams[1])

def ngram_cosine_distance_from_strings(str1, str2, n=3):
    ngrams = lines_to_ngrams([str1, str2], n)
    #print(ngrams)
    return ngram_cosine_distance(ngrams[0], ngrams[1])

def normalized_unique_ngrams(ngram_lists):
        """
        Calc the portion of unique n-grams out of all n-grams.
        :param ngram_lists: list of lists of ngrams
        :return: value in (0,1]
        """
        ngrams = [item for sublist in ngram_lists for item in sublist]  # flatten
        return len(set(ngrams)) / len(ngrams) if len(ngrams) > 0 else 0.

def unique_ngrams(response_set, n=3):
    return normalized_unique_ngrams(lines_to_ngrams(response_set, n=3))

def ngram_distance_matrix(resp_set, n=3, drop_duplicates=False):
    resp_set=lines_to_ngrams(resp_set, n=n)

    ngramsim=np.zeros(shape=(len(resp_set),len(resp_set)))

    if len(resp_set)>1:
        for i, r in enumerate(resp_set):
            for j in range(i+1, len(resp_set)):
                #print(j, resp_set[j])
                if ((len(r)==0) | (len(resp_set[j])==0)):
                    sim_ngrams=0
                else:
                    sim_ngrams=ngram_cosine_distance(r[0], resp_set[j][0])
                if i==j:
                    print("i=j", sim_ngrams)	
                ngramsim[i,j] = sim_ngrams
                ngramsim[j,i] = sim_ngrams

        if drop_duplicates:
            ngramsim=remove_duplicates_distances(ngramsim)
    return ngramsim 

def remove_duplicates(X):
    """
    Remove duplicate observations from a dataset.

    Parameters
    ----------
    X : array_like, shape (`n_obs`, `n_vars`)
        A dataset whose rows are observations and columns are features.
  
    Returns
    -------
    X_unique : array_like, shape (`n_obs`, `n_vars`)
        A dataset whose rows/observations are unique.
    """
    X_unique, indices = np.unique(X, axis=0, return_index=True)
    n_new = X_unique.shape[0]
    n = X.shape[0]
    if n_new != n:
        print("Out of the "+ str(round(n)) + " observations in X, only "+ str(round(n_new)) + " are unique.")

    return X_unique#, indices, n

def remove_duplicates_distances(D):
    X_unique, indices = np.unique(D, axis=0, return_index=True)
    X_unique = X_unique[:,indices]
    n_new = X_unique.shape[0]
    n = D.shape[0]
    if n_new != n:
        print("Out of the "+ str(round(n)) + " observations in X, only "+ str(round(n_new)) + " are unique.")

    return X_unique



