from wordfreq import zipf_frequency
import spacy

nlp = spacy.load("en_core_web_sm")

filter_words = ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost',
                'alone', 'along', 'already', 'also', 'although', 'am', 'among', 'amongst', 'an', 'and', 'another',
                'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', "aren't", 'around', 'as',
                'at', 'back', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides',
                'between', 'beyond', 'both', 'but', 'by', 'can', 'cannot', 'could', 'couldn', "couldn't", 'd', 'didn',
                "didn't", 'doesn', "doesn't", 'don', "don't", 'down', 'due', 'during', 'either', 'else', 'elsewhere',
                'empty', 'enough', 'even', 'ever', 'everyone', 'everything', 'everywhere', 'except', 'first', 'for',
                'former', 'formerly', 'from', 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'he', 'hence',
                'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
                'how', 'however', 'hundred', 'i', 'if', 'in', 'indeed', 'into', 'is', 'isn', "isn't", 'it', "it's",
                'its', 'itself', 'just', 'latter', 'latterly', 'least', 'll', 'may', 'me', 'meanwhile', 'mightn',
                "mightn't", 'mine', 'more', 'moreover', 'most', 'mostly', 'must', 'mustn', "mustn't", 'my', 'myself',
                'namely', 'needn', "needn't", 'neither', 'never', 'nevertheless', 'next', 'no', 'nobody', 'none',
                'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'o', 'of', 'off', 'on', 'once', 'one', 'only',
                'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'per',
                'please', 's', 'same', 'shan', "shan't", 'she', "she's", "should've", 'shouldn', "shouldn't", 'somehow',
                'something', 'sometime', 'somewhere', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs',
                'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein',
                'thereupon', 'these', 'they', 'this', 'those', 'through', 'throughout', 'thru', 'thus', 'to', 'too',
                'toward', 'towards', 'under', 'unless', 'until', 'up', 'upon', 'used', 've', 'was', 'wasn', "wasn't",
                'we', 'were', 'weren', "weren't", 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
                'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while',
                'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'with', 'within', 'without', 'won',
                "won't", 'would', 'wouldn', "wouldn't", 'y', 'yet', 'you', "you'd", "you'll", "you're", "you've",
                'your', 'yours', 'yourself', 'yourselves', '.', '-', 'a the', '/', '?', 'some', '"', ',', 'b', '&', '!',
                '@', '%', '^', '*', '(', ')', "-", '-', '+', '=', '<', '>', '|', ':', ";", '～', '·']
filter_words = set(filter_words)


def preprocess_text(text):
    """
    Preprocess the text.
    """
    text = text.lower()
    text = text.replace('.', '')
    text = text.split()
    text = [word for word in text if word not in filter_words]
    text = ' '.join(text)
    return text


def get_sentence_length(sentence):
    """
    Get the length of a sentence in terms of number of words,
    after removing the stop words.
    """
    sentence = preprocess_text(sentence)
    sentence = sentence.split()
    return len(sentence)


def is_noun(word):
    """
    Check if the word is a noun.
    """
    word = word.lower()
    doc = nlp(word)
    return doc[0].pos_ == 'NOUN'


def is_verb(word):
    """
    Check if the word is a verb.
    """
    word = word.lower()
    doc = nlp(word)
    return doc[0].pos_ == 'VERB'


def is_adj(word):
    """
    Check if the word is an adjective.
    """
    word = word.lower()
    doc = nlp(word)
    return doc[0].pos_ == 'ADJ'


def get_pos_counts(sentence):
    """
    Returns:
        dictionary of counts of each part of speech in the sentence.
    """
    sentence = preprocess_text(sentence)
    doc = nlp(sentence)
    pos_counts = {}
    pos_dict = {}
    for token in doc:
        pos = token.pos_
        pos_counts[pos] = pos_counts.get(pos, 0) + 1
        pos_dict.setdefault(pos, []).append(token.text)
    # print(pos_dict)
    return pos_counts


def get_word_zipf_freq(word, lang='en'):
    """
    Get the Zipf frequency of a word.
    """
    word = word.lower()
    return zipf_frequency(word, lang)


def get_sentence_inv_freq_sum(sentence):
    """
    Get the sum of the inversed Zipf frequency of all the words in a sentence.
        Larger value means the sentence is more informative.
    """
    sentence = preprocess_text(sentence)
    sentence = sentence.split()
    inv_freq_sum = 0
    for word in sentence:
        zipf_freq = get_word_zipf_freq(word)
        if zipf_freq == 0:
            # invalid word
            inv_freq = 0
            print("Invalid word for zipf:", word)
        else:
            inv_freq = 1 / zipf_freq
        inv_freq_sum += inv_freq
        # print(word, inv_freq)
    return inv_freq_sum


# def get_norm_word_embed(word):
#     """
#     Get the norm of the word embedding.
#     """
#     word = word.lower()

#     return word


if __name__=="__main__":
    # sentence = "The quick brown fox jumps over the lazy dog."
    sentences = [
        "A man with a red helmet on a small moped on a dirt road. ",
        "Man riding a motor bike on a dirt road on the countryside.",
        "A man riding on the back of a motorcycle.",
        "A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ",
        "A man in a red shirt and a red hat is on a motorcycle on a hill side."
        ]

    for sentence in sentences:
        print("====================================")
        print("Sentence:", sentence)
        print("Length:", get_sentence_length(sentence))
        print("POS counts:", get_pos_counts(sentence))
        print("Sum of inversed Zipf frequency:", get_sentence_inv_freq_sum(sentence))
  