import pickle

with open('./cc3m/dict_sentences_nouns.pkl', 'rb') as f:
    dict_nouns_to_count, dict_sentence_to_nouns = pickle.load(f)

keep_radio = 0.6

list_nouns_count = list(dict_nouns_to_count.items())
list_nouns_count = sorted(list_nouns_count, key=lambda x: x[1], reverse=False)
print('number of list_nouns_count:{}'.format(len(list_nouns_count)))


# print(list_nouns_count[:1000])
# print(list_nouns_count[-1000:])
def contains_digit(s):
    return any(char.isdigit() for char in s)


keep_number = int(len(list_nouns_count) * keep_radio)
print('keep number:{}'.format(keep_number))
list_nouns_count = list_nouns_count[:keep_number]
keep_words = []
for item_word, item_count in list_nouns_count:
    if (contains_digit(item_word) or item_count == 1):
        continue
    keep_words.append(item_word)

keep_words = set(keep_words)

final_words = set()
selected_sentences = []
for index, (item_sentence, nouns_cur_sentence) in enumerate(dict_sentence_to_nouns.items()):
    if (index % 10000 == 0):
        print('index:{}'.format(index))
    for item_noun in nouns_cur_sentence:
        if (item_noun in keep_words):
            final_words.update(nouns_cur_sentence)
            selected_sentences.append(item_sentence)
            break

final_words = list(final_words)
print('number of final words:{}'.format(len(final_words)))
print('final words:{}'.format(final_words[:50]))
print('number of selected_sentences:{}'.format(len(selected_sentences)))
with open('./cc3m/final_words_selected_sentences2.pkl', 'wb') as f:
    pickle.dump([final_words, selected_sentences], f)
