import json

from nltk.tokenize import RegexpTokenizer

from df_iapf import DFITF


if __name__ == '__main__':
    train_data = json.load(open('../data/mimic3/mimic3_train.json', encoding='utf-8'))
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_notes = [tokenizer.tokenize(sample['TEXT']) for sample in train_data]

    dfitf = DFITF(tokenized_notes, n=5)
    rough_titles = dfitf.calculate()
    top = rough_titles[:500]
    filtered_titles = []
    for i, (r, s) in enumerate(top):
        for j, (rr, ss) in enumerate(top):
            if i != j and r in rr:
                break
        else:
            filtered_titles.append(r)
    json.dump(filtered_titles, open('filtered_titles.json', 'w', encoding='utf-8'), indent=4)
