import os
import json
import re
import random
from random import shuffle
from tqdm import tqdm, trange
from nltk.tokenize import sent_tokenize, word_tokenize

path = "./wikitext/"
MAX_WORD_NUM = 400

filename_list = os.listdir(path)

pageid_list = [int(x.split(".")[0]) for x in filename_list]
shuffle(pageid_list)


# an test example: 41888831.txt

# print(len(pageid_list))    # 1093162

wf = open("./wiki_data/lm_train_10000.txt", 'w', encoding="utf-8")

cnt = 0

for pageid in tqdm(pageid_list):
    cnt += 1
    if cnt > 10000: 
        break
    filename = path + str(pageid) + ".txt"
    with open(filename, 'r', encoding='utf-8') as rf:
        i = 0
        pre_sent = []
        for line in rf:
            if i==0:
                title = line.strip()[7:]
                i+=1
                continue
            line = line.strip()
            line = line.replace('*', '')
            # print(line)
            # if len(line) < 50:
            #     continue
            temp_line = line.replace(" ", '')
            if "=References=" in temp_line or "=Literature=" in temp_line:
                break
            
            if "==" in temp_line or temp_line == "":
                continue

            sent_list = sent_tokenize(line)

            if len(sent_list) > 1:
                for sent in sent_list:
                    if len(word_tokenize(sent)) > 5:
                        wf.write(sent + '\n')
    wf.write('\n')
            
