from argparse import ArgumentParser
import os 
import glob
import json

import nltk
# nltk.download('punkt')  # This is needed for tokenization
from nltk.tokenize import word_tokenize, sent_tokenize


def split_text(data, word_limit):
    result = []
    for x in data:
        passages = split_into_passages(x, word_limit)
        result += passages
    return result


def split_into_passages_breakline(text, word_limit=500):
    sentences = text.split("\n")
    passages = []
    current_passage = ""

    for sentence in sentences:
        # Check if adding the next sentence would exceed the word limit
        if len(word_tokenize(current_passage)) + len(word_tokenize(sentence)) > word_limit:
            passages.append(current_passage)
            current_passage = sentence
        else:
            current_passage += "\n" + sentence

    # Add the last passage if it's not empty
    if current_passage.strip() != "":
        passages.append(current_passage)

    return passages





def split_into_passages(text, word_limit=500):
    sentences = sent_tokenize(text)
    passages = []
    current_passage = ""

    for sentence in sentences:
        # Check if adding the next sentence would exceed the word limit
        if len(word_tokenize(current_passage)) + len(word_tokenize(sentence)) > word_limit:
            passages.append(current_passage)
            current_passage = sentence
        else:
            current_passage += " " + sentence

    # Add the last passage if it's not empty
    if current_passage.strip() != "":
        passages.append(current_passage)

    return passages





parser = ArgumentParser()
parser.add_argument('input_dir')   
parser.add_argument('output_dir')   
parser.add_argument('word_limit', type=int)   

args = parser.parse_args()
print("Process {input_dir} is started.".format(input_dir=args.input_dir.split("/")[-1]))


os.mkdir(args.output_dir)
# list_document_path = glob.glob(args.input_dir + "/*", recursive=False)
list_document_path = [args.input_dir]
for path in list_document_path:
    documents = []
    with open(path) as f:
        documents.append(f.read())
    
    documents = split_text(documents, args.word_limit)
    new_path = args.output_dir +"/" + path.split("/")[-1]
    with open(new_path, "w") as f:
        for doc in documents:
            f.write(json.dumps({"text":doc}) + "\n")

print("Process {input_dir} is done.".format(input_dir=args.input_dir.split("/")[-1]))
