from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
import numpy as np
from source.controller.retriever.generate_passage_embeddings import main_modified
import os 
import pandas as pd 
import glob
import json
import datetime
# import nltk
# nltk.download('punkt')  # This is needed for tokenization
from nltk.tokenize import word_tokenize, sent_tokenize
import subprocess

word_limit = 100


def split_file(input_dir, output_dir, word_limit):
    # Call the split_script.py with the file name
    return subprocess.Popen(['python', './data/split.py', input_dir, output_dir, str(word_limit)])

INPUT_DIR = "data/ADE_med/data/"
OUTPUT_DIR = "data/ADE_med/data_split_100/"
os.mkdir(OUTPUT_DIR)

list_folder_path = glob.glob(INPUT_DIR + "/*", recursive=False)
print(list_folder_path)
processes = [split_file(folder, OUTPUT_DIR + folder.split("/")[-1],  word_limit) for folder in list_folder_path]

for process in processes:
    process.wait()
    

