import csv
import json
import os

import nltk
import string

add_option = True


def preprocess(text):
  text = nltk.word_tokenize(text)
  if len(text) == 0:
    return ""
  if text[-1] in string.punctuation:
    text = text[:-1]
  return ' '.join(text)

name_list = []

for name in os.listdir("/scratch2/USER/Verbosity/raw_dataset/MMLU/data/test"):

    file_path = f"/scratch2/USER/Verbosity/raw_dataset/MMLU/data/test/{name}"
    dataset = []
    with open(file_path) as file:
        reader = csv.reader(file)
        for line in reader:
            options = line[1:-1]
            output = ord(line[-1]) - ord('A')
            answer = preprocess(options[output])
            length = len(nltk.word_tokenize(answer))
            if length > 3:
                continue
            turns = ["",""]
            if add_option:
                turns = ["Here are some candidates as hints:"] + options
            dataset.append({'raw': line, 'turns': turns, 'question':line[0], 'output': answer})

    new_data = dataset
    dataset_name = name.replace('.csv','')
    if add_option:
        dataset_name += "+op"
    save_file_path = f"dataset/mmlu_{dataset_name}.json"
    import random
    random.seed(42)
    if len(new_data) > 500:
        new_data = random.sample(new_data, 500)
    with open(save_file_path, 'w') as file:
        json.dump(new_data, file)
        print(f"Dump {len(new_data)} samples to {save_file_path}")
    name_list.append(f"mmlu_{name.replace('.csv','')}")
print(' '.join(name_list))
print(name_list)