import json
import os
import glob
import tqdm
import argparse
import random

random.seed(42)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_dir", 
        type=str, 
    )
    parser.add_argument(
        "--save_dir", 
        type=str, 
    )
    parser.add_argument(
        "--bbh_no_cot", 
        action="store_true", 
        help="if specified, chain of thoughts will be removed from the prompts."
    )
    parser.add_argument(
        "--bbh_max_num_examples_per_task", 
        type=int, 
        default=40, 
        help="maximum number of examples to evaluate per task."
    )
    parser.add_argument(
        "--chat_formatting_function", 
        type=str, 
        default="eval.templates.create_prompt_with_tulu_chat_format", 
        help="The function to use to create the chat format. This function will be dynamically imported. Please see examples in `eval/templates.py`."
    )
    args = parser.parse_args()
    
    return args



def create_bbh_eval_data(args)
    all_tasks = {}
    bbh_data_dir = os.path.join(args.data_dir, "bbh")
    task_files = glob.glob(os.path.join(bbh_data_dir, "bbh", "*.json"))
    for task_file in tqdm.tqdm(task_files, desc="Loading tasks"):
        with open(task_file, "r") as f:
            task_name = os.path.basename(task_file).split(".")[0]
            all_tasks[task_name] = json.load(f)["examples"]
            if args.bbh_max_num_examples_per_task:
                all_tasks[task_name] = random.sample(all_tasks[task_name], args.bbh_max_num_examples_per_task)

    all_prompts = {}
    cot_prompt_files = glob.glob(os.path.join(bbh_data_dir, "cot-prompts", "*.txt"))
    for cot_prompt_file in tqdm.tqdm(cot_prompt_files, desc="Loading prompts"):
        with open(cot_prompt_file, "r") as f:
            task_name = os.path.basename(cot_prompt_file).split(".")[0]
            task_prompt = "".join(f.readlines()[2:])
            if args.bbh_no_cot:
                prompt_fields = task_prompt.split("\n\n")
                new_prompt_fields = []
                for prompt_field in prompt_fields:
                    if prompt_field.startswith("Q:"):
                        assert "So the answer is" in prompt_field, f"`So the answer is` not found in prompt field of {task_name}.txt."
                        assert "\nA:" in prompt_field, "`\nA:` not found in prompt field."
                        answer = prompt_field.split("So the answer is")[-1].strip()
                        question = prompt_field.split("\nA:")[0].strip()
                        new_prompt_fields.append(question + "\nA: " + answer)
                    else:
                        new_prompt_fields.append(prompt_field)
                task_prompt = "\n\n".join(new_prompt_fields)
            all_prompts[task_name] = task_prompt

    assert set(all_tasks.keys()) == set(all_prompts.keys()), "task names in task data and task prompts are not the same."
    
    return all_tasks, all_prompts


if __name__ == "__main__":
    args = parse_args()
    
    print('BBH')
    for use_chat_format in [True, False]:
        create_bbh_eval_data(args, use_chat_format)