from datasets import load_dataset
import copy
import re


INTERMEDIATE_SCHEMA = {
    "task_type": "MCQ",
    "dataset": "",
    "original_dataset_metadata": "https://huggingface.co/datasets/google/boolq/",
    "dataset_input": "", # Instance given to the LLM without any instruction. 
    "candidate_answer_set": [], # the list of all posssible answers for that instance
    "candidate_answer_label_space": [], # the list of all posssible answer labels
    "ground_truth_answer_label": "", 
    "ground_truth_answer_text": "",
    "dataset_instruction": "", # Task Prompt. Task prompt should not define how to generate the answer.
    "final_suffix_task_instruction": "", # The final task instruction which gets appended to the input and original_task_instruction
    "final_prefix_task_instruction": "", # The final task instruction which gets prepended to the input and original_task_instruction
    "task_instructions": [],
    "instruction_output": [],
    "instruction_following_errors_set": [],
    "reasoning_error_set":[]
}



def transform_boolq(test_instance: dict):
    schema = copy.deepcopy(INTERMEDIATE_SCHEMA)
    
    # Adding meta data information
    schema['task_type'] = "MCQ"
    schema['dataset'] = "BoolQ"

    candidate_answer_list = ["True", "False"]
    # convert Boolean value to string
    test_instance["answer"] = str(test_instance["answer"])

    # Copying untransformed data fields
    schema["dataset_input"] = f"Passage: {test_instance['passage']}\nQuestion: {test_instance['question']}\nOptions: "
    schema["candidate_answer_set"] = candidate_answer_list
    # label space is same as True or False
    # we could have 0 or 1 / A or B as label spaces
    schema["candidate_answer_label_space"] = ["A", "B"]

    if test_instance["answer"] == "True":
        schema["ground_truth_answer_label"] = "A"
    else:
        schema["ground_truth_answer_label"] = "B"
    schema["ground_truth_answer_text"] = test_instance["answer"]

    # The instruction prefix which gets appended
    schema["dataset_instruction"] = "Given a passage and a boolean question, and the possible answer candidates 'A' or 'B', "
    schema["final_prefix_task_instruction"] = "Given a passage and a boolean question, and the possible answer candidates 'A' or 'B', answer the question by selecting the value associated with the option label corresponding to the correct answer.\n"
    schema["final_suffix_task_instruction"] = "\n"

    schema["instruction_output"] = [schema["ground_truth_answer_text"]]
    #schema["candidate_answer_instruction_output"].append(schema["candidate_answer_label_space"])

    schema["task_instructions"].append(schema["dataset_instruction"])

    assert schema["dataset_input"] != ""
    assert schema["ground_truth_answer_text"] != ""
    assert schema["ground_truth_answer_label"] != ""
    assert len(schema["candidate_answer_label_space"]) > 0 
    assert len(schema["candidate_answer_set"]) > 0 

    assert schema["dataset_instruction"] != ""
    assert len(schema["instruction_output"]) > 0
    return schema

class BoolQ:
    def __init__(self):
        super().__init__()
        self.dataset = load_dataset("google/boolq", split="validation")
        self.intermediate_representation = self.dataset.map(transform_boolq, remove_columns=self.dataset.column_names, desc="Converting dataset to schema")
