from datasets import load_dataset
import copy
import re


INTERMEDIATE_SCHEMA = {
    "task_type": "MCQ",
    "dataset": "",
    "original_dataset_metadata": "https://huggingface.co/datasets/allenai/winogrande",
    "dataset_input": "", # Instance given to the LLM without any instruction. 
    "candidate_answer_set": [], # the list of all posssible answers for that instance
    "candidate_answer_label_space": [], # the list of all posssible answer labels
    "ground_truth_answer_label": "", 
    "ground_truth_answer_text": "",
    "dataset_instruction": "", # Task Prompt. Task prompt should not define how to generate the answer.
    "final_suffix_task_instruction": "", # The final task instruction which gets appended to the input and dataset_instruction
    "final_prefix_task_instruction": "", # The final task instruction which gets prepended to the input and dataset_instruction
    "task_instructions": [],
    "instruction_output": [],
    "instruction_following_errors_set": [],
    "reasoning_error_set":[]
}


def transform_winogrande(test_instance: dict):
    schema = copy.deepcopy(INTERMEDIATE_SCHEMA)
    
    # Adding meta data information
    schema['task_type'] = "MCQ"
    schema['dataset'] = "Winogrande"

    candidate_answer_list = ["1", "2"]

    # Copying untransformed data fields
    schema["dataset_input"] = f"Sentence: {test_instance['sentence']}\nOptions: "
    schema["candidate_answer_set"] = [test_instance["option1"], test_instance["option2"]]
    # label space is same as True or False
    # we could have 0 or 1 / A or B as label spaces
    schema["candidate_answer_label_space"] = ["A", "B"]

    if test_instance["answer"] == "1":
        schema["ground_truth_answer_text"] = test_instance["option1"]
        schema["ground_truth_answer_label"]  = "A"
        schema["instruction_output"] = ["A"]
    else:
        schema["ground_truth_answer_text"] = test_instance["option2"]
        schema["ground_truth_answer_label"]  = "B"
        schema["instruction_output"] = ["B"]

    # The instruction prefix which gets appended
    schema["dataset_instruction"] = "Given a sentence with a blank and 2 options namely 'A' and 'B', "
    schema["final_prefix_task_instruction"] = "Given a sentence with a blank and 2 options namely 'A' and 'B', complete the sentence by selecting the  values associated with the correct option labels corresponding to the blank.\n"
   
    schema["final_suffix_task_instruction"] = "\n"

    #schema["candidate_answer_instruction_output"].append(schema["candidate_answer_label_space"])

    schema["task_instructions"].append(schema["dataset_instruction"])

    assert schema["dataset_input"] != ""
    assert schema["ground_truth_answer_text"] != ""
    assert schema["ground_truth_answer_label"] != ""
    assert len(schema["candidate_answer_label_space"]) > 0 
    assert len(schema["candidate_answer_set"]) > 0 

    assert schema["dataset_instruction"] != ""
    assert len(schema["instruction_output"]) > 0
    return schema

class Winogrande:
    def __init__(self):
        super().__init__()
        self.dataset = load_dataset("allenai/winogrande", "winogrande_xl", split="validation")
        self.intermediate_representation = self.dataset.map(transform_winogrande, remove_columns=self.dataset.column_names, desc="Converting dataset to schema")