import os
# from src.setting import *
from config.setting import *
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import logging
import os
import colorlog
import json
from evaluator.evaluator import evaluate_answer
from evaluator.Accuracy import Accuracy
from utils import Result_Extractor

from evaluator.datasets.bigbenchhard_dataset import BigBenchHardDataset
from prompt.bigbenchhard_prompt_set import BigBenchHardPromptSet
from src.utils import get_gpt_response
from config.setting import EXECUTOR_PROMPT_FORMAT_FOR_BASELINE

def setup_logging(log_file_path):
    handler = colorlog.StreamHandler()
    handler.setFormatter(colorlog.ColoredFormatter(
        '%(log_color)s%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        log_colors={
            'DEBUG': 'cyan',
            'INFO': 'green',
            'WARNING': 'yellow',
            'ERROR': 'red',
            'CRITICAL': 'red,bg_white'
        }
    ))
    
    logger = logging.getLogger()
    logger.addHandler(handler)
    logger.setLevel("INFO")
    
    file_handler = logging.FileHandler(log_file_path, mode="w")
    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(file_handler)


def main():
    # args = get_args()
    accuracy = Accuracy()
    log_file_path = os.path.join("./log", f"baseline.log" )
    setup_logging(log_file_path)
    logger = logging.getLogger(__name__)
    logger.info("Progress Start!")

    dataset = BigBenchHardDataset()
    dataset_root_path = "./big_datasets/bigbenchhard"
    test_dataset_file_path1 = os.path.join(dataset_root_path, "bigbenchhard_test_same.jsonl")
    test_dataset_file_path2 = os.path.join(dataset_root_path, "bigbenchhard_test_unseen.jsonl")
    test_dataset1 = dataset.generate_tasks_by_file_path(test_dataset_file_path1)
    test_dataset2 = dataset.generate_tasks_by_file_path(test_dataset_file_path2)
    print('length of test dataset is ', len(test_dataset1))
    print('length of test dataset is ', len(test_dataset2))

    prompt_set = BigBenchHardPromptSet()
    constraints = prompt_set.get_constraint()

    for task in test_dataset1:
        logger.info(f"Task {task.task_id} Description: {task.major_problem}")
        query_prompt = EXECUTOR_PROMPT_FORMAT_FOR_BASELINE.format(major_problem=task.major_problem, constraints=constraints)
        logger.info(f"Query Prompt: {query_prompt}")
        response = get_gpt_response(query_prompt=query_prompt, system_prompt="You are a helpful assistant." )
        response = Result_Extractor.extract_answer(response)
        logger.info(f"Response: {response}")
        result = evaluate_answer(task.correct_answer, response)
        accuracy.update(result)
        if result:
            logger.info(f"Task {task.task_id} is correct, response is {response}")
        else:
            logger.info(f"Task {task.task_id} is wrong, response is {response} with correct answer {task.correct_answer}")
        logger.info(f"Accuracy: {accuracy.get_accuracy()}")

        
        
if __name__ == "__main__":
    main()