import lm_eval
from lm_eval import tasks
from lm_eval import utils as lm_eval_utils
from lm_eval.api.registry import ALL_TASKS
from lm_eval.models.huggingface import HFLM

TASK_METRIC_MAP = {
    "arc_challenge": "acc_norm,none",
    "arc_easy": "acc_norm,none",
    "hellaswag": "acc_norm,none",
    "piqa": "acc_norm,none",
    "winogrande": "acc,none",
    "openbookqa": "acc_norm,none",
    "boolq": "acc,none",
    }

def calculate_avg_accuracy(task_names: str, results: dict) -> float:
    n_tasks = len(task_names)
    acc_cumul = sum(result.get(TASK_METRIC_MAP[task]) for task, result in results.items() if 'mmlu' not in task)

    questions_per_mmlu_task = {
        task_name: lm_eval.tasks.get_task_dict([task_name])[task_name].dataset["test"].num_rows
        for task_name in task_names
        if 'mmlu' in task_name
    }

    if not questions_per_mmlu_task:
        return acc_cumul / n_tasks

    acc_mmlu = sum(
        result.get(TASK_METRIC_MAP[task]) * questions_per_mmlu_task[task]
        for task, result in results.items()
        if 'mmlu' in task
    )
    acc_mmlu_avg = acc_mmlu / sum(questions_per_mmlu_task.values())

    return (acc_cumul + acc_mmlu_avg) / (n_tasks - len(
    ) + 1)

    
def evaluate_zero_shot(model, tokenizer, batch_size=64, num_fewshot=0):
    hflm = HFLM(pretrained=model, tokenizer=tokenizer, batch_size=batch_size)

    task_names = ["winogrande", "piqa", "openbookqa", "hellaswag", "boolq", "arc_easy", "arc_challenge"]

    print(task_names)

    for task in task_names:
        if task not in TASK_METRIC_MAP:
            raise NotImplementedError(
                f"Please specify the metric to use for {task} in TASK_METRIC_MAP. Available info {TASK_METRIC_MAP}"
            )

    results = lm_eval.simple_evaluate(hflm, tasks=task_names, num_fewshot=num_fewshot, batch_size=batch_size)[
        'results'
    ]

    print(results)

    metric_vals = {task: round(result.get(TASK_METRIC_MAP[task]), 4) for task, result in results.items()}
    acc_avg = calculate_avg_accuracy(task_names, results)
    metric_vals['average'] = round(acc_avg, 4)

    print(metric_vals)
    print('acc_avg: ', acc_avg)

def evaluate_few_shot_mmlu(model, tokenizer, batch_size=64, num_fewshot=5):
    hflm = HFLM(pretrained=model, tokenizer=tokenizer, batch_size=batch_size)

    task_names = ["mmlu"]    

    results = lm_eval.simple_evaluate(hflm, tasks=task_names, num_fewshot=num_fewshot, batch_size=batch_size)[
        'results'
    ]

    print(results)
    print('mmlu acc,none: ', results["mmlu"]["acc,none"])


