from experiment import HardLevelExperiment
from eval import eval_file
from utils.utils import extract_hard_level_file


def extract_hard_level(model_name, dataset_list, method_list, level='Level 5'):
    """
    extract the level 5 problem and its results from the whole file
    """ 
    for method in method_list:
        for dataset in dataset_list:
            file = f'results/{model_name}_{dataset}_{method}.jsonl'
            extract_hard_level_file(file, level)
            print(f"Extracted level 5 problems from {file}")


if __name__ == '__main__':

    # Create a hard level experiment
    dataset_list = ['algebra', 'counting & probability', 'geometry', 'number theory', 'intermediate algebra' ,'precalculus', 'prealgebra']
    # dataset_list = ['counting & probability', 'geometry', ]
    # dataset_list = ['number theory', 'intermediate algebra']
    # dataset_list = ['precalculus', 'prealgebra']
    # method_list = ['cot', 'pal', 'codenl', 'nlcode']
    # model_name = 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo'
    # extract_hard_level(model_name, dataset_list, method_list)
    # dataset_list =  ['counting & probability']
    # method = 'metamath' # 'pal'
    hard_level = "Level 5"
    # model_name = 'gpt-4o-mini'
    model_name = 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo'
    verbose = False
    is_train = False

    method = 'majorvote'
    # for method in ['metamath']:
    methods = ['cot', 'pal', 'codenl', 'nlcode']
    scores = []
    for dataset in dataset_list:
        for hard_level in ['Level 4', 'Level 3', 'Level 2', 'Level 1']:
            print(f"Running experiment on {dataset} with hard level {hard_level} for method {method}")
            if 'gpt' in model_name:
                store_path = f"new_results/gpt/{model_name}_{dataset}_{hard_level}_{method}_greedy.jsonl"
            else:
                store_path = f"new_results/{model_name}_{dataset}_{hard_level}_{method}_greedy.jsonl"
            if is_train:
                store_path = store_path.replace(".jsonl", "_train.jsonl")
            exp = HardLevelExperiment(model_name, dataset, method, store_path, hard_level=hard_level, verbose=verbose)
            exp.run()
            scores += eval_file(store_path)
        accuracy = sum(scores) / len(scores)
        print(f"Average accuracy for model {model_name} of method {method} is {accuracy}")