import subprocess
import multiprocessing
from itertools import product

# Dictionary of datasets and their results
iterative_res = {
    "clutrr": 0.585,
    "clevr": 0.75,
    "leaf": 0.418,
    "omnimath-1": 0.61,
    "bbeh_word_sorting": 0.49,
    "bbeh_dyck_languages": 0.075,
    "bbeh_object_counting": 0.375,
    "bbeh_object_properties": 0.165,
    "bbeh_boardgame_qa": 0.375,
    "bbeh_boolean_expressions": 0.485,
    "bbeh_zebra_puzzle": 0.215,
    "bbeh_buggy_tables": 0.245,
    "bbeh_spatial_reasoning": 0.345,
    "bbeh_multistep_arithmetic": 0.06,
    "bbeh_geometric_shapes": 0.295,
    "bbeh_shuffled_objects": 0.0,
    "bbeh_temporal_sequence": None,
    "bbeh_disambiguation_qa": 0.433,
    "bbeh_causal_understanding": 0.5,
    "bbeh_time_arithmetic": None,
    "bbeh_web_of_lies": 0.145,
    "bbeh_sarc_triples": 0.305,
    "bbeh_nycc": 0.11,
    "bbeh_sport_qa": 0.225,
    "bbeh_linguini": 0.105,
    "bbeh_movie_recommendation": 0.435
}

def run_evaluation(args):
    method, dataset = args
    cmd = f"python scripts/code_eval.py --model gemini-2.0-flash --method {method} --dataset {dataset}"
    print(f"Running: {cmd}")
    subprocess.run(cmd, shell=True)

def main():
    # Create all combinations of methods and datasets
    methods = ["code", "gen_sym_reason_prog"]
    datasets = list(iterative_res.keys())
    tasks = list(product(methods, datasets))
    
    # Run 8 processes in parallel
    with multiprocessing.Pool(processes=8) as pool:
        pool.map(run_evaluation, tasks)

if __name__ == "__main__":
    main() 