import numpy as np from eval.scripts.utils import get_scores_all_benchmarks models = {  "dcft-v3.1": "3b5b1b2e-e511-4b87-ac73-e562e56b2e59",  "dcft-v1.2": "12d8042d-0ebc-44d8-9f04-630eee68c1ce",  "gemma2-9b-it": "a7c2132c-c360-4cd9-bb09-4f6a7681ca7b",  "teknium-mistral-2.5": "c71ac955-3a9a-48cb-b9d0-459e97e7fee5",  "mistral-instruct-v0.3-7b": "ea55020a-157b-42b3-acd1-55626bb85c9b",  "llama-3.1-instruct-8b": "39092328-fa6f-4ef2-914d-ad42b771679c", } benchmarks = [  "alpaca_eval_length_controlled_winrate",  "WildBench_score",  "MixEval_gpt-4o-mini-2024-07-18/metrics/overall",  "MTBench_Average",  "IFEval_instruction-level",  "MBPP_pass@1",  "HumanEval_python_pass@1",  "mmlu_acc,none",  "arc_challenge_acc,none",  "drop_f1,none", ] for model_name in models:  scores = get_scores_all_benchmarks(models[model_name])  for benchmark_name in benchmarks:  curr_benchmark_scores = scores[benchmark_name]  mean = np.mean(curr_benchmark_scores)  std_dev = np.std(curr_benchmark_scores)  min_val = np.min(curr_benchmark_scores)  max_val = np.max(curr_benchmark_scores)  num_outers = len(  [x for x in curr_benchmark_scores if abs((x - mean) / std_dev) > 2]  )  print(model_name, benchmark_name)  print([round(num, 3) for num in curr_benchmark_scores])  print("Mean: ", round(mean, 3))  print("Std: ", round(std_dev, 3))  print(f"Range: [{round(min_val, 3)} {round(max_val, 3)}]")  print("Outers: ", num_outers)  print("=" * 20) 