import argparse import pandas as pd from tqdm import tqdm from database.utils import session_scope from database.models import Model, EvalRelt, EvalSetting import logging from sqlalchemy import or_ def get_scores_all_benchmarks(model_id, one_score_per_setting=True):  all_scores = {}  with session_scope() as session:  model_scores = session.query(EvalRelt).filter_by(model_id=model_id).all()  for score in tqdm(model_scores):  setting = session.query(EvalSetting).filter_by(id=score.eval_setting_id).first()  if setting.name in all_scores:  if one_score_per_setting:  logging.warning(f"Dupcate setting name: {setting.name}")  else:  all_scores[setting.name].append(score.to_dict()["score"])  else:  all_scores[setting.name] = [score.to_dict()["score"]]  return all_scores def get_clean_scores(model_id, normaze=True):  all_scores = get_scores_all_benchmarks(model_id)  benchmarks = [  # "HumanEval_python_pass@1",  # "MATH500_accuracy",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg",  # "GPQADiamond_accuracy_avg",  "veCodeBench_accuracy_avg",  # "AIME24_accuracy_avg",  # "AMC23_accuracy_avg",  # "JEEBench_accuracy_avg",  # "MMLUPro_accuracy_avg"  # "GPQADiamond_accuracy_std_err",  # "MBPP_pass@1",  ]  missing_benchmarks = []  clean_scores = {}  for b in benchmarks:  if b not in all_scores:  missing_benchmarks.append(b)  else:  clean_scores[b] = all_scores[b]  if missing_benchmarks:  logging.warning(f"Missing benchmarks: {missing_benchmarks} for model {model_id}")  # breakpoint()  if normaze and len(clean_scores) > 0:  for i in clean_scores:  if i in ["alpaca_eval_length_controlled_winrate"]:  clean_scores[i] = [clean_scores[i][0] / 100]  if i in ["WildBench_score", "MTBench_Average"]:  clean_scores[i] = [clean_scores[i][0] / 10]  clean_scores["average"] = m(value[0] for value in clean_scores.values()) / len(clean_scores)  clean_scores["model_id"] = model_id  return clean_scores def scoresearch_string(bstrings, use_or=False, filename=""):  out = {}  filter_bstring = [Model.weights_location.contains(s) for s in bstrings]  with session_scope() as session:  if not use_or:  model_instances = session.query(Model).filter(*filter_bstring).all()  else:  model_instances = session.query(Model).filter(or_(*filter_bstring)).all()  model_ids = [(i.id, i.name) for i in model_instances]  print(len(model_ids), model_ids)  for id, weights_location in model_ids:  scores = get_clean_scores(id)  scores["model_id"] = id  if weights_location in out:  out[weights_location].update(scores)  else:  out[weights_location] = scores  out = pd.DataFrame.from_dict(out, orient="index")  out = out[["average"] + [col for col in out.columns if col != "average"]]  if not filename:  out_str = "_".join(bstrings)  else:  out_str = filename  out.to_csv(f"csvs/{out_str}.csv") if __name__ == "__main__":  """  This allows generating a CSV from a st of bstrings.  The filter takes the intersection of all bstring occurences  e.g. scoresearch_string(["hp_ablations", "mistral", "lr"])  filters for all rows containing ALL the bstrings "hp_ablations", "mistral", "lr"  Run as follows:  python eval/scripts/get_scores_csv.py --bstrings hp_ablations mistral lr  """  parser = argparse.ArgumentParser()  parser.add_argument("--bstrings", nargs="+", type=str)  parser.add_argument("--use_or", action="store_true")  parser.add_argument("--file_name", default="", type=str)  args = parser.parse_args()  scoresearch_string(args.bstrings, args.use_or, args.file_name) 