import argparse import logging import pandas as pd from tqdm import tqdm from database.models import EvalRelt, EvalSetting, Model from database.utils import session_scope def get_scores_all_benchmarks(model_id, one_score_per_setting=True):  all_scores = {}  with session_scope() as session:  model_scores = session.query(EvalRelt).filter_by(model_id=model_id).all()  for score in tqdm(model_scores):  setting = (  session.query(EvalSetting).filter_by(id=score.eval_setting_id).first()  )  if setting.name in all_scores:  if one_score_per_setting:  logging.warning(f"Dupcate setting name: {setting.name}")  else:  all_scores[setting.name].append(score.to_dict()["score"])  else:  all_scores[setting.name] = [score.to_dict()["score"]]  return all_scores def get_clean_scores(model_id, normaze=True):  all_scores = get_scores_all_benchmarks(model_id)  benchmarks = [  "MATH500_accuracy_avg",  "AMC23_accuracy_avg",  "AIME24_accuracy_avg",  "MMLUPro_accuracy_avg",  "JEEBench_accuracy_avg",  "GPQADiamond_accuracy_avg",  "veCodeBench_accuracy_avg",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg",  ]  missing_benchmarks = []  clean_scores = {}  for b in benchmarks:  if b not in all_scores:  missing_benchmarks.append(b)  else:  clean_scores[b] = all_scores[b]  if missing_benchmarks:  logging.warning(  f"Missing benchmarks: {missing_benchmarks} for model {model_id}"  )  if normaze and len(clean_scores) > 0:  for i in clean_scores:  if i in ["alpaca_eval_length_controlled_winrate"]:  clean_scores[i] = [clean_scores[i][0] / 100]  if i in ["WildBench_score", "MTBench_Average"]:  clean_scores[i] = [clean_scores[i][0] / 10]  clean_scores["average"] = m(  value[0] for value in clean_scores.values()  ) / len(clean_scores)  clean_scores["model_id"] = model_id  return clean_scores def scoresearch_string(bstrings):  out = {}  filter_bstring = [Model.weights_location.contains(s) for s in bstrings]  with session_scope() as session:  model_instances = session.query(Model).filter(*filter_bstring).all()  model_ids = [(i.id, i.weights_location) for i in model_instances]  print(len(model_ids), model_ids)  for id, weights_location in model_ids:  scores = get_clean_scores(id)  print(weights_location)  print(scores)  out[weights_location] = scores  out = pd.DataFrame.from_dict(out, orient="index")  out = out[["average"] + [col for col in out.columns if col != "average"]]  out_str = "_".join(bstrings)  out.to_csv(f"{out_str}.csv") if __name__ == "__main__":  """  This allows generating a CSV from a st of bstrings.  The filter takes the intersection of all bstring occurences  e.g. scoresearch_string(["hp_ablations", "mistral", "lr"])  filters for all rows containing ALL the bstrings "hp_ablations", "mistral", "lr"  Run as follows:  python eval/scripts/get_scores_csv.py --bstrings hp_ablations mistral lr  """  parser = argparse.ArgumentParser()  parser.add_argument("--bstrings", nargs="+", type=str)  args = parser.parse_args()  scoresearch_string(args.bstrings) 