import argparse import pandas as pd from tqdm import tqdm from database.utils import session_scope from database.models import Model, EvalRelt, EvalSetting import logging from sqlalchemy import or_ from concurrent.futures import ProcessPoolExecutor import multiprocessing def get_scores_all_benchmarks(model_id, one_score_per_setting=True):  all_scores = {}  with session_scope() as session:  model_scores = session.query(EvalRelt).filter_by(model_id=model_id).all()  for score in tqdm(model_scores):  setting = session.query(EvalSetting).filter_by(id=score.eval_setting_id).first()  if setting.name in all_scores:  if one_score_per_setting:  logging.warning(f"Dupcate setting name: {setting.name}")  else:  all_scores[setting.name].append(score.to_dict()["score"])  else:  all_scores[setting.name] = [score.to_dict()["score"]]  return all_scores def get_clean_scores(model_id, normaze=True):  all_scores = get_scores_all_benchmarks(model_id)  benchmarks = [  # "HumanEval_python_pass@1",  "MATH500_accuracy",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg",  "GPQADiamond_accuracy_avg",  "veCodeBench_accuracy_avg",  "AIME24_accuracy_avg",  "AMC23_accuracy_avg",  "JEEBench_accuracy_avg",  "MMLUPro_accuracy_avg",  # "GPQADiamond_accuracy_std_err",  # "MBPP_pass@1",  ]  missing_benchmarks = []  clean_scores = {}  for b in benchmarks:  if b not in all_scores:  missing_benchmarks.append(b)  else:  clean_scores[b] = all_scores[b]  if missing_benchmarks:  logging.warning(f"Missing benchmarks: {missing_benchmarks} for model {model_id}")  # breakpoint()  if normaze and len(clean_scores) > 0:  for i in clean_scores:  if i in ["alpaca_eval_length_controlled_winrate"]:  clean_scores[i] = [clean_scores[i][0] / 100]  if i in ["WildBench_score", "MTBench_Average"]:  clean_scores[i] = [clean_scores[i][0] / 10]  clean_scores["average"] = m(value[0] for value in clean_scores.values()) / len(clean_scores)  clean_scores["model_id"] = model_id  return clean_scores # Function to process a single model ID in parallel def process_model(model_id_and_name):  model_id, weights_location = model_id_and_name  scores = get_clean_scores(model_id)  scores["model_id"] = model_id  return weights_location, scores def scoresearch_string(bstrings, use_or=False, filename=""):  with session_scope() as session:  # Prepare filter for model query  filter_bstring = [Model.weights_location.contains(s) for s in bstrings]  # Query models  if not use_or:  model_instances = session.query(Model).filter(*filter_bstring).all()  else:  model_instances = session.query(Model).filter(or_(*filter_bstring)).all()  model_ids = [(i.id, i.name) for i in model_instances]  print(f"Found {len(model_ids)} models matching the criteria")  # Process models in parallel  out = {}  num_cores = max(1, multiprocessing.cpu_count() - 1) # Leave one core free  with ProcessPoolExecutor(max_workers=num_cores) as executor:  # Use tqdm to show progress  relts = st(tqdm(executor.map(process_model, model_ids), total=len(model_ids), desc="Processing models"))  # Collect relts  for weights_location, scores in relts:  if weights_location in out:  out[weights_location].update(scores)  else:  out[weights_location] = scores  # Convert to DataFrame and save  out = pd.DataFrame.from_dict(out, orient="index")  out = out[["average"] + [col for col in out.columns if col != "average"]]  if not filename:  out_str = "_".join(bstrings)  else:  out_str = filename  out.to_csv(f"csvs/{out_str}.csv")  print(f"Relts saved to csvs/{out_str}.csv") if __name__ == "__main__":  """  This allows generating a CSV from a st of bstrings.  The filter takes the intersection of all bstring occurences  e.g. scoresearch_string(["hp_ablations", "mistral", "lr"])  filters for all rows containing ALL the bstrings "hp_ablations", "mistral", "lr"  Run as follows:  python eval/scripts/get_scores_csv.py --bstrings hp_ablations mistral lr  """  parser = argparse.ArgumentParser()  parser.add_argument("--bstrings", nargs="+", type=str)  parser.add_argument("--use_or", action="store_true")  parser.add_argument("--file_name", default="", type=str)  parser.add_argument(  "--num_workers", type=int, default=None, help="Number of worker processes to use (default: CPU count - 1)"  )  args = parser.parse_args()  # Set number of workers if specified  if args.num_workers:  num_cores = args.num_workers  else:  num_cores = max(1, multiprocessing.cpu_count() - 1)  print(f"Using {num_cores} worker processes")  scoresearch_string(args.bstrings, args.use_or, args.file_name) 