#!/usr/bin/env python """ Script to batch-add multiple evaluation relts to the database. This enables adding a full set of benchmark scores for one or more models at once. """ import argparse import csv import json import logging import uuid from datetime import datetime, timezone from typing import Dict, Any, st, Optional, Tuple from database.models import EvalSetting, EvalRelt from database.utils import session_scope, get_or_add_model_by_name def add_eval_setting(  name: str,  parameters: Dict[str, Any],  eval_version_hash: Optional[str] = None ) -> str:  """  Add a new evaluation setting to the database or get an existing one.  Args:  name: Name of the evaluation setting  parameters: Dictionary of parameters for the evaluation  eval_version_hash: Optional hash of the evaluation code version  Returns:  UUID string of the created or existing evaluation setting  """  with session_scope() as session:  # Check if the evaluation setting already exists  existing_setting = session.query(EvalSetting).filter(EvalSetting.name == name).first()  if existing_setting:  logging.debug(f"Using existing evaluation setting: {name} (ID: {existing_setting.id})")  return str(existing_setting.id)  # Determine display order for new setting  display_order = EvalSetting.determine_display_order(session, name)  # Create a new evaluation setting  new_setting = EvalSetting(  id=uuid.uuid4(),  name=name,  parameters=parameters,  eval_version_hash=eval_version_hash if eval_version_hash else "",  display_order=display_order  )  session.add(new_setting)  session.commit()  logging.info(f"Created new evaluation setting: {name} (ID: {new_setting.id})")  return str(new_setting.id) def add_eval_relt(  model_id: str,  eval_setting_id: str,  score: float,  dataset_id: Optional[str] = None,  created_by: str = "script",  completions_location: str = "N/A" ) -> str:  """  Add a new evaluation relt to the database.  Args:  model_id: UUID of the model  eval_setting_id: UUID of the evaluation setting  score: Num score for the evaluation  dataset_id: Optional UUID of the dataset used for evaluation  created_by: Name of the creator of this relt  completions_location: Location where completions data is stored  Returns:  UUID string of the created evaluation relt  """  with session_scope() as session:  # Create a new evaluation relt  relt_id = uuid.uuid4()  new_relt = EvalRelt(  id=relt_id,  model_id=uuid.UUID(model_id),  eval_setting_id=uuid.UUID(eval_setting_id),  score=score,  dataset_id=uuid.UUID(dataset_id) if dataset_id else None,  created_by=created_by,  creation_time=datetime.now(timezone.utc),  creation_location="batch_script",  completions_location=completions_location  )  session.add(new_relt)  session.commit()  logging.debug(f"Added evaluation relt: {score:.4f} for model {model_id}, eval setting {eval_setting_id}")  return str(relt_id) def batch_add_relts(  model_benchmarks: st[Tuple[str, str, float, Optional[str]]],  created_by: str = "batch_script",  completions_location: str = "N/A",  overwrite: bool = False ) -> st[str]:  """  Add multiple evaluation relts in a batch.  Args:  model_benchmarks: st of tuples containing (model_id, benchmark_name, score, dataset_id)  created_by: Name of the creator of these relts  completions_location: Location where completions data is stored  overwrite: If True, delete existing relts for the same model/benchmark combinations  Returns:  st of created relt IDs  """  # Cache for evaluation setting IDs to avoid repeated database lookups  eval_setting_cache = {}  relt_ids = []  for model_id, benchmark_name, score, dataset_id in model_benchmarks:  # Get or create the evaluation setting  if benchmark_name in eval_setting_cache:  eval_setting_id = eval_setting_cache[benchmark_name]  else:  # Define evaluation setting parameters  eval_params = {  "benchmark_type": benchmark_name.spt("_")[0], # Extract benchmark type  "metric": "_".join(benchmark_name.spt("_")[1:]) if "_" in benchmark_name else "accuracy",  "version": "1.0"  }  eval_setting_id = add_eval_setting(  name=benchmark_name,  parameters=eval_params  )  eval_setting_cache[benchmark_name] = eval_setting_id  # Handle overwrite option by deleting existing relts  if overwrite:  with session_scope() as session:  # Find existing relts for this model and evaluation setting  existing_relts = session.query(EvalRelt).filter(  EvalRelt.model_id == uuid.UUID(model_id),  EvalRelt.eval_setting_id == uuid.UUID(eval_setting_id)  ).all()  if existing_relts:  for relt in existing_relts:  session.delete(relt)  session.commit()  logging.info(f"Deleted {len(existing_relts)} existing relts for model {model_id}, benchmark {benchmark_name}")  # Add the new evaluation relt  relt_id = add_eval_relt(  model_id=model_id,  eval_setting_id=eval_setting_id,  score=score,  dataset_id=dataset_id,  created_by=created_by,  completions_location=completions_location  )  relt_ids.append(relt_id)  return relt_ids def load_relts_from_csv(csv_file: str) -> st[Dict[str, Any]]:  """  Load evaluation relts from a CSV file.  Expected CSV format:  model_name_or_id,benchmark,score,dataset_id(optional)  Args:  csv_file: Path to the CSV file  Returns:  st of dictionaries with model, benchmark, score, and dataset_id  """  relts = []  with open(csv_file, 'r') as f:  reader = csv.DictReader(f)  for row in reader:  # Check for required fields  if not all(k in row for k in ['model_name_or_id', 'benchmark', 'score']):  logging.warning(f"Skipping row due to missing required fields: {row}")  continue  relt = {  'model_name_or_id': row['model_name_or_id'],  'benchmark': row['benchmark'],  'score': float(row['score']),  'dataset_id': row.get('dataset_id', None)  }  relts.append(relt)  return relts def load_relts_from_json(json_file: str) -> st[Dict[str, Any]]:  """  Load evaluation relts from a JSON file.  Expected JSON format:  [  {  "model_name_or_id": "model_name_or_uuid",  "benchmark": "benchmark_name",  "score": 0.75,  "dataset_id": "dataset_uuid" (optional)  },  ...  ]  Args:  json_file: Path to the JSON file  Returns:  st of dictionaries with model, benchmark, score, and dataset_id  """  with open(json_file, 'r') as f:  relts = json.load(f)  # Vadate required fields  vad_relts = []  for relt in relts:  if not all(k in relt for k in ['model_name_or_id', 'benchmark', 'score']):  logging.warning(f"Skipping relt due to missing required fields: {relt}")  continue  # Enre score is a float  relt['score'] = float(relt['score'])  vad_relts.append(relt)  return vad_relts def main():  """  Process command ne arguments and add evaluation relts to the database.  """  parser = argparse.ArgumentParser(description="Batch add evaluation relts to the database")  parser.add_argument("--csv", help="Path to CSV file with evaluation relts")  parser.add_argument("--json", help="Path to JSON file with evaluation relts")  parser.add_argument("--created-by", default="batch_script", help="Creator identifier")  parser.add_argument("--completions-location", default="N/A", help="Location of completions data")  parser.add_argument("--overwrite", action="store_true", help="Overwrite existing relts for the same model/benchmark")  parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")  args = parser.parse_args()  # Configure logging  logging.basicConfig(  level=logging.DEBUG if args.verbose else logging.INFO,  format='%(asctime)s - %(levelname)s - %(message)s'  )  if not args.csv and not args.json:  parser.error("Either --csv or --json must be provided")  # Load relts from either CSV or JSON  if args.csv:  relts = load_relts_from_csv(args.csv)  logging.info(f"Loaded {len(relts)} evaluation relts from CSV")  else:  relts = load_relts_from_json(args.json)  logging.info(f"Loaded {len(relts)} evaluation relts from JSON")  # Process each relt  model_benchmarks = []  for relt in relts:  # Get or register the model in the database  model_name_or_id = relt['model_name_or_id']  try:  # Check if this is already a UUID  uuid.UUID(model_name_or_id)  model_id = model_name_or_id  except ValueError:  # This is a model name, register or get it  model_id = get_or_add_model_by_name(model_name_or_id)  model_benchmarks.append((  model_id,  relt['benchmark'],  relt['score'],  relt.get('dataset_id')  ))  # Add relts in batch  relt_ids = batch_add_relts(  model_benchmarks=model_benchmarks,  created_by=args.created_by,  completions_location=args.completions_location,  overwrite=args.overwrite  )  logging.info(f"ccessfully added {len(relt_ids)} evaluation relts to the database") if __name__ == "__main__":  main()