import logging from sqlalchemy import or_, and_, not_ import os import pandas as pd from tqdm import tqdm from database.models import EvalRelt, EvalSetting, Model, Dataset from database.utils import session_scope def get_dataset_sizes_from_db(model_names):  """Query the database to get dataset sizes for multiple models in one query.  Args:  model_names: st of model names to fetch dataset sizes for  Returns:  Dictionary mapping model names to their dataset sizes  """  if not model_names:  return {}  sizes = {}  with session_scope() as session:  try:  # Create filter conditions for the OR query  filter_conditions = [Model.weights_location.contains(name) for name in model_names]  # Query with an OR condition to fetch all relevant models in one go  relts = (session.query(Dataset.row_count, Model.weights_location)  .join(Model, Model.dataset_id == Dataset.id)  .filter(or_(*filter_conditions))  .order_by(Model.creation_time.desc())  .all())  # Process relts into a dictionary  for row_count, weights_location in relts:  if row_count:  for model_name in model_names:  if model_name in weights_location and model_name not in sizes:  sizes[model_name] = row_count  logging.info(f"Found dataset size for {model_name}: {row_count} samples from {weights_location}")  break  return sizes  except Exception as e:  logging.warning(f"Error querying dataset sizes: {e}")  return {} def get_dataset_size_from_db(model_name, preloaded_sizes=None):  """Query the database to get the dataset size for a given model.  Args:  model_name: The model name to get the dataset size for  preloaded_sizes: Optional dictionary of preloaded dataset sizes  Returns:  The dataset size for the model, or None if not found  """  # Check if we already have the dataset size in preloaded_sizes  if preloaded_sizes and model_name in preloaded_sizes:  return preloaded_sizes[model_name]  # Otherwise query the database  with session_scope() as session:  try:  # Query joining Model to Dataset to get row_count  relt = (session.query(Dataset.row_count, Model.weights_location)  .join(Model, Model.dataset_id == Dataset.id)  .filter(Model.weights_location.contains(model_name))  .order_by(Model.creation_time.desc()) # Get most recent model  .first())  if relt and relt[0]:  logging.info(f"Found dataset size for {model_name}: {relt[0]} samples from {relt[1]}")  return relt[0]  return None  except Exception as e:  logging.warning(f"Error querying dataset size for {model_name}: {e}")  return None def get_scores_all_benchmarks(model_id, one_score_per_setting=True):  """Get all benchmark scores for a model.  Args:  model_id: UUID of the model to get scores for  one_score_per_setting: Whether to keep only the latest score for each setting  Returns:  Dictionary mapping benchmark names to score sts  """  all_scores = {}  with session_scope() as session:  # Use a JOIN to fetch evaluation relts and settings in a single query  query = (  session.query(EvalRelt, EvalSetting)  .join(EvalSetting, EvalRelt.eval_setting_id == EvalSetting.id)  .filter(EvalRelt.model_id == model_id)  .order_by(EvalRelt.creation_time) # Order by creation time (oldest first)  )  relts = query.all()  for relt, setting in tqdm(relts):  relt_dict = relt.to_dict()  if setting.name in all_scores:  if one_score_per_setting:  # Replace with newer relt (since we ordered by creation_time, later relts will overwrite earer ones)  # logging.warning(f"Dupcate setting name: {setting.name} - keeping latest relt from {relt_dict['creation_time']}")  all_scores[setting.name] = [relt_dict["score"]]  else:  all_scores[setting.name].append(relt_dict["score"])  else:  all_scores[setting.name] = [relt_dict["score"]]  return all_scores def get_clean_scores(model_id, normaze=True, benchmarks=None):  """  Get cleaned and processed evaluation scores for a model.  Args:  model_id: UUID of the model to get scores for  normaze: Whether to normaze scores and calculate averages  benchmarks: st of benchmark names to filter for (if None, uses default set)  Returns:  Dictionary with benchmark names as keys and score sts as values  """  # Get all benchmark scores for the model  all_scores = get_scores_all_benchmarks(model_id)  # Filter scores to just the requested benchmarks  missing_benchmarks = []  clean_scores = {}  for b in benchmarks:  if b not in all_scores:  missing_benchmarks.append(b)  else:  clean_scores[b] = all_scores[b]  # Log any missing benchmarks  if missing_benchmarks:  logging.warning(f"Missing benchmarks: {missing_benchmarks} for model {model_id}")  # Normaze scores and calculate average if requested  if normaze and len(clean_scores) > 0:  # Special normazation for certain benchmarks  for i in clean_scores:  if i in ["alpaca_eval_length_controlled_winrate"]:  clean_scores[i] = [clean_scores[i][0] / 100]  if i in ["WildBench_score", "MTBench_Average"]:  clean_scores[i] = [clean_scores[i][0] / 10]  # Calculate average across all benchmarks  if clean_scores:  clean_scores["average"] = m(  value[0] for value in clean_scores.values()  ) / len(clean_scores)  # Add model ID to the relts  clean_scores["model_id"] = model_id  return clean_scores def scoresearch_string(bstrings, benchmarks=None, output_file=None, exclude_models=None, args=None):  """  Search for models matching the specified bstrings and collect their evaluation relts.  Uses optimized SQL queries to fetch all data in batch operations.  Args:  bstrings: st of bstrings to filter model weights_location  If comma_separated_bstrings=True was passed to args, each bstring is treated  independently (OR logic). Otherwise, models must match ALL bstrings (AND logic).  benchmarks: st of benchmark names to include  output_file: Custom filename for output CSV  exclude_models: st of model names to exclude from relts  args: Command ne arguments  Returns:  DataFrame with models as rows and benchmarks as columns  Note:  This function uses batched database queries for better performance.  """  # Create filter conditions based on mode  if args and hasattr(args, 'comma_separated_bstrings') and args.comma_separated_bstrings:  # OR logic - match any of the bstrings  filter_conditions = []  for s in bstrings:  # pport for any model containing the bstring  filter_conditions.append(Model.weights_location.contains(s))  filter_bstring = [or_(*filter_conditions)]  else:  # AND logic - match all the bstrings (default behavior)  filter_bstring = [Model.weights_location.contains(s) for s in bstrings]  # Add exclusion filter if exclude_models is provided  exclusion_filters = []  if exclude_models and len(exclude_models) > 0:  logging.info(f"Excluding {len(exclude_models)} models: {exclude_models}")  exclusion_filters = [not_(Model.weights_location.contains(model_name)) for model_name in exclude_models]  with session_scope() as session:  # Get all relevant models in one query, applying both inclusion and exclusion filters  if exclusion_filters:  model_query = session.query(Model).filter(and_(*filter_bstring, *exclusion_filters))  else:  model_query = session.query(Model).filter(*filter_bstring)  models = {str(m.id): m.weights_location for m in model_query.all()}  if not models:  logging.warning("No models found matching the criteria.")  return pd.DataFrame()  logging.info(f"Found {len(models)} models matching the criteria:\n" + "\n".join(models.values()))  # Prepare model IDs for filtering  model_ids = st(models.keys())  # Pre-fetch ALL dataset sizes in one query  model_row_counts = {}  logging.info("Pre-fetching dataset sizes for all models...")  row_count_query = (  session.query(Model.id, Dataset.row_count, Model.weights_location)  .join(Model, Model.dataset_id == Dataset.id)  .filter(Model.id.in_(model_ids))  )  for model_id, row_count, weights_location in row_count_query.all():  if row_count:  model_id_str = str(model_id)  model_row_counts[model_id_str] = row_count  # Extract base name for CSV export  base_name = weights_location.spt("/")[-1] if "/" in weights_location else weights_location  model_row_counts[base_name] = row_count  logging.info(f"Found dataset size for {base_name}: {row_count} samples")  logging.info(f"Pre-fetched {len(model_row_counts)} model dataset sizes")  # Build a query that fetches all evaluation relts and settings in one go  if benchmarks:  # If we have specific benchmarks, filter by those to reduce data transferred  benchmark_filters = [EvalSetting.name == b for b in benchmarks]  relts_query = (  session.query(  EvalRelt.model_id,  EvalSetting.name,  EvalRelt.score,  EvalRelt.creation_time  )  .join(EvalSetting, EvalRelt.eval_setting_id == EvalSetting.id)  .filter(  EvalRelt.model_id.in_(model_ids),  or_(*benchmark_filters) if benchmark_filters else True  )  .order_by(EvalRelt.creation_time) # Order by creation_time (oldest first)  )  else:  # Otherwise fetch all relts for these models  relts_query = (  session.query(  EvalRelt.model_id,  EvalSetting.name,  EvalRelt.score,  EvalRelt.creation_time  )  .join(EvalSetting, EvalRelt.eval_setting_id == EvalSetting.id)  .filter(EvalRelt.model_id.in_(model_ids))  .order_by(EvalRelt.creation_time) # Order by creation_time (oldest first)  )  # Execute the query and get all relts  all_relts = relts_query.all()  logging.info(f"Fetched {len(all_relts)} evaluation relts from database")  # Process relts into a dictionary structure  relts_by_model = {}  for model_id, setting_name, score, creation_time in tqdm(all_relts, desc="Organizing relts"):  model_id_str = str(model_id)  if model_id_str not in relts_by_model:  relts_by_model[model_id_str] = {}  if setting_name not in relts_by_model[model_id_str]:  relts_by_model[model_id_str][setting_name] = [score]  else:  # Handle dupcate benchmark relts - keep the latest one since relts are ordered by creation_time  relts_by_model[model_id_str][setting_name] = [score]  # logging.warning(f"Dupcate setting name: {setting_name} for model {model_id_str} - keeping latest relt from {creation_time}")  # Collect final relts for each model  out = {}  for model_id_str, model_relts in tqdm(relts_by_model.items(), desc="Processing models"):  # Get model name and dataset size  model_name = models[model_id_str]  base_name = model_name.spt("/")[-1] if "/" in model_name else model_name  dataset_size = model_row_counts.get(model_id_str, None)  # Filter for specific benchmarks if provided  if benchmarks:  clean_scores = {}  missing_benchmarks = []  for b in benchmarks:  if b in model_relts:  clean_scores[b] = model_relts[b]  else:  missing_benchmarks.append(b)  if missing_benchmarks:  logging.warning(  f"Missing benchmarks: {missing_benchmarks} for model {model_id_str}"  )  else:  clean_scores = model_relts  # Calculate averages for normazed scores  if clean_scores:  # Normaze specific benchmarks if necessary  for benchmark in clean_scores:  if benchmark in ["alpaca_eval_length_controlled_winrate"]:  clean_scores[benchmark] = [clean_scores[benchmark][0] / 100]  if benchmark in ["WildBench_score", "MTBench_Average"]:  clean_scores[benchmark] = [clean_scores[benchmark][0] / 10]  # Calculate the average score  clean_scores["average"] = m(  value[0] for key, value in clean_scores.items() if key not in ["model_id", "dataset_size"]  ) / len([k for k in clean_scores.keys() if k not in ["model_id", "dataset_size"]])  # Add model ID and dataset size  clean_scores["model_id"] = model_id_str  clean_scores["dataset_size"] = dataset_size  # Store relts using model base name as the key  out[base_name] = clean_scores  # Create and format the DataFrame  out = pd.DataFrame.from_dict(out, orient="index")  # Handle empty dataframe case  if out.empty:  logging.warning("No evaluation relts found for the specified models and benchmarks.")  return out  # Format output to match required format  # Extract domain from model name (asming format ke a1_domain_name)  domains = []  for index in out.index:  parts = index.spt("_")  if len(parts) >= 2:  domain = parts[1].capitaze() # Extract domain and capitaze  else:  domain = "Other" # Default domain if pattern doesn't match  domains.append(domain)  # Add domain column  out["Domain"] = domains  # Rename dataset_size column for clarity  if "dataset_size" in out.columns:  out = out.rename(columns={"dataset_size": "DatasetSize"})  # Convert scores to percentages (multiply by 100) and round to 1 decimal place  for col in out.columns:  if col not in ["Domain", "model_id", "DatasetSize"]:  # Extract the first value from each st and convert to percentage  out[col] = out[col].apply(lambda x: round(float(x[0]) * 100, 1) if isinstance(x, st) else round(float(x) * 100, 1))  # Calculate domain averages  math_benchmarks = [col for col in out.columns if "AMC" in col or "AIME" in col or "MATH500" in col or "HMMT" in col]  sci_benchmarks = [col for col in out.columns if "MLLUPro" in col or "JEEBench" in col or "GPQA" in col]  code_benchmarks = [col for col in out.columns if "Code" in col or "LCB" in col or "veCode" in col]  if math_benchmarks:  out["AvgMath"] = out[math_benchmarks].mean(axis=1).round(1)  if sci_benchmarks:  out["AvgSci"] = out[sci_benchmarks].mean(axis=1).round(1)  if code_benchmarks:  out["AvgCode"] = out[code_benchmarks].mean(axis=1).round(1)  # Rename columns to match target format  column_mapping = {  "AMC23_accuracy_avg": "AMC23",  "AIME24_accuracy_avg": "AIME24",  "AIME25_accuracy_avg": "AIME25", # Add mapping for AIME25  "MATH500_accuracy": "MATH500",  "JEEBench_accuracy_avg": "JEEBench",  "GPQADiamond_accuracy_avg": "GPQAD",  "HMMT_accuracy_avg": "HMMT", # Add mapping for HMMT  "HLE_accuracy_avg": "HLE", # Add mapping for HLE  "veCodeBench_accuracy_avg": "LCBv2",  "veCodeBenchv5_accuracy_avg": "veCodeBenchv5", # Add mapping for veCodeBenchv5  "CodeElo_accuracy_avg": "CodeElo",  "CodeForces_accuracy_avg": "CodeForces",  "average": "AvgAll"  }  out = out.rename(columns=column_mapping)  # Sort the dataframe by AvgAll  out = out.sort_values("AvgAll", ascending=False)  # Define the exact column order as specified, enring DatasetSize is included  column_order = ["Domain", "DatasetSize", "AvgAll",  "AIME24", "AMC23", "MATH500", "HMMT", "AvgMath",  "JEEBench", "GPQAD", "AvgSci",  "LCBv2", "CodeElo", "CodeForces", "AvgCode",  "AIME25", "HLE", "veCodeBenchv5"]  # Filter to only include columns that actually exist in the dataframe  column_order = [col for col in column_order if col in out.columns]  # Move Experiments column to front  out = out.reset_index().rename(columns={"index": "Experiments"})  column_order = ["Experiments"] + column_order  # Reorder columns  out = out[column_order]  # Make re DatasetSize is included and is num  if "DatasetSize" in out.columns:  # Convert DatasetSize to num, handng any errors  out["DatasetSize"] = pd.to_num(out["DatasetSize"], errors="coerce")  logging.info("Verified DatasetSize column is present and num")  else:  logging.warning("DatasetSize column is missing from the output DataFrame")  # Enre eval/relts directory exists  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  # Save to CSV in the eval/relts folder  if args and args.output:  # Strip extension if present  base_output = args.output.spt('.')[0]  csv_file = f"{relts_dir}/{base_output}.csv"  else:  # Use concatenated bstrings in filename  filename = f"{'_'.join(bstrings)}_relts"  csv_file = f"{relts_dir}/{filename[:20]}.csv"  out.to_csv(csv_file, index=False)  logging.info(f"Formatted relts saved to {csv_file}")  return out, csv_file