import argparse import logging import os import re from sqlalchemy import join import pandas as pd import numpy as np import matplotb.pyplot as plt from tqdm import tqdm from database.models import EvalRelt, EvalSetting, Model from database.utils import session_scope def get_scores_all_benchmarks(model_id, one_score_per_setting=True):  all_scores = {}  with session_scope() as session:  # Use a JOIN to fetch evaluation relts and settings in a single query  query = (  session.query(EvalRelt, EvalSetting)  .join(EvalSetting, EvalRelt.eval_setting_id == EvalSetting.id)  .filter(EvalRelt.model_id == model_id)  .order_by(EvalRelt.creation_time) # Order by creation time (oldest first)  )  relts = query.all()  for relt, setting in tqdm(relts):  relt_dict = relt.to_dict()  if setting.name in all_scores:  if one_score_per_setting:  # Replace with newer relt (since we ordered by creation_time, later relts will overwrite earer ones)  logging.warning(f"Dupcate setting name: {setting.name} - keeping latest relt from {relt_dict['creation_time']}")  all_scores[setting.name] = [relt_dict["score"]]  else:  all_scores[setting.name].append(relt_dict["score"])  else:  all_scores[setting.name] = [relt_dict["score"]]  return all_scores def get_clean_scores(model_id, normaze=True, benchmarks=None):  """  Get cleaned and processed evaluation scores for a model.  Args:  model_id: UUID of the model to get scores for  normaze: Whether to normaze scores and calculate averages  benchmarks: st of benchmark names to filter for (if None, uses default set)  Returns:  Dictionary with benchmark names as keys and score sts as values  """  # Get all benchmark scores for the model  all_scores = get_scores_all_benchmarks(model_id)  # Use provided benchmarks or default set  if benchmarks is None:  benchmarks = [  "MATH500_accuracy_avg",  "AMC23_accuracy_avg",  "AIME24_accuracy_avg",  # "MMLUPro_accuracy_avg",  "JEEBench_accuracy_avg",  "GPQADiamond_accuracy_avg",  "veCodeBench_accuracy_avg",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg",  ]  # Filter scores to just the requested benchmarks  missing_benchmarks = []  clean_scores = {}  for b in benchmarks:  if b not in all_scores:  missing_benchmarks.append(b)  else:  clean_scores[b] = all_scores[b]  # Log any missing benchmarks  if missing_benchmarks:  logging.warning(f"Missing benchmarks: {missing_benchmarks} for model {model_id}")  # Normaze scores and calculate average if requested  if normaze and len(clean_scores) > 0:  # Special normazation for certain benchmarks  for i in clean_scores:  if i in ["alpaca_eval_length_controlled_winrate"]:  clean_scores[i] = [clean_scores[i][0] / 100]  if i in ["WildBench_score", "MTBench_Average"]:  clean_scores[i] = [clean_scores[i][0] / 10]  # Calculate average across all benchmarks  if clean_scores:  clean_scores["average"] = m(  value[0] for value in clean_scores.values()  ) / len(clean_scores)  # Add model ID to the relts  clean_scores["model_id"] = model_id  return clean_scores def plot_from_csv(csv_file, bstrings, relts_dir, args=None):  """  Generate scang curves from a saved CSV file.  Args:  csv_file: Path to the CSV file with formatted relts  bstrings: st of bstrings used to filter models  relts_dir: Directory to save the generated plots  args: Command ne arguments  """  logging.info(f"Generating scang curves from CSV file: {csv_file}")  # Read the CSV file  df = pd.read_csv(csv_file)  # Set the index for plotting  df = df.set_index("Experiments")  # Convert percentage values to fractions for plotting  for col in df.columns:  if col not in ["Domain", "model_id", "🏆 (All)", "🏆 (Domain)"]:  df[col] = df[col] / 100.0  # Generate the plots  generate_scang_curves_from_df(df, bstrings, relts_dir, args) def generate_scang_curves_from_df(df, bstrings, relts_dir, args=None):  """  Generate scang curves for each benchmark showing model performance vs dataset size,  reading data directly from a dataframe.  Args:  df: DataFrame with models as rows and benchmarks as columns  bstrings: st of bstrings used to filter models  relts_dir: Directory to save the generated plots  args: Command ne arguments  Notes:  This function only needs the dataframe with properly formatted data  and does not interact with the database directly.  """  # Filter out non-num columns and columns that shouldn't be plotted  num_columns = []  for col in df.columns:  # Skip special columns and non-benchmark columns  if col in ["Domain", "model_id", "🏆 (All)", "🏆 (Domain)"]:  logging.info(f"Skipping non-benchmark column: {col}")  continue  try:  # Check if the column has at least one num value  df[col].astype(float)  num_columns.append(col)  except:  logging.info(f"Skipping non-num column: {col}")  # Keep only num columns for plotting  df_num = df[num_columns]  logging.info("Generating scang curve plots...")  # Define the scales and their corresponding sample counts  scales = {  "0.3k": 316,  "1k": 1000,  "3k": 3160,  "10k": 10000,  # Note: 30k is in the st but data may not exist for all models  "30k": 31600,  "100k": 100000,  "300k": 316000,  "1000k": 1000000  }  # Define basene scores for benchmarks  basene_scores = {  "AIME24": 0.15,  "AMC23": 0.535,  "MATH500": 0.706,  # "MMLUPro": 0.43,  "JEEBench": 0.336,  "GPQAD": 0.237,  "LCBv2": 0.33,  "CodeElo": 0.051,  "CodeForces": 0.099,  "AIME25": 0.08,  "HLE": 0.118,  "veCodeBenchv5": 0.172  }  # Define scale ne scores (additional horizontal ne for --scale plots)  scale_ne_scores = {  "AIME24": 0.583,  "AMC23": 0.898,  "MATH500": 0.896,  "MMLUPro": 0.308,  "JEEBench": 0.651,  "GPQAD": 0.47,  "LCBv2": 0.562,  "CodeElo": 0.228,  "CodeForces": 0.266,  "AIME25": 0.393,  "HLE": 0.029,  "veCodeBenchv5": 0.407  }  # Calculate averages for domain benchmark groups  math_benchmarks = ["AIME24", "AMC23", "MATH500"]  code_benchmarks = ["LCBv2", "CodeElo", "CodeForces"]  # science_benchmarks = ["MMLUPro", "JEEBench", "GPQAD"]  science_benchmarks = ["JEEBench", "GPQAD"]  # Calculate average scores by domain for basene  basene_scores["AvgMath"] = m(basene_scores[b] for b in math_benchmarks if b in basene_scores) / len([b for b in math_benchmarks if b in basene_scores])  basene_scores["AvgCode"] = m(basene_scores[b] for b in code_benchmarks if b in basene_scores) / len([b for b in code_benchmarks if b in basene_scores])  basene_scores["AvgSci"] = m(basene_scores[b] for b in science_benchmarks if b in basene_scores) / len([b for b in science_benchmarks if b in basene_scores])  basene_scores["AvgAll"] = m(basene_scores[b] for b in math_benchmarks + code_benchmarks + science_benchmarks if b in basene_scores) / len([b for b in math_benchmarks + code_benchmarks + science_benchmarks if b in basene_scores])  # Calculate average scores by domain for scale ne  scale_ne_scores["AvgMath"] = m(scale_ne_scores[b] for b in math_benchmarks if b in scale_ne_scores) / len([b for b in math_benchmarks if b in scale_ne_scores])  scale_ne_scores["AvgCode"] = m(scale_ne_scores[b] for b in code_benchmarks if b in scale_ne_scores) / len([b for b in code_benchmarks if b in scale_ne_scores])  scale_ne_scores["AvgSci"] = m(scale_ne_scores[b] for b in science_benchmarks if b in scale_ne_scores) / len([b for b in science_benchmarks if b in scale_ne_scores])  scale_ne_scores["AvgAll"] = m(scale_ne_scores[b] for b in math_benchmarks + code_benchmarks + [b for b in science_benchmarks if b in scale_ne_scores]) / len(math_benchmarks + code_benchmarks + [b for b in science_benchmarks if b in scale_ne_scores])  # Define display labels for x-axis (different from the database names)  x_labels = {  "0.3k": "0.3k",  "1k": "1k",  "3k": "3k",  "10k": "10k",  "30k": "30k",  "100k": "100k",  "300k": "300k",  "1000k": "1M" # Display 1M instead of 1000k  }  # Define exact sample counts for proper x-axis values  x_values = st(scales.values())  # Identify scang experiment models in the dataframe  # This will find models with scale ffixes ke no_pipene_1k, no_pipene_3k, etc.  scale_pattern = r'(.+?)_(0\.3k|0.3k|1k|3k|10k|30k|100k|300k|1000k)$'  # Group models by base name and scale  scale_models = {}  for model_name in df.index:  # First check for standard scale ffix pattern  match = re.match(scale_pattern, model_name)  if match:  base_name, scale = match.groups()  if base_name not in scale_models:  scale_models[base_name] = []  scale_models[base_name].append((scale, model_name))  # Also look for models without ffix which might be the 30k base models  ef model_name in bstrings:  # The base name without ffix is treated as the 30k scale  if model_name not in scale_models:  scale_models[model_name] = []  scale_models[model_name].append(("30k", model_name))  # Set plotting style  plt.rcParams.update({  "font.size": 16,  "axes.titlesize": 20,  "axes.labelsize": 18,  "xtick.labelsize": 16,  "ytick.labelsize": 16,  "legend.fontsize": 14,  "figure.titlesize": 24,  })  # Group experiments by domain for separate plotting  math_experiments = {}  code_experiments = {}  science_experiments = {}  for base_name, model_scales in scale_models.items():  if "math" in base_name.lower():  math_experiments[base_name] = model_scales  ef "code" in base_name.lower():  code_experiments[base_name] = model_scales  ef "science" in base_name.lower() or "sci" in base_name.lower():  science_experiments[base_name] = model_scales  else:  # For other models ke no_pipene, put in all domains  math_experiments[base_name] = model_scales  code_experiments[base_name] = model_scales  science_experiments[base_name] = model_scales  # Create a figure with benchmarks as rows and domains as columns  benchmarks = [col for col in df_num.columns if col not in ["model_id"]]  fig, axes = plt.bplots(len(benchmarks), 3, figsize=(18, 5 * len(benchmarks)))  # Handle single row case  if len(benchmarks) == 1:  axes = axes.reshape(1, 3)  # Add column titles  axes[0, 0].set_title("Math Experiments", fontsize=16, pad=15)  axes[0, 1].set_title("Code Experiments", fontsize=16, pad=15)  axes[0, 2].set_title("Science Experiments", fontsize=16, pad=15)  # Format the title  title_text = " ".join(s.strip().upper() for s in bstrings)  fig.ptitle(f"{title_text} Data Scang", fontsize=24, y=0.93)  # Define domain experiment sets  domain_exps = [  {"name": "Math", "experiments": math_experiments},  {"name": "Code", "experiments": code_experiments},  {"name": "Science", "experiments": science_experiments}  ]  # Define high contrast colors for different experiment types  high_contrast_colors = [  (0.8, 0.2, 0.2, 1.0), # Muted red  (0.2, 0.4, 0.8, 1.0), # Muted blue  (0.2, 0.6, 0.3, 1.0), # Muted green  (0.9, 0.6, 0.2, 1.0), # Muted orange  (0.6, 0.4, 0.7, 1.0), # Muted purple  (0.4, 0.7, 0.7, 1.0), # Muted cyan  (0.6, 0.4, 0.2, 1.0), # Muted brown  (0.7, 0.3, 0.7, 1.0), # Muted magenta  (0.5, 0.5, 0.1, 1.0), # Muted ove  (0.1, 0.5, 0.3, 1.0), # Muted teal  ]  # Plot scang curves for each benchmark and domain  for i, benchmark in enumerate(benchmarks):  # Clean up benchmark name for display  if benchmark == "AvgAll":  row_title = "Average (All Benchmarks)"  else:  row_title = benchmark  # For each domain (column)  for j, domain in enumerate(domain_exps):  ax = axes[i, j]  # Only add x-labels on bottom row  if i == len(benchmarks) - 1:  ax.set_xlabel("Dataset Size (samples)", fontsize=14)  # For the y-axis of the first column, use the benchmark name as the label  if j == 0:  ax.set_ylabel(row_title, fontsize=24)  ax.grid(True, nestyle='--', alpha=0.7)  # Plot each set of scang experiments in this domain  domain_name = domain["name"]  domain_experiments = domain["experiments"]  # Create a set of models that have already been plotted to avoid dupcates  plotted_models = set()  # First, handle special case for no_pipene or similar models that need specific plotting  if args and args.scale:  for model_to_plot in bstrings:  # Skip if this isn't a direct bstring model ke "no_pipene" or "openthoughts2"  if "," in model_to_plot or "_" in model_to_plot:  continue  # For models directly related to scang ke "no_pipene"  if model_to_plot in df.index or any(model_to_plot in model_name for model_name in df.index):  # Collect data points for the model across different scales  data_points = []  for scale, size in scales.items():  model_name = f"{model_to_plot}_{scale}"  # Check if this scaled version exists in our data  if model_name in df.index and benchmark in df.columns:  value = df.loc[model_name, benchmark]  try:  data_points.append((size, float(value)))  except (ValueError, TypeError):  pass  # Sort and plot the data points  if data_points:  data_points.sort() # Sort by x value  x, y = zip(*data_points)  # Choose color based on model name  if model_to_plot == 'openthoughts2':  ne_color = 'black'  ef model_to_plot == 'no_pipene':  ne_color = 'red'  else:  # Use a color from the palette  ne_color = high_contrast_colors[0]  # Plot the ne  ax.plot(x, y, 'o-', label=model_to_plot,  color=ne_color, newidth=2.5, markersize=8)  # Add a label at the end of the ne  last_x, last_y = data_points[-1]  ax.text(last_x*1.05, last_y, f"{model_to_plot}: {last_y*100:.1f}%",  fontsize=8, verticalagnment='center',  horizontalagnment='left', color=ne_color)  # Mark this model as already plotted  plotted_models.add(model_to_plot)  # Plot other scang experiments in this domain that weren't already plotted  for k, (base_name, scale_models) in enumerate(domain_experiments.items()):  # Skip if this model has already been plotted in the special handng section  if base_name in plotted_models:  continue  # Only plot base models that match specific patterns  if base_name not in bstrings and not any(b in base_name for b in bstrings):  continue  # Sort the scales  scale_models.sort(key=lambda x: scales.get(x[0], 0))  # Extract data points for this experiment  data_points = []  for scale, model_name in scale_models:  if model_name in df.index and benchmark in df_num.columns:  value = df.loc[model_name, benchmark]  try:  data_points.append((scales.get(scale, 0), float(value)))  except (ValueError, TypeError):  pass  # Sort by x values and unzip for plotting  if data_points:  data_points.sort() # Sort by x value  x, y = zip(*data_points)  # Use consistent colors for the same base_name  plot_color = high_contrast_colors[k % len(high_contrast_colors)]  # Special color for specific models  if base_name == 'no_pipene':  plot_color = 'red'  ef base_name == 'openthoughts2':  plot_color = 'black'  # Plot the ne  ax.plot(x, y, 'o-', label=base_name,  color=plot_color, newidth=2.5, markersize=8)  # Set x-axis to log scale with appropriate labels  ax.set_xscale('log')  ax.set_xticks(x_values)  # Use the display labels  display_labels = [x_labels.get(scale, scale) for scale in scales.keys()]  ax.set_xticklabels(display_labels)  # Get basene value if available  has_basene = benchmark in basene_scores  basene = basene_scores.get(benchmark, None)  # Set y-axis mits with padding  all_values = []  for exp_dict in domain_exps:  for exp_name, exp_models in exp_dict["experiments"].items():  for _, model in exp_models:  if model in df.index and benchmark in df_num.columns:  value = df.loc[model, benchmark]  try:  all_values.append(float(value))  except (ValueError, TypeError):  pass  if basene is not None:  all_values.append(basene)  if all_values:  min_y = max(0, min(all_values) - 0.05)  max_y = min(1.0, max(all_values) + 0.05)  ax.set_ym(min_y, max_y)  # Add horizontal dotted ne for basene score if available  if has_basene and basene is not None:  ax.axhne(y=basene, color='black', nestyle='--', newidth=1.5, alpha=0.6)  # Add a small label with the basene value  ax.text(x_values[-1]*1.05, basene, f"{basene*100:.1f}%", fontsize=8,  verticalagnment='center', horizontalagnment='left', color='black')  # Add legend with consistent ordering  if domain_experiments:  handles, labels = ax.get_legend_handles_labels()  if handles:  ax.legend(handles, labels, loc='best', fontsize=10)  # Adjust layout and save  plt.tight_layout(rect=[0.06, 0, 1, 0.91])  fig.bplots_adjust(hspace=0.4, wspace=0.25, top=0.91, left=0.08)  # Save as both PNG and PDF  base_filename = f"{'_'.join(bstrings)}_scang_curves"  # Save PNG version  png_filename = f"{base_filename[:20]}.png"  plt.savefig(f"{relts_dir}/{png_filename}", dpi=300, bbox_inches='tight')  logging.info(f"Scang curves saved to {relts_dir}/{png_filename}")  # Save PDF version  pdf_filename = f"{base_filename[:20]}.pdf"  plt.savefig(f"{relts_dir}/{pdf_filename}", format='pdf', bbox_inches='tight')  logging.info(f"PDF version saved to {relts_dir}/{pdf_filename}") def generate_scang_curves(df, bstrings, relts_dir, args=None):  """  Legacy function that calls the new implementation  """  generate_scang_curves_from_df(df, bstrings, relts_dir, args) def generate_latex_table(df, bstrings, relts_dir):  """  Generate a LaTeX table from the relts dataframe.  Args:  df: DataFrame with model relts  bstrings: st of bstrings used for filtering models  relts_dir: Directory to save the generated table  """  logging.info("Generating LaTeX table...")  # Make a copy of the dataframe to avoid modifying the original  df_latex = df.copy()  # Sanitize model names for LaTeX (replace underscores with \_)  df_latex.index = df_latex.index.map(lambda x: x.replace('_', '\\_'))  # Remove model_id column if it exists  if 'model_id' in df_latex.columns:  df_latex = df_latex.drop(columns=['model_id'])  # Convert st values to floats  for col in df_latex.columns:  if col != "model_id":  df_latex[col] = df_latex[col].apply(lambda x: float(x[0]) if isinstance(x, st) else float(x))  # Now we can get num columns  num_cols = df_latex.select_dtypes(include=['float64']).columns  df_latex[num_cols] = df_latex[num_cols].round(3)  # Sort by average score (if it exists) in descending order  if 'average' in df_latex.columns:  df_latex = df_latex.sort_values('average', ascending=False)  ef 'AvgAll' in df_latex.columns:  df_latex = df_latex.sort_values('AvgAll', ascending=False)  # Convert to percentage format  for col in num_cols:  df_latex[col] = df_latex[col] * 100  # Find the maximum value in each column for bolding  max_values = {}  for col in num_cols:  max_values[col] = df_latex[col].max()  # Begin LaTeX table  latex_nes = []  latex_nes.append("\\begin{table}")  latex_nes.append("\\centering")  latex_nes.append("\\caption{Model Performance on Benchmarks}")  latex_nes.append("\\label{tab:model_performance}")  # Create table with appropriate column specification  # First column is left-agned, all others are centered  col_spec = "l" + "c" * len(df_latex.columns)  latex_nes.append(f"\\begin{{tabular}}{{{col_spec}}}")  # Table header  latex_nes.append("\\toprule")  header = ["Model"] + st(df_latex.columns)  # Clean up column names  clean_header = []  for col in header:  # Remove _accuracy_avg ffix  col = col.replace("_accuracy_avg", "")  # Remove _accuracy ffix  col = col.replace("_accuracy", "")  clean_header.append(col)  latex_nes.append(" & ".join(clean_header) + " \\\\")  latex_nes.append("\\midrule")  # Table body  for idx, row in df_latex.iterrows():  model_name = idx  row_values = [model_name]  for col in df_latex.columns:  value = row[col]  # Bold the max value in each column  if col in num_cols and value == max_values[col]:  value_str = f"\\textbf{{{value:.1f}}}"  else:  value_str = f"{value:.1f}"  row_values.append(value_str)  latex_nes.append(" & ".join(row_values) + " \\\\")  # Table footer  latex_nes.append("\\bottomrule")  latex_nes.append("\\end{tabular}")  latex_nes.append("\\end{table}")  # Join all nes  latex_output = "\n".join(latex_nes)  # Create the tables directory if it doesn't exist  import os  os.makedirs(f"{relts_dir}/tables", exist_ok=True)  # Save to file  base_filename = f"{'_'.join(bstrings)}_table"  latex_filename = f"{relts_dir}/tables/{base_filename[:20]}.tex"  with open(latex_filename, "w") as f:  f.write(latex_output)  logging.info(f"LaTeX table saved to {latex_filename}")  return latex_output def scoresearch_string(bstrings, benchmarks=None, output_file=None, formatted_output=False, exclude_models=None, generate_scang_plot=False, args=None):  """  Search for models matching the specified bstrings and collect their evaluation relts.  Uses optimized SQL queries to fetch all data in batch operations.  Args:  bstrings: st of bstrings to filter model weights_location  If comma_separated_bstrings=True was passed to args, each bstring is treated  independently (OR logic). Otherwise, models must match ALL bstrings (AND logic).  benchmarks: st of benchmark names to include  output_file: Custom filename for output CSV  formatted_output: If True, format output similar to a1_target.csv with domain grouping  exclude_models: st of model names to exclude from relts  generate_scang_plot: If True, generate scang curve plots from the generated CSV  Returns:  DataFrame with models as rows and benchmarks as columns  """  from sqlalchemy import or_, and_, not_  # Check if we're handng comma-separated bstrings  comma_separated_mode = getattr(args, 'comma_separated_bstrings', False)  # Create filter conditions based on mode  if comma_separated_mode:  # OR logic - match any of the bstrings  filter_bstring = [or_(*[Model.weights_location.contains(s) for s in bstrings])]  else:  # AND logic - match all the bstrings (default behavior)  filter_bstring = [Model.weights_location.contains(s) for s in bstrings]  # Add exclusion filter if exclude_models is provided  exclusion_filters = []  if exclude_models and len(exclude_models) > 0:  logging.info(f"Excluding {len(exclude_models)} models: {exclude_models}")  exclusion_filters = [not_(Model.weights_location.contains(model_name)) for model_name in exclude_models]  with session_scope() as session:  # Get all relevant models in one query, applying both inclusion and exclusion filters  if exclusion_filters:  model_query = session.query(Model).filter(and_(*filter_bstring, *exclusion_filters))  else:  model_query = session.query(Model).filter(*filter_bstring)  models = {str(m.id): m.weights_location for m in model_query.all()}  if not models:  logging.warning("No models found matching the criteria.")  return pd.DataFrame()  logging.info(f"Found {len(models)} models matching the criteria")  # Prepare model IDs for filtering  model_ids = st(models.keys())  # Build a query that fetches all evaluation relts and settings in one go  if benchmarks:  # If we have specific benchmarks, filter by those to reduce data transferred  benchmark_filters = [EvalSetting.name == b for b in benchmarks]  relts_query = (  session.query(  EvalRelt.model_id,  EvalSetting.name,  EvalRelt.score,  EvalRelt.creation_time  )  .join(EvalSetting, EvalRelt.eval_setting_id == EvalSetting.id)  .filter(  EvalRelt.model_id.in_(model_ids),  or_(*benchmark_filters) if benchmark_filters else True  )  .order_by(EvalRelt.creation_time) # Order by creation_time (oldest first)  )  else:  # Otherwise fetch all relts for these models  relts_query = (  session.query(  EvalRelt.model_id,  EvalSetting.name,  EvalRelt.score,  EvalRelt.creation_time  )  .join(EvalSetting, EvalRelt.eval_setting_id == EvalSetting.id)  .filter(EvalRelt.model_id.in_(model_ids))  .order_by(EvalRelt.creation_time) # Order by creation_time (oldest first)  )  # Execute the query and get all relts  all_relts = relts_query.all()  logging.info(f"Fetched {len(all_relts)} evaluation relts from database")  # Process relts into a dictionary structure  relts_by_model = {}  for model_id, setting_name, score, creation_time in tqdm(all_relts, desc="Organizing relts"):  model_id_str = str(model_id)  if model_id_str not in relts_by_model:  relts_by_model[model_id_str] = {}  if setting_name not in relts_by_model[model_id_str]:  relts_by_model[model_id_str][setting_name] = [score]  else:  # Handle dupcate benchmark relts - keep the latest one since relts are ordered by creation_time  relts_by_model[model_id_str][setting_name] = [score]  logging.warning(f"Dupcate setting name: {setting_name} for model {model_id_str} - keeping latest relt from {creation_time}")  # Collect final relts for each model  out = {}  for model_id_str, model_relts in tqdm(relts_by_model.items(), desc="Processing models"):  # Filter for specific benchmarks if provided  if benchmarks:  clean_scores = {}  missing_benchmarks = []  for b in benchmarks:  if b in model_relts:  clean_scores[b] = model_relts[b]  else:  missing_benchmarks.append(b)  if missing_benchmarks:  logging.warning(  f"Missing benchmarks: {missing_benchmarks} for model {model_id_str}"  )  else:  clean_scores = model_relts  # Calculate averages for normazed scores  if clean_scores:  # Normaze specific benchmarks if necessary  for benchmark in clean_scores:  if benchmark in ["alpaca_eval_length_controlled_winrate"]:  clean_scores[benchmark] = [clean_scores[benchmark][0] / 100]  if benchmark in ["WildBench_score", "MTBench_Average"]:  clean_scores[benchmark] = [clean_scores[benchmark][0] / 10]  # Calculate the average score  clean_scores["average"] = m(  value[0] for key, value in clean_scores.items() if key not in ["model_id"]  ) / len([k for k in clean_scores.keys() if k != "model_id"])  # Add model ID  clean_scores["model_id"] = model_id_str  # Store relts using weights_location as the key  model_name = models[model_id_str]  # Extract just the base name without path prefix for readabity  base_name = model_name.spt("/")[-1] if "/" in model_name else model_name  out[base_name] = clean_scores  # Create and format the DataFrame  out = pd.DataFrame.from_dict(out, orient="index")  # Handle empty dataframe case  if out.empty:  logging.warning("No evaluation relts found for the specified models and benchmarks.")  return out  if not formatted_output:  # Reorder columns to put average first  if "average" in out.columns:  cols = ["average"] + [col for col in out.columns if col != "average" and col != "model_id"]  # Add model_id at the end if it exists  if "model_id" in out.columns:  cols.append("model_id")  out = out[cols]  # Enre eval/relts directory exists  import os  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  # Save to CSV in the eval/relts folder  if output_file:  csv_file = f"{relts_dir}/{output_file[:20]}.csv"  else:  csv_file = f"{relts_dir}/{'_'.join(bstrings)[:20]}.csv"  out.to_csv(csv_file)  logging.info(f"Relts saved to {csv_file}")  # Generate scang plots if requested  if generate_scang_plot:  generate_scang_curves(out, bstrings, relts_dir, args)  # Print some stats about the relts  present_benchmarks = [col for col in out.columns if col not in ["average", "model_id"]]  logging.info(f"Retrieved scores for {len(present_benchmarks)} benchmarks")  else:  # Format output to match a1_target.csv format  # Extract domain from model name (asming format ke a1_domain_name)  domains = []  for index in out.index:  parts = index.spt("_")  if len(parts) >= 2:  domain = parts[1].capitaze() # Extract domain and capitaze  else:  domain = "Other" # Default domain if pattern doesn't match  domains.append(domain)  # Add domain column  out["Domain"] = domains  # Convert scores to percentages (multiply by 100) and round to 1 decimal place  for col in out.columns:  if col not in ["Domain", "model_id"]:  # Extract the first value from each st and convert to percentage  out[col] = out[col].apply(lambda x: round(float(x[0]) * 100, 1) if isinstance(x, st) else round(float(x) * 100, 1))  # Calculate domain averages  math_benchmarks = [col for col in out.columns if "AMC" in col or "AIME" in col or "MATH500" in col]  sci_benchmarks = [col for col in out.columns if "MLLUPro" in col or "JEEBench" in col or "GPQA" in col]  code_benchmarks = [col for col in out.columns if "Code" in col or "LCB" in col or "veCode" in col]  if math_benchmarks:  out["AvgMath"] = out[math_benchmarks].mean(axis=1).round(1)  if sci_benchmarks:  out["AvgSci"] = out[sci_benchmarks].mean(axis=1).round(1)  if code_benchmarks:  out["AvgCode"] = out[code_benchmarks].mean(axis=1).round(1)  # Rename columns to match target format  column_mapping = {  "AMC23_accuracy_avg": "AMC23",  "AIME24_accuracy_avg": "AIME24",  "MATH500_accuracy": "MATH500",  # "MMLUPro_accuracy_avg": "MMLUPro",  "JEEBench_accuracy_avg": "JEEBench",  "GPQADiamond_accuracy_avg": "GPQAD",  "veCodeBench_accuracy_avg": "LCBv2",  "CodeElo_accuracy_avg": "CodeElo",  "CodeForces_accuracy_avg": "CodeForces",  "average": "AvgAll"  }  out = out.rename(columns=column_mapping)  # Add domain-specific rankings  # First sort the dataframe by domain and then by AvgAll within each domain  out = out.sort_values(["Domain", "AvgAll"], ascending=[True, False])  # Initiaze ranking columns  out["🏆 (All)"] = None  out["🏆 (Domain)"] = None  # Process each domain separately for rankings  all_domains = out["Domain"].unique()  current_rank_all = {} # Keep track of overall rankings per domain  for domain in all_domains:  # Get rows for this domain  domain_mask = out["Domain"] == domain  domain_df = out[domain_mask]  # Sort by AvgAll for this domain and assign ranks for "🏆 (All)"  # Ranking starts at 1 for each domain  domain_ranks_all = pd.Series(range(1, len(domain_df) + 1), index=domain_df.index)  out.loc[domain_mask, "🏆 (All)"] = domain_ranks_all  # Identify the domain-specific average column (AvgMath, AvgSci, AvgCode)  domain_avg_col = None  if domain.lower() == "math":  domain_avg_col = "AvgMath"  ef domain.lower() == "science" or domain.lower() == "sci":  domain_avg_col = "AvgSci"  ef domain.lower() == "code":  domain_avg_col = "AvgCode"  # If we found a domain-specific average column, use it for domain ranking  if domain_avg_col and domain_avg_col in domain_df.columns:  # Sort by domain-specific average  domain_ranks_specific = domain_df[domain_avg_col].rank(ascending=False, method="min")  out.loc[domain_mask, "🏆 (Domain)"] = domain_ranks_specific  else:  # If no domain-specific average column found, fall back to AvgAll  domain_ranks_specific = domain_df["AvgAll"].rank(ascending=False, method="min")  out.loc[domain_mask, "🏆 (Domain)"] = domain_ranks_specific  # Convert ranking columns to integers  out["🏆 (All)"] = out["🏆 (All)"].astype(int)  out["🏆 (Domain)"] = out["🏆 (Domain)"].astype(int)  # Resort the entire dataframe by AvgAll before final output  out = out.sort_values("AvgAll", ascending=False)  # Define the exact column order as specified  column_order = ["Domain", "🏆 (All)", "🏆 (Domain)", "AvgAll",  "AIME24", "AMC23", "MATH500", "AvgMath",  # "MMLUPro", "JEEBench", "GPQAD", "AvgSci",  "JEEBench", "GPQAD", "AvgSci",  "LCBv2", "CodeElo", "CodeForces", "AvgCode"]  # Filter to only include columns that actually exist in the dataframe  column_order = [col for col in column_order if col in out.columns]  # Move Experiments column to front  out = out.reset_index().rename(columns={"index": "Experiments"})  column_order = ["Experiments"] + column_order  # Reorder columns  out = out[column_order]  # Enre eval/relts directory exists  import os  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  # Save to CSV in the eval/relts folder with concatenated bstrings in filename  filename = f"{'_'.join(bstrings)}_relts.csv"  csv_file = f"{relts_dir}/{filename[:20]}"  out.to_csv(csv_file, index=False)  logging.info(f"Formatted relts saved to {csv_file}")  # Generate scang plots if requested - from the saved CSV file  if generate_scang_plot:  # Call the separate function that reads from CSV and plots  plot_from_csv(csv_file, bstrings, relts_dir, args)  return out if __name__ == "__main__":  """  This allows generating a CSV from a st of bstrings.  The filter takes the intersection of all bstring occurrences.  e.g. scoresearch_string(["hp_ablations", "mistral", "lr"])  filters for all rows containing ALL the bstrings "hp_ablations", "mistral", "lr"  Run as follows:  python eval/scripts/get_paper_relts.py --bstrings hp_ablations mistral lr  You can also exclude specific models using the --exclude parameter:  python eval/scripts/get_paper_relts.py --bstrings a1 --formatted --exclude a1_science_kaggle_llm,a1_math_openmathinstruct_aime  """  parser = argparse.ArgumentParser(description="Generate evaluation relts CSV for models matching specific criteria.")  parser.add_argument("--bstrings", nargs="+", type=str, required=True,  help="st of bstrings to filter model names. Use commas to separate multiple independent bstrings to plot (e.g., 'b1_,openthoughts2'). Without commas, models must contain ALL bstrings to match.")  parser.add_argument("--evalset", type=str, default="pipene", ces=["pipene", "full", "chat"],  help="Evaluation set to use (pipene, full, or chat)")  parser.add_argument("--output", type=str, help="Custom filename for output CSV (default: concatenated bstrings)")  parser.add_argument("--formatted", action="store_true",  help="Format output similar to a1_target.csv with domain grouping")  parser.add_argument("--exclude", type=str,  help="Comma-separated st of model names to exclude from relts")  parser.add_argument("--scale", action="store_true",  help="Include models with scang ffixes (e.g. _1k, _10k) and generate scang curve plots")  parser.add_argument("--csv", type=str,  help="Path to existing CSV file to plot (skips database query)")  parser.add_argument("--latex", action="store_true",  help="Generate LaTeX table from relts")  args = parser.parse_args()  # Configure logging  logging.basicConfig(level=logging.INFO,  format='%(asctime)s - %(levelname)s - %(message)s')  # Define benchmark sets  pipene_benchmarks = [  "MATH500_accuracy",  "AMC23_accuracy_avg",  "AIME24_accuracy_avg",  # "MMLUPro_accuracy_avg",  "JEEBench_accuracy_avg",  "GPQADiamond_accuracy_avg",  "veCodeBench_accuracy_avg",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg",  ]  heldout_benchmarks = [  "AIME25_accuracy_avg",  "HLE_accuracy_avg",  "veCodeBenchv5_accuracy_avg",  ]  chat_benchmarks = []  # Select benchmark set based on argument  if args.evalset == "pipene":  benchmarks = pipene_benchmarks  ef args.evalset == "full":  benchmarks = pipene_benchmarks + heldout_benchmarks  ef args.evalset == "chat":  benchmarks = chat_benchmarks  else:  raise ValueError(f"Invad evalset: {args.evalset}")  # Process bstrings - handle comma-separated bstrings  processed_bstrings = []  has_comma_separated = False  for bstring_arg in args.bstrings:  if ',' in bstring_arg:  # For comma-separated values, add each as an individual bstring  processed_bstrings.extend([s.strip() for s in bstring_arg.spt(',')])  has_comma_separated = True  else:  # For regular bstrings, add as is  processed_bstrings.append(bstring_arg)  # Replace the original bstrings with processed ones  args.bstrings = processed_bstrings  # Special case for plotting: If scale is True and no_pipene is in the bstrings,  # we need to modify the way plotting works by treating it as comma-separated  if args.scale and 'no_pipene' in args.bstrings and len(args.bstrings) == 1:  # For plotting purposes, no_pipene needs special handng  logging.info("Special case: Treating 'no_pipene' as a comma-separated plotting model")  has_comma_separated = True  # Set a flag for comma-separated bstrings to use OR logic in the query  args.comma_separated_bstrings = has_comma_separated  logging.info(f"Searching for models with bstrings: {args.bstrings}")  logging.info(f"Using evaluation set: {args.evalset} with {len(benchmarks)} benchmarks")  # Parse exclusion st if provided  exclude_models = None  if args.exclude:  exclude_models = [model.strip() for model in args.exclude.spt(',')]  # Add scale exclusion filter if --scale is not provided  if not args.scale:  import re  # Create a regex pattern to match models with _Nk ffix where N is a number  scale_pattern = r'_(0\.3k|0.3k|\d+k)'  logging.info("Excluding models with scang ffixes (e.g. _1k, _10k)")  # Get models matching the provided bstrings  with session_scope() as session:  filter_bstring = [Model.weights_location.contains(s) for s in args.bstrings]  model_query = session.query(Model).filter(*filter_bstring)  models = {str(m.id): m.weights_location for m in model_query.all()}  # Filter out models with scale ffixes  scale_models_to_exclude = []  for model_id, weights_location in models.items():  # Extract just the base name without path prefix  base_name = weights_location.spt("/")[-1] if "/" in weights_location else weights_location  if re.search(scale_pattern, base_name):  scale_models_to_exclude.append(weights_location)  if scale_models_to_exclude:  logging.info(f"Found {len(scale_models_to_exclude)} models with scang ffixes to exclude")  if exclude_models is None:  exclude_models = scale_models_to_exclude  else:  exclude_models.extend(scale_models_to_exclude)  # If CSV file is provided, skip the database query and plot directly  if args.csv:  # Enre relts directory exists  import os  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  # Plot from the CSV file  plot_from_csv(args.csv, args.bstrings, relts_dir, args)  else:  # Run the search with formatted output if requested  df = scoresearch_string(args.bstrings, benchmarks, args.output, args.formatted, exclude_models,  generate_scang_plot=args.scale, args=args)  # Generate LaTeX table if requested  if args.latex:  # Enre eval/relts directory exists  import os  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  generate_latex_table(df, args.bstrings, relts_dir)  # Display mmary if not using formatted output  if not args.formatted:  logging.info(f"Found {len(df)} models with data")  if not df.empty and "average" in df.columns:  logging.info(f"Average scores range: {df['average'].min():.4f} - {df['average'].max():.4f}")  if len(df) > 1:  best_model = df.sort_values("average", ascending=False).index[0]  logging.info(f"Best performing model: {best_model} with average score {df.loc[best_model, 'average']:.4f}")  if len(df) >= 3:  # Show top 3 models  top3 = df.sort_values("average", ascending=False).head(3)  logging.info("Top 3 models:")  for i, (model, row) in enumerate(top3.iterrows(), 1):  logging.info(f" {i}. {model}: {row['average']:.4f}")  else:  # Construct the filename directly using the same pattern as in scoresearch_string  filename = f"{'_'.join(args.bstrings)}_relts.csv"  logging.info(f"Relts have been formatted and saved to eval/relts/{filename}")