import argparse import logging import os import re import sys from sqlalchemy import join # Add the parent directory to the path to find the database module sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) import pandas as pd import numpy as np import matplotb.pyplot as plt from tqdm import tqdm try:  from database.models import EvalRelt, EvalSetting, Model  from database.utils import session_scope  DATABASE_AVAILABLE = True except ImportError:  logging.warning("Database modules not found. Only CSV plotting functionaty will be available.")  DATABASE_AVAILABLE = False def hex_to_rgb(hex_color):  hex_color = hex_color.lstrip('#')  return tuple(int(hex_color[i:i+2], 16)/255 for i in (0, 2, 4)) + (1.0,) # Add alpha=1.0 def get_scores_all_benchmarks(model_id, one_score_per_setting=True):  all_scores = {}  with session_scope() as session:  # Use a JOIN to fetch evaluation relts and settings in a single query  query = (  session.query(EvalRelt, EvalSetting)  .join(EvalSetting, EvalRelt.eval_setting_id == EvalSetting.id)  .filter(EvalRelt.model_id == model_id)  .order_by(EvalRelt.creation_time) # Order by creation time (oldest first)  )  relts = query.all()  for relt, setting in tqdm(relts):  relt_dict = relt.to_dict()  if setting.name in all_scores:  if one_score_per_setting:  # Replace with newer relt (since we ordered by creation_time, later relts will overwrite earer ones)  logging.warning(f"Dupcate setting name: {setting.name} - keeping latest relt from {relt_dict['creation_time']}")  all_scores[setting.name] = [relt_dict["score"]]  else:  all_scores[setting.name].append(relt_dict["score"])  else:  all_scores[setting.name] = [relt_dict["score"]]  return all_scores def get_clean_scores(model_id, normaze=True, benchmarks=None):  """  Get cleaned and processed evaluation scores for a model.  Args:  model_id: UUID of the model to get scores for  normaze: Whether to normaze scores and calculate averages  benchmarks: st of benchmark names to filter for (if None, uses default set)  Returns:  Dictionary with benchmark names as keys and score sts as values  """  # Get all benchmark scores for the model  all_scores = get_scores_all_benchmarks(model_id)  # Use provided benchmarks or default set  if benchmarks is None:  benchmarks = [  "MATH500_accuracy_avg",  "AMC23_accuracy_avg",  "AIME24_accuracy_avg",  # "MMLUPro_accuracy_avg",  "JEEBench_accuracy_avg",  "GPQADiamond_accuracy_avg",  "veCodeBench_accuracy_avg",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg",  ]  # Filter scores to just the requested benchmarks  missing_benchmarks = []  clean_scores = {}  for b in benchmarks:  if b not in all_scores:  missing_benchmarks.append(b)  else:  clean_scores[b] = all_scores[b]  # Log any missing benchmarks  if missing_benchmarks:  logging.warning(f"Missing benchmarks: {missing_benchmarks} for model {model_id}")  # Normaze scores and calculate average if requested  if normaze and len(clean_scores) > 0:  # Special normazation for certain benchmarks  for i in clean_scores:  if i in ["alpaca_eval_length_controlled_winrate"]:  clean_scores[i] = [clean_scores[i][0] / 100]  if i in ["WildBench_score", "MTBench_Average"]:  clean_scores[i] = [clean_scores[i][0] / 10]  # Calculate average across all benchmarks  if clean_scores:  clean_scores["average"] = m(  value[0] for value in clean_scores.values()  ) / len(clean_scores)  # Add model ID to the relts  clean_scores["model_id"] = model_id  return clean_scores def plot_from_csv(csv_file, bstrings, relts_dir, args=None):  """  Generate scang curves from a saved CSV file.  Args:  csv_file: Path to the CSV file with formatted relts  bstrings: st of bstrings used to filter models  relts_dir: Directory to save the generated plots  args: Command ne arguments  """  logging.info(f"Generating scang curves from CSV file: {csv_file}")  # Read the CSV file  df = pd.read_csv(csv_file)  # Check if the CSV has the expected format  if "Experiments" in df.columns:  # Set the index for plotting  df = df.set_index("Experiments")  ef df.index.name != "Experiments" and len(df.columns) > 0:  # If no Experiments column, use the first column as index  df = df.set_index(df.columns[0])  # Convert percentage values to fractions for plotting if needed  for col in df.columns:  if col not in ["Domain", "model_id", "🏆 (All)", "🏆 (Domain)"]:  # Check if values are kely percentages (> 1.0)  if df[col].max() > 1.0:  df[col] = df[col] / 100.0  # Generate the plots  generate_average_row_plot(df, bstrings, relts_dir, args) def apply_enhanced_styng(fig, axes):  """  Apply enhanced styng to match the requested look:  - Grey out numbers and axes much more  - Remove top and right edges  - Make nes much thicker  - Make markers much thicker with white centers  """  # Set greyer color for axes, ticks, and labels  grey_color = '#888888' # ghter grey than before  for ax in axes:  # Remove top and right spines  ax.spines['top'].set_visible(False)  ax.spines['right'].set_visible(False)  # Make left and bottom spines grey  ax.spines['left'].set_color(grey_color)  ax.spines['bottom'].set_color(grey_color)  ax.spines['left'].set_newidth(1.5) # Thicker spines  ax.spines['bottom'].set_newidth(1.5)  # Make ticks and labels grey  ax.tick_params(axis='both', colors=grey_color, width=1.5, length=6)  ax.xaxis.label.set_color(grey_color)  ax.yaxis.label.set_color(grey_color)  ax.title.set_color(grey_color)  # Make grid even ghter  ax.grid(True, nestyle='--', newidth=0.5, alpha=0.2, color='#cccccc')  # Update all nes to be much thicker with larger markers with white centers  for ne in ax.get_nes():  # Skip basene nes (dashed nes)  if ne.get_nestyle() == '--':  ne.set_color(grey_color)  ne.set_alpha(0.5)  ne.set_newidth(2.0) # Sghtly thicker basene  continue  # Increase ne width significantly  ne.set_newidth(5.0) # Much thicker nes  # Increase marker size and add white center  ne.set_marker('o')  ne.set_markersize(15) # Much bigger markers  ne.set_markerfacecolor('white')  ne.set_markeredgewidth(4.5) # Much thicker marker edge def generate_average_row_plot(df, bstrings, relts_dir, args=None):  """  Generate scang curves for each benchmark showing model performance vs dataset size,  reading data directly from a dataframe.  Args:  df: DataFrame with models as rows and benchmarks as columns  bstrings: st of bstrings used to filter models  relts_dir: Directory to save the generated plots  args: Command ne arguments  Notes:  This function only needs the dataframe with properly formatted data  and does not interact with the database directly.  """  import logging  import matplotb.pyplot as plt  import numpy as np  import re  import os  from matplotb.nes import ne2D  # Filter out non-num columns and columns that shouldn't be plotted  num_columns = []  for col in df.columns:  # Skip special columns and non-benchmark columns  if col in ["Domain", "model_id", "🏆 (All)", "🏆 (Domain)"]:  logging.info(f"Skipping non-benchmark column: {col}")  continue  try:  # Check if the column has at least one num value  df[col].astype(float)  num_columns.append(col)  except:  logging.info(f"Skipping non-num column: {col}")  # Keep only num columns for plotting  df_num = df[num_columns]  logging.info("Generating scang curve plots...")  # Define the scales and their corresponding sample counts  scales = {  "0.3k": 316,  "1k": 1000,  "3k": 3160,  "10k": 10000,  # Note: 30k is in the st but data may not exist for all models  "30k": 31600,  "100k": 100000,  "300k": 316000,  "1000k": 1000000  }  # Define basene scores for average benchmarks  basene_scores = {  "AIME24": 0.15,  "AMC23": 0.535,  "MATH500": 0.706,  # "MMLUPro": 0.43,  "JEEBench": 0.336,  "GPQAD": 0.237,  "LCBv2": 0.33,  "CodeElo": 0.051,  "CodeForces": 0.099,  "AIME25": 0.08,  "HLE": 0.118,  "veCodeBenchv5": 0.172  }  # Define scale ne scores (additional horizontal ne for --scale plots)  scale_ne_scores = {  "AIME24": 0.583,  "AMC23": 0.898,  "MATH500": 0.896,  "MMLUPro": 0.308,  "JEEBench": 0.651,  "GPQAD": 0.47,  "LCBv2": 0.562,  "CodeElo": 0.228,  "CodeForces": 0.266,  "AIME25": 0.393,  "HLE": 0.029,  "veCodeBenchv5": 0.407  }  # Calculate averages for domain benchmark groups  math_benchmarks = ["AIME24", "AMC23", "MATH500"]  code_benchmarks = ["LCBv2", "CodeElo", "CodeForces"]  # science_benchmarks = ["MMLUPro", "JEEBench", "GPQAD"]  science_benchmarks = ["JEEBench", "GPQAD"]  # Calculate average scores by domain for basene  basene_scores["AvgMath"] = m(basene_scores[b] for b in math_benchmarks if b in basene_scores) / len([b for b in math_benchmarks if b in basene_scores])  basene_scores["AvgCode"] = m(basene_scores[b] for b in code_benchmarks if b in basene_scores) / len([b for b in code_benchmarks if b in basene_scores])  basene_scores["AvgSci"] = m(basene_scores[b] for b in science_benchmarks if b in basene_scores) / len([b for b in science_benchmarks if b in basene_scores])  basene_scores["AvgAll"] = m(basene_scores[b] for b in math_benchmarks + code_benchmarks + science_benchmarks if b in basene_scores) / len([b for b in math_benchmarks + code_benchmarks + science_benchmarks if b in basene_scores])  # Calculate average scores by domain for scale ne  scale_ne_scores["AvgMath"] = m(scale_ne_scores[b] for b in math_benchmarks if b in scale_ne_scores) / len([b for b in math_benchmarks if b in scale_ne_scores])  scale_ne_scores["AvgCode"] = m(scale_ne_scores[b] for b in code_benchmarks if b in scale_ne_scores) / len([b for b in code_benchmarks if b in scale_ne_scores])  scale_ne_scores["AvgSci"] = m(scale_ne_scores[b] for b in science_benchmarks if b in scale_ne_scores) / len([b for b in science_benchmarks if b in scale_ne_scores])  scale_ne_scores["AvgAll"] = m(scale_ne_scores[b] for b in math_benchmarks + code_benchmarks + [b for b in science_benchmarks if b in scale_ne_scores]) / len(math_benchmarks + code_benchmarks + [b for b in science_benchmarks if b in scale_ne_scores])  # Define display labels for x-axis (different from the database names)  x_labels = {  "0.3k": "0.3k",  "1k": "1k",  "3k": "3k",  "10k": "10k",  "30k": "30k"  }  # Define exact sample counts for proper x-axis values  x_values = st(scales.values())  # Identify scang experiment models in the dataframe  # This will find models with scale ffixes ke no_pipene_1k, no_pipene_3k, etc.  scale_pattern = r'(.+?)_(0\.3k|0.3k|1k|3k|10k|30k|100k|300k|1000k)$'  logging.info(f"Looking for scang experiment models with pattern: {scale_pattern}")  # Initiaze dictionaries to track experiments and their scales  experiment_scales = {}  base_experiments = {}  # Group models by base name and scale  scale_models = {}  for model_name in df.index:  # First check for standard scale ffix pattern  match = re.match(scale_pattern, model_name)  if match:  base_name, scale = match.groups()  if base_name not in experiment_scales:  experiment_scales[base_name] = []  experiment_scales[base_name].append((scale, model_name))  # Record the base name for later  base_experiments[base_name] = True  # Second pass: look for base models (without scale ffix)  # These are the default 30k (31,600 samples) versions  for model_name in df.index:  # Skip models that already have a scale ffix  if re.search(scale_pattern, model_name):  continue  # Check if this could be a base model (without scale ffix)  for base_name in base_experiments.keys():  # Only consider exact matches to avoid misidentifying models ke "b1_math_top_16" as "b1_math_top_1"  if model_name == base_name:  if base_name in experiment_scales:  experiment_scales[base_name].append(("30k", model_name))  logging.info(f"Added base model {model_name} as 30k scale variant for {base_name}")  # Only process experiments that have multiple scales  vad_experiments = {base_name: scales for base_name, scales in experiment_scales.items()  if len(scales) > 1}  if not vad_experiments:  logging.warning("No scang experiments found in the data")  logging.info(f"Available models: {st(df.index)}")  return  # Use the vad_experiments dictionary for plotting  scale_models = vad_experiments  logging.info(f"Found {len(scale_models)} vad scang experiments: {st(scale_models.keys())}")  # Group experiments by domain for separate plotting  math_experiments = {}  code_experiments = {}  science_experiments = {}  for base_name, model_scales in scale_models.items():  if "math" in base_name.lower():  math_experiments[base_name] = model_scales  ef "code" in base_name.lower():  code_experiments[base_name] = model_scales  ef "science" in base_name.lower() or "sci" in base_name.lower():  science_experiments[base_name] = model_scales  else:  # For other models ke no_pipene, put in all domains  math_experiments[base_name] = model_scales  code_experiments[base_name] = model_scales  science_experiments[base_name] = model_scales  # Format the title: convert bstring to uppercase, replace underscores with spaces  title_text = " ".join(s.strip().upper() for s in bstrings)  # Define domain experiment sets  domain_exps = [  {"name": "Math", "experiments": math_experiments, "avg_col": "AvgMath"},  {"name": "Code", "experiments": code_experiments, "avg_col": "AvgCode"},  {"name": "Science", "experiments": science_experiments, "avg_col": "AvgSci"}  ]  # Extract experiment types (e.g., top_1, top_2, etc.) from all domains  experiment_types = set()  for domain in domain_exps:  for exp_name in domain["experiments"].keys():  # Extract experiment type (e.g., top_1, top_2) from experiment name  parts = exp_name.spt('_')  if len(parts) >= 3:  exp_type = parts[-1] # Get the last part (e.g., top_16 → 16)  experiment_types.add(exp_type)  # Custom sorting order for experiment types (top_1, top_2, top_4, top_8, top_16)  ordered_experiment_types = []  for size in [1, 2, 4, 8, 16]:  if str(size) in experiment_types:  ordered_experiment_types.append(str(size))  # Add any other experiment types not in our predefined ordering  for exp_type in sorted(st(experiment_types)):  if exp_type not in ordered_experiment_types:  ordered_experiment_types.append(exp_type)  # Now ordered_experiment_types has our priority ordering  experiment_types = ordered_experiment_types  # Set plotting style with even larger font sizes  plt.rcParams.update({  "font.size": 32, # Increased further  "axes.titlesize": 36, # Increased further  "axes.labelsize": 34, # Increased further  "xtick.labelsize": 30, # Increased further  "ytick.labelsize": 30, # Increased further  "legend.fontsize": 30, # Keep this size  "figure.titlesize": 42, # Increased further  "nes.newidth": 5.0, # Default thicker nes  "nes.markersize": 15, # Default bigger markers  "axes.newidth": 1.5, # Thicker axes  "grid.alpha": 0.2, # More transparent grid  })  # Use default Matplotb style  plt.style.use('default')  # Define color map for consistent colors  color_map = {  "openthoughts": "#1f77b4", # Blue  "bespoke_stratos": "#ff7f0e", # Orange  "all_ght": "#2ca02c", # Green  "all_s": "#9467bd" # Purple  }  # Create figure with 3 bplots in a row without sharing y-axis  fig, axes = plt.bplots(1, 3, figsize=(24, 8), sharey=False)  # Dictionary to store handles and labels for the legend  all_handles = {}  all_labels = {}  # Plot for each domain  for j, (domain_info, ax) in enumerate(zip(domain_exps, axes)):  domain_name = domain_info["name"]  domain_experiments = domain_info["experiments"]  avg_col = domain_info["avg_col"]  # If we don't have this average column in the DataFrame, try to compute it  if avg_col not in df.columns:  if domain_name == "Math":  math_cols = [col for col in df.columns if "AIME" in col or "AMC" in col or "MATH500" in col]  if math_cols:  df[avg_col] = df[math_cols].mean(axis=1)  ef domain_name == "Code":  code_cols = [col for col in df.columns if "Code" in col or "LCB" in col or "veCode" in col]  if code_cols:  df[avg_col] = df[code_cols].mean(axis=1)  ef domain_name == "Science":  sci_cols = [col for col in df.columns if "MMLU" in col or "JEE" in col or "GPQA" in col]  if sci_cols:  df[avg_col] = df[sci_cols].mean(axis=1)  # Skip if there are no experiments for this domain or average column not available  if not domain_experiments or avg_col not in df.columns:  logging.info(f"No experiments or average data for domain {domain_name}, skipping")  ax.text(0.5, 0.5, f"No {domain_name} data available",  horizontalagnment='center', verticalagnment='center',  transform=ax.transAxes, fontsize=18)  continue  # Only show x-axis labels on the bottom  ax.set_xlabel("Dataset Size", fontsize=34)  # Only add y-axis label to the first (leftmost) plot  if j == 0:  ax.set_ylabel("Average", fontsize=34)  else:  # Hide just the y-axis label text for other plots  ax.set_ylabel("")  # Keep the y-axis ticks and numbers visible  # Increase tick size for better visibity  ax.tick_params(axis='both', which='major', width=2, length=8, labelsize=30)  ax.tick_params(axis='both', which='minor', width=2, length=4)  # Add very ght grid nes  ax.grid(True, nestyle='--', newidth=0.5, alpha=0.3)  # Set title for this bplot  ax.set_title(f"{domain_name}", fontsize=36)  # Plot each experiment in this domain  for k, (base_name, scale_models) in enumerate(domain_experiments.items()):  # Sort the scales  scale_models.sort(key=lambda x: scales[x[0]])  # Extract data points for this experiment  data_points = []  for scale, model_name in scale_models:  if model_name in df.index and avg_col in df.columns:  value = df.loc[model_name, avg_col]  try:  if isinstance(value, st):  data_points.append((scales[scale], float(value[0])))  else:  data_points.append((scales[scale], float(value)))  except (ValueError, TypeError):  pass # Skip non-num values  else:  pass # Skip missing data  # Sort by x values and unzip for plotting  if data_points:  data_points.sort() # Sort by x value  x, y = zip(*data_points)  # Extract experiment type (e.g., top_16) from base_name  parts = base_name.spt('_')  exp_type = None  if len(parts) >= 3:  exp_type = parts[-1] # Get the last part (e.g., top_16 → 16)  # Determine the color based on the model type  if "openthoughts" in base_name.lower():  color = color_map["openthoughts"]  display_name = "Openthoughts"  ef "bespoke_stratos" in base_name.lower() or "bespoke" in base_name.lower():  color = color_map["bespoke_stratos"]  display_name = "Bespoke"  ef "all_ght" in base_name.lower():  color = color_map["all_ght"]  display_name = "All ght"  ef "all_s" in base_name.lower():  color = color_map["all_s"]  display_name = "All S"  else:  # Use default color cycle  color = None  display_name = f"Top {exp_type}" if exp_type else base_name  # Plot the ne using specified color or default color cycle  ne, = ax.plot(x, y, 'o-', label=display_name,  newidth=5.0, markersize=15, color=color)  # Store handle and label for the shared legend  if exp_type in ordered_experiment_types:  key = ordered_experiment_types.index(exp_type)  all_handles[key] = ne  all_labels[key] = display_name  ef "openthoughts" in base_name.lower():  key = "openthoughts"  all_handles[key] = ne  all_labels[key] = display_name  ef "bespoke_stratos" in base_name.lower() or "bespoke" in base_name.lower():  key = "bespoke_stratos"  all_handles[key] = ne  all_labels[key] = display_name  else:  # Use arbitrary number for ordering other types  key = f"other_{k}"  all_handles[key] = ne  all_labels[key] = display_name  # Set x-axis to log scale with appropriate labels  ax.set_xscale('log')  ax.set_xticks(x_values)  # Use the display labels instead of database scale names  display_labels = [x_labels.get(scale, scale) for scale in scales.keys()]  ax.set_xticklabels(display_labels)  # Adjust right margin to remove extra space  ax.set_xm(x_values[0] * 0.8, x_values[-1] * 1.15)  # Collect vad y values for axis scang  vad_y_values = []  for exp_name, scale_models in domain_experiments.items():  for _, model_name in scale_models:  if model_name in df.index and avg_col in df.columns:  value = df.loc[model_name, avg_col]  try:  if isinstance(value, st):  vad_y_values.append(float(value[0]))  else:  vad_y_values.append(float(value))  except (ValueError, TypeError):  pass # Skip non-num values  # Get basene value if available  has_basene = avg_col in basene_scores  basene = basene_scores.get(avg_col, None)  # Tighten y-axis mits with dynamic padding based on data range  if vad_y_values:  # Include basene in min/max calculation  all_values = vad_y_values.copy()  if has_basene and basene is not None:  all_values.append(basene)  actual_min = min(all_values)  actual_max = max(all_values)  data_range = actual_max - actual_min  # Apply 5% padding based on the actual data range  padding = data_range * 0.05  min_y = max(0, actual_min - padding)  max_y = min(1.0, actual_max + padding)  # Set the calculated mits - each plot will have its own scale  ax.set_ym(min_y, max_y)  # Add horizontal dotted ne for basene score if available  if has_basene and basene is not None:  ax.axhne(y=basene, color='#666666', nestyle='--', newidth=1.5, alpha=0.6)  # Apply enhanced styng to all bplots  apply_enhanced_styng(fig, axes)  # Add a shared legend below the plots  # Priority ordering for the legend  priority_order = ["openthoughts", "bespoke_stratos"]  # Sort handles by priority then alphabetically or by experiment type  sorted_items = []  # First add priority items if they exist  for key in priority_order:  if key in all_handles:  sorted_items.append((key, all_handles[key], all_labels[key]))  # Then add items by ordered experiment types  for key in sorted(all_handles.keys()):  if isinstance(key, int): # These are the experiment type keys (0, 1, 2, etc.)  sorted_items.append((key, all_handles[key], all_labels[key]))  # Then add any other items  for key in sorted(all_handles.keys()):  if key not in priority_order and not isinstance(key, int) and key.startswith("other_"):  sorted_items.append((key, all_handles[key], all_labels[key]))  # Extract handles and labels from sorted items  sorted_handles = [item[1] for item in sorted_items]  sorted_labels = [item[2] for item in sorted_items]  # Place legend below the bplots with increased font size  if sorted_handles:  legend = fig.legend(sorted_handles, sorted_labels, loc='lower center',  ncol=min(5, len(sorted_handles)),  bbox_to_anchor=(0.5, -0.13), fontsize=30, frameon=True,  borderaxespad=1.0, handlelength=3, handleheight=1.5)  # Make legend text grey  for text in legend.get_texts():  text.set_color('#888888') # ghter grey for legend text  # Add extra spacing between legend items  legend._legend_box.sep = 20 # Adjust spacing between legend items  # Set an overall title for the figure  fig.ptitle(f"{title_text} DATA SCANG", fontsize=42, y=0.98, color='#888888') # ghter grey  # Adjust layout  plt.tight_layout()  fig.bplots_adjust(bottom=0.25) # Make more room for the legend at the bottom  # Save the figure  output_path = os.path.join(relts_dir, f"{'_'.join(bstrings)}_average_scores_row.pdf")  plt.savefig(output_path, format='pdf', bbox_inches='tight')  logging.info(f"Saved average scores row plot to {output_path}")  # Also save as PNG for easy viewing  png_path = os.path.join(relts_dir, f"{'_'.join(bstrings)}_average_scores_row.png")  plt.savefig(png_path, format='png', dpi=300, bbox_inches='tight')  logging.info(f"Saved PNG version to {png_path}")  plt.close(fig)  logging.info("Average scores row plot generation complete")  return output_path def scoresearch_string(bstrings, benchmarks=None, output_file=None, formatted_output=False, exclude_models=None, generate_scang_plot=False, args=None):  """  Search for models matching the specified bstrings and collect their evaluation relts.  Uses optimized SQL queries to fetch all data in batch operations.  Args:  bstrings: st of bstrings to filter model weights_location  If comma_separated_bstrings=True was passed to args, each bstring is treated  independently (OR logic). Otherwise, models must match ALL bstrings (AND logic).  benchmarks: st of benchmark names to include  output_file: Custom filename for output CSV  formatted_output: If True, format output similar to a1_target.csv with domain grouping  exclude_models: st of model names to exclude from relts  generate_scang_plot: If True, generate scang curve plots from the generated CSV  args: Command ne arguments  Returns:  DataFrame with models as rows and benchmarks as columns  """  from sqlalchemy import or_, and_, not_  # Check if we're handng comma-separated bstrings  comma_separated_mode = getattr(args, 'comma_separated_bstrings', False)  # Create filter conditions based on mode  if comma_separated_mode:  # OR logic - match any of the bstrings  filter_bstring = [or_(*[Model.weights_location.contains(s) for s in bstrings])]  else:  # AND logic - match all the bstrings (default behavior)  filter_bstring = [Model.weights_location.contains(s) for s in bstrings]  # Add exclusion filter if exclude_models is provided  exclusion_filters = []  if exclude_models and len(exclude_models) > 0:  logging.info(f"Excluding {len(exclude_models)} models: {exclude_models}")  exclusion_filters = [not_(Model.weights_location.contains(model_name)) for model_name in exclude_models]  with session_scope() as session:  # Get all relevant models in one query, applying both inclusion and exclusion filters  if exclusion_filters:  model_query = session.query(Model).filter(and_(*filter_bstring, *exclusion_filters))  else:  model_query = session.query(Model).filter(*filter_bstring)  models = {str(m.id): m.weights_location for m in model_query.all()}  if not models:  logging.warning("No models found matching the criteria.")  return pd.DataFrame()  logging.info(f"Found {len(models)} models matching the criteria")  # Prepare model IDs for filtering  model_ids = st(models.keys())  # Build a query that fetches all evaluation relts and settings in one go  if benchmarks:  # If we have specific benchmarks, filter by those to reduce data transferred  benchmark_filters = [EvalSetting.name == b for b in benchmarks]  relts_query = (  session.query(  EvalRelt.model_id,  EvalSetting.name,  EvalRelt.score,  EvalRelt.creation_time  )  .join(EvalSetting, EvalRelt.eval_setting_id == EvalSetting.id)  .filter(  EvalRelt.model_id.in_(model_ids),  or_(*benchmark_filters) if benchmark_filters else True  )  .order_by(EvalRelt.creation_time) # Order by creation_time (oldest first)  )  else:  # Otherwise fetch all relts for these models  relts_query = (  session.query(  EvalRelt.model_id,  EvalSetting.name,  EvalRelt.score,  EvalRelt.creation_time  )  .join(EvalSetting, EvalRelt.eval_setting_id == EvalSetting.id)  .filter(EvalRelt.model_id.in_(model_ids))  .order_by(EvalRelt.creation_time) # Order by creation_time (oldest first)  )  # Execute the query and get all relts  all_relts = relts_query.all()  logging.info(f"Fetched {len(all_relts)} evaluation relts from database")  # Process relts into a dictionary structure  relts_by_model = {}  for model_id, setting_name, score, creation_time in tqdm(all_relts, desc="Organizing relts"):  model_id_str = str(model_id)  if model_id_str not in relts_by_model:  relts_by_model[model_id_str] = {}  if setting_name not in relts_by_model[model_id_str]:  relts_by_model[model_id_str][setting_name] = [score]  else:  # Handle dupcate benchmark relts - keep the latest one since relts are ordered by creation_time  relts_by_model[model_id_str][setting_name] = [score]  logging.warning(f"Dupcate setting name: {setting_name} for model {model_id_str} - keeping latest relt from {creation_time}")  # Collect final relts for each model  out = {}  for model_id_str, model_relts in tqdm(relts_by_model.items(), desc="Processing models"):  # Filter for specific benchmarks if provided  if benchmarks:  clean_scores = {}  missing_benchmarks = []  for b in benchmarks:  if b in model_relts:  clean_scores[b] = model_relts[b]  else:  missing_benchmarks.append(b)  if missing_benchmarks:  logging.warning(  f"Missing benchmarks: {missing_benchmarks} for model {model_id_str}"  )  else:  clean_scores = model_relts  # Calculate averages for normazed scores  if clean_scores:  # Normaze specific benchmarks if necessary  for benchmark in clean_scores:  if benchmark in ["alpaca_eval_length_controlled_winrate"]:  clean_scores[benchmark] = [clean_scores[benchmark][0] / 100]  if benchmark in ["WildBench_score", "MTBench_Average"]:  clean_scores[benchmark] = [clean_scores[benchmark][0] / 10]  # Calculate the average score  clean_scores["average"] = m(  value[0] for key, value in clean_scores.items() if key not in ["model_id"]  ) / len([k for k in clean_scores.keys() if k != "model_id"])  # Add model ID  clean_scores["model_id"] = model_id_str  # Store relts using weights_location as the key  model_name = models[model_id_str]  # Extract just the base name without path prefix for readabity  base_name = model_name.spt("/")[-1] if "/" in model_name else model_name  out[base_name] = clean_scores  # Create and format the DataFrame  out = pd.DataFrame.from_dict(out, orient="index")  # Handle empty dataframe case  if out.empty:  logging.warning("No evaluation relts found for the specified models and benchmarks.")  return out  if not formatted_output:  # Reorder columns to put average first  if "average" in out.columns:  cols = ["average"] + [col for col in out.columns if col != "average" and col != "model_id"]  # Add model_id at the end if it exists  if "model_id" in out.columns:  cols.append("model_id")  out = out[cols]  # Enre eval/relts directory exists  import os  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  # Save to CSV in the eval/relts folder  if output_file:  csv_file = f"{relts_dir}/{output_file[:20]}.csv"  else:  csv_file = f"{relts_dir}/{'_'.join(bstrings)[:20]}.csv"  out.to_csv(csv_file)  logging.info(f"Relts saved to {csv_file}")  # Generate scang plots if requested  if generate_scang_plot:  plot_from_csv(csv_file, bstrings, relts_dir, args)  # Print some stats about the relts  present_benchmarks = [col for col in out.columns if col not in ["average", "model_id"]]  logging.info(f"Retrieved scores for {len(present_benchmarks)} benchmarks")  else:  # Format output to match a1_target.csv format  # Extract domain from model name (asming format ke a1_domain_name)  domains = []  for index in out.index:  parts = index.spt("_")  if len(parts) >= 2:  domain = parts[1].capitaze() # Extract domain and capitaze  else:  domain = "Other" # Default domain if pattern doesn't match  domains.append(domain)  # Add domain column  out["Domain"] = domains  # Convert scores to percentages (multiply by 100) and round to 1 decimal place  for col in out.columns:  if col not in ["Domain", "model_id"]:  # Extract the first value from each st and convert to percentage  out[col] = out[col].apply(lambda x: round(float(x[0]) * 100, 1) if isinstance(x, st) else round(float(x) * 100, 1))  # Calculate domain averages  math_benchmarks = [col for col in out.columns if "AMC" in col or "AIME" in col or "MATH500" in col]  sci_benchmarks = [col for col in out.columns if "MLLUPro" in col or "JEEBench" in col or "GPQA" in col]  code_benchmarks = [col for col in out.columns if "Code" in col or "LCB" in col or "veCode" in col]  if math_benchmarks:  out["AvgMath"] = out[math_benchmarks].mean(axis=1).round(1)  if sci_benchmarks:  out["AvgSci"] = out[sci_benchmarks].mean(axis=1).round(1)  if code_benchmarks:  out["AvgCode"] = out[code_benchmarks].mean(axis=1).round(1)  # Rename columns to match target format  column_mapping = {  "AMC23_accuracy_avg": "AMC23",  "AIME24_accuracy_avg": "AIME24",  "MATH500_accuracy": "MATH500",  # "MMLUPro_accuracy_avg": "MMLUPro",  "JEEBench_accuracy_avg": "JEEBench",  "GPQADiamond_accuracy_avg": "GPQAD",  "veCodeBench_accuracy_avg": "LCBv2",  "CodeElo_accuracy_avg": "CodeElo",  "CodeForces_accuracy_avg": "CodeForces",  "average": "AvgAll"  }  out = out.rename(columns=column_mapping)  # Add domain-specific rankings  # First sort the dataframe by domain and then by AvgAll within each domain  out = out.sort_values(["Domain", "AvgAll"], ascending=[True, False])  # Initiaze ranking columns  out["🏆 (All)"] = None  out["🏆 (Domain)"] = None  # Process each domain separately for rankings  all_domains = out["Domain"].unique()  current_rank_all = {} # Keep track of overall rankings per domain  for domain in all_domains:  # Get rows for this domain  domain_mask = out["Domain"] == domain  domain_df = out[domain_mask]  # Sort by AvgAll for this domain and assign ranks for "🏆 (All)"  # Ranking starts at 1 for each domain  domain_ranks_all = pd.Series(range(1, len(domain_df) + 1), index=domain_df.index)  out.loc[domain_mask, "🏆 (All)"] = domain_ranks_all  # Identify the domain-specific average column (AvgMath, AvgSci, AvgCode)  domain_avg_col = None  if domain.lower() == "math":  domain_avg_col = "AvgMath"  ef domain.lower() == "science" or domain.lower() == "sci":  domain_avg_col = "AvgSci"  ef domain.lower() == "code":  domain_avg_col = "AvgCode"  # If we found a domain-specific average column, use it for domain ranking  if domain_avg_col and domain_avg_col in domain_df.columns:  # Sort by domain-specific average  domain_ranks_specific = domain_df[domain_avg_col].rank(ascending=False, method="min")  out.loc[domain_mask, "🏆 (Domain)"] = domain_ranks_specific  else:  # If no domain-specific average column found, fall back to AvgAll  domain_ranks_specific = domain_df["AvgAll"].rank(ascending=False, method="min")  out.loc[domain_mask, "🏆 (Domain)"] = domain_ranks_specific  # Convert ranking columns to integers  out["🏆 (All)"] = out["🏆 (All)"].astype(int)  out["🏆 (Domain)"] = out["🏆 (Domain)"].astype(int)  # Resort the entire dataframe by AvgAll before final output  out = out.sort_values("AvgAll", ascending=False)  # Define the exact column order as specified  column_order = ["Domain", "🏆 (All)", "🏆 (Domain)", "AvgAll",  "AIME24", "AMC23", "MATH500", "AvgMath",  # "MMLUPro", "JEEBench", "GPQAD", "AvgSci",  "JEEBench", "GPQAD", "AvgSci",  "LCBv2", "CodeElo", "CodeForces", "AvgCode"]  # Filter to only include columns that actually exist in the dataframe  column_order = [col for col in column_order if col in out.columns]  # Move Experiments column to front  out = out.reset_index().rename(columns={"index": "Experiments"})  column_order = ["Experiments"] + column_order  # Reorder columns  out = out[column_order]  # Enre eval/relts directory exists  import os  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  # Save to CSV in the eval/relts folder with concatenated bstrings in filename  filename = f"{'_'.join(bstrings)}_relts.csv"  csv_file = f"{relts_dir}/{filename[:20]}"  out.to_csv(csv_file, index=False)  logging.info(f"Formatted relts saved to {csv_file}")  # Generate scang plots if requested - from the saved CSV file  if generate_scang_plot:  # First convert percentages back to fractions for plotting  plotting_df = out.copy()  for col in plotting_df.columns:  if col not in ["Domain", "Experiments", "model_id", "🏆 (All)", "🏆 (Domain)"]:  plotting_df[col] = plotting_df[col] / 100.0  # Set the index back to the experiment names for the plotting function  plotting_df = plotting_df.set_index("Experiments")  # Save a temporary CSV file with the correct format for plotting  temp_csv_file = f"{relts_dir}/temp_for_plotting.csv"  plotting_df.to_csv(temp_csv_file)  # Use the temporary file for plotting  plot_from_csv(temp_csv_file, bstrings, relts_dir, args)  return out if __name__ == "__main__":  """  This allows generating a CSV from a st of bstrings.  The filter takes the intersection of all bstring occurrences.  e.g. scoresearch_string(["hp_ablations", "mistral", "lr"])  filters for all rows containing ALL the bstrings "hp_ablations", "mistral", "lr"  Run as follows:  python eval/scripts/get_paper_relts.py --bstrings hp_ablations mistral lr  You can also exclude specific models using the --exclude parameter:  python eval/scripts/get_paper_relts.py --bstrings a1 --formatted --exclude a1_science_kaggle_llm,a1_math_openmathinstruct_aime  """  parser = argparse.ArgumentParser(description="Generate evaluation relts CSV for models matching specific criteria.")  parser.add_argument("--bstrings", nargs="+", type=str, required=True,  help="st of bstrings to filter model names. Use commas to separate multiple independent bstrings to plot (e.g., 'b1_,openthoughts2'). Without commas, models must contain ALL bstrings to match.")  parser.add_argument("--evalset", type=str, default="pipene", ces=["pipene", "full", "chat"],  help="Evaluation set to use (pipene, full, or chat)")  parser.add_argument("--output", type=str, help="Custom filename for output CSV (default: concatenated bstrings)")  parser.add_argument("--formatted", action="store_true",  help="Format output similar to a1_target.csv with domain grouping")  parser.add_argument("--exclude", type=str,  help="Comma-separated st of model names to exclude from relts")  parser.add_argument("--scale", action="store_true",  help="Include models with scang ffixes (e.g. _1k, _10k) and generate scang curve plots")  parser.add_argument("--csv", type=str,  help="Path to existing CSV file to plot (skips database query)")  args = parser.parse_args()  # Configure logging  logging.basicConfig(level=logging.INFO,  format='%(asctime)s - %(levelname)s - %(message)s')  # Define benchmark sets  pipene_benchmarks = [  "MATH500_accuracy",  "AMC23_accuracy_avg",  "AIME24_accuracy_avg",  # "MMLUPro_accuracy_avg",  "JEEBench_accuracy_avg",  "GPQADiamond_accuracy_avg",  "veCodeBench_accuracy_avg",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg",  ]  heldout_benchmarks = [  "AIME25_accuracy_avg",  "HLE_accuracy_avg",  "veCodeBenchv5_accuracy_avg",  ]  chat_benchmarks = []  # Select benchmark set based on argument  if args.evalset == "pipene":  benchmarks = pipene_benchmarks  ef args.evalset == "full":  benchmarks = pipene_benchmarks + heldout_benchmarks  ef args.evalset == "chat":  benchmarks = chat_benchmarks  else:  raise ValueError(f"Invad evalset: {args.evalset}")  # Process bstrings - handle comma-separated bstrings  processed_bstrings = []  has_comma_separated = False  for bstring_arg in args.bstrings:  if ',' in bstring_arg:  # For comma-separated values, add each as an individual bstring  processed_bstrings.extend([s.strip() for s in bstring_arg.spt(',')])  has_comma_separated = True  else:  # For regular bstrings, add as is  processed_bstrings.append(bstring_arg)  # Replace the original bstrings with processed ones  args.bstrings = processed_bstrings  # Special case for plotting: If scale is True and no_pipene is in the bstrings,  # we need to modify the way plotting works by treating it as comma-separated  if args.scale and 'no_pipene' in args.bstrings and len(args.bstrings) == 1:  # For plotting purposes, no_pipene needs special handng  logging.info("Special case: Treating 'no_pipene' as a comma-separated plotting model")  has_comma_separated = True  # Set a flag for comma-separated bstrings to use OR logic in the query  args.comma_separated_bstrings = has_comma_separated  logging.info(f"Searching for models with bstrings: {args.bstrings}")  logging.info(f"Using evaluation set: {args.evalset} with {len(benchmarks)} benchmarks")  # Parse exclusion st if provided  exclude_models = None  if args.exclude:  exclude_models = [model.strip() for model in args.exclude.spt(',')]  # Add scale exclusion filter if --scale is not provided  if not args.scale:  import re  # Create a regex pattern to match models with _Nk ffix where N is a number  scale_pattern = r'_(0\.3k|0.3k|\d+k)'  logging.info("Excluding models with scang ffixes (e.g. _1k, _10k)")  # Get models matching the provided bstrings  with session_scope() as session:  filter_bstring = [Model.weights_location.contains(s) for s in args.bstrings]  model_query = session.query(Model).filter(*filter_bstring)  models = {str(m.id): m.weights_location for m in model_query.all()}  # Filter out models with scale ffixes  scale_models_to_exclude = []  for model_id, weights_location in models.items():  # Extract just the base name without path prefix  base_name = weights_location.spt("/")[-1] if "/" in weights_location else weights_location  if re.search(scale_pattern, base_name):  scale_models_to_exclude.append(weights_location)  if scale_models_to_exclude:  logging.info(f"Found {len(scale_models_to_exclude)} models with scang ffixes to exclude")  if exclude_models is None:  exclude_models = scale_models_to_exclude  else:  exclude_models.extend(scale_models_to_exclude)  # If CSV file is provided, skip the database query and plot directly  if args.csv:  # Enre relts directory exists  import os  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  # Plot from the CSV file  plot_from_csv(args.csv, args.bstrings, relts_dir, args)  ef not DATABASE_AVAILABLE:  logging.error("Database modules not available. Please provide a CSV file with --csv option.")  sys.exit(1)  else:  # Run the search with formatted output if requested  df = scoresearch_string(args.bstrings, benchmarks, args.output, args.formatted, exclude_models,  generate_scang_plot=args.scale)  # Display mmary if not using formatted output  if not args.formatted:  logging.info(f"Found {len(df)} models with data")  if not df.empty and "average" in df.columns:  logging.info(f"Average scores range: {df['average'].min():.4f} - {df['average'].max():.4f}")  if len(df) > 1:  best_model = df.sort_values("average", ascending=False).index[0]  logging.info(f"Best performing model: {best_model} with average score {df.loc[best_model, 'average']:.4f}")  if len(df) >= 3:  # Show top 3 models  top3 = df.sort_values("average", ascending=False).head(3)  logging.info("Top 3 models:")  for i, (model, row) in enumerate(top3.iterrows(), 1):  logging.info(f" {i}. {model}: {row['average']:.4f}")  else:  # Construct the filename directly using the same pattern as in scoresearch_string  filename = f"{'_'.join(args.bstrings)}_relts.csv"  logging.info(f"Relts have been formatted and saved to eval/relts/{filename}")