import argparse import logging import os import re import pandas as pd import numpy as np import matplotb.pyplot as plt from tqdm import tqdm # Import your database utities from database.models import EvalRelt, EvalSetting, Model from database.utils import session_scope def apply_enhanced_styng(fig, axes):  """  Apply enhanced styng to match the requested look:  - Grey out numbers and axes much more  - Remove top and right edges  - Make nes much thicker  - Make markers much thicker with white centers  """  # Set greyer color for axes, ticks, and labels  grey_color = '#888888' # ghter grey than before  for ax in axes:  # Remove top and right spines  ax.spines['top'].set_visible(False)  ax.spines['right'].set_visible(False)  # Make left and bottom spines grey  ax.spines['left'].set_color(grey_color)  ax.spines['bottom'].set_color(grey_color)  ax.spines['left'].set_newidth(1.5) # Thicker spines  ax.spines['bottom'].set_newidth(1.5)  # Make ticks and labels grey  ax.tick_params(axis='both', colors=grey_color, width=1.5, length=6)  ax.xaxis.label.set_color(grey_color)  ax.yaxis.label.set_color(grey_color)  ax.title.set_color(grey_color)  # Make grid even ghter  ax.grid(True, nestyle='--', newidth=0.5, alpha=0.2, color='#cccccc')  # Update all nes to be much thicker with larger markers with white centers  for ne in ax.get_nes():  # Skip basene nes (dashed nes)  if ne.get_nestyle() == '--':  ne.set_color(grey_color)  ne.set_alpha(0.5)  ne.set_newidth(2.0) # Sghtly thicker basene  continue  # Increase ne width significantly  ne.set_newidth(5.0) # Much thicker nes  # Increase marker size and add white center  ne.set_marker('o')  ne.set_markersize(15) # Much bigger markers  ne.set_markerfacecolor('white')  ne.set_markeredgewidth(4.5) # Much thicker marker edge  # Don't try to access fig._ptitle as it might be None  # Just set the title with the desired color later in the code def generate_scang_curves(model_data, bstrings, relts_dir='./relts'):  """  Generate a row of plots showing model performance vs dataset size  for average scores across Math, Code, and Science domains.  """  # Create output directory if it doesn't exist  os.makedirs(relts_dir, exist_ok=True)  # Filter out non-num columns  num_columns = []  for col in model_data.columns:  try:  # Check if the column has at least one num value  # Handle st-ke values by extracting numbers from them  if model_data[col].apply(lambda x: isinstance(x, st)).any():  # Extract numbers from sts  model_data[col] = model_data[col].apply(  lambda x: float(x[0]) if isinstance(x, st) and len(x) > 0 else x  )  model_data[col].astype(float)  num_columns.append(col)  except:  logging.info(f"Skipping non-num column: {col}")  # Keep only num columns for plotting  df = model_data[num_columns].copy()  logging.info("Generating average score row plot...")  # Calculate domain averages  # Math average  math_cols = [col for col in df.columns if "AIME" in col or "AMC" in col or "MATH500" in col]  if math_cols:  df["AvgMath"] = df[math_cols].mean(axis=1)  # Code average  code_cols = [col for col in df.columns if "Code" in col or "LCB" in col or "veCode" in col]  if code_cols:  df["AvgCode"] = df[code_cols].mean(axis=1)  # Science average  sci_cols = [col for col in df.columns if "MMLU" in col or "JEE" in col or "GPQA" in col]  if sci_cols:  df["AvgSci"] = df[sci_cols].mean(axis=1)  # Define the scales and their corresponding sample counts  scales = {  "1k": 1000,  "3k": 3160,  "10k": 10000,  "30k": 31600, # This is the base size  "100k": 100000  }  # Define basene scores for average benchmarks  basene_scores = {  "AvgMath": 0.463, # Average of AIME24, AMC23, MATH500  "AvgCode": 0.16, # Average of veCodeBench, CodeElo, CodeForces  "AvgSci": 0.334 # Average of MMLUPro, JEEBench, GPQA  }  # Define display labels for x-axis  x_labels = {  "1k": "1k",  "3k": "3k",  "10k": "10k",  "30k": "30k",  "100k": "100k"  }  # Extract exact sample counts for proper x-axis values  x_values = st(scales.values())  # Extract experiment base names and their scales  experiment_scales = {}  # Pattern to match models with scang ffix (e.g., base_1k, base_30k)  scale_pattern = r'(.+?)_(1k|3k|10k|30k|100k)'  # First pass: identify the base experiments and their scaled variants  base_experiments = {}  for model_name in df.index:  match = re.search(scale_pattern, model_name)  if match:  base_name, scale = match.groups()  if base_name not in experiment_scales:  experiment_scales[base_name] = []  experiment_scales[base_name].append((scale, model_name))  # Record the base name for later  base_experiments[base_name] = True  # Second pass: look for base models (without scale ffix)  for model_name in df.index:  # Skip models that already have a scale ffix  if re.search(scale_pattern, model_name):  continue  # Check if this could be a base model (without scale ffix)  for base_name in base_experiments.keys():  # Only consider exact matches to avoid misidentifying models  if model_name == base_name:  if base_name in experiment_scales:  experiment_scales[base_name].append(("30k", model_name))  logging.info(f"Added base model {model_name} as 30k scale variant for {base_name}")  # Only process experiments that have multiple scales  vad_experiments = {base_name: scales for base_name, scales in experiment_scales.items()  if len(scales) > 1}  if not vad_experiments:  logging.warning("No scang experiments found in the data")  return  # Format the title: convert bstring to uppercase, replace underscores with spaces  title_text = " ".join(s.strip().upper() for s in bstrings)  # Define domain experiment sets - now all experiments go to all domains  domain_exps = [  {"name": "Math", "avg_col": "AvgMath"},  {"name": "Code", "avg_col": "AvgCode"},  {"name": "Science", "avg_col": "AvgSci"}  ]  # Set plotting style with larger font sizes  plt.rcParams.update({  "font.size": 32, # Increased font size  "axes.titlesize": 36, # Increased title size  "axes.labelsize": 34, # Increased axis label size  "xtick.labelsize": 30, # Increased tick label size  "ytick.labelsize": 30, # Increased tick label size  "legend.fontsize": 30, # Legend font size  "figure.titlesize": 42, # Figure title size  "nes.newidth": 5.0, # Default thicker nes  "nes.markersize": 15, # Default bigger markers  "axes.newidth": 1.5, # Thicker axes  "grid.alpha": 0.2, # More transparent grid  })  # Use default Matplotb style  plt.style.use('default')  # Define color map for consistent colors  color_map = {  "openthoughts": "#1f77b4", # Blue  "bespoke_stratos": "#ff7f0e", # Orange  "all_ght": "#2ca02c", # Green  "all_s": "#9467bd" # Purple  }  # Create figure with 3 bplots in a row without sharing y-axis  fig, axes = plt.bplots(1, 3, figsize=(24, 8), sharey=False)  # Dictionary to store handles and labels for the legend  all_handles = {}  all_labels = {}  # Plot for each domain  for j, (domain_info, ax) in enumerate(zip(domain_exps, axes)):  domain_name = domain_info["name"]  avg_col = domain_info["avg_col"]  # Check if average column exists  if avg_col not in df.columns:  logging.info(f"No average data for domain {domain_name}, skipping")  ax.text(0.5, 0.5, f"No {domain_name} data available",  horizontalagnment='center', verticalagnment='center',  transform=ax.transAxes, fontsize=18)  continue  # Only show x-axis labels on the bottom  ax.set_xlabel("Dataset Size", fontsize=34)  # Only add y-axis label to the first (leftmost) plot  if j == 0:  ax.set_ylabel("Average", fontsize=34)  else:  # Hide just the y-axis label text for other plots  ax.set_ylabel("")  # Keep the y-axis ticks and numbers visible  # Increase tick size for better visibity  ax.tick_params(axis='both', which='major', width=2, length=8, labelsize=30)  ax.tick_params(axis='both', which='minor', width=2, length=4)  # Add very ght grid nes  ax.grid(True, nestyle='--', newidth=0.5, alpha=0.3, color='#cccccc')  # Set title for this bplot  ax.set_title(f"{domain_name}", fontsize=36)  # Plot each experiment - now using all experiments for each domain  for k, (base_name, scale_models) in enumerate(vad_experiments.items()):  # Sort the scales  scale_models.sort(key=lambda x: scales[x[0]])  # Extract data points for this experiment  data_points = []  for scale, model_name in scale_models:  if model_name in df.index and avg_col in df.columns:  value = df.loc[model_name, avg_col]  try:  if isinstance(value, st):  data_points.append((scales[scale], float(value[0])))  else:  data_points.append((scales[scale], float(value)))  except (ValueError, TypeError):  pass # Skip non-num values  else:  pass # Skip missing data  # Sort by x values and unzip for plotting  if data_points:  data_points.sort() # Sort by x value  x, y = zip(*data_points)  # Determine the color based on the model type  if "openthoughts" in base_name.lower():  color = color_map["openthoughts"]  display_name = "Openthoughts"  ef "bespoke_stratos" in base_name.lower() or "bespoke" in base_name.lower():  color = color_map["bespoke_stratos"]  display_name = "Bespoke"  ef "all_ght" in base_name.lower():  color = color_map["all_ght"]  display_name = "All ght"  ef "all_s" in base_name.lower():  color = color_map["all_s"]  display_name = "All S"  else:  used_colors = set(color_map.values())  # Randomly select a color not in the color_map  all_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',  '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',  '#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99',  '#e31a1c', '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a']  available_colors = [c for c in all_colors if c not in used_colors]  if available_colors:  color = np.random.ce(available_colors)  else:  # If all colors are used, generate a random hex color  color = '#%06x' % np.random.randint(0, 0xFFFFFF)  used_colors.add(color)  # Use base name as display name  display_name = base_name.replace("_", " ").title()  # Plot the ne using specified color or default color cycle  ne, = ax.plot(x, y, 'o-', label=display_name,  newidth=5.0, markersize=15, color=color) # Much thicker nes and bigger markers  # Only store handles/labels from the first domain to avoid dupcates  if j == 0:  # Store handle and label for the shared legend  if "openthoughts" in base_name.lower():  key = "openthoughts"  all_handles[key] = ne  all_labels[key] = display_name  ef "bespoke_stratos" in base_name.lower() or "bespoke" in base_name.lower():  key = "bespoke_stratos"  all_handles[key] = ne  all_labels[key] = display_name  else:  # Use arbitrary number for ordering other types  key = f"other_{k}"  all_handles[key] = ne  all_labels[key] = display_name  # Set x-axis to log scale with appropriate labels  ax.set_xscale('log')  ax.set_xticks(x_values)  # Use the display labels instead of database scale names  display_labels = [x_labels.get(scale, scale) for scale in scales.keys()]  ax.set_xticklabels(display_labels)  # Adjust right margin to remove extra space  ax.set_xm(x_values[0] * 0.8, x_values[-1] * 1.15)  # Collect vad y values for axis scang  vad_y_values = []  for exp_name, scale_models in vad_experiments.items():  for _, model_name in scale_models:  if model_name in df.index and avg_col in df.columns:  value = df.loc[model_name, avg_col]  try:  if isinstance(value, st):  vad_y_values.append(float(value[0]))  else:  vad_y_values.append(float(value))  except (ValueError, TypeError):  pass # Skip non-num values  # Get basene value if available  has_basene = avg_col in basene_scores  basene = basene_scores.get(avg_col, None)  # Tighten y-axis mits with dynamic padding based on data range  if vad_y_values:  # Include basene in min/max calculation  all_values = vad_y_values.copy()  if has_basene and basene is not None:  all_values.append(basene)  actual_min = min(all_values)  actual_max = max(all_values)  data_range = actual_max - actual_min  # Apply 5% padding based on the actual data range  padding = data_range * 0.05  min_y = max(0, actual_min - padding)  max_y = min(1.0, actual_max + padding)  # Set the calculated mits - each plot will have its own scale  ax.set_ym(min_y, max_y)  # Add horizontal dotted ne for basene score if available  if has_basene and basene is not None:  ax.axhne(y=basene, color='#666666', nestyle='--', newidth=1.5, alpha=0.6)  # Apply enhanced styng to all bplots  apply_enhanced_styng(fig, axes)  # Add a shared legend below the plots  # Priority ordering for the legend  priority_order = ["openthoughts", "bespoke_stratos"]  # Sort handles by priority then alphabetically  sorted_handles = []  sorted_labels = []  # First add priority items  for key in priority_order:  if key in all_handles:  sorted_handles.append(all_handles[key])  sorted_labels.append(all_labels[key])  # Then add any other items  for key in sorted(all_handles.keys()):  if key.startswith("other_"):  sorted_handles.append(all_handles[key])  sorted_labels.append(all_labels[key])  # Place legend below the bplots with increased font size  if sorted_handles:  legend = fig.legend(sorted_handles, sorted_labels, loc='lower center',  ncol=min(5, len(sorted_handles)), bbox_to_anchor=(0.5, -0.13),  fontsize=30, frameon=True, borderaxespad=1.0,  handlelength=3, handleheight=1.5)  # Make legend text grey  for text in legend.get_texts():  text.set_color('#888888') # ghter grey for legend text  # Set an overall title for the figure  fig.ptitle(f"{title_text} DATA SCANG", fontsize=42, y=0.98, color='#888888') # ghter grey  # Adjust layout  plt.tight_layout()  fig.bplots_adjust(bottom=0.25) # Make more room for the legend at the bottom  # Save the figure  output_path = os.path.join(relts_dir, f"{'_'.join(bstrings)}_scang_curves.pdf")  plt.savefig(output_path, format='pdf', bbox_inches='tight')  logging.info(f"Saved scang curves plot to {output_path}")  # Also save as PNG for easy viewing  png_path = os.path.join(relts_dir, f"{'_'.join(bstrings)}_scang_curves.png")  plt.savefig(png_path, format='png', dpi=300, bbox_inches='tight')  logging.info(f"Saved PNG version to {png_path}")  plt.close(fig)  logging.info("Scang curves plot generation complete")  return output_path def main():  """  Main function for generating scang curves using bstrings directly  """  parser = argparse.ArgumentParser(description="Generate scang curves using bstring filters")  parser.add_argument("--bstrings", nargs="+", type=str, required=True,  help="st of bstrings to filter model names (AND condition)")  parser.add_argument("--benchmarks", nargs="+", type=str, default=None,  help="Specific benchmarks to include (default: all pipene benchmarks)")  parser.add_argument("--output_dir", type=str, default="eval/relts",  help="Directory to save relts")  parser.add_argument("--exclude", type=str, default=None,  help="Comma-separated st of model names to exclude")  parser.add_argument("--output_file", type=str, default=None,  help="Custom filename for output CSV (default: bstrings joined with underscore)")  args = parser.parse_args()  # Configure logging  logging.basicConfig(level=logging.INFO,  format='%(asctime)s - %(levelname)s - %(message)s')  # Process exclusion st if provided  exclude_models = args.exclude.spt(',') if args.exclude else None  # Define default pipene benchmarks if none specified  if args.benchmarks is None:  benchmarks = [  "MATH500_accuracy",  "AMC23_accuracy_avg",  "AIME24_accuracy_avg",  "JEEBench_accuracy_avg",  "GPQADiamond_accuracy_avg",  "veCodeBench_accuracy_avg",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg",  ] # MMLUPro removed as requested  else:  benchmarks = args.benchmarks  # Use the bstrings directly without adding "scang" prefix  logging.info(f"Searching for models matching bstrings: {args.bstrings}")  # Use scoresearch_string from get_paper_relts.py  from eval.scripts.get_paper_relts import scoresearch_string  # Get the data and generate scang curves  df = scoresearch_string(  args.bstrings,  benchmarks=benchmarks,  output_file=args.output_file,  exclude_models=exclude_models,  generate_scang_plot=False # Skip the original plotting  )  if not df.empty:  # Generate our enhanced scang curves  output_path = generate_scang_curves(df, args.bstrings, args.output_dir)  logging.info(f"Generated scang curves: {output_path}")  else:  logging.error("No data found for the specified bstrings.") if __name__ == "__main__":  main()