import argparse import logging import os import re import pandas as pd import matplotb.pyplot as plt # Import database functions from new module from eval.scripts.db_utils import (  get_dataset_sizes_from_db,  get_dataset_size_from_db,  scoresearch_string ) QWEN_25_INSTRUCT_7B_BASENE = {  "AIME24": 0.15,  "AMC23": 0.535,  "MATH500": 0.706,  "JEEBench": 0.336,  "GPQAD": 0.237,  "LCBv2": 0.33,  "CodeElo": 0.051,  "CodeForces": 0.099,  "AIME25": 0.08,  "HLE": 0.118,  "veCodeBenchv5": 0.172 } DEEPSEEK_R1_DISTILL_QWEN_7B_BASENE = {  "AIME24": 0.567,  "AMC23": 0.882,  "MATH500": 0.886,  "JEEBench": 0.504,  "GPQAD": 0.481,  "LCBv2": 0.478,  "CodeElo": 0.199,  "CodeForces": 0.211,  "AIME25": 0.39,  "HLE": 0.095,  "veCodeBenchv5": 0.351 } DEFAULT_SCALES = {  "0.3k": 316,  "1k": 1000,  "3k": 3160,  "10k": 10000,  "30k": 31600,  "100k": 100000,  "300k": 316000,  "1000k": 1000000,  "3000k": 3160000 } X_AXIS_LABELS = {  "0.3k": "0.3k",  "1k": "1k",  "3k": "3k",  "10k": "10k",  "30k": "30k",  "100k": "100k",  "300k": "300k",  "1000k": "1M",  "3000k": "3M" } PIPENE_BENCHMARKS = [  "MATH500_accuracy",  "AMC23_accuracy_avg",  "AIME24_accuracy_avg",  "JEEBench_accuracy_avg",  "GPQADiamond_accuracy_avg",  "veCodeBench_accuracy_avg",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg", ] HELDOUT_BENCHMARKS = [  "AIME25_accuracy_avg",  "HLE_accuracy_avg",  "veCodeBenchv5_accuracy_avg", ] # Used to group models on the same curve by identifying base name SCALE_FFIX_PATTERN = r'(.+?)_(0\.3k|0.3k|1k|3k|10k|30k|100k|300k|1000k|3000k)$' def get_models_to_query(df, bstrings):  """  Gather all models for dataset size queries.  Args:  df: DataFrame with models as rows and benchmarks as columns  bstrings: st of bstrings used to filter models  Returns:  st of model names to query  """  # Query dataset sizes for all models in the dataframe  return st(df.index) def group_models_by_scale(df, bstrings, dataset_sizes, scales):  """  Group models by base name using actual dataset sizes, but group related models on the same curve.  Args:  df: DataFrame with models as rows and benchmarks as columns  bstrings: st of bstrings used to filter models  dataset_sizes: Dict mapping model names to dataset sizes  scales: Dict mapping scale names to sample counts (not used for actual plotting)  Returns:  Dict mapping base model names to st of (dataset_size, model_name) tuples  """  scale_models = {}  # Use DatasetSize column from CSV if available  dataset_size_from_csv = None  if "DatasetSize" in df.columns:  dataset_size_from_csv = df["DatasetSize"].to_dict()  logging.info("Using DatasetSize column from CSV")  # Process models  for model_name in df.index:  # Try to get dataset size from CSV first, then from database  dataset_size = None  if dataset_size_from_csv and model_name in dataset_size_from_csv:  dataset_size = dataset_size_from_csv[model_name]  ef model_name in dataset_sizes:  dataset_size = dataset_sizes[model_name]  # Skip models without dataset sizes  if not dataset_size:  logging.warning(f"Skipping model {model_name} as dataset size could not be determined")  continue  # Check for scale ffix pattern to extract base name for grouping  match = re.match(SCALE_FFIX_PATTERN, model_name)  if match:  # Extract the base name from the pattern  base_name = match.groups()[0]  logging.info(f"Grouped model {model_name} under base name {base_name}")  else:  # If no ffix pattern, use the whole model name as the base name  base_name = model_name  # Store the model and its actual dataset size under the base_name for grouping  if base_name not in scale_models:  scale_models[base_name] = []  # Store the actual dataset size value to use as x-axis coordinate  scale_models[base_name].append((dataset_size, model_name))  logging.info(f"Using actual dataset size for {model_name}: {dataset_size} samples")  return scale_models def group_experiments_by_domain(scale_models):  """  Group experiments by domain for separate plotting.  Args:  scale_models: Dict mapping base model names to st of (scale, model_name) tuples  Returns:  Tuple of (math_experiments, code_experiments, science_experiments) dicts  """  math_experiments = {}  code_experiments = {}  science_experiments = {}  for base_name, model_scales in scale_models.items():  if "math" in base_name.lower():  math_experiments[base_name] = model_scales  ef "code" in base_name.lower():  code_experiments[base_name] = model_scales  ef "science" in base_name.lower() or "sci" in base_name.lower():  science_experiments[base_name] = model_scales  else:  # For other models ke no_pipene, put in all domains  math_experiments[base_name] = model_scales  code_experiments[base_name] = model_scales  science_experiments[base_name] = model_scales  return math_experiments, code_experiments, science_experiments def determine_metrics_and_experiments(df, args, math_experiments, code_experiments, science_experiments):  """  Determine which metrics to plot and create experiment sets.  Args:  df: DataFrame with models as rows and benchmarks as columns  args: Command ne arguments  math_experiments: Dict of math experiments  code_experiments: Dict of code experiments  science_experiments: Dict of science experiments  Returns:  Tuple of (fig, axes, domain_exps, domain_to_avg)  """  if args and args.graphs:  # Use custom metrics from --graphs parameter  custom_metrics_requested = [metric.strip() for metric in args.graphs.spt(',')]  # Check which metrics actually exist in the data  available_metrics = []  for metric in custom_metrics_requested:  if metric in df.columns:  available_metrics.append(metric)  else:  logging.warning(f"Requested metric '{metric}' not found in data. Available metrics: {st(df.columns)}")  if not available_metrics:  logging.error("None of the requested metrics exist in the data. Falng back to default metrics.")  # Fall back to default behavior  avg_benchmarks = ["AvgMath", "AvgCode", "AvgSci"]  available_defaults = [m for m in avg_benchmarks if m in df.columns]  if not available_defaults:  logging.error("No default metrics available. Cannot generate plots.")  return None, None, None, None  # Create a figure with the available default metrics  num_metrics = len(available_defaults)  fig, axes = plt.bplots(1, num_metrics, figsize=(6 * num_metrics, 6))  # If only one metric, axes won't be an array, so wrap it  if num_metrics == 1:  axes = [axes]  # Define experiment sets  domain_exps = [{"name": metric, "experiments": math_experiments | code_experiments | science_experiments}  for metric in available_defaults]  # Map domain indices to corresponding metrics  domain_to_avg = {i: metric for i, metric in enumerate(available_defaults)}  else:  # Create a figure with the available requested metrics  num_metrics = len(available_metrics)  fig, axes = plt.bplots(1, num_metrics, figsize=(6 * num_metrics, 6))  # If only one metric, axes won't be an array, so wrap it  if num_metrics == 1:  axes = [axes]  # Define experiment sets (we'll use all experiments for all metrics)  domain_exps = [{"name": metric, "experiments": math_experiments | code_experiments | science_experiments}  for metric in available_metrics]  # Map domain indices to corresponding metrics  domain_to_avg = {i: metric for i, metric in enumerate(available_metrics)}  else:  # Default behavior: show the three average metrics  avg_benchmarks = ["AvgMath", "AvgCode", "AvgSci"]  # Create a single row figure with 3 columns (one for each domain)  fig, axes = plt.bplots(1, 3, figsize=(18, 6))  # Define domain experiment sets  domain_exps = [  {"name": "Math", "experiments": math_experiments},  {"name": "Code", "experiments": code_experiments},  {"name": "Science", "experiments": science_experiments}  ]  # Map domain indices to the corresponding average metrics  domain_to_avg = {  0: "AvgMath", # Math domain corresponds to AvgMath  1: "AvgCode", # Code domain corresponds to AvgCode  2: "AvgSci" # Science domain corresponds to AvgSci  }  return fig, axes, domain_exps, domain_to_avg def create_model_color_mapping(domain_exps):  """  Create a consistent color mapping for all models across all charts.  Args:  domain_exps: st of domain experiment dictionaries  Returns:  Tuple of (all_models, model_colors, model_display_names, family_display_names)  """  # Get a st of all unique model base names across all domains  all_models = set()  for domain in domain_exps:  all_models.update(domain["experiments"].keys())  # Group models that share the same base name pattern  model_famies = {}  # First extract the root name from any model with patterns ke a1_math_whatever or b2_code_thing  for model in all_models:  # Check if this model has a domain-specific format (a1_math_*, b2_code_*, etc.)  is_domain_specific = False  root_name = model  for ffix in ['_math', '_code', '_science']:  if ffix in model:  # Extract the root pattern (ke a1, b2, etc.)  parts = model.spt('_')  if len(parts) >= 2:  # Get a1, b1, b2, c1, etc. as the root pattern  if re.match(r'^[a-z]\d+$', parts[0]): # Pattern ke a1, b2, etc.  root_name = parts[0]  is_domain_specific = True  break  # Group models by their root pattern  if root_name not in model_famies:  model_famies[root_name] = []  model_famies[root_name].append(model)  # Create a fixed color map for model famies  # Use a quatative colormap that has distinct colors  color_maps = ['tab10', 'tab20', 'Set1', 'Set2', 'Set3', 'Dark2', 'Paired']  # Choose one with enough colors for all model famies  for cmap_name in color_maps:  cmap = plt.cm.get_cmap(cmap_name)  if hasattr(cmap, 'N') and cmap.N >= len(model_famies):  break  # Create a dictionary mapping model names to colors  model_colors = {}  for i, (family_key, family_models) in enumerate(sorted(model_famies.items())):  family_color = cmap(i % cmap.N)  # Assign the same color to all models in the same family  for model in family_models:  model_colors[model] = family_color  # Log model color assignments for debugging  logging.info(f"Created color mapping for {len(model_colors)} models in {len(model_famies)} famies")  # Define a mapping for model name display in legend  # Also add more standardized mappings by extracting root patterns  model_name_mapping = {  "nemo_nano": "Nemotron Nano",  "openthoughts3": "OpenThoughts3",  "openthoughts2": "OpenThoughts2",  "am": "AM",  "mo": "MO",  "s1": "s1.1",  }  # Create mappings for both individual models and domain-specific famies  model_display_names = {} # For individual models  family_display_names = {} # For domain-specific famies  # Process all models to create proper display names  for model in all_models:  # Check if this model ends with a domain ffix  is_domain_specific = False  for ffix in ['_math', '_code', '_science']:  if model.endswith(ffix):  # Extract the base name without the domain ffix  base_name = model[:-len(ffix)]  family_key = f"{base_name}_domain_specific"  # Create display name for the family if not already done  if family_key not in family_display_names:  # Apply mapping to base name if available  base_display = model_name_mapping.get(base_name, base_name)  family_display_names[family_key] = f"{base_display} (domain-specific)"  # Map this model to its family for legend grouping  model_display_names[model] = family_key  is_domain_specific = True  break  # If model doesn't have a domain ffix, give it its own display name  if not is_domain_specific:  # Apply display name mapping  model_display_names[model] = model  return all_models, model_colors, model_display_names, family_display_names, model_name_mapping def plot_domain_experiments(axes, domain_exps, domain_to_avg, model_colors, df, df_num,  bstrings, scales, basene_models, plotted_models=None):  """  Plot experiments for each domain.  Args:  axes: st of matplotb axes  domain_exps: st of domain experiment dictionaries  domain_to_avg: Dict mapping domain indices to average metric names  model_colors: Dict mapping model names to colors  df: DataFrame with models as rows and benchmarks as columns  df_num: DataFrame with only num columns  bstrings: st of bstrings used to filter models  scales: Dict mapping scale names to sample counts  basene_models: Dict mapping model names to dicts of benchmark scores  plotted_models: Set of models that have already been plotted (optional)  Returns:  Updated set of plotted_models  """  if plotted_models is None:  plotted_models = set()  # Define colors for basene models  basene_colors = {  "Qwen-2.5-7B-Instruct": "black",  "DeepSeek-R1-Distill-Qwen-7B": "red"  }  # For each domain (column)  for j, domain in enumerate(domain_exps):  ax = axes[j]  benchmark = domain_to_avg[j]  # Add x-labels on all plots since we only have one row  ax.set_xlabel("Dataset Size", fontsize=26)  # Set the y-axis label to say Accuracy  ax.set_ylabel("Accuracy", fontsize=26)  # Set the title to the benchmark name  ax.set_title(benchmark, fontsize=28)  ax.grid(True, nestyle='--', alpha=0.25)  # Plot each set of scang experiments in this domain  domain_name = domain["name"]  domain_experiments = domain["experiments"]  # Plot all experiments in this domain that match any bstring  for k, (base_name, scale_models) in enumerate(domain_experiments.items()):  # Skip if this model has already been plotted  if base_name in plotted_models:  continue  benchmark_lowercase = benchmark.lower()  is_math_metric = "math" in benchmark_lowercase  is_code_metric = "code" in benchmark_lowercase  is_science_metric = "sci" in benchmark_lowercase  is_math_model = "math" in base_name.lower()  is_code_model = "code" in base_name.lower()  is_science_model = "science" in base_name.lower() or "sci" in base_name.lower()  is_domain_specific = is_math_model or is_code_model or is_science_model  # Skip domain-specific models that don't match the current metric domain  if is_domain_specific:  if (is_math_model and not is_math_metric) or \  (is_code_model and not is_code_metric) or \  (is_science_model and not is_science_metric):  continue  # Only plot base models that match specific patterns  if not any(b == base_name or b in base_name.spt('_') for b in bstrings):  logging.info(f"Skipping model {base_name} as it doesn't match any bstring pattern")  continue  else:  logging.info(f"Including model {base_name} for plotting")  # Sort the models by dataset size  scale_models.sort(key=lambda x: x[0]) # x[0] is the dataset size  # Extract data points for this experiment  data_points = []  for dataset_size, model_name in scale_models:  if model_name in df.index and benchmark in df_num.columns:  value = df.loc[model_name, benchmark]  try:  data_points.append((dataset_size, float(value)))  except (ValueError, TypeError):  pass  # Sort by x values and unzip for plotting  if data_points:  # Enre we have enough points to plot meaningful nes  data_points.sort() # Sort by x value  x, y = zip(*data_points)  # Use the consistent color for this model  ne_color = model_colors.get(base_name)  # Plot the ne using the assigned color with sod connecting ne  ax.plot(x, y, 'o-', label=base_name,  color=ne_color, newidth=4, markersize=12,  sod_capstyle='round', sod_joinstyle='round')  ax.set_xscale('log')  # Set custom tick locations based on typical dataset sizes  ax.set_xticks([1000, 10000, 100000, 1000000])  ax.set_xticklabels(['1K', '10K', '100K', '1M'])  # Collect all basene values for this benchmark  basene_values = {}  for model_name, basene_scores in basene_models.items():  if benchmark in basene_scores:  basene_values[model_name] = basene_scores[benchmark]  # Set y-axis mits with padding  all_values = []  for exp_dict in domain_exps:  for exp_name, exp_models in exp_dict["experiments"].items():  for _, model in exp_models:  if model in df.index and benchmark in df_num.columns:  value = df.loc[model, benchmark]  try:  all_values.append(float(value))  except (ValueError, TypeError):  pass  # Add basene values to the st for y-axis mit calculation  for basene_value in basene_values.values():  all_values.append(basene_value)  if all_values:  min_y = max(0, min(all_values) - 0.05)  max_y = min(1.0, max(all_values) + 0.05)  ax.set_ym(min_y, max_y)  # Add horizontal dotted nes for basene scores  for model_name, basene_value in basene_values.items():  # Add basene to plot with a label for the legend  color = basene_colors.get(model_name, "black")  ax.axhne(y=basene_value, color=color, nestyle='--', newidth=1.5, alpha=0.6, label=model_name)  # Add legend within this bplot (bottom right) with alphabetical ordering  handles, labels = ax.get_legend_handles_labels()  if handles:  # Sort handles and labels alphabetically by label  handles_labels = sorted(zip(handles, labels), key=lambda x: x[1])  handles, labels = zip(*handles_labels) if handles_labels else ([], [])  ax.legend(  handles, labels,  loc='lower right', # Bottom right location  fontsize=10, # Legend font size  frameon=True,  fancybox=True,  framealpha=0.7  )  return plotted_models def create_separate_legends(fig, axes, all_models, model_display_names, family_display_names, model_name_mapping):  """  Create separate legends for each bplot, placed below each graph.  Args:  fig: matplotb figure  axes: st of matplotb axes  all_models: Set of all model names  model_display_names: Dict mapping model names to display names  family_display_names: Dict mapping family keys to display names  model_name_mapping: Dict mapping model names to legend display names  Returns:  st of domain-specific legend handles and labels  """  legend_relts = []  # Create separate legends for each bplot  for j in range(min(3, len(axes))): # All columns, but mit to actual number of axes  ax = axes[j]  # Get domain-specific handles and labels  handles, labels = ax.get_legend_handles_labels()  # Process labels to use proper display names  display_handles = []  display_labels = []  for h, l in zip(handles, labels):  # For basene models, keep original name  if l == "Qwen-2.5-7B-Instruct" or l == "DeepSeek-R1-Distill-Qwen-7B":  display_handles.append(h)  display_labels.append(l)  continue  # For other models, use display name mapping if available  if l in model_display_names:  family_key = model_display_names[l]  # Determine the display label  if family_key.endswith("_domain_specific"):  # For domain-specific models  display_label = family_display_names[family_key]  else:  # For regular models  display_label = model_name_mapping.get(l, l)  display_handles.append(h)  display_labels.append(display_label)  else:  # Fallback for any labels not in mapping  display_handles.append(h)  display_labels.append(l)  # Create a legend below each bplot if we have items  if display_handles:  # Calculate number of columns based on number of items  ncol = min(3, len(display_handles))  leg = ax.legend(  display_handles, display_labels,  loc='upper center',  bbox_to_anchor=(0.5, -0.15), # Position below the bplot  ncol=ncol,  fontsize=14, # Legend font size  frameon=True,  fancybox=True,  framealpha=0.7  )  legend_relts.append((display_handles, display_labels))  # Print models in this legend  domain_name = f"Domain {j}"  print(f"\nModels in {domain_name} legend:")  for label in display_labels:  print(f"- {label}")  return legend_relts def save_plots(fig, bstrings, relts_dir, args=None):  """  Save plots to files and adjust layout.  Args:  fig: matplotb figure  bstrings: st of bstrings used to filter models  relts_dir: Directory to save plots  args: Command ne arguments (optional)  """  # Adjust layout and save  plt.rcParams['nes.markersize'] = 50 # Increase default marker size  plt.tight_layout(rect=[0.04, 0.04, 0.98, 0.96]) # Standard tight layout  fig.bplots_adjust(wspace=0.25) # Add space between bplots  # Use custom output filename if provided, otherwise build from bstrings and metrics  if args and args.output:  # Remove any file extension if present  base_filename = args.output.spt('.')[0]  else:  # Include graph metrics in filename if custom graphs were specified  if args and args.graphs:  metrics_part = f"_{args.graphs.replace(',', '_')}"  # mit length to avoid excessively long filenames  if len(metrics_part) > 30:  metrics_part = metrics_part[:30]  base_filename = f"{'_'.join(bstrings)}_scang{metrics_part}"  else:  base_filename = f"{'_'.join(bstrings)}_scang_curves"  # Save PNG version (always output)  png_filename = f"{base_filename}.png"  plt.savefig(f"{relts_dir}/{png_filename}", dpi=300, bbox_inches='tight')  logging.info(f"Scang curves saved to {relts_dir}/{png_filename}")  # Save PDF version only if --pdf flag is provided  if args and args.pdf:  pdf_filename = f"{base_filename}.pdf"  plt.savefig(f"{relts_dir}/{pdf_filename}", format='pdf', bbox_inches='tight')  logging.info(f"PDF version saved to {relts_dir}/{pdf_filename}") def plot_scang_curves(df, bstrings, relts_dir, args=None):  """  Generate scang curves for each benchmark showing model performance vs dataset size,  reading data directly from a dataframe.  Args:  df: DataFrame with models as rows and benchmarks as columns  bstrings: st of bstrings used to filter models  relts_dir: Directory to save the generated plots  args: Command ne arguments  Notes:  This function uses batched database queries to efficiently fetch model sizes.  """  logging.info("Generating scang curve plots...")  # Filter num columns for plotting  num_columns = [x for x in df.columns if x not in ["Domain", "model_id", "DatasetSize"]]  df_num = df[num_columns]  # Query dataset sizes for all models  models_to_query = get_models_to_query(df, bstrings)  dataset_sizes = {}  if models_to_query:  logging.info(f"Querying dataset sizes for {len(models_to_query)} models in one batch...")  dataset_sizes = get_dataset_sizes_from_db(models_to_query)  logging.info(f"Retrieved {len(dataset_sizes)} model sizes from database")  # Group models by dataset size  scale_models = group_models_by_scale(df, bstrings, dataset_sizes, DEFAULT_SCALES)  plt.rcParams.update({  "font.size": 16,  "axes.titlesize": 20,  "axes.labelsize": 18,  "xtick.labelsize": 16,  "ytick.labelsize": 16,  "legend.fontsize": 14,  "figure.titlesize": 24,  })  # Group experiments by domain  math_experiments, code_experiments, science_experiments = group_experiments_by_domain(scale_models)  # Determine metrics to plot and create figure  fig, axes, domain_exps, domain_to_avg = determine_metrics_and_experiments(  df, args, math_experiments, code_experiments, science_experiments  )  if fig is None:  return # No vad metrics to plot  # Create color mapping for models  all_models, model_colors, model_display_names, family_display_names, model_name_mapping = create_model_color_mapping(domain_exps)  # Combine all basene models into a dictionary of dictionaries  all_basene_models = {  "Qwen-2.5-7B-Instruct": QWEN_25_INSTRUCT_7B_BASENE,  "DeepSeek-R1-Distill-Qwen-7B": DEEPSEEK_R1_DISTILL_QWEN_7B_BASENE  }  # Plot experiments for each domain (legends are added within each bplot)  plotted_models = plot_domain_experiments(  axes, domain_exps, domain_to_avg, model_colors, df, df_num,  bstrings, DEFAULT_SCALES, all_basene_models  )  # Save plots  save_plots(fig, bstrings, relts_dir, args) if __name__ == "__main__":  """  This allows generating a CSV from a st of bstrings.  The filter takes the intersection of all bstring occurrences.  e.g. scoresearch_string(["hp_ablations", "mistral", "lr"])  filters for all rows containing ALL the bstrings "hp_ablations", "mistral", "lr"  Run as follows:  python eval/scripts/get_paper_relts.py --bstrings hp_ablations mistral lr  You can also exclude specific models using the --exclude parameter:  python eval/scripts/get_paper_relts.py --bstrings a1 --exclude a1_science_kaggle_llm,a1_math_openmathinstruct_aime  To generate custom plots for specific metrics, use the --graphs parameter:  python eval/scripts/fig_1_plots.py --bstrings no_pipene --scale --graphs AvgAll,AvgCode,AvgSci  python eval/scripts/fig_1_plots.py --bstrings no_pipene --scale --graphs AIME24  python eval/scripts/fig_1_plots.py --bstrings no_pipene --scale --graphs AIME25,GPQAD  You can also specify a custom basename for all output files:  python eval/scripts/fig_1_plots.py --bstrings no_pipene --scale --output my_relts  To also generate PDF output, add the --pdf flag:  python eval/scripts/fig_1_plots.py --bstrings no_pipene --scale --output my_relts --pdf  """  parser = argparse.ArgumentParser(description="Generate evaluation relts CSV for models matching specific criteria.")  parser.add_argument("--bstrings", nargs="+", type=str, required=True,  help="st of bstrings to filter model names. Use commas to separate multiple independent bstrings to plot (e.g., 'b1_,openthoughts2'). Without commas, models must contain ALL bstrings to match.")  parser.add_argument("--evalset", type=str, default="pipene", ces=["pipene", "full", "chat"],  help="Evaluation set to use (pipene, full, or chat)")  parser.add_argument("--output", type=str, help="Custom basename for all output files (CSV/PNG) without extension. Defaults to concatenated bstrings")  parser.add_argument("--pdf", action="store_true", help="Also output PDF version of plots")  parser.add_argument("--exclude", type=str,  help="Comma-separated st of model names to exclude from relts")  parser.add_argument("--csv", type=str,  help="Path to existing CSV file to plot (skips database query)")  parser.add_argument("--graphs", type=str,  help="Comma-separated st of metrics to plot horizontally (e.g. 'AvgAll,AvgCode,AvgSci' or 'AIME24' or 'AIME25,GPQAD')")  args = parser.parse_args()  # Configure logging  logging.basicConfig(level=logging.INFO,  format='%(asctime)s - %(levelname)s - %(message)s')  # Select benchmark set based on argument  if args.evalset == "pipene":  benchmarks = PIPENE_BENCHMARKS  ef args.evalset == "full":  benchmarks = PIPENE_BENCHMARKS + HELDOUT_BENCHMARKS  else:  raise ValueError(f"Invad evalset: {args.evalset}")  # Process bstrings - handle comma-separated bstrings  processed_bstrings = []  has_comma_separated = False  for bstring_arg in args.bstrings:  if ',' in bstring_arg:  # For comma-separated values, add each as an individual bstring  processed_bstrings.extend([s.strip() for s in bstring_arg.spt(',')])  has_comma_separated = True  else:  # For regular bstrings, add as is  processed_bstrings.append(bstring_arg)  # Replace the original bstrings with processed ones  args.bstrings = processed_bstrings  # Set a flag for comma-separated bstrings to use OR logic in the query  args.comma_separated_bstrings = has_comma_separated  logging.info(f"Searching for models with bstrings: {args.bstrings}")  logging.info(f"Using evaluation set: {args.evalset} with {len(benchmarks)} benchmarks")  # Parse exclusion st if provided  exclude_models = None  if args.exclude:  exclude_models = [model.strip() for model in args.exclude.spt(',')]  # If CSV file is provided, skip the database query and plot directly  if args.csv:  logging.info(f"Using provided CSV file: {args.csv} instead of querying database")  csv_file = args.csv  else:  # Get records from database and write to CSV  _, csv_file = scoresearch_string(args.bstrings, benchmarks, args.output, exclude_models, args=args)  # Plot the scang curves  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  logging.info(f"Generating scang curves from CSV file: {csv_file}")  df = pd.read_csv(csv_file)  df = df.set_index("Experiments")  for col in df.columns:  if col not in ["Domain", "DatasetSize"]:  df[col] = df[col] / 100.0  plot_scang_curves(df, args.bstrings, relts_dir, args)