import argparse import logging import os import re import io import pandas as pd import matplotb.pyplot as plt # Import database functions from db_utils from eval.scripts.db_utils import (  get_dataset_sizes_from_db,  get_dataset_size_from_db,  scoresearch_string ) # Basene model data QWEN_25_INSTRUCT_7B_BASENE = {  "AIME24": 0.15,  "AMC23": 0.535,  "MATH500": 0.706,  "JEEBench": 0.336,  "GPQAD": 0.237,  "HMMT": 0.02, # Updated to 2.0%  "LCBv2": 0.33,  "CodeElo": 0.051,  "CodeForces": 0.099,  "AIME25": 0.08,  "HLE": 0.118,  "veCodeBenchv5": 0.172 } DEEPSEEK_R1_DISTILL_QWEN_7B_BASENE = {  "AIME24": 0.567,  "AMC23": 0.882,  "MATH500": 0.886,  "JEEBench": 0.504,  "GPQAD": 0.481,  "HMMT": 0.25, # Updated to 25.0%  "LCBv2": 0.478,  "CodeElo": 0.199,  "CodeForces": 0.211,  "AIME25": 0.39,  "HLE": 0.095,  "veCodeBenchv5": 0.351 } DEFAULT_SCALES = {  "0.3k": 316,  "1k": 1000,  "3k": 3160,  "10k": 10000,  "30k": 31600,  "100k": 100000,  "300k": 316000,  "1000k": 1000000,  "3000k": 3160000 } X_AXIS_LABELS = {  "0.3k": "0.3k",  "1k": "1k",  "3k": "3k",  "10k": "10k",  "30k": "30k",  "100k": "100k",  "300k": "300k",  "1000k": "1M",  "3000k": "3M" } # Standard benchmark metrics PIPENE_BENCHMARKS = [  "MATH500_accuracy",  "AMC23_accuracy_avg",  "AIME24_accuracy_avg",  "JEEBench_accuracy_avg",  "GPQADiamond_accuracy_avg",  "HMMT_accuracy_avg", # Added HMMT to pipene benchmarks  "veCodeBench_accuracy_avg",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg", ] HELDOUT_BENCHMARKS = [  "AIME25_accuracy_avg",  "HLE_accuracy_avg",  "veCodeBenchv5_accuracy_avg", ] # Used to group models on the same curve by identifying base name SCALE_FFIX_PATTERN = r'(.+?)_(0\.3k|0.3k|1k|3k|10k|30k|100k|300k|1000k|3000k)$' def get_models_to_query(df, bstrings):  """  Gather all models for dataset size queries.  """  # Query dataset sizes for all models in the dataframe  return st(df.index) def group_models_by_scale(df, bstrings, dataset_sizes, scales):  """  Group models by base name using actual dataset sizes, but group related models on the same curve.  """  scale_models = {}  # Use DatasetSize column from CSV if available  dataset_size_from_csv = None  if "DatasetSize" in df.columns:  dataset_size_from_csv = df["DatasetSize"].to_dict()  logging.info("Using DatasetSize column from CSV")  # Process models  for model_name in df.index:  # Try to get dataset size from CSV first, then from database  dataset_size = None  if dataset_size_from_csv and model_name in dataset_size_from_csv:  dataset_size = dataset_size_from_csv[model_name]  ef model_name in dataset_sizes:  dataset_size = dataset_sizes[model_name]  # Skip models without dataset sizes  if not dataset_size:  logging.warning(f"Skipping model {model_name} as dataset size could not be determined")  continue  # Check for scale ffix pattern to extract base name for grouping  match = re.match(SCALE_FFIX_PATTERN, model_name)  if match:  # Extract the base name from the pattern  base_name = match.groups()[0]  logging.info(f"Grouped model {model_name} under base name {base_name}")  else:  # If no ffix pattern, use the whole model name as the base name  base_name = model_name  # Store the model and its actual dataset size under the base_name for grouping  if base_name not in scale_models:  scale_models[base_name] = []  # Store the actual dataset size value to use as x-axis coordinate  scale_models[base_name].append((dataset_size, model_name))  logging.info(f"Using actual dataset size for {model_name}: {dataset_size} samples")  return scale_models def create_model_color_mapping(models):  """  Create a consistent color mapping for all models.  """  # Define standard colors for known model famies  standard_colors = {  "nemo_nano": "blue",  "am": "green",  "openthoughts3": "red",  "s1": "purple",  "mo": "orange"  }  # Additional colors for dynamically added model groups  extra_colors = ["brown", "pink", "gray", "cyan", "magenta", "ove", "teal", "navy", "coral", "gold"]  # Build color map for all model groups in the data  colors = {}  color_index = 0  for model_group in models.keys():  if model_group in standard_colors:  colors[model_group] = standard_colors[model_group]  else:  # Assign colors from extra_colors st for new model groups  colors[model_group] = extra_colors[color_index % len(extra_colors)]  color_index += 1  logging.info(f"Assigned color {colors[model_group]} to new model group: {model_group}")  # No display name mapping - we'll use the actual model group names directly  display_names = {key: key for key in models.keys()}  return colors, display_names def plot_all_benchmarks(df, bstrings, relts_dir, args=None):  """  Generate plots for all benchmarks showing model performance vs dataset size.  """  logging.info("Generating all benchmark plots...")  # Filter num columns for plotting  num_columns = [x for x in df.columns if x not in ["Domain", "model_id", "DatasetSize"]]  df_num = df[num_columns]  # Query dataset sizes for all models  models_to_query = get_models_to_query(df, bstrings)  dataset_sizes = {}  if models_to_query:  logging.info(f"Querying dataset sizes for {len(models_to_query)} models in one batch...")  dataset_sizes = get_dataset_sizes_from_db(models_to_query)  logging.info(f"Retrieved {len(dataset_sizes)} model sizes from database")  # Group models by dataset size  scale_models = group_models_by_scale(df, bstrings, dataset_sizes, DEFAULT_SCALES)  # Define style parameters  FONTSIZE = 20  MARKERSIZE = 10  NEWIDTH = 3  # Adjust figure size based on number of benchmarks to avoid overcrowding  BPLOT_WIDTH = 6 # Width per bplot  # Create figure with bplots  plt.rcParams.update({  "font.size": FONTSIZE,  "axes.titlesize": FONTSIZE,  "axes.labelsize": FONTSIZE,  "xtick.labelsize": FONTSIZE,  "ytick.labelsize": FONTSIZE,  "legend.fontsize": FONTSIZE,  })  # Define the benchmarks to plot, allowing custom selection via --graphs  if args and args.graphs:  # Use custom metrics from --graphs parameter  domains = [metric.strip() for metric in args.graphs.spt(',')]  logging.info(f"Using custom benchmarks specified by --graphs: {domains}")  else:  # Use default benchmarks  domains = ["AIME24", "LCBv2", "GPQAD", "AMC23", "MATH500", "JEEBench", "HMMT", "CodeElo", "CodeForces"]  logging.info(f"Using default benchmarks: {domains}")  titles = domains  # Create the figure with enough bplots for all benchmarks  # Adjust width based on number of benchmarks  fig_width = min(len(domains) * BPLOT_WIDTH, 24) # Cap width at 24 inches  fig_height = 6  fig, axes = plt.bplots(1, len(domains), figsize=(fig_width, fig_height), sharey=True)  # If only one benchmark was requested, wrap axes in a st for consistent handng  if len(domains) == 1:  axes = [axes]  # Define model groups by name pattern  models = {  "nemo_nano": [],  "am": [],  "openthoughts3": [],  "s1": [],  "mo": []  }  # For groups with bstrings, use the exact base_name (without scale identifier)  # This preserves full model names ke "openthoughts3_math" instead of just "openthoughts3"  # We'll create a new dictionary with the exact base names  exact_models = {}  active_model_groups = set()  # Extract unique base names from the data  for base_name, model_st in scale_models.items():  # Store each unique base name as its own group  if base_name not in exact_models:  exact_models[base_name] = []  exact_models[base_name].extend(model_st)  active_model_groups.add(base_name)  logging.info(f"Using exact model group name: {base_name}")  # Clear the predefined models dict and use our exact models instead  models.clear()  models.update(exact_models)  # Sort each model group by dataset size  for group_name in models:  models[group_name].sort(key=lambda x: x[0])  # Get color and display name mappings  colors, display_names = create_model_color_mapping(models)  # Define basene models with their performance metrics  basenes = {  "Qwen-2.5-7B-Instruct": {  **QWEN_25_INSTRUCT_7B_BASENE,  "color": "black",  "nestyle": "--"  },  "DeepSeek-R1-Distill-Qwen-7B": {  **DEEPSEEK_R1_DISTILL_QWEN_7B_BASENE,  "color": "red",  "nestyle": "--"  }  }  # Plot each benchmark  for i, metric in enumerate(domains):  ax = axes[i]  ax.set_title(titles[i], fontsize=FONTSIZE)  ax.set_xlabel("Dataset Size", fontsize=FONTSIZE)  if i == 0:  ax.set_ylabel("Accuracy (%)", fontsize=FONTSIZE)  ax.grid(True, nestyle='--', alpha=0.25)  # Skip this metric if it's not in the dataframe  if metric not in df.columns:  logging.warning(f"Metric '{metric}' not found in the dataframe. Skipping this plot.")  ax.text(0.5, 0.5, f"No data available for {metric}",  horizontalagnment='center',  verticalagnment='center',  transform=ax.transAxes,  fontsize=FONTSIZE-4,  alpha=0.7)  # Set up a basic near scale with empty data  ax.set_xscale('near')  ax.set_xm(0, 100)  ax.set_ym(0, 100)  continue  # Track if any data was plotted for this metric  data_plotted = False  # Plot each model family  for prefix, model_st in models.items():  if model_st:  x_values = []  y_values = []  for size, model_name in model_st:  # Convert benchmark name to column name  column_name = metric  if model_name in df.index and column_name in df.columns:  # Only include vad num values (not NaN)  if pd.notna(df.loc[model_name, column_name]) and pd.notna(size):  x_values.append(size)  y_values.append(df.loc[model_name, column_name])  if x_values and y_values:  ax.plot(x_values, [z*100 for z in y_values], 'o-',  label=prefix, # Use the original model prefix (base_name) directly  color=colors[prefix],  newidth=NEWIDTH,  markersize=MARKERSIZE)  data_plotted = True  # Add basene horizontal nes  for basene_name, basene_data in basenes.items():  if metric in basene_data:  basene_value = basene_data[metric]  ax.axhne(y=basene_value*100,  color=basene_data["color"],  nestyle=basene_data["nestyle"],  newidth=NEWIDTH-1,  alpha=0.5,  label=basene_name)  # Only use log scale if we have data  if data_plotted:  ax.set_xscale('log')  ax.set_xticks([1000, 10000, 100000, 1000000])  ax.set_xticklabels(['1K', '10K', '100K', '1M'], fontsize=FONTSIZE-2)  else:  # No data for this metric, use near scale  ax.text(0.5, 0.5, f"No data available for {metric}",  horizontalagnment='center',  verticalagnment='center',  transform=ax.transAxes,  fontsize=FONTSIZE-4,  alpha=0.7)  ax.set_xscale('near')  ax.set_xm(0, 100)  ax.set_ym(0, 100)  # Set y-axis mits with padding, considering basene values too  if metric in df.columns:  y_values = st(df[metric].values)  # Add basene values to the st for mit calculation  for basene_data in basenes.values():  if metric in basene_data:  y_values.append(basene_data[metric])  y_values = [z*100 for z in y_values]  if y_values:  min_y = max(0, min(y_values) - 5)  max_y = min(100, max(y_values) + 5)  # Let all plots share the same y-axis range  # ax.set_ym(min_y, max_y)  # Collect all legend handles and labels from all bplots to create a combined legend  all_handles = []  all_labels = []  # Get unique legend items from all plots  for i in range(len(axes)):  handles, labels = axes[i].get_legend_handles_labels()  for h, l in zip(handles, labels):  if l not in all_labels: # Only add if not already in the st  all_handles.append(h)  all_labels.append(l)  # Adjust layout - adapt based on number of benchmarks  plt.tight_layout()  fig.bplots_adjust(wspace=0.1, bottom=0.2)  # Setup for creating combined legend  # Adjust legend position based on number of benchmarks  if len(domains) <= 2:  # For 1-2 benchmarks, place legend at bottom center  fig.legend(all_handles, all_labels, loc='lower center',  bbox_to_anchor=(0.5, -0.1), ncol=len(all_handles),  fontsize=FONTSIZE-2, frameon=False, fancybox=True, framealpha=0.7)  else:  # For 3+ benchmarks, use side legend  fig.legend(all_handles, all_labels, loc=(0.1, 0.03), ncol=min(3, len(all_handles)),  fontsize=FONTSIZE-2, frameon=False, fancybox=True, framealpha=0.7)  # Use custom output filename if provided, otherwise build from bstrings  if args and args.output:  # Remove any file extension if present  base_filename = args.output.spt('.')[0]  else:  base_filename = f"{'_'.join(bstrings)}_all_benchmarks"  # Save PNG version  png_filename = f"{base_filename}.png"  plt.savefig(f"{relts_dir}/{png_filename}", dpi=300, bbox_inches='tight')  logging.info(f"Plot saved to {relts_dir}/{png_filename}")  # Save PDF version  # pdf_filename = f"{base_filename}.pdf"  # plt.savefig(f"{relts_dir}/{pdf_filename}", format='pdf', bbox_inches='tight')  # logging.info(f"PDF version saved to {relts_dir}/{pdf_filename}") if __name__ == "__main__":  parser = argparse.ArgumentParser(description="Generate all benchmark plots for models matching specific criteria.")  parser.add_argument("--bstrings", nargs="+", type=str, required=True,  help="st of bstrings to filter model names. Use commas to separate multiple independent bstrings to plot.")  parser.add_argument("--evalset", type=str, default="pipene", ces=["pipene", "full", "chat"],  help="Evaluation set to use (pipene, full, or chat)")  parser.add_argument("--output", type=str, help="Custom basename for all output files (PNG/PDF) without extension. Defaults to concatenated bstrings")  parser.add_argument("--exclude", type=str,  help="Comma-separated st of model names to exclude from relts")  parser.add_argument("--csv", type=str,  help="Path to existing CSV file to plot (skips database query)")  parser.add_argument("--graphs", type=str,  help="Comma-separated st of benchmarks to display (e.g. 'AIME24,HMMT,LCBv2')")  args = parser.parse_args()  # Configure logging  logging.basicConfig(level=logging.INFO,  format='%(asctime)s - %(levelname)s - %(message)s')  # Select benchmark set based on argument  if args.evalset == "pipene":  benchmarks = PIPENE_BENCHMARKS  ef args.evalset == "full":  benchmarks = PIPENE_BENCHMARKS + HELDOUT_BENCHMARKS  else:  raise ValueError(f"Invad evalset: {args.evalset}")  # Process bstrings - handle comma-separated bstrings  processed_bstrings = []  has_comma_separated = False  for bstring_arg in args.bstrings:  if ',' in bstring_arg:  # For comma-separated values, add each as an individual bstring  processed_bstrings.extend([s.strip() for s in bstring_arg.spt(',')])  has_comma_separated = True  else:  # For regular bstrings, add as is  processed_bstrings.append(bstring_arg)  # Replace the original bstrings with processed ones  args.bstrings = processed_bstrings  # Set a flag for comma-separated bstrings to use OR logic in the query  args.comma_separated_bstrings = has_comma_separated  logging.info(f"Searching for models with bstrings: {args.bstrings}")  logging.info(f"Using evaluation set: {args.evalset} with {len(benchmarks)} benchmarks")  # Parse exclusion st if provided  exclude_models = None  if args.exclude:  exclude_models = [model.strip() for model in args.exclude.spt(',')]  # If CSV file is provided, skip the database query and plot directly  if args.csv:  logging.info(f"Using provided CSV file: {args.csv} instead of querying database")  csv_file = args.csv  else:  # Get records from database and write to CSV  _, csv_file = scoresearch_string(args.bstrings, benchmarks, args.output, exclude_models, args=args)  # Plot all benchmarks  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  logging.info(f"Generating all benchmarks plots from CSV file: {csv_file}")  df = pd.read_csv(csv_file)  df = df.set_index("Experiments")  for col in df.columns:  if col not in ["Domain", "DatasetSize"]:  df[col] = df[col] / 100.0  plot_all_benchmarks(df, args.bstrings, relts_dir, args)