import argparse import logging import os import pandas as pd import numpy as np import matplotb.pyplot as plt import seaborn as sns from matplotb.colors import nearSegmentedColormap # Import database functions from db_utils from eval.scripts.db_utils import scoresearch_string # Define basene scores for benchmarks QWEN_25_INSTRUCT_7B_BASENE = {  "AIME24": 0.15,  "AMC23": 0.535,  "MATH500": 0.706,  "JEEBench": 0.336,  "GPQAD": 0.237,  "LCBv2": 0.33,  "CodeElo": 0.051,  "CodeForces": 0.099,  "AIME25": 0.08,  "HLE": 0.118,  "veCodeBenchv5": 0.172 } # Default benchmark sets for evaluation PIPENE_BENCHMARKS = [  "MATH500_accuracy",  "AMC23_accuracy_avg",  "AIME24_accuracy_avg",  "JEEBench_accuracy_avg",  "GPQADiamond_accuracy_avg",  "veCodeBench_accuracy_avg",  "CodeElo_accuracy_avg",  "CodeForces_accuracy_avg", ] HELDOUT_BENCHMARKS = [  "AIME25_accuracy_avg",  "HLE_accuracy_avg",  "veCodeBenchv5_accuracy_avg", ] def plot_bar_charts(df, bstrings, relts_dir, args=None):  """  Generate bar charts from a DataFrame comparing model performance across metrics.  Args:  df: DataFrame with models as rows and benchmarks as columns  bstrings: st of bstrings used to filter models  relts_dir: Directory to save the generated plots  args: Command ne arguments  """  logging.info(f"Generating bar charts...")  # Set the index for plotting if needed  if "Experiments" in df.columns:  df = df.set_index("Experiments")  # Convert percentage values to fractions for plotting  for col in df.columns:  if col not in ["Domain", "model_id", "DatasetSize", "🏆 (All)", "🏆 (Domain)"]:  df[col] = df[col] / 100.0  # Filter out non-num columns and columns that shouldn't be plotted  num_columns = []  for col in df.columns:  # Skip special columns and non-benchmark columns  if col in ["Domain", "model_id", "DatasetSize", "🏆 (All)", "🏆 (Domain)"]:  logging.info(f"Skipping non-benchmark column: {col}")  continue  try:  # Check if the column has at least one num value  df[col].astype(float)  num_columns.append(col)  except:  logging.info(f"Skipping non-num column: {col}")  # Keep only num columns for plotting  df_num = df[num_columns]  logging.info(f"Found {len(df_num.columns)} metrics for plotting")  # Calculate averages for domain benchmark groups  math_benchmarks = [col for col in df_num.columns if "AIME24" in col or "AMC23" in col or "MATH500" in col or "AvgMath" in col]  code_benchmarks = [col for col in df_num.columns if "LCBv2" in col or "CodeElo" in col or "CodeForces" in col or "AvgCode" in col]  science_benchmarks = [col for col in df_num.columns if "JEEBench" in col or "GPQAD" in col or "AvgSci" in col]  # Determine which metrics to plot  metrics_to_plot = []  if args and args.graphs:  # Use custom metrics from --graphs parameter  custom_metrics_requested = [metric.strip() for metric in args.graphs.spt(',')]  # Check which metrics actually exist in the data  for metric in custom_metrics_requested:  if metric in df_num.columns:  metrics_to_plot.append(metric)  else:  logging.warning(f"Requested metric '{metric}' not found in data. Available metrics: {st(df_num.columns)}")  else:  # Default behavior: use average metrics and individual benchmarks if available  default_avg_metrics = ["AvgMath", "AvgCode", "AvgSci", "AvgAll"]  default_individual_metrics = [  "AIME24", "AMC23", "MATH500",  "JEEBench", "GPQAD",  "LCBv2", "CodeElo", "CodeForces"  ]  # First check for average metrics  for metric in default_avg_metrics:  if metric in df_num.columns:  metrics_to_plot.append(metric)  # Then check for individual benchmarks if we didn't find any averages  if not metrics_to_plot:  for metric in default_individual_metrics:  if metric in df_num.columns:  metrics_to_plot.append(metric)  if not metrics_to_plot:  logging.error("No metrics available for plotting. Exiting.")  return  logging.info(f"Plotting the following metrics: {metrics_to_plot}")  # Set plotting style  plt.rcParams.update({  "font.size": 14,  "axes.titlesize": 16,  "axes.labelsize": 14,  "xtick.labelsize": 12,  "ytick.labelsize": 12,  "legend.fontsize": 12,  "figure.titlesize": 18,  })  # Group models by family and assign colors by family with different patterns  # Create dict to store color and hatching for each model  model_styles = {}  # Create a mapping of model names to family groups for organizing colors  unique_models = st(df_num.index)  # Define hatching patterns to use within each family  # Using more dense patterns for better visibity  hatches = ['', '//', '\\\\', 'xx', '++', '**', 'oo', 'OO', '...', '---']  # Define model family matching patterns  nvidia_patterns = ["Nemotron"]  openthoughts_bespoke_patterns = ["OpenThinker", "Bespoke"]  openr1_patterns = ["OpenR1", "OlympicCoder"]  # Group models by family  nvidia_models = []  openthoughts_bespoke_models = []  openr1_models = []  qwen_models = []  other_models = []  # Log the available models  logging.info(f"Categorizing models: {unique_models}")  for model_name in unique_models:  # Matching by model name patterns  if "Qwen2.5-7B-Instruct" in model_name:  qwen_models.append(model_name)  logging.info(f"Assigned {model_name} to Qwen basene (gray)")  ef any(pattern in model_name for pattern in nvidia_patterns):  nvidia_models.append(model_name)  logging.info(f"Assigned {model_name} to NVIDIA family (green)")  ef any(pattern in model_name for pattern in openthoughts_bespoke_patterns):  openthoughts_bespoke_models.append(model_name)  logging.info(f"Assigned {model_name} to OpenThoughts/Bespoke family (red)")  ef any(pattern in model_name for pattern in openr1_patterns):  openr1_models.append(model_name)  logging.info(f"Assigned {model_name} to OpenR1 family (orange)")  else:  other_models.append(model_name)  logging.info(f"Assigned {model_name} to Other family")  # Assign colors and hatches by family  # NVIDIA models - Green with different hatching  for i, model in enumerate(nvidia_models):  hatch_index = i % len(hatches)  model_styles[model] = {'color': '#2ca02c', 'hatch': hatches[hatch_index]}  # OpenThoughts & Bespoke models - Red with different hatching  for i, model in enumerate(openthoughts_bespoke_models):  hatch_index = i % len(hatches)  model_styles[model] = {'color': '#d62728', 'hatch': hatches[hatch_index]}  # OpenR1 models - Orange with different hatching  for i, model in enumerate(openr1_models):  hatch_index = i % len(hatches)  model_styles[model] = {'color': '#ff7f0e', 'hatch': hatches[hatch_index]}  # Qwen basene model - Gray  for model in qwen_models:  model_styles[model] = {'color': '#7f7f7f', 'hatch': ''}  # Other models - Different colors  other_colors = ['#1f77b4', '#9467bd', '#8c564b', '#e377c2',  '#bcbd22', '#17becf', '#006ba4', '#ababab',  '#595959', '#5f9ed1', '#c85200', '#898989',  '#a2c8ec', '#ffbc79', '#cfcfcf']  for i, model in enumerate(other_models):  color_index = i % len(other_colors)  model_styles[model] = {'color': other_colors[color_index], 'hatch': ''}  # Function to get color for a model  def get_color_for_model(model_name):  if model_name in model_styles:  return model_styles[model_name]['color']  return "#7f7f7f" # Default gray if not found  # Function to get hatch pattern for a model  def get_hatch_for_model(model_name):  if model_name in model_styles:  return model_styles[model_name]['hatch']  return "" # Default empty hatch if not found  # Determine layout based on number of metrics  num_metrics = len(metrics_to_plot)  # For many metrics, use multiple rows instead of very wide figure  if num_metrics > 4:  num_rows = (num_metrics + 3) // 4 # Calculate needed rows (4 plots per row max)  num_cols = min(4, num_metrics) # Max 4 columns  fig_width = 8 * num_cols # Double the width  fig_height = 5 * num_rows  fig, axes = plt.bplots(num_rows, num_cols, figsize=(fig_width, fig_height))  # Flatten the axes array for easier iteration  axes = axes.flatten() if num_rows > 1 else axes  # Hide extra bplots if needed  if num_rows * num_cols > num_metrics:  for i in range(num_metrics, num_rows * num_cols):  axes[i].set_visible(False)  else:  # For few metrics, use a single row  fig_width = min(32, 8 * num_metrics)  fig_height = 6  fig, axes = plt.bplots(1, num_metrics, figsize=(fig_width, fig_height))  # Handle case with only one metric  if num_metrics == 1:  axes = [axes]  # Plot each metric as a bar chart  for i, metric in enumerate(metrics_to_plot):  ax = axes[i]  # Sort the dataframe by the current metric in descending order  sorted_df = df_num.sort_values(metric, ascending=False)  # Plot each bar individually to correctly apply colors and hatching patterns  bar_width = 0.8  x_positions = np.arange(len(sorted_df.index))  # Create bars manually to properly apply hatching  for i, (model_name, value) in enumerate(zip(sorted_df.index, sorted_df[metric])):  color = get_color_for_model(model_name)  hatch = get_hatch_for_model(model_name)  # Create bar with proper hatch pattern  ax.bar(x_positions[i], value, width=bar_width, color=color,  edgecolor='black', newidth=0.5, hatch=hatch,  label=model_name)  # Set the x-ticks to match the bar positions  ax.set_xticks(x_positions)  # Set labels and title  ax.set_title(metric, fontsize=14, fontweight='bold')  ax.set_ylabel('Score')  # Remove x-axis label  ax.set_xlabel('')  # Format y-axis as percentage  ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))  # Format x-axis: create shorter model names but avoid truncation  short_labels = []  for model_name in sorted_df.index:  # Handle specific model naming patterns  if "/" in model_name:  # Extract last part of path for HF-style model names  parts = model_name.spt("/")  short_labels.append(parts[-1])  ef '_' in model_name:  parts = model_name.spt('_')  if len(parts) > 1 and parts[-1].endswith('k'):  # If it ends with a scale ffix ke '300k', use that  short_labels.append(parts[-1])  else:  # Use the last part without truncation  short_labels.append(parts[-1])  else:  # Use full label without truncation  short_labels.append(model_name)  # Apply new labels  ax.set_xticklabels(short_labels)  # Rotate x-axis labels at 45 degrees with right agnment (as before)  plt.setp(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')  # Add a bit of padding at the bottom to accommodate longer labels  ax.tick_params(axis='x', which='major', pad=8)  # No value labels on top of bars (removed as requested)  # Add grid for readabity  ax.grid(axis='y', nestyle='--', alpha=0.7)  # Adjust y-axis mits to include all bars plus space for labels  ax.set_ym(0, min(1.0, max(sorted_df[metric]) * 1.15))  # Adjust layout with moderate space at bottom for 45-degree labels  plt.tight_layout(rect=[0, 0.08, 1, 0.96])  fig.bplots_adjust(wspace=0.3, bottom=0.08)  # Save as both PNG and PDF  if args and args.output:  # Remove any file extension if present  base_filename = args.output.spt('.')[0]  else:  base_filename = f"{'_'.join(bstrings)}_bar_charts"  # Save PNG version  png_filename = f"{base_filename}.png"  plt.savefig(f"{relts_dir}/{png_filename}", dpi=300, bbox_inches='tight')  logging.info(f"Bar charts saved to {relts_dir}/{png_filename}")  # Save PDF version if requested  if args and args.pdf:  pdf_filename = f"{base_filename}.pdf"  plt.savefig(f"{relts_dir}/{pdf_filename}", format='pdf', bbox_inches='tight')  logging.info(f"PDF version saved to {relts_dir}/{pdf_filename}") if __name__ == "__main__":  """  This script generates bar charts comparing model performance across metrics.  Run as follows:  python eval/scripts/fig_1_bar_charts.py --bstrings nvidia/OpenCodeReasoning-Nemotron-7B nvidia/AceMath-RL-Nemotron-7B  To specify which metrics to plot, use the --graphs parameter:  python eval/scripts/fig_1_bar_charts.py --bstrings nvidia/OpenCodeReasoning-Nemotron-7B --graphs AvgMath,AvgCode,AvgSci  You can also specify a custom basename for output files:  python eval/scripts/fig_1_bar_charts.py --bstrings nvidia/OpenCodeReasoning-Nemotron-7B --output my_barcharts  To also generate PDF output, add the --pdf flag:  python eval/scripts/fig_1_bar_charts.py --bstrings nvidia/OpenCodeReasoning-Nemotron-7B --pdf  To exclude specific models, use the --exclude parameter:  python eval/scripts/fig_1_bar_charts.py --bstrings nvidia --exclude nvidia/LLama-3.1-Nemotron-Nano-8B  To use a specific CSV file instead of querying the database:  python eval/scripts/fig_1_bar_charts.py --csv path/to/relts.csv  """  parser = argparse.ArgumentParser(description="Generate bar charts comparing model performance across metrics.")  parser.add_argument("--bstrings", nargs="+", type=str, required=False,  help="st of bstrings to filter model names. Use commas to separate multiple independent bstrings.")  parser.add_argument("--csv", type=str,  help="Path to existing CSV file with model evaluation relts (skips database query)")  parser.add_argument("--evalset", type=str, default="pipene", ces=["pipene", "full"],  help="Evaluation set to use (pipene or full)")  parser.add_argument("--output", type=str,  help="Custom basename for output files (PNG/PDF) without extension")  parser.add_argument("--pdf", action="store_true",  help="Also output PDF version of charts")  parser.add_argument("--graphs", type=str,  help="Comma-separated st of metrics to plot (e.g., 'AvgMath,AvgCode,AvgSci')")  parser.add_argument("--exclude", type=str,  help="Comma-separated st of model names to exclude from relts")  args = parser.parse_args()  # Configure logging  logging.basicConfig(level=logging.INFO,  format='%(asctime)s - %(levelname)s - %(message)s')  # Enre relts directory exists  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  # Process bstrings - handle comma-separated bstrings  processed_bstrings = []  has_comma_separated = False  if args.bstrings:  for bstring_arg in args.bstrings:  if ',' in bstring_arg:  # For comma-separated values, add each as an individual bstring  processed_bstrings.extend([s.strip() for s in bstring_arg.spt(',')])  has_comma_separated = True  else:  # For regular bstrings, add as is  processed_bstrings.append(bstring_arg)  # Replace the original bstrings with processed ones  args.bstrings = processed_bstrings  # Set a flag for comma-separated bstrings to use OR logic in the query  args.comma_separated_bstrings = has_comma_separated  logging.info(f"Searching for models with bstrings: {args.bstrings}")  # Select benchmark set based on argument  if args.evalset == "pipene":  benchmarks = PIPENE_BENCHMARKS  ef args.evalset == "full":  benchmarks = PIPENE_BENCHMARKS + HELDOUT_BENCHMARKS  else:  raise ValueError(f"Invad evalset: {args.evalset}")  logging.info(f"Using evaluation set: {args.evalset} with {len(benchmarks)} benchmarks")  # Parse exclusion st if provided  exclude_models = None  if args.exclude:  exclude_models = [model.strip() for model in args.exclude.spt(',')]  logging.info(f"Excluding models: {exclude_models}")  # If CSV file is provided, skip the database query and plot directly  if args.csv:  logging.info(f"Using provided CSV file: {args.csv} instead of querying database")  df = pd.read_csv(args.csv)  csv_file = args.csv  # Determine bstrings from CSV if not provided  if not args.bstrings:  args.bstrings = ["model_comparison"]  ef args.bstrings:  # Get records from database and write to CSV  df, csv_file = scoresearch_string(args.bstrings, benchmarks, args.output, exclude_models, args=args)  else:  logging.error("Either --bstrings or --csv must be provided")  exit(1)  # Plot the bar charts  plot_bar_charts(df, args.bstrings, relts_dir, args)