#!/usr/bin/env python """ Simple script to plot s1 and mo models gside standard scang models. """ import matplotb.pyplot as plt import pandas as pd import os import logging # Configure logging logging.basicConfig(level=logging.INFO,  format='%(asctime)s - %(levelname)s - %(message)s') def create_comparison_plot():  """  Create a plot comparing s1 and mo models with other models.  """  # Enre relts directory exists  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  # Load data from the CSV files we created  try:  scang_df = pd.read_csv(f"{relts_dir}/fig_1_aime24_lcbv2_gpqa_data.csv")  stande_df = pd.read_csv(f"{relts_dir}/s1_mo_models.csv")  except FileNotFoundError as e:  logging.error(f"Error loading CSV files: {e}")  logging.error("Please make re the required CSV files exist in eval/relts/")  return  # Process the stande models data  stande_models = []  for _, row in stande_df.iterrows():  # Build a model record  model = {  "Model": row["Model"],  "Dataset_Size": row["Dataset Size"],  "Benchmarks": {}  }  # Add benchmark scores  for col in stande_df.columns:  if col in ["AIME24", "LCBv2", "GPQAD"]:  model["Benchmarks"][col] = row[col] / 100 # Convert from percentage to fraction  stande_models.append(model)  # Set up Times New Roman as the font  plt.rcParams['font.family'] = 'serif'  plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']  # Set plotting style  plt.rcParams.update({  "font.size": 16,  "axes.titlesize": 28,  "axes.labelsize": 26,  "xtick.labelsize": 16,  "ytick.labelsize": 16,  "legend.fontsize": 18,  "figure.titlesize": 32,  "axes.grid": True,  "grid.color": "gray",  "grid.alpha": 0.1,  "axes.spines.top": False,  "axes.spines.right": False,  "nes.markersize": 10,  "nes.newidth": 2  })  # Create a figure with bplots for each benchmark  benchmarks = ["AIME24", "LCBv2", "GPQAD"]  fig, axes = plt.bplots(1, len(benchmarks), figsize=(6 * len(benchmarks), 6))  # Define basene scores  basene_scores = {  "AIME24": 0.15,  "LCBv2": 0.33,  "GPQAD": 0.237,  }  # Define colors for consistency  colors = {  "openthoughts3": "blue",  "nemo_nano": "orange",  "am": "green",  "s1": "red",  "mo": "purple"  }  # Define standard x-axis ticks  x_ticks = [192, 316, 800, 1000, 3160, 10000, 31600, 100000, 316000, 1000000]  x_tick_labels = ["0.2k", "0.3k", "0.8k", "1k", "3k", "10k", "30k", "100k", "300k", "1M"]  # For each benchmark  for i, benchmark in enumerate(benchmarks):  ax = axes[i]  ax.set_title(benchmark, fontsize=28, family='Times New Roman')  ax.set_xlabel("Dataset Size", fontsize=26, family='Times New Roman')  ax.set_ylabel("Accuracy", fontsize=26, family='Times New Roman')  ax.grid(True, nestyle='--', alpha=0.25)  # Set x-axis to log scale  ax.set_xscale('log')  ax.set_xticks(x_ticks)  ax.set_xticklabels(x_tick_labels)  # Plot scang models (from full experiment data)  model_data = {}  # Group data by model  for _, row in scang_df.iterrows():  if row["Benchmark"] == benchmark:  model = row["Model"]  if model not in model_data:  model_data[model] = []  model_data[model].append((row["Dataset_Size"], row["Score"]))  # Plot each model's scang curve  for model, points in model_data.items():  # Skip if no data points  if not points:  continue  # Sort data points by dataset size  points.sort(key=lambda x: x[0])  x, y = zip(*points)  # Determine color  color = None  for key, val in colors.items():  if key in model.lower():  color = val  break  if color is None:  # Default color for unknown models  color = "gray"  # Plot the scang curve  ax.plot(x, y, 'o-', label=model, color=color, newidth=4, markersize=12)  # Add stande models  for model in stande_models:  if benchmark in model["Benchmarks"]:  model_name = model["Model"]  dataset_size = model["Dataset_Size"]  score = model["Benchmarks"][benchmark]  # Determine color  color = None  for key, val in colors.items():  if key in model_name.lower():  color = val  break  if color is None:  color = "gray"  # Plot stande model as a square marker  ax.plot(dataset_size, score, 's', label=model_name, color=color, markersize=14)  # Add basene  if benchmark in basene_scores:  basene = basene_scores[benchmark]  ax.axhne(y=basene, color='black', nestyle='--', newidth=1.5,  alpha=0.6, label="Qwen-2.5-7B-Instruct")  # Set y-axis mits  ax.set_ym(0, 0.6)  # Add legend to the bottom of the figure  handles, labels = axes[0].get_legend_handles_labels()  by_label = dict(zip(labels, handles))  fig.legend(  by_label.values(), by_label.keys(),  loc='lower center',  bbox_to_anchor=(0.5, 0.05),  ncol=min(5, len(by_label)),  fontsize=18,  frameon=True,  fancybox=True,  framealpha=0.7  )  # Adjust layout and save  plt.tight_layout(rect=[0.04, 0.12, 0.98, 0.96]) # Provide space for legend at bottom  fig.bplots_adjust(wspace=0.25)  # Save as PNG and PDF  plt.savefig(f"{relts_dir}/s1_mo_comparison.png", dpi=300, bbox_inches='tight')  plt.savefig(f"{relts_dir}/s1_mo_comparison.pdf", format='pdf', bbox_inches='tight')  logging.info(f"Plots saved to {relts_dir}/s1_mo_comparison.png and .pdf") if __name__ == "__main__":  create_comparison_plot()