import pandas as pd import matplotb.pyplot as plt import os import numpy as np # Read the data df = pd.read_csv("eval/relts/30k_300k_comparison.csv") # Convert percentages to decimal where needed for col in df.columns:  if col not in ["Experiments", "Domain", "DatasetSize"]:  df[col] = df[col] / 100.0 # Define the benchmarks to plot (reordered as requested in 3x4 layout) benchmarks = ["AIME24", "AMC23", "HMMT", "AIME25", "veCodeBenchv5", "CodeElo", "CodeForces", "HLE", "GPQAD", "JEEBench", "LCBv2", "AvgAll"] titles = benchmarks FONTSIZE = 20 MARKERSIZE = 12 NEWIDTH = 3 FIGSIZE = (18, 24) # Create figure with 4x3 layout plt.rcParams.update({  "font.size": FONTSIZE,  "axes.titlesize": FONTSIZE,  "axes.labelsize": FONTSIZE,  "xtick.labelsize": FONTSIZE,  "ytick.labelsize": FONTSIZE,  "legend.fontsize": FONTSIZE, }) fig, axes = plt.bplots(4, 3, figsize=FIGSIZE, sharey=False) axes = axes.flatten() # Flatten to make indexing easier # Define model prefixes to group by model_famies = {  "openthoughts3": "red",  "openthoughts2": "orange",  "openmathreasoning": "green",  "opencodereasoning": "blue",  "nemo_nano": "purple",  "am": "brown",  "no_pipene": "pink" } # Create mapping of 30k to 300k models model_pairs = [] for _, row in df.iterrows():  model_name = row['Experiments']  dataset_size = row['DatasetSize']  # Skip models that don't match our target sizes  if pd.isna(dataset_size):  continue  if abs(dataset_size - 31600) < 1000: # 30k models  # Look for corresponding 300k model  base_name = model_name.replace('_30k', '').replace('_math_30k', '_math').replace('_code_30k', '_code').replace('_science_30k', '_science')  # Try different 300k variants  possible_300k_names = [  base_name + '_300k',  base_name + '_math_300k' if '_math' in model_name else None,  base_name + '_code_300k' if '_code' in model_name else None,  base_name + '_science_300k' if '_science' in model_name else None  ]  for possible_300k in possible_300k_names:  if possible_300k is None:  continue  # Find the 300k model in the dataframe  matching_300k = df[df['Experiments'] == possible_300k]  if not matching_300k.empty:  model_pairs.append({  'model_30k': model_name,  'model_300k': possible_300k,  'family': None # Will determine this later  })  break # Determine family for each pair for pair in model_pairs:  for family_name in model_famies.keys():  if pair['model_30k'].startswith(family_name):  pair['family'] = family_name  break  if pair['family'] is None:  pair['family'] = 'other' print(f"Found {len(model_pairs)} model pairs:") for pair in model_pairs:  print(f" {pair['model_30k']} -> {pair['model_300k']} ({pair['family']})") # Plot each benchmark for i, benchmark in enumerate(benchmarks):  ax = axes[i]  ax.set_title(titles[i], fontsize=FONTSIZE+4)  # Only set xlabel for the last row (indices 9, 10, 11)  if i >= 9:  ax.set_xlabel("30k Accuracy (%)", fontsize=FONTSIZE+2)  # Only set ylabel for the first bplot in each row (0, 3, 6, 9)  if i % 3 == 0:  ax.set_ylabel("300k Accuracy (%)", fontsize=FONTSIZE+2)  ax.grid(True, nestyle='--', alpha=0.25)  # Hide top and right spines  ax.spines['top'].set_visible(False)  ax.spines['right'].set_visible(False)  # Plot points for each model family  family_plotted = set()  for pair in model_pairs:  model_30k = pair['model_30k']  model_300k = pair['model_300k']  family = pair['family']  # Get data for both models  data_30k = df[df['Experiments'] == model_30k]  data_300k = df[df['Experiments'] == model_300k]  if not data_30k.empty and not data_300k.empty and benchmark in data_30k.columns and benchmark in data_300k.columns:  x_val = data_30k[benchmark].iloc[0]  y_val = data_300k[benchmark].iloc[0]  # Skip if either value is NaN  if pd.isna(x_val) or pd.isna(y_val):  continue  color = model_famies.get(family, 'gray')  # Only add to legend once per family  label = family if family not in family_plotted else None  if label:  family_plotted.add(family)  ax.scatter(x_val*100, y_val*100,  color=color,  s=MARKERSIZE**2,  alpha=0.7,  label=label)  # Get the actual data range for this benchmark  x_data = []  y_data = []  for pair in model_pairs:  model_30k = pair['model_30k']  model_300k = pair['model_300k']  data_30k = df[df['Experiments'] == model_30k]  data_300k = df[df['Experiments'] == model_300k]  if not data_30k.empty and not data_300k.empty and benchmark in data_30k.columns and benchmark in data_300k.columns:  x_val = data_30k[benchmark].iloc[0]  y_val = data_300k[benchmark].iloc[0]  if pd.notna(x_val) and pd.notna(y_val):  x_data.append(x_val*100)  y_data.append(y_val*100)  if x_data and y_data:  # Set axis mits based on data range with some padding  x_min, x_max = min(x_data), max(x_data)  y_min, y_max = min(y_data), max(y_data)  # Add 10% padding  x_range = x_max - x_min  y_range = y_max - y_min  x_padding = max(x_range * 0.1, 2) # At least 2% padding  y_padding = max(y_range * 0.1, 2)  x_min = max(0, x_min - x_padding)  x_max = min(100, x_max + x_padding)  y_min = max(0, y_min - y_padding)  y_max = min(100, y_max + y_padding)  ax.set_xm(x_min, x_max)  ax.set_ym(y_min, y_max)  # Add diagonal ne (y = x) within the visible range  ne_min = max(x_min, y_min)  ne_max = min(x_max, y_max)  if ne_min < ne_max:  ax.plot([ne_min, ne_max], [ne_min, ne_max], 'k--', alpha=0.5, zorder=0, newidth=1)  else:  # Fallback to 0-100 if no data  ax.set_xm(0, 100)  ax.set_ym(0, 100) # Collect all legend handles and labels from all bplots all_handles = [] all_labels = [] for i in range(len(benchmarks)):  handles, labels = axes[i].get_legend_handles_labels()  for h, l in zip(handles, labels):  if l not in all_labels: # Only add if not already in the st  all_handles.append(h)  all_labels.append(l) # Create a combined legend at the bottom fig.legend(all_handles, all_labels, loc='lower center',  bbox_to_anchor=(0.51, 0.02), ncol=len(all_handles),  fontsize=FONTSIZE, frameon=False, fancybox=True, framealpha=0.7) plt.tight_layout(rect=[0.04, 0.08, 0.98, 0.96]) fig.bplots_adjust(wspace=0.15, hspace=0.3) # Save plots relts_dir = "eval/relts" os.makedirs(relts_dir, exist_ok=True) # Save as PNG and PDF plt.savefig(f"{relts_dir}/scale_30k_vs_300k_comparison.png", dpi=300, bbox_inches='tight') plt.savefig(f"{relts_dir}/scale_30k_vs_300k_comparison.pdf", dpi=300, bbox_inches='tight') print(f"Plots saved to {relts_dir}/scale_30k_vs_300k_comparison.png and {relts_dir}/scale_30k_vs_300k_comparison.pdf")