import pandas as pd import matplotb.pyplot as plt import os import io # import sys # sys.path.append('.') # from eval.scripts.db_utils import scoresearch_string # def get_openthoughts3_data(): # """Get both qwen and llama openthoughts3 data from database""" # # # Define benchmarks ke in simple_all_benchmarks.py # benchmarks = [ # 'AIME24_accuracy_avg', 'AMC23_accuracy_avg', 'MATH500_accuracy', 'HMMT_accuracy_avg', 'AIME25_accuracy_avg', # 'veCodeBench_accuracy_avg', 'veCodeBenchv5_accuracy_avg', 'CodeElo_accuracy_avg', 'CodeForces_accuracy_avg', # 'GPQADiamond_accuracy_avg', 'JEEBench_accuracy_avg', 'HLE_accuracy_avg' # ] # # class Args: # comma_separated_bstrings = False # output = None # # args = Args() # # # Get qwen data (original openthoughts3) # qwen_bstrings = ['openthoughts3'] # qwen_exclude = ['llama3', 'herorun', 'buggy', 'ckpts', 'python_filtered', 'leonardo', 'complete-thoughts', 'filter-thought', 'remove-wait', 'code_swap_r1', 'annotated'] # qwen_df, _ = scoresearch_string(qwen_bstrings, benchmarks=benchmarks, exclude_models=qwen_exclude, args=args) # # # Get llama data # llama_bstrings = ['openthoughts3', 'llama3'] # llama_df, _ = scoresearch_string(llama_bstrings, benchmarks=benchmarks, args=args) # # return qwen_df, llama_df # # def create_combined_csv(): # """Create combined CSV with both qwen and llama data""" # qwen_df, llama_df = get_openthoughts3_data() # # # Filter qwen models to scang variants only # qwen_scang_models = [] # for idx, row in qwen_df.iterrows(): # name = row['Experiments'] # # Keep main scang models # if (name in ['openthoughts3', 'openthoughts3_300k', 'openthoughts3_100k', 'openthoughts3_30k', 'openthoughts3_10k', 'openthoughts3_3k', 'openthoughts3_1k', 'openthoughts3_0.3k'] or # (name.startswith('openthoughts3_') and any(size in name for size in ['300k', '100k', '30k', '10k', '3k', '1k', '0.3k']) and # not any(exclude in name for exclude in ['llama3', 'herorun', 'buggy', 'ckpts', 'python_filtered', 'leonardo', 'complete-thoughts', 'filter-thought', 'remove-wait', 'code_swap_r1', 'annotated']))): # qwen_scang_models.append(row) # # # Filter llama models to scang variants only # llama_scang_models = [] # for idx, row in llama_df.iterrows(): # name = row['Experiments'] # # Keep main scang models with llama3 # if (name in ['openthoughts3_100k_llama3', 'openthoughts3_30k_llama3', 'openthoughts3_10k_llama3', 'openthoughts3_3k_llama3', 'openthoughts3_1k_llama3'] or # (name.startswith('openthoughts3_') and 'llama3' in name and any(size in name for size in ['100k', '30k', '10k', '3k', '1k', '0.3k']) and # not any(exclude in name for exclude in ['herorun', 'buggy', 'ckpts', 'python_filtered', 'leonardo', 'complete-thoughts', 'filter-thought', 'remove-wait', 'code_swap_r1', 'annotated']))): # llama_scang_models.append(row) # # # Convert to dataframes # qwen_filtered = pd.DataFrame(qwen_scang_models) if qwen_scang_models else pd.DataFrame() # llama_filtered = pd.DataFrame(llama_scang_models) if llama_scang_models else pd.DataFrame() # # # Combine # combined_df = pd.concat([qwen_filtered, llama_filtered], ignore_index=True) # # # Convert to CSV format (multiply by 100 to get percentages, round to 1 decimal) # csv_data = combined_df.copy() # for col in csv_data.columns: # if col not in ["Domain", "DatasetSize", "Experiments"]: # csv_data[col] = csv_data[col].round(1) # # return csv_data # # Get data and create CSV string # df_data = create_combined_csv() # # # Create CSV string manually to match simple_all_benchmarks format # csv_nes = [] # csv_nes.append("Experiments,Domain,DatasetSize,AvgAll,AIME24,AMC23,MATH500,HMMT,AvgMath,JEEBench,GPQAD,AvgSci,LCBv2,CodeElo,CodeForces,AvgCode,AIME25,HLE,veCodeBenchv5") # # # Add data rows # for _, row in df_data.iterrows(): # ne = f"{row['Experiments']},{row.get('Domain', 'Other')},{row.get('DatasetSize', '')},{row.get('AvgAll', '')},{row.get('AIME24', '')},{row.get('AMC23', '')},{row.get('MATH500', '')},{row.get('HMMT', '')},{row.get('AvgMath', '')},{row.get('JEEBench', '')},{row.get('GPQAD', '')},{row.get('AvgSci', '')},{row.get('LCBv2', '')},{row.get('CodeElo', '')},{row.get('CodeForces', '')},{row.get('AvgCode', '')},{row.get('AIME25', '')},{row.get('HLE', '')},{row.get('veCodeBenchv5', '')}" # csv_nes.append(ne) # # fig1_csv = "\n".join(csv_nes) # Define the CSV data (includes both Qwen and Llama3 variants) fig1_csv = """Experiments,Domain,DatasetSize,AvgAll,AIME24,AMC23,MATH500,HMMT,AvgMath,JEEBench,GPQAD,AvgSci,LCBv2,CodeElo,CodeForces,AvgCode,AIME25,HLE,veCodeBenchv5 openthoughts3,Other,1200000.0,55.3,69.0,93.5,90.0,42.7,69.7,72.4,53.7,63.0,64.5,31.0,32.2,44.8,53.3,10.2,51.7 openthoughts3_300k,300k,316000.0,57.4,61.0,90.5,89.2,36.7,80.2,70.3,51.0,60.6,48.2,24.0,25.2,32.5,39.7,10.6,34.0 openthoughts3_100k,100k,100000.0,54.1,54.3,86.8,89.0,29.3,76.7,61.8,51.0,56.4,43.7,21.5,24.4,29.9,41.0,9.8,31.5 openthoughts3_30k,30k,31600.0,49.8,40.7,83.2,87.4,21.7,70.4,58.2,48.0,53.1,45.1,16.6,19.1,26.9,34.7,10.9,31.0 openthoughts3_10k,10k,10000.0,43.8,32.0,75.2,82.6,20.7,63.3,45.7,48.7,47.2,40.0,11.1,15.2,22.1,28.3,1.4,26.8 openthoughts3_3k,3k,3160.0,41.0,30.7,66.5,81.0,16.0,59.4,51.7,48.1,49.9,32.6,7.1,10.7,16.8,23.0,1.9,20.0 openthoughts3_1k,1k,1000.0,34.3,18.0,58.5,78.6,9.3,51.7,46.0,39.1,42.6,22.4,5.1,6.4,11.3,17.7,0.3,18.2 openthoughts3_0.3k,0.3k,316.0,33.7,17.3,56.5,79.8,11.0,51.2,45.2,39.2,42.2,19.7,4.5,7.5,10.6,16.0,5.4,11.2 openthoughts3_100k_llama3,100k,100000.0,37.9,37.0,75.2,83.8,19.3,49.1,45.2,45.1,45.2,44.4,13.8,18.3,27.1,30.3,9.7,31.9 openthoughts3_30k_llama3,30k,31600.0,33.7,29.3,70.8,80.0,19.7,44.5,40.3,43.8,42.0,35.7,11.3,14.6,21.7,22.7,11.3,25.2 openthoughts3_10k_llama3,10k,10000.0,27.4,14.0,56.0,73.4,12.0,34.8,32.4,46.1,39.2,27.2,7.3,10.7,16.3,18.7,11.4,19.9 openthoughts3_3k_llama3,3k,3160.0,23.7,9.7,48.8,67.8,6.0,28.5,29.6,47.0,38.3,23.3,5.9,7.9,13.4,10.3,12.1,16.4 openthoughts3_1k_llama3,1k,1000.0,17.8,4.7,36.8,58.8,0.3,20.9,27.6,31.5,29.6,17.0,3.8,5.7,9.6,4.0,11.6,11.9 """ # Convert string to dataframe df = pd.read_csv(io.StringIO(fig1_csv)) df = df.set_index("Experiments") # Convert percentages to decimal for col in df.columns:  if col not in ["Domain", "DatasetSize"]:  df[col] = df[col] / 100.0 FONTSIZE = 24 MARKERSIZE = 10 NEWIDTH = 3 FIGSIZE = (18, 30) # Create figure with 4x3 layout plt.rcParams.update({  "font.size": FONTSIZE,  "axes.titlesize": FONTSIZE,  "axes.labelsize": FONTSIZE,  "xtick.labelsize": FONTSIZE,  "ytick.labelsize": FONTSIZE,  "legend.fontsize": FONTSIZE, }) # Plot domains (reordered as requested) domains = ["AIME24", "AMC23", "MATH500", "HMMT", "AIME25", "LCBv2", "veCodeBenchv5", "CodeElo", "CodeForces", "GPQAD", "JEEBench", "HLE"] titles = ["AIME24", "AMC23", "MATH500", "HMMT", "AIME25", "LCB 05/23-05/24", "LCB 06/24-01/25", "CodeElo", "CodeForces", "GPQAD", "JEEBench", "HLE"] # Change to 4x3 layout fig, axes = plt.bplots(4, 3, figsize=FIGSIZE, sharey=False) axes = axes.flatten() # Flatten to make indexing easier # Define model groups by name pattern models = {  "openthoughts3_qwen": [],  "openthoughts3_llama": [] } # Group models by type and collect their dataset sizes for model in df.index:  dataset_size = df.loc[model, "DatasetSize"]  if 'llama3' in model:  models["openthoughts3_llama"].append((dataset_size, model))  else:  models["openthoughts3_qwen"].append((dataset_size, model)) # Sort each model group by dataset size for prefix in models:  models[prefix].sort(key=lambda x: x[0] if pd.notna(x[0]) else 0) # Define colors for each model family colors = {  "openthoughts3_qwen": "red",  "openthoughts3_llama": "blue" } # Define display names for legend display_names = {  "openthoughts3_qwen": "OpenThoughts3 (Qwen)",  "openthoughts3_llama": "OpenThoughts3 (Llama3)" } # Define basene models with their performance metrics basenes = {  "Qwen-2.5-7B-Instruct": {  "AIME24": 0.15,  "LCBv2": 0.33,  "GPQAD": 0.237,  "AMC23": 0.535,  "MATH500": 0.706,  "HMMT": 0.003, # 0.3% from HMMT basene  "JEEBench": 0.336,  "CodeElo": 0.051,  "CodeForces": 0.099,  "AIME25": 0.08,  "HLE": 0.118,  "veCodeBenchv5": 0.172,  "color": "red", # Same red as Qwen scang curves  "nestyle": "--"  },  "Llama-3.1-8B-Instruct": {  "AIME24": 0.047,  "AMC23": 0.158,  "MATH500": 0.432,  "HMMT": 0.003,  "AIME25": 0.003,  "LCBv2": 0.131,  "veCodeBenchv5": 0.089,  "CodeElo": 0.021,  "CodeForces": 0.067,  "GPQAD": 0.258,  "JEEBench": 0.141,  "HLE": 0.17,  "color": "blue", # Same blue as Llama scang curves  "nestyle": "--"  } } for i, metric in enumerate(domains):  ax = axes[i]  ax.set_title(titles[i], fontsize=FONTSIZE+4)  # Only set xlabel for the last row (indices 9, 10, 11)  if i >= 9:  ax.set_xlabel("Dataset Size", fontsize=FONTSIZE+2)  # Only set ylabel for the first bplot in each row (0, 3, 6, 9)  if i % 3 == 0:  ax.set_ylabel("Accuracy (%)", fontsize=FONTSIZE+2)  ax.grid(True, nestyle='--', alpha=0.25)  # Hide top and right spines  ax.spines['top'].set_visible(False)  ax.spines['right'].set_visible(False)  # Plot each model family  for prefix, model_st in models.items():  if model_st:  x_values = []  y_values = []  for size, model_name in model_st:  if model_name in df.index and metric in df.columns and pd.notna(df.loc[model_name, metric]):  x_values.append(size)  y_values.append(df.loc[model_name, metric])  if x_values and y_values:  # Plot the ne with circles for all points except the last  if len(x_values) > 1:  ax.plot(x_values[:-1], [z*100 for z in y_values[:-1]], 'o-',  label=display_names[prefix],  color=colors[prefix],  newidth=NEWIDTH,  markersize=MARKERSIZE)  # Connect the ne to the last point without marker  ax.plot([x_values[-2], x_values[-1]], [y_values[-2]*100, y_values[-1]*100], '-',  color=colors[prefix],  newidth=NEWIDTH)  else:  # If only one point, just plot the ne style for legend  ax.plot([], [], 'o-',  label=display_names[prefix],  color=colors[prefix],  newidth=NEWIDTH,  markersize=MARKERSIZE)  # Add square marker at the last point (no circle)  if len(x_values) > 0:  ax.plot(x_values[-1], y_values[-1]*100, 's',  color=colors[prefix],  markersize=MARKERSIZE)  # Add basene horizontal nes  for basene_name, basene_data in basenes.items():  if metric in basene_data:  basene_value = basene_data[metric]  ax.axhne(y=basene_value*100,  color=basene_data["color"],  nestyle=basene_data["nestyle"],  newidth=NEWIDTH-1,  alpha=1.0,  label=basene_name)  ax.set_xscale('log')  ax.set_xticks([1000, 10000, 100000, 1000000])  ax.set_xticklabels(['1K', '10K', '100K', '1M'], fontsize=FONTSIZE)  # Fix all axes to 0-100  ax.set_ym(0, 100)  # Set y-axis ticks in increments of 10  y_ticks = range(0, 110, 10)  ax.set_yticks(y_ticks) # Collect all legend handles and labels from all bplots to create a combined legend all_handles = [] all_labels = [] # Get unique legend items from all plots for i in range(len(domains)):  handles, labels = axes[i].get_legend_handles_labels()  for h, l in zip(handles, labels):  if l not in all_labels: # Only add if not already in the st  all_handles.append(h)  all_labels.append(l) # Create a combined legend at the bottom (2 rows x 2 columns) fig.legend(all_handles, all_labels, loc='lower center',  bbox_to_anchor=(0.51, 0.16), ncol=2,  fontsize=FONTSIZE, frameon=False, fancybox=True, framealpha=0.7) plt.tight_layout(rect=[0.04, 0.2, 0.98, 0.96]) fig.bplots_adjust(wspace=0.15) # Save plots relts_dir = "eval/relts" os.makedirs(relts_dir, exist_ok=True) # Save as PNG and PDF plt.savefig(f"{relts_dir}/simple_llama_vs_qwen.png", dpi=300, bbox_inches='tight') plt.savefig(f"{relts_dir}/simple_llama_vs_qwen.pdf", dpi=300, bbox_inches='tight') print(f"Plots saved to {relts_dir}/simple_llama_vs_qwen.png and {relts_dir}/simple_llama_vs_qwen.pdf")