import pandas as pd import matplotb.pyplot as plt import os import io # Hardcoded CSV data pipene_best_csv = """Experiments,Domain,DatasetSize,AvgAll,AIME24,AMC23,MATH500,AvgMath,JEEBench,GPQAD,AvgSci,LCBv2,CodeElo,CodeForces,AvgCode a1_math_open2math,Math,31600.0,38.1,26.0,67.8,82.8,58.9,44.8,46.3,45.6,24.0,6.1,7.1,12.4 a1_math_open2math_3k,Math,3160.0,32.8,16.3,57.5,77.6,50.5,40.6,48.0,44.3,12.4,4.1,5.7,7.4 a1_math_open2math_0.3k,Math,316.0,28.6,15.7,53.5,72.0,47.1,37.7,32.5,35.1,10.5,2.1,4.6,5.7 a1_math_open2math_1k,Math,1000.0,28.3,15.3,50.2,74.2,46.6,34.5,36.0,35.2,9.7,2.8,3.8,5.4 b1_math_top_1,Math,31600.0,37.6,25.7,71.8,83.0,60.2,44.0,39.7,41.8,23.0,6.5,6.8,12.1 b1_math_top_1_10k,Math,10000.0,34.1,19.7,63.0,79.0,53.9,37.6,43.4,40.5,19.0,4.4,6.5,10.0 b1_math_top_1_3k,Math,3160.0,36.0,20.0,63.2,79.2,54.1,39.8,46.0,42.9,25.5,7.2,7.2,13.3 b1_math_top_1_1k,Math,1000.0,33.9,17.3,60.2,77.6,51.7,43.1,40.2,41.7,22.5,4.9,5.4,10.9 b1_math_top_1_0.3k,Math,316.0,32.1,14.7,54.2,72.8,47.2,39.1,42.6,40.8,23.5,4.3,5.5,11.1 b2_math_length,Math,31600.0,41.9,32.7,79.0,86.2,66.0,51.5,45.8,48.6,24.3,7.4,8.6,13.4 b2_math_length_10k,Math,10000.0,39.3,28.7,69.5,82.8,60.3,45.9,43.1,44.5,28.5,6.6,9.3,14.8 b2_math_length_3k,Math,3160.0,38.3,20.7,68.2,81.4,56.8,45.8,48.0,46.9,28.2,7.0,7.4,14.2 b2_math_length_1k,Math,1000.0,33.5,20.3,59.8,74.8,51.6,41.0,37.9,39.4,24.1,4.2,5.7,11.3 b2_math_length_0.3k,Math,316.0,33.2,17.0,59.0,76.6,50.9,46.1,27.4,36.8,26.9,5.2,7.1,13.1 c1_math_0d_16s,Math,31597.0,40.1,32.3,74.0,84.4,63.6,49.8,47.6,48.7,17.9,7.8,7.3,11.0 c1_math_0d_16s_10k,Math,10000.0,38.5,25.3,68.5,84.4,59.4,44.6,44.1,44.4,25.2,7.3,8.5,13.7 c1_math_0d_16s_3k,Math,3160.0,38.3,25.7,65.2,81.4,57.4,45.8,46.5,46.2,27.1,6.8,8.1,14.0 c1_math_0d_16s_1k,Math,1000.0,32.9,16.7,59.2,78.8,51.6,44.8,28.5,36.6,23.3,5.6,6.3,11.7 c1_math_0d_16s_0.3k,Math,316.0,33.9,16.7,59.5,77.6,51.3,44.2,30.8,37.5,26.7,6.7,8.8,14.1 e1_math_all_qwq_together,Math,31600.0,44.2,43.0,83.2,88.4,71.5,58.9,47.5,53.2,18.3,7.5,7.0,10.9 e1_math_all_qwq_together_10k,Math,10000.0,40.5,34.7,75.0,85.6,65.1,50.6,43.8,47.2,20.2,6.8,7.2,11.4 e1_math_all_qwq_together_3k,Math,3160.0,38.9,30.0,69.3,80.8,60.0,51.7,46.0,48.8,22.7,5.3,5.4,11.1 e1_math_all_qwq_together_0.3k,Math,316.0,33.8,19.7,62.5,78.0,53.4,44.8,40.1,42.4,17.3,3.9,4.0,8.4 e1_math_all_qwq_together_1k,Math,1000.0,32.9,20.7,56.8,78.8,52.1,45.9,35.5,40.7,15.2,4.9,5.3,8.5 a1_code_code_golf,Code,31600.0,38.8,17.7,58.0,77.0,50.9,38.8,42.6,40.7,44.4,14.4,17.2,25.3 a1_code_code_golf_10k,Code,10000.0,36.9,17.7,62.0,78.4,52.7,45.0,44.4,44.7,19.8,12.1,15.9,15.9 a1_code_code_golf_3k,Code,3160.0,33.1,15.3,58.0,75.2,49.5,42.1,40.9,41.5,15.0,8.0,10.4,11.1 a1_code_code_golf_0.3k,Code,316.0,31.1,13.0,57.2,74.8,48.3,41.4,38.0,39.7,9.0,7.1,8.0,8.0 a1_code_code_golf_1k,Code,1000.0,31.0,17.3,54.2,75.4,49.0,40.2,36.7,38.4,10.3,5.9,8.1,8.1 b1_code_top_2,Code,31600.0,41.3,24.0,65.0,75.0,54.7,44.7,39.6,42.2,46.7,16.4,18.8,27.3 b1_code_top_2_10k,Code,10000.0,38.3,21.0,63.3,71.8,52.0,43.3,39.4,41.3,40.8,11.0,15.8,22.5 b1_code_top_2_3k,Code,3160.0,36.7,18.3,59.5,76.4,51.4,43.1,40.2,41.7,35.0,8.2,12.5,18.6 b1_code_top_2_1k,Code,1000.0,33.0,16.7,51.8,74.4,47.6,40.1,39.2,39.7,25.2,7.2,9.7,14.0 b1_code_top_2_0.3k,Code,316.0,33.8,18.0,54.5,71.6,48.0,41.0,40.6,40.8,29.1,6.6,9.3,15.0 b2_code_difficulty,Code,31597.0,43.0,21.0,69.5,77.4,56.0,47.3,45.5,46.4,47.4,16.1,19.6,27.7 b2_code_difficulty_10k,Code,10000.0,38.0,21.3,59.0,68.8,49.7,46.1,42.8,44.4,40.1,11.9,13.7,21.9 b2_code_difficulty_3k,Code,3160.0,37.2,21.7,62.0,76.0,53.2,44.9,38.2,41.6,31.4,10.3,13.5,18.4 b2_code_difficulty_1k,Code,1000.0,32.8,16.7,54.0,74.0,48.2,40.0,35.0,37.5,26.7,7.4,8.8,14.3 b2_code_difficulty_0.3k,Code,316.0,31.4,16.0,56.0,76.0,49.3,39.7,36.5,38.1,14.4,6.1,6.2,8.9 c1_code_0d_16s,Code,31600.0,40.6,20.3,61.0,79.2,53.5,43.1,45.1,44.1,43.6,13.9,18.8,25.4 c1_code_0d_16s_10k,Code,10000.0,39.8,21.3,65.5,77.4,54.7,45.7,43.6,44.7,36.7,13.1,15.2,21.7 c1_code_0d_16s_3k,Code,3160.0,37.2,18.3,61.0,77.4,52.2,44.7,44.8,44.8,33.3,7.0,11.0,17.1 c1_code_0d_16s_0.3k,Code,316.0,33.3,18.7,55.5,75.2,49.8,40.1,40.1,40.1,21.9,6.6,8.0,12.2 c1_code_0d_16s_1k,Code,1000.0,33.2,15.7,56.3,71.8,47.9,40.1,38.6,39.4,27.3,6.6,9.1,14.3 e1_code_fasttext_qwq_together,Code,31600.0,44.2,27.0,68.8,80.4,58.7,49.2,39.9,44.6,51.7,17.5,19.4,29.5 e1_code_fasttext_qwq_together_10k,Code,10000.0,42.0,28.0,67.5,78.8,58.1,49.0,39.4,44.2,43.2,12.5,17.5,24.4 e1_code_fasttext_qwq_together_3k,Code,3160.0,39.4,22.7,64.0,79.4,55.4,47.4,38.4,42.9,38.0,10.4,14.7,21.0 e1_code_fasttext_qwq_together_0.3k,Code,316.0,35.9,22.3,58.5,77.2,52.7,46.1,42.8,44.4,27.1,5.8,7.8,13.6 e1_code_fasttext_qwq_together_1k,Code,1000.0,34.9,18.3,60.5,74.8,51.2,43.6,39.1,41.4,26.8,7.8,8.5,14.4 a1_science_stackexchange_physics,Science,31600.0,34.3,19.7,57.5,75.6,50.9,43.9,42.6,43.2,22.7,5.3,7.6,11.9 a1_science_stackexchange_physics_10k,Science,10000.0,32.3,16.7,57.7,73.0,49.1,40.0,46.3,43.2,13.1,4.7,6.5,8.1 a1_science_stackexchange_physics_3k,Science,3160.0,31.2,16.0,57.0,73.4,48.8,40.7,40.4,40.6,11.7,3.9,6.1,7.2 a1_science_stackexchange_physics_0.3k,Science,316.0,28.6,13.7,53.0,70.6,45.8,38.5,35.4,37.0,7.5,4.3,6.1,6.0 a1_science_stackexchange_physics_1k,Science,1000.0,30.8,17.3,53.0,74.6,48.3,36.5,28.3,32.4,10.4,2.6,4.9,6.0 b1_science_top_2,Science,31600.0,33.7,17.7,58.7,74.6,50.3,42.7,47.0,44.8,16.6,4.7,7.2,9.5 b1_science_top_2_10k,Science,10000.0,38.5,18.3,57.2,75.0,50.2,39.0,41.2,40.1,6.4,4.1,7.6,6.0 b1_science_top_2_3k,Science,3160.0,33.0,18.0,57.0,73.8,49.6,40.2,42.3,41.2,22.0,4.2,6.8,11.0 b1_science_top_2_1k,Science,1000.0,33.7,17.0,56.2,77.2,50.1,42.9,43.1,43.0,21.7,4.8,6.8,11.1 b1_science_top_2_0.3k,Science,316.0,32.6,17.3,58.2,73.2,49.6,41.2,33.0,37.1,25.0,5.6,7.3,12.6 b2_science_length_10k,Science,10000.0,35.9,21.0,61.3,77.4,53.2,43.8,47.3,45.6,22.1,5.7,8.6,12.1 b2_science_length,Science,31598.0,35.9,21.7,61.3,80.2,54.4,49.2,52.5,50.8,12.9,4.2,4.9,7.3 b2_science_length_3k,Science,3160.0,35.2,20.7,60.0,75.8,52.2,40.4,49.0,44.7,24.5,4.3,6.7,11.8 b2_science_length_0.3k,Science,316.0,34.0,15.3,57.8,73.2,48.8,42.4,41.2,41.8,28.6,5.3,7.7,13.9 b2_science_length_1k,Science,1000.0,33.1,17.7,58.0,73.8,49.8,39.8,42.3,41.0,22.5,3.9,6.5,11.0 c1_science_0d_16s,Science,31600.0,36.2,21.0,65.0,77.4,54.5,46.3,53.0,49.6,15.7,4.9,6.4,9.0 c1_science_0d_16s_10k,Science,10000.0,35.9,20.3,64.8,77.6,54.2,42.2,46.6,44.4,20.7,6.3,8.7,11.9 c1_science_0d_16s_3k,Science,3160.0,35.6,21.3,60.5,76.0,52.6,40.0,52.5,46.2,24.1,4.3,6.1,11.5 c1_science_0d_16s_0.3k,Science,316.0,32.8,15.0,56.7,76.6,49.4,36.7,41.6,39.2,24.9,4.3,6.7,12.0 c1_science_0d_16s_1k,Science,1000.0,32.3,15.3,57.0,74.8,49.0,38.4,41.4,39.9,22.3,3.8,5.1,10.4 e1_science_longest_qwq_together,Science,31600.0,39.1,32.0,72.2,82.0,62.1,53.3,42.8,48.0,15.2,6.8,8.2,10.1 e1_science_longest_qwq_together_0.3k,Science,316.0,35.4,20.7,58.5,78.6,52.6,42.3,47.3,44.8,24.3,5.4,6.0,11.9 e1_science_longest_qwq_together_10k,Science,10000.0,33.7,20.3,63.7,81.0,55.0,46.3,37.0,41.6,9.7,5.6,6.0,7.1 e1_science_longest_qwq_together_1k,Science,1000.0,33.2,19.0,60.7,75.6,51.8,44.8,36.4,40.6,19.8,4.2,5.1,9.7 """ df = pd.read_csv(io.StringIO(pipene_best_csv)) df = df.set_index("Experiments") # Sort by domain (math, code, science) and then by pipene stage (a1, b1, b2, c1, e1) domain_order = {'Math': 0, 'Code': 1, 'Science': 2} stage_order = {'a1': 0, 'b1': 1, 'b2': 2, 'c1': 3, 'e1': 4} def get_stage(experiment_name):  for stage in ['a1', 'b1', 'b2', 'c1', 'e1']:  if experiment_name.startswith(stage + '_'):  return stage  return 'z' # Unknown stages go to end df['domain_sort'] = df['Domain'].map(domain_order) df['stage_sort'] = df.index.map(get_stage).map(stage_order) df = df.sort_values(['domain_sort', 'stage_sort']) df = df.drop(['domain_sort', 'stage_sort'], axis=1) # Print the sorted CSV for updating the hardcoded data # print("Sorted CSV data:") # print(df.to_csv()) # Convert percentages to decimal for col in df.columns:  if col not in ["Domain", "DatasetSize"]:  df[col] = df[col] / 100.0 FONTSIZE = 24 MARKERSIZE = 10 NEWIDTH = 3 FIGSIZE = (18, 8) # Create figure with 3 bplots plt.rcParams.update({  "font.size": FONTSIZE,  "axes.titlesize": FONTSIZE,  "axes.labelsize": FONTSIZE,  "xtick.labelsize": FONTSIZE,  "ytick.labelsize": FONTSIZE,  "legend.fontsize": FONTSIZE, }) fig, axes = plt.bplots(1, 3, figsize=FIGSIZE) # Define model groups by stage patterns models = {  "a1": [], # A1 stage  "b1": [], # B1 stage  "b2": [], # B2 stage  "c1": [], # C1 stage  "e1": [] # E1 stage } # Group models by stage prefix and collect their dataset sizes for model in df.index:  for prefix in models.keys():  if model.startswith(prefix + "_"):  dataset_size = df.loc[model, "DatasetSize"]  domain = df.loc[model, "Domain"]  models[prefix].append((dataset_size, model, domain))  break # Sort each model group by dataset size for prefix in models:  models[prefix].sort(key=lambda x: x[0]) # Define colors for each stage colors = {  "a1": "green",  "b1": "blue",  "b2": "orange",  "c1": "purple",  "e1": "red" } # Define display names for legend display_names = {  "a1": "Question Source",  "b1": "Question Mix",  "b2": "Question Filter",  "c1": "Multiple Answers",  "e1": "Teacher Model" } # Define basene models with their performance metrics basenes = {  "Qwen-2.5-7B-Instruct": {  "AvgMath": 0.385,  "AvgCode": 0.093,  "AvgSci": 0.287,  "color": "black",  "nestyle": "--"  },  # "DeepSeek-R1-Distill-Qwen-7B": {  # "AvgMath": 0.763,  # "AvgCode": 0.205,  # "AvgSci": 0.513,  # "color": "red",  # "nestyle": "--"  # } } # Plot domains domains = ["AvgMath", "AvgCode", "AvgSci"] titles = ["Math Average", "Code Average", "Science Average"] for i, metric in enumerate(domains):  ax = axes[i]  ax.set_title(titles[i], fontsize=FONTSIZE+4)  ax.set_xlabel("Dataset Size", fontsize=FONTSIZE+2)  if i == 0:  ax.set_ylabel("Accuracy (%)", fontsize=FONTSIZE+2)  ax.grid(True, nestyle='--', alpha=0.25)  # Hide top and right spines  ax.spines['top'].set_visible(False)  ax.spines['right'].set_visible(False)  # Plot each stage  for prefix, model_st in models.items():  if model_st:  x_values = []  y_values = []  for size, model_name, domain in model_st:  # Only plot models that match the current domain or are general  model_domain = domain.lower()  metric_name = metric.lower()  # Include model if it matches the domain or if it's a general model (no domain ffix)  if (model_domain == 'math' and 'math' in metric_name) or \  (model_domain == 'code' and 'code' in metric_name) or \  (model_domain == 'science' and 'sci' in metric_name) or \  (model_domain not in ['math', 'code', 'science']):  if model_name in df.index and metric in df.columns:  x_values.append(size)  y_values.append(df.loc[model_name, metric])  if x_values and y_values:  # Convert to percentages for display  y_values_percent = [y * 100 for y in y_values]  # Plot the ne with circles for all points except the last  if len(x_values) > 1:  ax.plot(x_values[:-1], y_values_percent[:-1], 'o-',  label=display_names[prefix],  color=colors[prefix],  newidth=NEWIDTH,  markersize=MARKERSIZE)  # Connect the ne to the last point without marker  ax.plot([x_values[-2], x_values[-1]], [y_values_percent[-2], y_values_percent[-1]], '-',  color=colors[prefix],  newidth=NEWIDTH)  else:  # If only one point, just plot the ne style for legend  ax.plot([], [], 'o-',  label=display_names[prefix],  color=colors[prefix],  newidth=NEWIDTH,  markersize=MARKERSIZE)  # Add square marker at the last point (no circle)  if len(x_values) > 0:  ax.plot(x_values[-1], y_values_percent[-1], 's',  color=colors[prefix],  markersize=MARKERSIZE)  # Add basene horizontal nes  for basene_name, basene_data in basenes.items():  if metric in basene_data:  basene_value = basene_data[metric] * 100 # Convert to percentage  ax.axhne(y=basene_value,  color=basene_data["color"],  nestyle=basene_data["nestyle"],  newidth=NEWIDTH-1,  alpha=0.5,  label=basene_name)  ax.set_xscale('log')  ax.set_xticks([1000, 10000, 100000])  ax.set_xticklabels(['1K', '10K', ''], fontsize=FONTSIZE)  ax.set_xm(0, 100000)  # Set y-axis mits with padding, considering basene values too  y_values = st(df[metric].values)  # Add basene values to the st for mit calculation  for basene_data in basenes.values():  if metric in basene_data:  y_values.append(basene_data[metric])  y_values = [z*100 for z in y_values]  min_y = max(0, min(y_values) - 5) # Add some padding at the bottom but don't go below 0  max_y = min(100, max(y_values) + 5) # Add some padding at the top  ax.set_ym(min_y, max_y)  # Set y-axis ticks in increments of 10  y_min, y_max = ax.get_ym()  y_ticks = range(int(y_min//10)*10, int(y_max//10)*10 + 20, 10)  ax.set_yticks(y_ticks) # Collect all legend handles and labels from all bplots to create a combined legend all_handles = [] all_labels = [] # Get unique legend items from all plots for i in range(len(axes)):  handles, labels = axes[i].get_legend_handles_labels()  for h, l in zip(handles, labels):  if l not in all_labels: # Only add if not already in the st  all_handles.append(h)  all_labels.append(l) # Define the desired order for legend items (arranged for 3-column layout) # To get: Source, Mix, Filter (top row) and Sampng, Model, Qwen (bottom row) desired_order = [  "Question Source",  "Multiple Answers",  "Question Mix",  "Teacher Model",  "Question Filter",  "Qwen-2.5-7B-Instruct" ] # Reorder handles and labels according to desired order ordered_handles = [] ordered_labels = [] for desired_label in desired_order:  for handle, label in zip(all_handles, all_labels):  if label == desired_label:  ordered_handles.append(handle)  ordered_labels.append(label)  break # Create a combined legend with 2 rows fig.legend(ordered_handles, ordered_labels, loc='lower center',  bbox_to_anchor=(0.51, 0.05), ncol=len(ordered_handles)//2,  fontsize=FONTSIZE, frameon=False, fancybox=True, framealpha=0.7) plt.tight_layout(rect=[0.04, 0.2, 0.98, 0.96]) fig.bplots_adjust(wspace=0.15) # Save plots relts_dir = "eval/relts" os.makedirs(relts_dir, exist_ok=True) # Save as PNG and PDF plt.savefig(f"{relts_dir}/simple_pipene_best_scang.png", dpi=300, bbox_inches='tight') print(f"Plots saved to {relts_dir}/simple_pipene_best_scang.png") plt.savefig(f"{relts_dir}/simple_pipene_best_scang.pdf", format='pdf', bbox_inches='tight') print(f"Plots saved to {relts_dir}/simple_pipene_best_scang.pdf")