import pandas as pd import matplotb.pyplot as plt import io import os # Hardcoded CSV data for pipene best models from each stage pipene_csv = """Experiments,Domain,DatasetSize,AvgAll,AIME24,AMC23,MATH500,AvgMath,JEEBench,GPQAD,AvgSci,LCBv2,CodeElo,CodeForces,AvgCode a1_math_open2math,Math,31600.0,38.1,26.0,67.8,82.8,58.9,44.8,46.3,45.6,24.0,6.1,7.1,12.4 b1_math_top_1,Math,31600.0,37.6,25.7,71.8,83.0,60.2,44.0,39.7,41.8,23.0,6.5,6.8,12.1 b2_math_length,Math,31600.0,41.9,32.7,79.0,86.2,66.0,51.5,45.8,48.6,24.3,7.4,8.6,13.4 c1_math_0d_16s,Math,31597.0,40.1,32.3,74.0,84.4,63.6,49.8,47.6,48.7,17.9,7.8,7.3,11.0 e1_math_all_qwq_together,Math,31600.0,44.2,43.0,83.2,88.4,71.5,58.9,47.5,53.2,18.3,7.5,7.0,10.9 a1_code_code_golf,Code,31600.0,38.8,17.7,58.0,77.0,50.9,38.8,42.6,40.7,44.4,14.4,17.2,25.3 b1_code_top_2,Code,31600.0,41.3,24.0,65.0,75.0,54.7,44.7,39.6,42.2,46.7,16.4,18.8,27.3 b2_code_difficulty,Code,31597.0,43.0,21.0,69.5,77.4,56.0,47.3,45.5,46.4,47.4,16.1,19.6,27.7 c1_code_0d_16s,Code,31600.0,40.6,20.3,61.0,79.2,53.5,43.1,45.1,44.1,43.6,13.9,18.8,25.4 e1_code_fasttext_qwq_together,Code,31600.0,44.2,27.0,68.8,80.4,58.7,49.2,39.9,44.6,51.7,17.5,19.4,29.5 a1_science_stackexchange_physics,Science,31600.0,34.3,19.7,57.5,75.6,50.9,43.9,42.6,43.2,22.7,5.3,7.6,11.9 b1_science_top_2,Science,31600.0,33.7,17.7,58.7,74.6,50.3,42.7,47.0,44.8,16.6,4.7,7.2,9.5 b2_science_length,Science,31598.0,35.9,21.7,61.3,80.2,54.4,49.2,52.5,50.8,12.9,4.2,4.9,7.3 c1_science_0d_16s,Science,31600.0,36.2,21.0,65.0,77.4,54.5,46.3,53.0,49.6,15.7,4.9,6.4,9.0 e1_science_longest_qwq_together,Science,31600.0,39.1,32.0,72.2,82.0,62.1,53.3,42.8,48.0,15.2,6.8,8.2,10.1 """ # Convert string to dataframe df = pd.read_csv(io.StringIO(pipene_csv)) df = df.set_index("Experiments") # Convert percentages to decimal for col in df.columns:  if col not in ["Domain", "DatasetSize"]:  df[col] = df[col] / 100.0 FONTSIZE = 20 MARKERSIZE = 10 NEWIDTH = 3 FIGSIZE = (18, 6) # Create figure with 3 bplots plt.rcParams.update({  "font.size": FONTSIZE,  "axes.titlesize": FONTSIZE,  "axes.labelsize": FONTSIZE,  "xtick.labelsize": FONTSIZE,  "ytick.labelsize": FONTSIZE,  "legend.fontsize": FONTSIZE, }) fig, axes = plt.bplots(1, 3, figsize=FIGSIZE) # Define model groups by domain models = {  "math": [],  "code": [],  "science": [] } # Group models by domain and identify pipene stages for model in df.index:  domain = df.loc[model, "Domain"].lower()  dataset_size = df.loc[model, "DatasetSize"]  # Determine pipene stage from model name  if model.startswith("a1_"):  stage = "Instruction Sourcing"  ef model.startswith("b1_"):  stage = "Instruction Mixing"  ef model.startswith("b2_"):  stage = "Instruction Filtering"  ef model.startswith("c1_"):  stage = "Answer Sampng"  ef model.startswith("d1_"):  stage = "Answer Filtering"  ef model.startswith("e1_"):  stage = "Answer Model"  else:  stage = "Other"  models[domain].append((stage, model, dataset_size)) # Sort each model group by stage order stage_order = ["Instruction Sourcing", "Instruction Mixing", "Instruction Filtering", "Answer Sampng", "Answer Filtering", "Answer Model"] for domain in models:  models[domain].sort(key=lambda x: stage_order.index(x[0]) if x[0] in stage_order else 999) # Define colors for each pipene stage stage_colors = {  "Instruction Sourcing": "#1f77b4", # Blue  "Instruction Mixing": "#ff7f0e", # Orange  "Instruction Filtering": "#2ca02c", # Green  "Answer Sampng": "#9467bd", # Purple  "Answer Filtering": "#8c564b", # Brown  "Answer Model": "#d62728" # Red } # Define basene models with their performance metrics basenes = {  "Qwen-2.5-7B-Instruct": {  "AvgMath": 0.437, # Average of AIME24, AMC23, MATH500  "AvgCode": 0.164, # Average of LCBv2, CodeElo, CodeForces  "AvgSci": 0.287, # Average of JEEBench, GPQAD  "color": "black",  "nestyle": "--"  } } # Plot domains domains = ["AvgMath", "AvgCode", "AvgSci"] titles = ["Math Average", "Code Average", "Science Average"] domain_keys = ["math", "code", "science"] for i, (metric, title, domain_key) in enumerate(zip(domains, titles, domain_keys)):  ax = axes[i]  ax.set_title(title, fontsize=FONTSIZE+4)  ax.set_xlabel("Pipene Stage", fontsize=FONTSIZE)  if i == 0:  ax.set_ylabel("Accuracy (%)", fontsize=FONTSIZE)  ax.grid(True, nestyle='--', alpha=0.25)  # Hide top and right spines  ax.spines['top'].set_visible(False)  ax.spines['right'].set_visible(False)  # Plot models for this domain  model_st = models[domain_key]  if model_st:  x_positions = []  y_values = []  colors = []  labels = []  for j, (stage, model_name, dataset_size) in enumerate(model_st):  if model_name in df.index and metric in df.columns:  x_positions.append(j)  y_values.append(df.loc[model_name, metric])  colors.append(stage_colors.get(stage, "gray"))  labels.append(stage)  if x_positions and y_values:  # Convert to percentages  y_values_pct = [y * 100 for y in y_values]  # Plot ne connecting the points  ax.plot(x_positions, y_values_pct, 'o-',  color="black", newidth=NEWIDTH, markersize=MARKERSIZE, alpha=0.7)  # Plot individual points with stage colors  for x, y, color, label in zip(x_positions, y_values_pct, colors, labels):  ax.scatter(x, y, color=color, s=MARKERSIZE*20, label=label, zorder=5)  # Set x-axis labels with abbreviations  if model_st:  stage_labels = [stage for stage, _, _ in model_st]  # Map stage names to abbreviations  stage_abbrevs = {  "Instruction Sourcing": "IS",  "Instruction Mixing": "IM",  "Instruction Filtering": "IF",  "Answer Sampng": "AS",  "Answer Filtering": "AF",  "Answer Model": "AM"  }  abbrev_labels = [stage_abbrevs.get(stage, stage) for stage in stage_labels]  ax.set_xticks(range(len(stage_labels)))  ax.set_xticklabels(abbrev_labels, rotation=0, ha='center')  # Add basene horizontal nes  for basene_name, basene_data in basenes.items():  if metric in basene_data:  basene_value = basene_data[metric] * 100 # Convert to percentage  ax.axhne(y=basene_value,  color=basene_data["color"],  nestyle=basene_data["nestyle"],  newidth=NEWIDTH-1,  alpha=0.5,  label=basene_name)  # Set y-axis mits with padding  if y_values:  y_values_pct = [y * 100 for y in y_values]  # Add basene values to the st for mit calculation  for basene_data in basenes.values():  if metric in basene_data:  y_values_pct.append(basene_data[metric] * 100)  min_y = max(0, min(y_values_pct) - 5) # Add padding below, but don't go below 0  max_y = min(100, max(y_values_pct) + 5) # Add some padding at the top  ax.set_ym(min_y, max_y)  # Set y-axis ticks in increments of 10  y_min, y_max = ax.get_ym()  y_ticks = range(int(y_min//10)*10, int(y_max//10)*10 + 20, 10)  ax.set_yticks(y_ticks) # Collect all legend handles and labels from all bplots to create a combined legend all_handles = [] all_labels = [] # Get unique legend items from all plots for i in range(len(axes)):  handles, labels = axes[i].get_legend_handles_labels()  for h, l in zip(handles, labels):  if l not in all_labels: # Only add if not already in the st  all_handles.append(h)  all_labels.append(l) # Add abbreviations to stage names in legend stage_abbrevs = {  "Instruction Sourcing": "IS",  "Instruction Mixing": "IM",  "Instruction Filtering": "IF",  "Answer Sampng": "AS",  "Answer Filtering": "AF",  "Answer Model": "AM" } # Sort legend items in pipene order with basene last pipene_order = ["Instruction Sourcing", "Instruction Mixing", "Instruction Filtering",  "Answer Sampng", "Answer Filtering", "Answer Model"] # Separate pipene stages and basene pipene_items = [] basene_items = [] for i, label in enumerate(all_labels):  if label in pipene_order:  pipene_items.append((pipene_order.index(label), all_handles[i], label))  else:  basene_items.append((all_handles[i], label)) # Sort pipene items by order pipene_items.sort(key=lambda x: x[0]) # Create ordered handles and labels ordered_handles = [item[1] for item in pipene_items] + [item[0] for item in basene_items] ordered_labels = [item[2] for item in pipene_items] + [item[1] for item in basene_items] # Update labels to include abbreviations updated_labels = [] for label in ordered_labels:  if label in stage_abbrevs:  updated_labels.append(f"{label} ({stage_abbrevs[label]})")  else:  updated_labels.append(label) # Create a combined legend with two rows if ordered_handles:  ncol = max(1, len(ordered_handles) // 2) # Spt into roughly 2 rows  fig.legend(ordered_handles, updated_labels, loc='lower center',  bbox_to_anchor=(0.51, 0), ncol=ncol,  fontsize=FONTSIZE, frameon=False, fancybox=True, framealpha=0.7) # Save plots relts_dir = "eval/relts" os.makedirs(relts_dir, exist_ok=True) plt.tight_layout(rect=[0.04, 0.2, 0.98, 0.96]) fig.bplots_adjust(wspace=0.15) # Save as PNG and PDF plt.savefig(f"{relts_dir}/simple_pipene_best.png", dpi=300, bbox_inches='tight') print(f"Plots saved to {relts_dir}/simple_pipene_best.png") plt.savefig(f"{relts_dir}/simple_pipene_best.pdf", dpi=300, bbox_inches='tight') print(f"Plots saved to {relts_dir}/simple_pipene_best.pdf")