import pandas as pd import matplotb.pyplot as plt import io import os # Hardcoded CSV data for pipene best models from each stage pipene_csv = """Experiments,Domain,DatasetSize,AvgAll,AIME24,AMC23,MATH500,AvgMath,JEEBench,GPQAD,AvgSci,LCBv2,CodeElo,CodeForces,AvgCode a1_math_open2math,Math,31600.0,38.1,26.0,67.8,82.8,58.9,44.8,46.3,45.6,24.0,6.1,7.1,12.4 b1_math_top_1,Math,31600.0,37.6,25.7,71.8,83.0,60.2,44.0,39.7,41.8,23.0,6.5,6.8,12.1 b2_math_length,Math,31600.0,41.9,32.7,79.0,86.2,66.0,51.5,45.8,48.6,24.3,7.4,8.6,13.4 c1_math_0d_16s,Math,31597.0,40.1,32.3,74.0,84.4,63.6,49.8,47.6,48.7,17.9,7.8,7.3,11.0 e1_math_all_qwq_together,Math,31600.0,44.2,43.0,83.2,88.4,71.5,58.9,47.5,53.2,18.3,7.5,7.0,10.9 a1_code_code_golf,Code,31600.0,38.8,17.7,58.0,77.0,50.9,38.8,42.6,40.7,44.4,14.4,17.2,25.3 b1_code_top_2,Code,31600.0,41.3,24.0,65.0,75.0,54.7,44.7,39.6,42.2,46.7,16.4,18.8,27.3 b2_code_difficulty,Code,31597.0,43.0,21.0,69.5,77.4,56.0,47.3,45.5,46.4,47.4,16.1,19.6,27.7 c1_code_0d_16s,Code,31600.0,40.6,20.3,61.0,79.2,53.5,43.1,45.1,44.1,43.6,13.9,18.8,25.4 e1_code_fasttext_qwq_together,Code,31600.0,44.2,27.0,68.8,80.4,58.7,49.2,39.9,44.6,51.7,17.5,19.4,29.5 a1_science_stackexchange_physics,Science,31600.0,34.3,19.7,57.5,75.6,50.9,43.9,42.6,43.2,22.7,5.3,7.6,11.9 b1_science_top_2,Science,31600.0,33.7,17.7,58.7,74.6,50.3,42.7,47.0,44.8,16.6,4.7,7.2,9.5 b2_science_length,Science,31598.0,35.9,21.7,61.3,80.2,54.4,49.2,52.5,50.8,12.9,4.2,4.9,7.3 c1_science_0d_16s,Science,31600.0,36.2,21.0,65.0,77.4,54.5,46.3,53.0,49.6,15.7,4.9,6.4,9.0 e1_science_longest_qwq_together,Science,31600.0,39.1,32.0,72.2,82.0,62.1,53.3,42.8,48.0,15.2,6.8,8.2,10.1 """ # Convert string to dataframe df = pd.read_csv(io.StringIO(pipene_csv)) df = df.set_index("Experiments") # Convert percentages to decimal for col in df.columns:  if col not in ["Domain", "DatasetSize"]:  df[col] = df[col] / 100.0 FONTSIZE = 20 MARKERSIZE = 12 NEWIDTH = 3 FIGSIZE = (12, 8) # Create figure with single bplot plt.rcParams.update({  "font.size": FONTSIZE,  "axes.titlesize": FONTSIZE,  "axes.labelsize": FONTSIZE,  "xtick.labelsize": FONTSIZE,  "ytick.labelsize": FONTSIZE,  "legend.fontsize": FONTSIZE, }) fig, ax = plt.bplots(1, 1, figsize=FIGSIZE) # Define model groups by domain models = {  "math": [],  "code": [],  "science": [] } # Group models by domain and identify pipene stages for model in df.index:  domain = df.loc[model, "Domain"].lower()  dataset_size = df.loc[model, "DatasetSize"]  # Determine pipene stage from model name  if model.startswith("a1_"):  stage = "Instruction Sourcing"  ef model.startswith("b1_"):  stage = "Instruction Mixing"  ef model.startswith("b2_"):  stage = "Instruction Filtering"  ef model.startswith("c1_"):  stage = "Answer Sampng"  ef model.startswith("d1_"):  stage = "Answer Filtering"  ef model.startswith("e1_"):  stage = "Answer Model"  else:  stage = "Other"  models[domain].append((stage, model, dataset_size)) # Sort each model group by stage order stage_order = ["Instruction Sourcing", "Instruction Mixing", "Instruction Filtering", "Answer Sampng", "Answer Filtering", "Answer Model"] for domain in models:  models[domain].sort(key=lambda x: stage_order.index(x[0]) if x[0] in stage_order else 999) # Define colors for each pipene stage stage_colors = {  "Instruction Sourcing": "#1f77b4", # Blue  "Instruction Mixing": "#ff7f0e", # Orange  "Instruction Filtering": "#2ca02c", # Green  "Answer Sampng": "#9467bd", # Purple  "Answer Filtering": "#8c564b", # Brown  "Answer Model": "#d62728" # Red } # Define shapes for each domain domain_shapes = {  "math": "o", # Circle  "code": "s", # Square  "science": "^" # Triangle } # Define display names for domains domain_display_names = {  "math": "Math",  "code": "Code",  "science": "Science" } # Define basene models with their performance metrics basenes = {  "Qwen-2.5-7B-Instruct": {  "AvgAll": 0.29, # Average of all benchmarks for this basene  "color": "black",  "nestyle": "--"  } } ax.set_title("Pipene Performance by Stage", fontsize=FONTSIZE+4) ax.set_xlabel("Pipene Stage", fontsize=FONTSIZE) ax.set_ylabel("Accuracy (%)", fontsize=FONTSIZE) ax.grid(True, nestyle='--', alpha=0.25) # Hide top and right spines ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) # Create x-position mapping for stages stage_positions = {stage: i for i, stage in enumerate(stage_order)} # Plot models for each domain for domain_key, model_st in models.items():  if model_st:  x_positions = []  y_values = []  colors = []  labels = []  for stage, model_name, dataset_size in model_st:  if model_name in df.index and "AvgAll" in df.columns:  x_positions.append(stage_positions[stage])  y_values.append(df.loc[model_name, "AvgAll"])  colors.append(stage_colors.get(stage, "gray"))  labels.append(stage)  if x_positions and y_values:  # Convert to percentages  y_values_pct = [y * 100 for y in y_values]  # Plot ne connecting the points for this domain  ax.plot(x_positions, y_values_pct,  marker=domain_shapes[domain_key],  color="black",  newidth=NEWIDTH,  markersize=MARKERSIZE,  alpha=0.7,  label=domain_display_names[domain_key])  # Plot individual points with stage colors  for x, y, color in zip(x_positions, y_values_pct, colors):  ax.scatter(x, y,  marker=domain_shapes[domain_key],  color=color,  s=MARKERSIZE*20,  zorder=5,  edgecolors='black',  newidths=1) # Set x-axis labels with abbreviations stage_abbrevs = {  "Instruction Sourcing": "IS",  "Instruction Mixing": "IM",  "Instruction Filtering": "IF",  "Answer Sampng": "AS",  "Answer Filtering": "AF",  "Answer Model": "AM" } abbrev_labels = [stage_abbrevs[stage] for stage in stage_order] ax.set_xticks(range(len(stage_order))) ax.set_xticklabels(abbrev_labels, rotation=0, ha='center') # Add basene horizontal ne for basene_name, basene_data in basenes.items():  basene_value = basene_data["AvgAll"] * 100 # Convert to percentage  ax.axhne(y=basene_value,  color=basene_data["color"],  nestyle=basene_data["nestyle"],  newidth=NEWIDTH-1,  alpha=0.5,  label=basene_name) # Set y-axis mits with padding all_y_values = [] for domain_key, model_st in models.items():  for stage, model_name, dataset_size in model_st:  if model_name in df.index and "AvgAll" in df.columns:  all_y_values.append(df.loc[model_name, "AvgAll"] * 100) # Add basene values to the st for mit calculation for basene_data in basenes.values():  all_y_values.append(basene_data["AvgAll"] * 100) if all_y_values:  min_y = max(0, min(all_y_values) - 5) # Add padding below, but don't go below 0  max_y = min(100, max(all_y_values) + 5) # Add some padding at the top  ax.set_ym(min_y, max_y)  # Set y-axis ticks in increments of 10  y_min, y_max = ax.get_ym()  y_ticks = range(int(y_min//10)*10, int(y_max//10)*10 + 20, 10)  ax.set_yticks(y_ticks) # Create legend handles, labels = ax.get_legend_handles_labels() # Create a combined legend if handles:  ax.legend(handles, labels, loc='upper left',  fontsize=FONTSIZE-2, frameon=False, fancybox=True, framealpha=0.7) # Save plots relts_dir = "eval/relts" os.makedirs(relts_dir, exist_ok=True) plt.tight_layout() # Save as PNG and PDF plt.savefig(f"{relts_dir}/simple_pipene_best_combined.png", dpi=300, bbox_inches='tight') print(f"Plots saved to {relts_dir}/simple_pipene_best_combined.png") plt.savefig(f"{relts_dir}/simple_pipene_best_combined.pdf", dpi=300, bbox_inches='tight') print(f"Plots saved to {relts_dir}/simple_pipene_best_combined.pdf")