from datasets import load_dataset import pandas as pd import matplotb.pyplot as plt import os ds = load_dataset("-dev/a1_math_open2math", spt="train") print(len(ds.unique('instruction_seed'))) ds2 = load_dataset("-dev/open2math-1M-gpt-4.1-mini", spt="train") print(len(ds2.unique('instruction_seed'))) # ds3 = load_dataset("-dev/b1_math_top_1", spt="train") # print(len(ds3.unique('instruction_seed'))) # ds4 = load_dataset("-dev/b2_math_length", spt="train") # print(len(ds4.unique('instruction_seed'))) instructions = set(ds['instruction_seed']) ds2 = ds2.filter(lambda x: x['instruction_seed'] in instructions, num_proc=os.cpu_count()) print(len(ds2.unique('instruction_seed'))) # Add response length ds = ds.map(lambda x: {"response_length": len(x["final_reasoning_trace"])}, num_proc=os.cpu_count()) ds2 = ds2.map(lambda x: {"response_length": len(x["gpt41_mini_response"])}, num_proc=os.cpu_count()) # Convert to pandas ds_pd = ds.to_pandas()[["instruction_seed", "response_length"]] ds2_pd = ds2.to_pandas()[["instruction_seed", "response_length"]] # Group by instruction_seed and calculate mean of response_length ds_grouped = ds_pd.groupby("instruction_seed")["response_length"].mean().reset_index() ds2_grouped = ds2_pd.groupby("instruction_seed")["response_length"].mean().reset_index() # Merge the grouped dataframes merged_df = pd.merge(ds_grouped, ds2_grouped, on="instruction_seed", ffixes=('_r1', '_gpt41mini')) # Plot correlation between response lengths and calculate correlation print(f"Number of data points: {len(merged_df)}") correlation = merged_df["response_length_r1"].corr(merged_df["response_length_gpt41mini"]) print(f"Correlation coefficient: {correlation:.4f}") plt.figure(figsize=(10, 8)) plt.scatter(merged_df["response_length_r1"], merged_df["response_length_gpt41mini"], alpha=0.5) plt.xlabel("r1 response length (chars)") plt.ylabel("gpt4.1-mini response length (chars)") plt.title(f"Correlation of Response Lengths: R1 vs GPT-4.1-mini (r = {correlation:.4f})") plt.grid(True, alpha=0.3) plt.savefig("r1_gpt4.1-mini_length_correlation.png") plt.close()