import os import matplotb.pyplot as plt import numpy as np from datasets import load_dataset from transformers import AutoTokenizer # Load both datasets ds_nemotron = load_dataset("/nemotron-sft_100000", spt="train") ds_openthought = load_dataset("/openthoughts_100000", spt="train") print("Nemotron Dataset:", ds_nemotron) print("OpenThoughts Dataset:", ds_openthought) tokenizer = AutoTokenizer.from_pretrained(  "Qwen/Qwen2.5-7B-Instruct", trust_remote_code=True ) def count_tokens(example):  conversation = example["conversations"]  question = conversation[0]["value"]  response = conversation[1]["value"]  example["question_num_tokens"] = len(tokenizer.encode(question))  example["response_num_tokens"] = len(tokenizer.encode(response))  # Create a proper conversation format for the tokenizer's chat template  chat_conversation = [  {"role": "user", "content": question},  {"role": "assistant", "content": response},  ]  example["conversation_num_tokens"] = len(  tokenizer.apply_chat_template(chat_conversation)  )  return example num_cpus = os.cpu_count() nemotron_counts = ds_nemotron.map(count_tokens, num_proc=num_cpus) openthought_counts = ds_openthought.map(count_tokens, num_proc=num_cpus) # Set global font sizes plt.rcParams.update(  {  "font.size": 16,  "axes.titlesize": 20,  "axes.labelsize": 18,  "xtick.labelsize": 16,  "ytick.labelsize": 16,  "legend.fontsize": 16,  "figure.titlesize": 24,  } ) # Calculate common bins for question tokens question_min = min(  min(nemotron_counts["question_num_tokens"]),  min(openthought_counts["question_num_tokens"]), ) question_max = max(  max(nemotron_counts["question_num_tokens"]),  max(openthought_counts["question_num_tokens"]), ) question_bins = np.nspace(question_min, question_max, 50) # Calculate common bins for response tokens response_min = min(  min(nemotron_counts["response_num_tokens"]),  min(openthought_counts["response_num_tokens"]), ) response_max = max(  max(nemotron_counts["response_num_tokens"]),  max(openthought_counts["response_num_tokens"]), ) response_bins = np.nspace(response_min, response_max, 50) # Calculate common bins for conversation tokens conversation_min = min(  min(nemotron_counts["conversation_num_tokens"]),  min(openthought_counts["conversation_num_tokens"]), ) conversation_max = max(  max(nemotron_counts["conversation_num_tokens"]),  max(openthought_counts["conversation_num_tokens"]), ) conversation_bins = np.nspace(conversation_min, conversation_max, 50) # Create figure with bplots for all token counts fig, (ax1, ax2, ax3) = plt.bplots(3, 1, figsize=(14, 14)) fig.ptitle("Token Count Distributions by Component") # Define consistent colors for each dataset nemotron_color = "blue" openthoughts_color = "red" # Plot question token counts ax1.hist(  nemotron_counts["question_num_tokens"],  bins=question_bins,  color=nemotron_color,  alpha=0.5,  label="Nemotron", ) ax1.hist(  openthought_counts["question_num_tokens"],  bins=question_bins,  color=openthoughts_color,  alpha=0.5,  label="OpenThoughts", ) ax1.set_title("Question Token Counts") ax1.set_ylabel("Frequency") ax1.legend() # Plot response token counts ax2.hist(  nemotron_counts["response_num_tokens"],  bins=response_bins,  color=nemotron_color,  alpha=0.5,  label="Nemotron", ) ax2.hist(  openthought_counts["response_num_tokens"],  bins=response_bins,  color=openthoughts_color,  alpha=0.5,  label="OpenThoughts", ) ax2.set_title("Response Token Counts") ax2.set_ylabel("Frequency") ax2.legend() # Plot conversation token counts ax3.hist(  nemotron_counts["conversation_num_tokens"],  bins=conversation_bins,  color=nemotron_color,  alpha=0.5,  label="Nemotron", ) ax3.hist(  openthought_counts["conversation_num_tokens"],  bins=conversation_bins,  color=openthoughts_color,  alpha=0.5,  label="OpenThoughts", ) ax3.set_title("Total Conversation Token Counts") ax3.set_xlabel("Number of Tokens") ax3.set_ylabel("Frequency") ax3.legend() plt.tight_layout() plt.savefig("figures/nemotron_vs_openthoughts_token_histograms.png") threshold = 1000 print(  f"Nemotron responses (out of 100k) with length < {threshold:,} tokens: {len([r for r in nemotron_counts['response_num_tokens'] if r < threshold]):,}" ) print(  f"OpenThoughts responses (out of 100k) with length < {threshold:,} tokens: {len([r for r in openthought_counts['response_num_tokens'] if r < threshold]):,}" ) # Convert to pandas dataframe for easier analysis df = nemotron_counts.remove_columns(["conversations", "question", "answer"]).to_pandas() print(df["generator"].value_counts()) print(df["reasoning"].value_counts()) print(df["category"].value_counts()) # Filter responses to those below the threshold for better viazation filtered_df = df[df["response_num_tokens"] < threshold].copy() print("\nAfter token length filter:") print(filtered_df["generator"].value_counts()) print(filtered_df["reasoning"].value_counts()) print(filtered_df["category"].value_counts()) # Create a new figure with three bplots for Nemotron dataset breakdowns fig2, (ax1, ax2, ax3) = plt.bplots(3, 1, figsize=(16, 18)) fig2.ptitle("Nemotron Response Token Distributions by Properties", fontsize=24) # 1. Plot token histograms for each generator generators = filtered_df["generator"].unique() colors = plt.cm.tab10(np.nspace(0, 1, len(generators))) for i, generator in enumerate(generators):  bset = filtered_df[filtered_df["generator"] == generator]  ax1.hist(  bset["response_num_tokens"],  bins=50,  alpha=0.6,  label=generator,  color=colors[i],  ) ax1.set_title("Response Token Counts by Generator", fontsize=20) ax1.set_ylabel("Frequency", fontsize=18) ax1.legend(fontsize=12) # 2. Plot token histograms for reasoning on/off reasoning_values = filtered_df["reasoning"].unique() colors = ["green", "orange"] for i, reasoning in enumerate(reasoning_values):  bset = filtered_df[filtered_df["reasoning"] == reasoning]  ax2.hist(  bset["response_num_tokens"],  bins=50,  alpha=0.6,  label=f"Reasoning: {reasoning}",  color=colors[i],  ) ax2.set_title("Response Token Counts by Reasoning Setting", fontsize=20) ax2.set_ylabel("Frequency", fontsize=18) ax2.legend(fontsize=14) # 3. Plot token histograms for each category categories = filtered_df["category"].unique() colors = plt.cm.tab10(np.nspace(0, 1, len(categories))) for i, category in enumerate(categories):  bset = filtered_df[filtered_df["category"] == category]  ax3.hist(  bset["response_num_tokens"],  bins=50,  alpha=0.6,  label=category,  color=colors[i],  ) ax3.set_title("Response Token Counts by Category", fontsize=20) ax3.set_xlabel("Number of Tokens", fontsize=18) ax3.set_ylabel("Frequency", fontsize=18) ax3.legend(fontsize=14) plt.tight_layout() plt.savefig("figures/nemotron_breakdowns_token_histograms.png") 