import os import pandas as pd from datasets import concatenate_datasets, load_dataset s = ["code", "math", "science", "chat", "safety"] ds = load_dataset("nvidia/Llama-Nemotron-Post-Training-Dataset-v1", "SFT", spt=s) ds = concatenate_datasets(ds) ds = ds.select_columns(["used_in_training", "reasoning", "generator", "category"]) df = ds.to_pandas() # Show a sample of the data for debugging print(df["used_in_training"].value_counts(), "\n") print(df["category"].value_counts(), "\n") print(df["generator"].value_counts(), "\n") print(df["reasoning"].value_counts(), "\n") # Pivot the DataFrame to count rows by generator and category pivot_df = pd.pivot_table(  df, index="generator", columns="category", aggfunc="size", fill_value=0 ) # Add a full_dataset column that ms across all categories pivot_df["full_dataset"] = pivot_df.m(axis=1) # Get the unique generators from the pivot table before adding the ALL row generators = pivot_df.index.tost() # Add a row for "all" that ms all values pivot_df.loc["ALL"] = pivot_df.m() # Create separate pivot tables for each model (Ultra, per, Nano) model_pivots = {} for model in ["Ultra", "per", "Nano"]:  # Filter rows where the model was used in training  df_model = df[df["used_in_training"].str.contains(model, na=False)]  # Create the pivot table  pivot_model_df = pd.pivot_table(  df_model, index="generator", columns="category", aggfunc="size", fill_value=0  )  # Add a total column that ms across all categories  pivot_model_df[f"total_{model.lower()}"] = pivot_model_df.m(axis=1)  # Enre all generators from the first table are in this table (even if all zeros)  for generator in generators:  if generator not in pivot_model_df.index:  # Add a row of zeros for this generator  pivot_model_df.loc[generator] = 0  # Reorder the rows to match the first table  pivot_model_df = pivot_model_df.reindex(generators)  # Add a row for "all" that ms all values  pivot_model_df.loc["ALL"] = pivot_model_df.m()  # Store in dictionary  model_pivots[model] = pivot_model_df # Format values with K and M ffixes def format_number(num):  if num == 0:  return 0  ef num >= 1_000_000:  return f"{num/1_000_000:.1f}M"  ef num >= 1_000:  return f"{num/1_000:.0f}K"  else:  return str(num) # Apply formatting to all values formatted_df = pivot_df.map(format_number) # Format each model's pivot table formatted_model_pivots = {  model: pivot.map(format_number) for model, pivot in model_pivots.items() } # Display the formatted tables print("\nCount of all rows by generator and category:") print(formatted_df) # Display tables for each model for model in ["Ultra", "per", "Nano"]:  print(f"\nCount of rows used in {model} training by generator and category:")  print(formatted_model_pivots[model]) # Print tables in markdown format print("\n\n--- MARKDOWN FORMAT ---\n") print("\nCount of all rows by generator and category (Markdown):") print(formatted_df.to_markdown()) # Print markdown tables for each model for model in ["Ultra", "per", "Nano"]:  print(f"\nCount of rows used in {model} training by generator and category (Markdown):")  print(formatted_model_pivots[model].to_markdown()) # Print mmary of data not used in each model's training for model in ["Ultra", "per", "Nano"]:  df_notused = df[~df["used_in_training"].str.contains(model, na=False)]  print(f"\nGenerators in data not used for {model} training:")  print(df_notused["generator"].value_counts().head(10)) # Show top 10 to keep output manageable 