#!/usr/bin/env python3 """ Filter and format the domain scang dataset for final plotting. This script focuses on the main model famies and their scang curves. """ import pandas as pd import numpy as np import os def main():  """Create a clean domain scang dataset"""  # Read the updated relts  df = pd.read_csv('/Users//dcft_private/eval/relts/domain_scang_updated.csv')  # Define the core model famies we want to track for domain scang  core_famies = [  'openthoughts3_math',  'openthoughts3_code',  'openthoughts3_science',  'no_pipene_math',  'no_pipene_code',  'no_pipene_science',  'nemo_nano_math',  'nemo_nano_code',  'nemo_nano_science'  ]  # Filter to only include core famies or specific large-scale models  filtered_models = []  for idx, row in df.iterrows():  model_name = row['Experiments']  # Check if it matches any core family  is_core_family = False  for family in core_famies:  if model_name.startswith(family):  # Make re it's not a checkpoint or special variant  if not any(x in model_name for x in ['ckpt', 'buggy', 'annotated', 'filtered']):  is_core_family = True  break  # Also include some key large-scale models for comparison  is_large_scale = False  if any(x in model_name for x in ['openthoughts2_1000k', 'nemo_nano_1000k', 'openthoughts3_300k']):  if not any(x in model_name for x in ['ckpt', 'buggy', 'annotated']):  is_large_scale = True  if is_core_family or is_large_scale:  filtered_models.append(row)  # Create the filtered dataframe  clean_df = pd.DataFrame(filtered_models)  # Add proper domain classification  domains = []  for idx, row in clean_df.iterrows():  model_name = row['Experiments']  if '_math' in model_name:  domain = 'Math'  ef '_code' in model_name:  domain = 'Code'  ef '_science' in model_name:  domain = 'Science'  ef 'openthoughts3_300k' == model_name:  domain = 'Mixed' # This is a mixed model  ef any(x in model_name for x in ['1000k', '300k']):  domain = 'Mixed' # Large scale mixed models  else:  domain = row['Domain']  domains.append(domain)  clean_df['Domain'] = domains  # Enre we have all required columns and handle missing values  required_columns = [  'Experiments', 'Domain', 'DatasetSize', 'AvgAll',  'AIME24', 'AMC23', 'MATH500', 'AvgMath',  'JEEBench', 'GPQAD', 'AvgSci',  'LCBv2', 'CodeElo', 'CodeForces', 'AvgCode'  ]  # Fill missing values with NaN for proper handng  for col in required_columns:  if col not in clean_df.columns:  clean_df[col] = np.nan  # Reorder columns  clean_df = clean_df[required_columns]  # Sort by average performance  clean_df = clean_df.sort_values('AvgAll', ascending=False)  # Save the cleaned dataset  relts_dir = "eval/relts"  os.makedirs(relts_dir, exist_ok=True)  output_file = f"{relts_dir}/domain_scang_final.csv"  clean_df.to_csv(output_file, index=False)  print(f"=== DOMAIN SCANG FINAL DATASET ===")  print(f"Total models: {len(clean_df)}")  print(f"Saved to: {output_file}")  # Print mmary by domain  print(f"\n=== MODELS BY DOMAIN ===")  domain_counts = clean_df['Domain'].value_counts()  for domain, count in domain_counts.items():  print(f"{domain}: {count} models")  # Print dataset size range  if 'DatasetSize' in clean_df.columns:  sizes = clean_df['DatasetSize'].dropna()  if len(sizes) > 0:  print(f"\n=== DATASET SIZES ===")  print(f"Range: {sizes.min():.0f} to {sizes.max():.0f} samples")  print(f"Models with size info: {len(sizes)}/{len(clean_df)}")  # Show top 15 models  print(f"\n=== TOP 15 MODELS ===")  top_models = clean_df.head(15)  for _, row in top_models.iterrows():  size_str = f"({row['DatasetSize']:.0f})" if pd.notna(row['DatasetSize']) else "(Unknown)"  print(f"{row['Experiments']} {size_str}: {row['AvgAll']:.1f}% ({row['Domain']})")  return clean_df, output_file if __name__ == "__main__":  main()